PerlMol - Example: Polar surface area
Calculation of polar surface area based on fragment contributions (TPSA)
Sent by Konrad Koehler, Karo Bio AB
Originally a C program by Peter Ertl, Novartis, translated to Perl by Eric Allen, using DayPerl.
The program reads a list of SMILES strings from stdin, and writes SMILES and the calculated TPSA on stdout.
Files
- psa.pl - the program
- input.smi - a few sample SMILES strings
- output.smi - the results of running ./psa.pl -f input.smi
Code
#!/usr/bin/perl # # http://www.daylight.com/support/contrib/ertl/tpsa.pl # This script is a conversion of Peter Ertl's c program tpsa.c into Perl. # I added a few additional features, like the ability to use files as input # and output. -- Eric Allen # # +++++++++++++++++++++++++ # # Calculation of polar surface area based on fragment contributions (TPSA) # Peter Ertl, Novartis Pharma AG, Basel, August 2000 # peter.ertl@pharma.novartis.com || peter.ertl@altavista.net # The program uses the SMILES Daylight Toolkit module. # # For description of the methodology see : # P. Ertl, B. Rohde, P. Selzer # Fast Calculation of Molecular Polar Surface Area as a Sum of Fragment-based # Contributions and Its Application to the Prediction of Drug Transport # Properties, J.Med.Chem. 43, 3714-3717, 2000 # # The program reads a list of SMILES strings from stdin, and writes SMILES # and the calculated TPSA on stdout. # The program expects "problematic" nitrogens in pentavalent form, i.e. nitro # groups as -N(=O)=O, N-oxides as c1ccccn1=O, and azides as -N=N#N. # # This contrib program comes with absolutely no warranty of any kind. # # +++++++++++++++++++++++++ # # Revision History # # Date Who What # ----------- ------------------ -------------------------------------------- # 08-MAR-2001 Eric Allen Converted c program to Perl. # 13-MAY-2005 Konrad Koehler Replaced Daylight Dependencies with PerlMol modules. # use strict; # KFK: Commented out the following line and replaced by the following 9 lines: #use DayPerl; use Chemistry::Mol; use Chemistry::Atom; use Chemistry::Pattern; use Chemistry::File::SMILES; use Chemistry::File::SMARTS; use Chemistry::Bond::Find; use Chemistry::Ring; use Chemistry::Ring::Find ':all'; use Chemistry::Ring 'aromatize_mol'; my %inputArguments = @ARGV; my $calculatedOne = 0; my $usageString = "usage:\n\tcalculatepsa.pl 'smiles string'\nor\n\tcalculatepsa.pl -f filename [-o outputfilename]\n\n"; my $line; my $smiles; my $id; my $psa; my $message; my $filename; my $outfilename; my $writeToFile = 0; if($#ARGV > 0) { # means we have multiple arguements to deal with if(defined($inputArguments{-f})) { open INFILE,"<$inputArguments{-f}" or die "Could not open input file $inputArguments{-f}.\n"; } else { die "$usageString"; } if(defined($inputArguments{-o})) { open OUTFILE,">$inputArguments{-o}" or die "Could not open output file $inputArguments{-o}.\n"; $writeToFile = 1; } while($smiles = <INFILE>) { chomp $smiles; $message = ''; ($psa,$message) = &CalculatePSA($smiles); if($writeToFile) { if(length($message) > 0) { print OUTFILE "$smiles $message\n"; } else { print OUTFILE "$smiles\t$psa\n"; } } else { if(length($message) > 0) { print "$smiles $message\n"; } else { print "$smiles\t$psa\n"; } } } close INFILE; if($writeToFile) { close OUTFILE; } } elsif($#ARGV == 0) { # means we have a single smiles string to use $smiles = $ARGV[0]; chomp $smiles; ($psa,$message) = &CalculatePSA($smiles); if(length($message) > 0) { print "$smiles $message\n"; } else { print "$smiles\t$psa\n"; } } else { $smiles = <STDIN>; chomp $smiles; while(length($smiles) > 0) { $calculatedOne = 1; ($psa,$message) = &CalculatePSA($smiles); if(length($message) > 0) { print "$smiles $message\n"; } else { print "$smiles\t$psa\n"; } $smiles = <STDIN>; chomp $smiles; } unless($calculatedOne) { print "$usageString"; } } ################# # CalculatePSA ################# sub CalculatePSA { my ($smiles) = @_; my $atomic_number_nitrogen = 7; my $atomic_number_oxygen = 8; my $max_ring_count = 3; my $psa = 0; my $message; my $molecule; my $atoms; my $rings; my $bonds; my $oneAtom; my $oneBond; my $atomicNumber; my $isIn3Ring; my $hydrogenCount; my $formalCharge; my $isAromatic; my $oneRing; my $ringAtoms; my $oneRingAtom; my $resetRings; my $numberOfNeighbors; my $singleBondCount; my $doubleBondCount; my $tripleBondCount; my $aromaticBondCount; my $bondType; my $polarity; my $deAllocated; my $NTriple = 23.79; my $NHDouble = 23.85; my $NH2Single = 26.02; my $NH2PlusDouble = 25.59; my $NH3PlusSingle = 27.64; my $NSingleDouble = 12.36; my $NDoubleTriple = 13.60; my $NHSingleSingle = 12.03; my $NHSingleSingle3Ring = 21.94; my $NPlusSingleTriple = 4.36; my $NHPlusSingleDouble = 13.97; my $NH2PlusSingleSingle = 16.61; my $nAromaticAromatic = 12.89; my $nHAromaticAromatic = 15.79; my $nHPlusAromaticAromatic = 14.14; my $NSingleSingleSingle = 3.24; my $NSingleSingleSingle3Ring = 3.01; my $NSingleDoubleDouble = 11.68; my $NPlusSingleSingleDouble = 3.01; my $NPlusSingleSingleSingle = 4.44; my $nAromaticAromaticAromatic = 4.41; my $nSingleAromaticAromatic = 4.93; my $nDoubleAromaticAromatic = 8.39; my $nPlusAromaticAromaticAromatic = 4.10; my $nPlusSingleAromaticAromatic = 3.88; my $NPlusSingleSingleSingleSingle = 0.00; my $nonStandardNBasePolarity = 30.5; my $neighborNBasePolarity = 8.2; my $hydrogenBasePolarity = 1.5; my $ODouble = 17.07; my $OHSingle = 20.23; my $OMinusSingle = 23.06; my $OSingleSingle = 9.23; my $OSingleSingle3Ring = 12.53; my $oAromaticAromatic = 13.14; my $nonStandardOBasePolarity = 28.5; my $neighborOBasePolarity = 8.6; # KFK: replaced the following line with the next two lines: # $molecule = dt_smilin($smiles); $molecule = Chemistry::Mol->parse($smiles, format => 'smiles'); aromatize_mol($molecule); # <--- AROMATIZE!!! unless(defined($molecule)) { $message = "SMILES string invalid"; return ($psa,$message); } # KFK: replaced the next two lines with the following line: #$atoms = dt_stream($molecule,TYP_ATOM); #$rings = dt_stream($molecule,TYP_CYCLE); # KFK: added the next block of code to recplace the following block: my @atoms = $molecule->atoms; foreach (@atoms) { $oneAtom = $_; $oneAtom->collapse_hydrogens; $atomicNumber = $oneAtom->Z; if(($atomicNumber == $atomic_number_nitrogen) || ($atomicNumber == $atomic_number_oxygen)) { $hydrogenCount = $oneAtom->implicit_hydrogens; $formalCharge = $oneAtom->formal_charge; if ($formalCharge == "") { $formalCharge = 0; } $isAromatic = $oneAtom->aromatic; if (find_ring($oneAtom, size => '3') == "") { $isIn3Ring = 0; } else { $isIn3Ring = 1; } $numberOfNeighbors = $oneAtom->neighbors($oneAtom); $singleBondCount = 0; $doubleBondCount = 0; $tripleBondCount = 0; $aromaticBondCount = 0; my @bonds = $oneAtom->bonds($oneAtom); foreach (@bonds) { $oneBond = $_; my $bo = $oneBond->order; if ($bo == 1) { $singleBondCount++; } if ($bo == 2) { $doubleBondCount++; } if ($bo == 3) { $tripleBondCount++; } if ($oneBond->aromatic) { $aromaticBondCount++; } } # while($oneAtom = dt_next($atoms)) # { # $atomicNumber = dt_number($oneAtom); # if(($atomicNumber == $atomic_number_nitrogen) || ($atomicNumber == $atomic_number_oxygen)) # { # $hydrogenCount = dt_hcount($oneAtom); # $formalCharge = dt_charge($oneAtom); # $isAromatic = dt_aromatic($oneAtom); # $isIn3Ring = 0; # RINGCHECK: # while($oneRing = dt_next($rings)) # { # if(dt_count($oneRing,TYP_ATOM) > $max_ring_count) # { # next RINGCHECK; # } # $ringAtoms = dt_stream($oneRing,TYP_ATOM); # while($oneRingAtom = dt_next($ringAtoms)) # { # if($oneAtom == $oneRingAtom) # { # $isIn3Ring = 1; # last RINGCHECK; # } # } # } # $resetRings = dt_reset($rings); # # $numberOfNeighbors = 0; # $singleBondCount = 0; # $doubleBondCount = 0; # $tripleBondCount = 0; # $aromaticBondCount = 0; # $bonds = dt_stream($oneAtom,TYP_BOND); # while($oneBond = dt_next($bonds)) # { # $numberOfNeighbors++; # $bondType = dt_bondtype($oneBond); # if($bondType == DX_BTY_SINGLE) # { # $singleBondCount++; # } # elsif($bondType == DX_BTY_DOUBLE) # { # $doubleBondCount++; # } # elsif($bondType == DX_BTY_TRIPLE) # { # $tripleBondCount++; # } # elsif($bondType == DX_BTY_AROMAT) # { # $aromaticBondCount++; # } # } # $polarity = -1; if($atomicNumber == $atomic_number_nitrogen) { if($numberOfNeighbors == 1) { if(($hydrogenCount == 0) && ($tripleBondCount == 1) && ($formalCharge == 0)) { # N# $polarity = $NTriple; } elsif(($hydrogenCount == 1) && ($doubleBondCount == 1) && ($formalCharge == 0)) { # [NH]= $polarity = $NHDouble; } elsif(($hydrogenCount == 2) && ($singleBondCount == 1) && ($formalCharge == 0)) { # [NH2]- $polarity = $NH2Single; } elsif(($hydrogenCount == 2) && ($doubleBondCount == 1) && ($formalCharge == 1)) { # [NH2+]= $polarity = $NH2PlusDouble; } elsif(($hydrogenCount == 3) && ($singleBondCount == 1) && ($formalCharge == 1)) { # [NH3+]- $polarity = $NH3PlusSingle; } } elsif($numberOfNeighbors == 2) { if(($hydrogenCount == 0) && ($singleBondCount == 1) && ($doubleBondCount == 1) && ($formalCharge == 0)) { $polarity = $NSingleDouble; } elsif(($hydrogenCount == 0) && ($tripleBondCount == 1) && ($doubleBondCount == 1) && ($formalCharge == 0)) { $polarity = $NDoubleTriple; } elsif(($hydrogenCount == 1) && ($singleBondCount == 2) && ($formalCharge == 0) && !($isIn3Ring)) { $polarity = $NHSingleSingle; } elsif(($hydrogenCount == 1) && ($singleBondCount == 2) && ($formalCharge == 0) && ($isIn3Ring)) { $polarity = $NHSingleSingle3Ring; } elsif(($hydrogenCount == 0) && ($tripleBondCount == 1) && ($singleBondCount == 1) && ($formalCharge == 1)) { $polarity = $NPlusSingleTriple; } elsif(($hydrogenCount == 1) && ($doubleBondCount == 1) && ($singleBondCount == 1) && ($formalCharge == 1)) { $polarity = $NHPlusSingleDouble; } elsif(($hydrogenCount == 2) && ($singleBondCount == 2) && ($formalCharge == 1)) { $polarity = $NH2PlusSingleSingle; } elsif(($hydrogenCount == 0) && ($aromaticBondCount == 2) && ($formalCharge == 0)) { $polarity = $nAromaticAromatic; } elsif(($hydrogenCount == 1) && ($aromaticBondCount == 2) && ($formalCharge == 0)) { $polarity = $nHAromaticAromatic; } elsif(($hydrogenCount == 1) && ($aromaticBondCount == 2) && ($formalCharge == 1)) { $polarity = $nHPlusAromaticAromatic; } } elsif($numberOfNeighbors == 3) { if(($hydrogenCount == 0) && ($singleBondCount == 3) && ($formalCharge == 0) && !($isIn3Ring)) { $polarity = $NSingleSingleSingle; } elsif(($hydrogenCount == 0) && ($singleBondCount == 3) && ($formalCharge == 0) && ($isIn3Ring)) { $polarity = $NSingleSingleSingle3Ring; } elsif(($hydrogenCount == 0) && ($singleBondCount == 1) && ($doubleBondCount == 2) && ($formalCharge == 0)) { $polarity = $NSingleDoubleDouble; } elsif(($hydrogenCount == 0) && ($singleBondCount == 2) && ($doubleBondCount == 1) && ($formalCharge == 1)) { $polarity = $NPlusSingleSingleDouble; } elsif(($hydrogenCount == 1) && ($singleBondCount == 3) && ($formalCharge == 1)) { $polarity = $NPlusSingleSingleSingle; } elsif(($hydrogenCount == 0) && ($aromaticBondCount == 3) && ($formalCharge == 0)) { $polarity = $nAromaticAromaticAromatic; } elsif(($hydrogenCount == 0) && ($singleBondCount == 1) && ($aromaticBondCount == 2) && ($formalCharge == 0)) { $polarity = $nSingleAromaticAromatic; } elsif(($hydrogenCount == 0) && ($aromaticBondCount == 3) && ($formalCharge == 1)) { $polarity = $nPlusAromaticAromaticAromatic; } elsif(($hydrogenCount == 0) && ($singleBondCount == 1) && ($aromaticBondCount == 2) && ($formalCharge == 1)) { $polarity = $nPlusSingleAromaticAromatic; } } elsif($numberOfNeighbors == 4) { if(($hydrogenCount == 0) && ($singleBondCount == 4) && ($formalCharge == 1)) { $polarity = $NPlusSingleSingleSingleSingle; } } if($polarity < 0) # for an N with non-standard valence { $polarity = $nonStandardNBasePolarity - ($numberOfNeighbors * $neighborNBasePolarity) + ($hydrogenCount * $hydrogenBasePolarity); } if($polarity < 0) { $polarity = 0; } } elsif($atomicNumber == $atomic_number_oxygen) { if($numberOfNeighbors == 1) { if(($hydrogenCount == 0) && ($doubleBondCount == 1) && ($formalCharge == 0)) { $polarity = $ODouble; } elsif(($hydrogenCount == 1) && ($singleBondCount == 1) && ($formalCharge == 0)) { $polarity = $OHSingle; } elsif(($hydrogenCount == 0) && ($singleBondCount == 1) && ($formalCharge == -1)) { $polarity = $OMinusSingle; } } elsif($numberOfNeighbors == 2) { if(($hydrogenCount == 0) && ($singleBondCount == 2) && ($formalCharge == 0) && !($isIn3Ring)) { $polarity = $OSingleSingle; } elsif(($hydrogenCount == 0) && ($singleBondCount == 2) && ($formalCharge == 0) && ($isIn3Ring)) { $polarity = $OSingleSingle3Ring; } elsif(($hydrogenCount == 0) && ($aromaticBondCount == 2) && ($formalCharge == 0)) { $polarity = $oAromaticAromatic; } } if($polarity < 0) # O with non standard polarity { $polarity = $nonStandardOBasePolarity - ($numberOfNeighbors * $neighborOBasePolarity) + ($hydrogenCount * $hydrogenBasePolarity); } if($polarity < 0) { $polarity = 0; } } $psa += $polarity; } } # KFK: commented out the following deallocation calls which are no-longer needed: # $deAllocated = dt_dealloc($atoms); # $deAllocated = dt_dealloc($bonds); # $deAllocated = dt_dealloc($rings); # $deAllocated = dt_dealloc($oneAtom); # $deAllocated = dt_dealloc($oneBond); # $deAllocated = dt_dealloc($oneRing); # $deAllocated = dt_dealloc($ringAtoms); # $deAllocated = dt_dealloc($oneRingAtom); # $deAllocated = dt_dealloc($molecule); return $psa,$message; } ################# # #################