PerlMol - Example: Peptide builder

Peptide builder sample script

By Ivan Tubert-Brohman, upon a suggestion by Ramón Garduño-Vázquez

Builds a 3D structure for a peptide given a sequence (the structure is essentialy linear; with some minor tweaks it could solve the protein-folding problem ;-)

It uses mdl molfiles. I know that for peptides PDB would make more sense, but for some reason I already had a few aminoacid structures in my drive as MDL molfiles and was too lazy to deal with PDB. Some things would have to be changed for it to generate a proper PDB file with the residue sequence numbers and such.

# usage: ./pep.pl <SEQUENCE>
# prints an mdl molfile to stdout

# curent version only supports the residues A,G,L,V

Files

peptide_builder.pl - the program
ala.mol - alanine with 3D coordinates
gly.mol - glycine with 3D coordinates
leu.mol - leucine with 3D coordinates
val.mol - valine with 3D coordinates
lava.mol - expected program output for ./peptide_builder LAVA

Code

#!/home/ivan/bin/perl
# usage: ./pep.pl <SEQUENCE>
# prints an mdl molfile to stdout

# curent version only supports the residues A,G,L,V
# the residues are assumed to be in the files listed below in %files,
# which must have the atoms N, CA, and C as the first three atoms in the 
# file

use strict;
use warnings;
#use diagnostics;

use Chemistry::File::MDLMol;
use Chemistry::File::SMARTS;
use Chemistry::InternalCoords::Builder qw(build_zmat);

{
    # the files with the template aminoacids (3D coords)
    my %files = (
        A => 'ala.mol',
        V => 'val.mol',
        G => 'gly.mol',
        L => 'leu.mol',
    );

    # This defines the essential part of an aminoacid
    my $patt = Chemistry::Pattern->parse('C((=O)CN([C,H])[H])O[H]', 
        format => 'smarts');

    my %aa;

    my ($c, $o, $ca, $n, $hn);

    # return a molecule object from an aminoacid given its one-letter code
    sub amino_acid {
        my ($code) = @_;
        return $aa{$code} if $aa{$code};
        my $file = $files{$code} or die "unknown code '$code', stopping ";
        $aa{$code} = Chemistry::Mol->read($file);
    }

    # start a new chain from a residue (a Chemistry::Mol object).
    # returns the new chain (a Chemistry::Mol object)
    sub start_chain {
        my ($res) = @_;

        # clone because we don't want to destroy the template
        my $chain = safe_clone($res);

        # find out which atom is which
        $patt->match($chain) or die "didn't match, stopping ";
        my ($oh, $ho, $hnb);
        ($c, $o, $ca, $n, $hn, $hnb, $oh, $ho) = $patt->atom_map;

        # chop off the carboxyl OH
        $oh->delete; $ho->delete;

        # generate internal coordinates. 
        build_zmat($chain, bfs => 0);
        $chain;
    }

    # add a new residue (a Chemistry::Mol object) to a chain
    sub add_residue {
        my ($chain, $res) = @_;

        # clone because we don't want to destroy the template
        $res = safe_clone($res);

        # find out which atom is which
        $patt->match($res) or die "didn't match, stopping ";
        my @map = $patt->atom_map;

        # delete the hydroxyl and an HN atom
        $_->delete for splice @map, 5;

        # generate internal coordinates. 
        # The bfs => 0 is needed to make sure
        # that the first three atoms are N, CA, and C, so that we can use 
        # them reliably for positioning the new residue with respect to
        # the previous one
        build_zmat($res, bfs => 0);

        # add the new residue
        $chain->combine($res);

        # since combine clones $res, we want to find the atoms of interest
        # in the new part of the chain, by "translating" the atom_map
        my ($c2, $o2, $ca2, $n2, $hn2) = 
            map { $chain->by_id($_->id) } @map;

        # create the peptide bond
        $chain->new_bond(atoms => [$n2, $c], order => 1);

        # add some roughly reasonable internal coordinates to position the
        # new residue
        $n2 ->internal_coords($c, 1.5, $ca, 120, $o, 180);
        $ca2->internal_coords($n2, 1.5, $c, 120, $ca, 180);
        $c2 ->internal_coords($ca2, 1.5, $n2, 120, $c, 180);

        # make sure that the N is planar
        $hn2->internal_coords($n2, 1.1, $c, 120, $o, 180)
            if $hn2->symbol eq 'H'; # make sure it's not Proline

        # save the new terminal residue's atoms of interest
        ($c, $o, $ca, $n, $hn) = ($c2, $o2, $ca2, $n2, $hn2);
    }

    sub end_chain {
        my ($chain) = @_;

        # add the terminal hydroxyl group
        my $ox = $chain->new_atom(
            symbol => 'O', 
            internal_coords => [ $c, 1.5, $ca, 120, $o, 180 ]
        );
        my $h = $chain->new_atom(
            symbol => 'H', 
            internal_coords => [ $ox, 1.1, $c, 105, $o, 0 ]
        );
        $chain->new_bond(atoms => [$ox, $c], order => 1);
        $chain->new_bond(atoms => [$ox, $h], order => 1);

        # finally, calculate the cartesian coordinates for all the atoms in
        # the chain
        $_->internal_coords->add_cartesians for $chain->atoms;
    }

}

# this is like Chemistry::Mol::clone, but assigns a fresh ID to the cloned
# molecule, atoms, and bonds. This should be added to Chemistry::Mol
sub safe_clone {
    my ($mol) = @_;
    my $clone = $mol->clone;
    for ($clone, $clone->atoms, $clone->bonds) {
        # this uses the undocumented nextID method in Chemistry::Mol,
        # Chemistry::Atom, and Chemistry::Bond
        $_->id($_->nextID);
    }
    $clone;
}

# takes a sequence string such as "LAVA" and returns a Chemistry::Mol object
sub build_sequence {
    my ($s) = @_;
    $s or die "no sequence, stopping ";
    my @seq = split //, $s;

    my $code = shift @seq;
    my $res = amino_acid($code);

    my $chain = start_chain($res);
    $chain->name($s);

    for my $code (@seq) {
        $res = amino_acid($code);
        add_residue($chain, $res);
    }
    end_chain($chain);

    return $chain;
}


##############################

# generate the structure and print it out as an MDL molfile
my $chain = build_sequence(shift);
print $chain->print(format => 'mdl');