PDBx/mmCIF Dictionary Resources

About This Example

This example shows how the (sequence) information contained in a CIF file can be readily accessed and transformed into another format. This particular example implements a FASTA converter, which reads the monomer sequences in the entity_poly_seq category and translates them into the single-letter FASTA format. As there are single-letter codifications already in the CIF file, in the entity_poly category, this example also shows how one might easily compare their translations with the codifications already in the CIF file.

Build Instructions

FILES: FASTA.py, 5HVP.cif

Save FASTA.py and save the CIF data file. Run python FASTA.py /path/to/file.cif

Functions of Note

from pdbx.reader.PdbxContainers import ContainerBase
from pdbx.reader.PdbxContainers import DataCategory

getObj(self, name) Returns the DataCategory object specified by name.
hasAttribute(self, attributeName) Returns a bool indicating whether or not the category has the attribute attributeName.
getValue(self, attributeName=None, rowIndex=None) Returns the value of the attribute attributeName at row index rowIndex.
getRowCount(self)Returns the number of rows in the category table.

Basic Sample Output

Output of the command: python FASTA.py 5HVP.cif

For entity #1:

The codified version in the CIF file is as follows:
PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPT
PVNIIGRNLLTQIGCTLNF

The codified version obtained from translating entity_poly_seq is as follows:
PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPT
PVNIIGRNLLTQIGCTLNF

The two are equivalent.
---
For entity #2:

The codified version in the CIF file is as follows:
XVVXAX

The codified version obtained from translating entity_poly_seq is as follows:
XVVXAX

The two are equivalent.
---
more 5HVP.fasta
>HIV-1 PROTEASE
PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPT
PVNIIGRNLLTQIGCTLNF
>ACETYL-*PEPSTATIN
XVVXAX

Example Source Code

"""
 FASTA.py

 For some CIF file, generate a FASTA file based on the monomer sequences
 stored in the struct_poly_seq category table. Optionally compare our
 codification with the one stored in the struct_poly category table.

 Lines with superscriptions contain footnoted references or explanations.
"""

from os.path import splitext
from pdbx.reader.PdbxReader import PdbxReader
from pdbx.reader.PdbxContainers import *
from sys import argv, exit

# Amino acid FASTA codification dictionary
codification = { "ALA" : 'A',
                 "CYS" : 'C',
                 "ASP" : 'D',
                 "GLU" : 'E',
                 "PHE" : 'F',
                 "GLY" : 'G',
                 "HIS" : 'H',
                 "ILE" : 'I',
                 "LYS" : 'K',
                 "LEU" : 'L',
                 "MET" : 'M',
                 "ASN" : 'N',
                 "PYL" : 'O',
                 "PRO" : 'P',
                 "GLN" : 'Q',
                 "ARG" : 'R',
                 "SER" : 'S',
                 "THR" : 'T',
                 "SEC" : 'U',
                 "VAL" : 'V',
                 "TRP" : 'W',
                 "TYR" : 'Y' }

# Compare our translations against the codifications stored in the CIF file
compare = True

# Check for improper usage
if len(argv) != 2 :
    exit("Usage: python FASTA.py /path/to/file.cif")

# Open the CIF file
cif = open(argv[1])

# A list to be propagated with data blocks
data = []

# Create a PdbxReader object 
pRd = PdbxReader(cif)

# Read the CIF file, propagating the data list
pRd.read(data)

# Close the CIF file, as it is no longer needed
cif.close()

# Retrieve the first data block
block = data[0]

# Retrieve the entity category table, which contains information that will be used in the FASTA header¹
entity = block.getObj("entity")

# Holds non-mandatory entity attributes that could serve as FASTA header lines, ordered preferentially
candidates = ["pdbx_description", "details", "type"]

# The entity category table attribute to be used to fill the FASTA header
headerDescriptor = ""

# Set the header descriptor
for c in candidates :
    if entity.hasAttribute(c) :
        headerDescriptor = c
        break

# If none of the optional descriptors are present, just use the entity id
if not headerDescriptor: headerDescriptor = "id"

# Retrieve the entity_poly_seq category table, which containers the monomer sequences for entities²
entity_poly_seq = block.getObj("entity_poly_seq")

# Use the CIF file pathname to generate the FASTA file (.fasta) pathname
(file, ext) = splitext(argv[1])
fastaFileName = file + ".fasta"

# Open the FASTA file for writing
fasta = open(fastaFileName, "w")

# A container to hold each codified entity for optional comparison
codifiedEntities = []

# Track the current entity
currentEntity = 1

# Track the column width
columnWidth = 0

# Codification of the current entity
codifiedEntity = ""

# Write the first header to the FASTA file
fasta.write(">%s\n" % entity.getValue(headerDescriptor, currentEntity - 1))

# Iterate over every row in entity_poly_seq, each containing an entity monomer
for index in range(entity_poly_seq.getRowCount()) :

    # Obtain the ID of the entity monomer described by this row
    tempEntity = (int)(entity_poly_seq.getValue("entity_id", index))
    
    # If we are dealing with a new entity
    if currentEntity != tempEntity :

        # Write out the current entity's FASTA codification
        fasta.write(codifiedEntity)
        
        # Store the current entity's FASTA codification
        codifiedEntities.append(codifiedEntity)

        # Set the new entity ID
        currentEntity = tempEntity

        # Write out the new header
        fasta.write("\n>%s\n" % entity.getValue(headerDescriptor, currentEntity - 1))

        columnWidth = 0
        codifiedEntity = ""

    # If we have hit the maximum column width
    if columnWidth == 80 :

        # Move to a new line and reset the width
        codifiedEntity += "\n"
        columnWidth = 0

    # Retrieve the monomer stored in this row
    monomer = entity_poly_seq.getValue("mon_id", index)

    # If the monomer is an amino acid
    if len(monomer) == 3 :
	
        # If it's in the codification dictionary, add it
        if codification.has_key(monomer) :
            codifiedEntity += codification[monomer]
			
        # Otherwise, use the default value "X"
        else :
            codifiedEntity += "X"

    # If it's a nucleic acid, there is nothing to codify
    else :
        codifiedEntity += monomer

    columnWidth += 1

# Write out and store the last entity
fasta.write(codifiedEntity)
codifiedEntities.append(codifiedEntity)

# Close the FASTA file
fasta.close()

# Optional comparison against the codified sequences stored in entity_poly
if compare :

    # Retrieve the table corresponding to the entity_poly category, which contains one letter
    # codified canonical sequences for entities, against which we can compare our conversions³
    entity_poly = block.getObj("entity_poly")

    # Compare every codification with the one in the CIF file
    for i in range(len(codifiedEntities)) :
        code = entity_poly.getValue("pdbx_seq_one_letter_code_can", i)
        print "For entity #%d: " % (i + 1)
        print "\nThe codified version in the CIF file is as follows: " 
        print code
        print "\nThe codified version obtained from translating entity_poly_seq is as follows: "
        print codifiedEntities[i]
        equality = (code == codifiedEntities[i]) and "equivalent" or "inequivalent"
        print "\nThe two are %s." % (equality) 
        print "---"