This example shows how the (sequence) information contained in a CIF file can be readily accessed and transformed into another format. This particular example implements a FASTA converter, which reads the monomer sequences in the entity_poly_seq category and translates them into the single-letter FASTA format. As there are single-letter codifications already in the CIF file, in the entity_poly category, this example also shows how one might easily compare their translations with the codifications already in the CIF file.
Save FASTA.py and save the CIF data file. Run python FASTA.py /path/to/file.cif
from pdbx.reader.PdbxContainers import ContainerBase from pdbx.reader.PdbxContainers import DataCategory
getObj(self, name) Returns the DataCategory object specified by name.hasAttribute(self, attributeName) Returns a bool indicating whether or not the category has the attribute attributeName.getValue(self, attributeName=None, rowIndex=None) Returns the value of the attribute attributeName at row index rowIndex.getRowCount(self)Returns the number of rows in the category table.Output of the command: python FASTA.py 5HVP.cif
For entity #1:
The codified version in the CIF file is as follows:
PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPT
PVNIIGRNLLTQIGCTLNF
The codified version obtained from translating entity_poly_seq is as follows:
PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPT
PVNIIGRNLLTQIGCTLNF
The two are equivalent.
---
For entity #2:
The codified version in the CIF file is as follows:
XVVXAX
The codified version obtained from translating entity_poly_seq is as follows:
XVVXAX
The two are equivalent.
---
more 5HVP.fasta
>HIV-1 PROTEASE
PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPT
PVNIIGRNLLTQIGCTLNF
>ACETYL-*PEPSTATIN
XVVXAX
"""
FASTA.py
For some CIF file, generate a FASTA file based on the monomer sequences
stored in the struct_poly_seq category table. Optionally compare our
codification with the one stored in the struct_poly category table.
Lines with superscriptions contain footnoted references or explanations.
"""
from os.path import splitext
from pdbx.reader.PdbxReader import PdbxReader
from pdbx.reader.PdbxContainers import *
from sys import argv, exit
# Amino acid FASTA codification dictionary
codification = { "ALA" : 'A',
"CYS" : 'C',
"ASP" : 'D',
"GLU" : 'E',
"PHE" : 'F',
"GLY" : 'G',
"HIS" : 'H',
"ILE" : 'I',
"LYS" : 'K',
"LEU" : 'L',
"MET" : 'M',
"ASN" : 'N',
"PYL" : 'O',
"PRO" : 'P',
"GLN" : 'Q',
"ARG" : 'R',
"SER" : 'S',
"THR" : 'T',
"SEC" : 'U',
"VAL" : 'V',
"TRP" : 'W',
"TYR" : 'Y' }
# Compare our translations against the codifications stored in the CIF file
compare = True
# Check for improper usage
if len(argv) != 2 :
exit("Usage: python FASTA.py /path/to/file.cif")
# Open the CIF file
cif = open(argv[1])
# A list to be propagated with data blocks
data = []
# Create a PdbxReader object
pRd = PdbxReader(cif)
# Read the CIF file, propagating the data list
pRd.read(data)
# Close the CIF file, as it is no longer needed
cif.close()
# Retrieve the first data block
block = data[0]
# Retrieve the entity category table, which contains information that will be used in the FASTA header1
entity = block.getObj("entity")
# Holds non-mandatory entity attributes that could serve as FASTA header lines, ordered preferentially
candidates = ["pdbx_description", "details", "type"]
# The entity category table attribute to be used to fill the FASTA header
headerDescriptor = ""
# Set the header descriptor
for c in candidates :
if entity.hasAttribute(c) :
headerDescriptor = c
break
# If none of the optional descriptors are present, just use the entity id
if not headerDescriptor: headerDescriptor = "id"
# Retrieve the entity_poly_seq category table, which containers the monomer sequences for entities2
entity_poly_seq = block.getObj("entity_poly_seq")
# Use the CIF file pathname to generate the FASTA file (.fasta) pathname
(file, ext) = splitext(argv[1])
fastaFileName = file + ".fasta"
# Open the FASTA file for writing
fasta = open(fastaFileName, "w")
# A container to hold each codified entity for optional comparison
codifiedEntities = []
# Track the current entity
currentEntity = 1
# Track the column width
columnWidth = 0
# Codification of the current entity
codifiedEntity = ""
# Write the first header to the FASTA file
fasta.write(">%s\n" % entity.getValue(headerDescriptor, currentEntity - 1))
# Iterate over every row in entity_poly_seq, each containing an entity monomer
for index in range(entity_poly_seq.getRowCount()) :
# Obtain the ID of the entity monomer described by this row
tempEntity = (int)(entity_poly_seq.getValue("entity_id", index))
# If we are dealing with a new entity
if currentEntity != tempEntity :
# Write out the current entity's FASTA codification
fasta.write(codifiedEntity)
# Store the current entity's FASTA codification
codifiedEntities.append(codifiedEntity)
# Set the new entity ID
currentEntity = tempEntity
# Write out the new header
fasta.write("\n>%s\n" % entity.getValue(headerDescriptor, currentEntity - 1))
columnWidth = 0
codifiedEntity = ""
# If we have hit the maximum column width
if columnWidth == 80 :
# Move to a new line and reset the width
codifiedEntity += "\n"
columnWidth = 0
# Retrieve the monomer stored in this row
monomer = entity_poly_seq.getValue("mon_id", index)
# If the monomer is an amino acid
if len(monomer) == 3 :
# If it's in the codification dictionary, add it
if codification.has_key(monomer) :
codifiedEntity += codification[monomer]
# Otherwise, use the default value "X"
else :
codifiedEntity += "X"
# If it's a nucleic acid, there is nothing to codify
else :
codifiedEntity += monomer
columnWidth += 1
# Write out and store the last entity
fasta.write(codifiedEntity)
codifiedEntities.append(codifiedEntity)
# Close the FASTA file
fasta.close()
# Optional comparison against the codified sequences stored in entity_poly
if compare :
# Retrieve the table corresponding to the entity_poly category, which contains one letter
# codified canonical sequences for entities, against which we can compare our conversions3
entity_poly = block.getObj("entity_poly")
# Compare every codification with the one in the CIF file
for i in range(len(codifiedEntities)) :
code = entity_poly.getValue("pdbx_seq_one_letter_code_can", i)
print "For entity #%d: " % (i + 1)
print "\nThe codified version in the CIF file is as follows: "
print code
print "\nThe codified version obtained from translating entity_poly_seq is as follows: "
print codifiedEntities[i]
equality = (code == codifiedEntities[i]) and "equivalent" or "inequivalent"
print "\nThe two are %s." % (equality)
print "---"