This example shows how the (sequence) information contained in a CIF file can be readily accessed and transformed into another format. This particular example implements a FASTA converter, which reads the monomer sequences in the entity_poly_seq category and translates them into the single-letter FASTA format. As there are single-letter codifications already in the CIF file, in the entity_poly category, this example also shows how one might easily compare their translations with the codifications already in the CIF file.
Save FASTA.C to/path/to/cifparse-obj-vX.X-prod-src/parser-test-app-vX.X/src/
Save the CIF file anywhere, e.g.,/path/to/cifparse-obj-vX.X-prod-src/parser-test-app-vX.X/bin/
Add FASTA.ext to the BASE_MAIN_FILES list in the Makefile in/path/to/cifparse-obj.vX.X-prod-src/parser-test-app-vX.X
Executemake
in the same directory as the Makefilecd
to bin, where the executable has been made, and run./FASTA /path/to/5HVP.cif
, which generates/path/to/5HVP.fasta
#include "CifFile.h"
CifFile::GetBlockNames(vector<string>& blockNames)
. #include "ISTable.h"
./FASTA 5HVP.cif
For entity #1: The codified version in the CIF file is as follows: PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPT PVNIIGRNLLTQIGCTLNF The codified version obtained from translating entity_poly_seq is as follows: PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPT PVNIIGRNLLTQIGCTLNF The two are equivalent. --- For entity #2: The codified version in the CIF file is as follows: XVVXAX The codified version obtained from translating entity_poly_seq is as follows: XVVXAX The two are equivalent. ---more 5HVP.fasta
>HIV-1 PROTEASE PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPT PVNIIGRNLLTQIGCTLNF >ACETYL-*PEPSTATIN XVVXAX
/************************* * FASTA.C * * For some CIF file, generate a FASTA file based on the monomer sequences * stored in the struct_poly_seq category table. Optionally compare our * codification with the one stored in the struct_poly category table. * * Lines with superscriptions contain footnoted references or explanations. *************************/ #include <fstream> #include <cstring> #include <iostream> #include <map> #include <string> #include <vector> #include "CifFile.h" #include "CifParserBase.h" #include "ISTable.h" #include <boost/assign/list_of.hpp> #define COL_WIDTH_MAX 80 // Holds amino acid codification information std::map<string, char> codification; char identifyMonomer(const string& monomer); void prepareCodes(); void showUsage(); int main(int argc, char **argv) { if (argc != 2) { showUsage(); } // The name of the CIF file string cifFileName = argv[1]; // Parsing diagnostics string diagnostics; // Compare our codifications with those contained in the CIF file bool compare (true); // Create CIF file and parser objects CifFile *cifFileP = new CifFile; CifParser *cifParserP = new CifParser(cifFileP); // Parse the CIF file cifParserP->Parse(cifFileName, diagnostics); // Delete the CIF parser, as it is no longer needed delete cifParserP; // Display any diagnostics if (!diagnostics.empty()) { std::cout << "Diagnostics: " << std::endl << diagnostics << std::endl; } // Fill the codification map prepareCodes(); // Get the first data block name in the CIF file string firstBlockName = cifFileP->GetFirstBlockName(); // Retrieve the first data block Block &block = cifFileP->GetBlock(firstBlockName); // Retrieve the entity category table, which contains information that will be used in the FASTA1 ISTable& entity = block.GetTable("entity"); // Holds non-mandatory entity attributes that could serve as FASTA header lines, ordered preferentially string candidates[] = {"pdbx_description", "details", "type"}; // The entity category table attribute to be used to fill the FASTA header string headerDescriptor; // Find the first present candidate for (unsigned int i = 0; i < 3; ++i) { if (entity.IsColumnPresent(candidates[i])) { headerDescriptor = candidates[i]; break; } } // If none of the optional descriptors are present, just use the entity id if(headerDescriptor.empty()) { headerDescriptor = "id"; } // Retrieve the entity_poly_seq category group table, which contains the monomer sequences for entities2 ISTable& entity_poly_seq = block.GetTable("entity_poly_seq"); // Use the CIF file name to generate the FASTA file name size_t fileExtPos = cifFileName.find(".cif"); string outFileName = cifFileName.substr(0, fileExtPos) + ".fasta"; // Create and open the FASTA file std::ofstream outFile; outFile.open(outFileName.c_str()); // Container to hold each codified entity for optional comparison vector<string> codifiedEntities; // Keep track of the current column width and current entity id unsigned int columnWidth (0), entityID (1); // Holds the codification of the current entity string codifiedEntity; // Write the header to the FASTA file outFile << ">" << entity(entityID - 1, headerDescriptor) << std::endl; for (unsigned int i = 0; i < entity_poly_seq.GetNumRows(); ++i, ++columnWidth) { // Obtain the ID of the entity described by this row unsigned int currentEntityID = atoi(entity_poly_seq(i, "entity_id").c_str()); // If we are dealing with a new entity if (currentEntityID != entityID) { // Write out the current entity's FASTA codification outFile << codifiedEntity; // Store the current entity's FASTA codification codifiedEntities.push_back(codifiedEntity); // Set the new entity ID entityID = currentEntityID; // Write out the new header outFile << "\n>" << entity(entityID - 1, headerDescriptor) << std::endl; columnWidth = 0; codifiedEntity.clear(); } // If we have hit the maximum column width if (columnWidth == COL_WIDTH_MAX) { // Move to a new line and reset the width codifiedEntity += "\n"; columnWidth = 0; } // Retrieve the monomer stored in this row string monomer = entity_poly_seq(i, "mon_id"); // Identify the monomer and add its codification codifiedEntity.push_back(identifyMonomer(monomer)); } // Write out and store the last entity outFile << codifiedEntity; codifiedEntities.push_back(codifiedEntity); // Close the FASTA file as we have no more monomers to read outFile.close(); // Optional comparison against the codified sequences stored in entity_poly if (compare) { // Retrieve the table corresponding to the entity_poly category, which contains one-letter // codified canonical sequences for entities, against which we can compare our conversions3 ISTable& entity_poly = block.GetTable("entity_poly"); for (unsigned int i = 0; i < codifiedEntities.size(); ++i) { string temp = entity_poly(i, "pdbx_seq_one_letter_code_can"); std::cout << "For entity #" << i + 1 << ":" << std::endl << std::endl; std::cout << "The codified version in the CIF file is as follows: " << std::endl << temp << std::endl << std::endl; std::cout << "The codified version obtained from translating entity_poly_seq is as follows: " << std::endl << codifiedEntities[i] << std::endl << std::endl; int comparison = codifiedEntities[i].compare(temp); string equality = (!comparison) ? "equivalent" : "inequivalent"; std::cout << "The two are " << equality + "." << std::endl << "---------------" << std::endl; } } return 0; } char identifyMonomer(const string& monomer) { char code; // If we are dealing with an amino acid if (monomer.length() == 3) { // Find its codification in the map, using 'X' if it is not found code = (codification.find(monomer) != codification.end()) ? codification[monomer] : 'X'; } // If we are dealing with a nucleic acid, there is nothing to codify else { code = monomer[0]; } return code; } void prepareCodes() { codification = boost::assign::map_list_of ("ALA", 'A') ("CYS", 'C') ("ASP", 'D') ("GLU", 'E') ("PHE", 'F') ("GLY", 'G') ("HIS", 'H') ("ILE", 'I') ("LYS", 'K') ("LEU", 'L') ("MET", 'M') ("ASN", 'N') ("PYL", 'O') ("PRO", 'P') ("GLN", 'Q') ("ARG", 'R') ("SER", 'S') ("THR", 'T') ("SEC", 'U') ("VAL", 'V') ("TRP", 'W') ("TYR", 'Y'); } void showUsage() { std::cout << "Usage: ./FASTA /path/to/file.cif" << std::endl; exit(1); }