Spaces:
No application file
No application file
| # Copyright 2012 by Eric Talevich. All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Bio.SeqIO support for accessing sequences in PDB and mmCIF files.""" | |
| import collections | |
| import warnings | |
| from Bio import BiopythonParserWarning | |
| from Bio.Data.PDBData import protein_letters_3to1 | |
| from Bio.Data.PDBData import protein_letters_3to1_extended | |
| from Bio.Seq import Seq | |
| from Bio.SeqRecord import SeqRecord | |
| from .Interfaces import SequenceIterator | |
| _aa3to1_dict = {} | |
| _aa3to1_dict.update(protein_letters_3to1) | |
| _aa3to1_dict.update(protein_letters_3to1_extended) | |
| def _res2aacode(residue, undef_code="X"): | |
| """Return the one-letter amino acid code from the residue name. | |
| Non-amino acid are returned as "X". | |
| """ | |
| if isinstance(residue, str): | |
| return _aa3to1_dict.get(residue, undef_code) | |
| return _aa3to1_dict.get(residue.resname, undef_code) | |
| def AtomIterator(pdb_id, structure): | |
| """Return SeqRecords from Structure objects. | |
| Base function for sequence parsers that read structures Bio.PDB parsers. | |
| Once a parser from Bio.PDB has been used to load a structure into a | |
| Bio.PDB.Structure.Structure object, there is no difference in how the | |
| sequence parser interprets the residue sequence. The functions in this | |
| module may be used by SeqIO modules wishing to parse sequences from lists | |
| of residues. | |
| Calling functions must pass a Bio.PDB.Structure.Structure object. | |
| See Bio.SeqIO.PdbIO.PdbAtomIterator and Bio.SeqIO.PdbIO.CifAtomIterator for | |
| details. | |
| """ | |
| model = structure[0] | |
| for chn_id, chain in sorted(model.child_dict.items()): | |
| # HETATM mod. res. policy: remove mod if in sequence, else discard | |
| residues = [ | |
| res | |
| for res in chain.get_unpacked_list() | |
| if _res2aacode(res.get_resname().upper()) != "X" | |
| ] | |
| if not residues: | |
| continue | |
| # Identify missing residues in the structure | |
| # (fill the sequence with 'X' residues in these regions) | |
| gaps = [] | |
| rnumbers = [r.id[1] for r in residues] | |
| for i, rnum in enumerate(rnumbers[:-1]): | |
| if rnumbers[i + 1] != rnum + 1 and rnumbers[i + 1] != rnum: | |
| # It's a gap! | |
| gaps.append((i + 1, rnum, rnumbers[i + 1])) | |
| if gaps: | |
| res_out = [] | |
| prev_idx = 0 | |
| for i, pregap, postgap in gaps: | |
| if postgap > pregap: | |
| gapsize = postgap - pregap - 1 | |
| res_out.extend(_res2aacode(x) for x in residues[prev_idx:i]) | |
| prev_idx = i | |
| res_out.append("X" * gapsize) | |
| else: | |
| warnings.warn( | |
| "Ignoring out-of-order residues after a gap", | |
| BiopythonParserWarning, | |
| ) | |
| # Keep the normal part, drop the out-of-order segment | |
| # (presumably modified or hetatm residues, e.g. 3BEG) | |
| res_out.extend(_res2aacode(x) for x in residues[prev_idx:i]) | |
| break | |
| else: | |
| # Last segment | |
| res_out.extend(_res2aacode(x) for x in residues[prev_idx:]) | |
| else: | |
| # No gaps | |
| res_out = [_res2aacode(x) for x in residues] | |
| record_id = f"{pdb_id}:{chn_id}" | |
| # ENH - model number in SeqRecord id if multiple models? | |
| # id = "Chain%s" % str(chain.id) | |
| # if len(structure) > 1 : | |
| # id = ("Model%s|" % str(model.id)) + id | |
| record = SeqRecord(Seq("".join(res_out)), id=record_id, description=record_id) | |
| # TODO: Test PDB files with DNA and RNA too: | |
| record.annotations["molecule_type"] = "protein" | |
| record.annotations["model"] = model.id | |
| record.annotations["chain"] = chain.id | |
| record.annotations["start"] = int(rnumbers[0]) | |
| record.annotations["end"] = int(rnumbers[-1]) | |
| yield record | |
| class PdbSeqresIterator(SequenceIterator): | |
| """Parser for PDB files.""" | |
| def __init__(self, source): | |
| """Return SeqRecord objects for each chain in a PDB file. | |
| Arguments: | |
| - source - input stream opened in text mode, or a path to a file | |
| The sequences are derived from the SEQRES lines in the | |
| PDB file header, not the atoms of the 3D structure. | |
| Specifically, these PDB records are handled: DBREF, DBREF1, DBREF2, SEQADV, SEQRES, MODRES | |
| See: http://www.wwpdb.org/documentation/format23/sect3.html | |
| This gets called internally via Bio.SeqIO for the SEQRES based interpretation | |
| of the PDB file format: | |
| >>> from Bio import SeqIO | |
| >>> for record in SeqIO.parse("PDB/1A8O.pdb", "pdb-seqres"): | |
| ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) | |
| ... print(record.dbxrefs) | |
| ... | |
| Record id 1A8O:A, chain A | |
| ['UNP:P12497', 'UNP:POL_HV1N5'] | |
| Equivalently, | |
| >>> with open("PDB/1A8O.pdb") as handle: | |
| ... for record in PdbSeqresIterator(handle): | |
| ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) | |
| ... print(record.dbxrefs) | |
| ... | |
| Record id 1A8O:A, chain A | |
| ['UNP:P12497', 'UNP:POL_HV1N5'] | |
| Note the chain is recorded in the annotations dictionary, and any PDB DBREF | |
| lines are recorded in the database cross-references list. | |
| """ | |
| super().__init__(source, mode="t", fmt="PDB") | |
| def parse(self, handle): | |
| """Start parsing the file, and return a SeqRecord generator.""" | |
| records = self.iterate(handle) | |
| return records | |
| def iterate(self, handle): | |
| """Iterate over the records in the PDB file.""" | |
| chains = collections.defaultdict(list) | |
| metadata = collections.defaultdict(list) | |
| rec_name = None | |
| for line in handle: | |
| rec_name = line[0:6].strip() | |
| if rec_name == "SEQRES": | |
| # NB: We only actually need chain ID and the residues here; | |
| # commented bits are placeholders from the wwPDB spec. | |
| # Serial number of the SEQRES record for the current chain. | |
| # Starts at 1 and increments by one each line. | |
| # Reset to 1 for each chain. | |
| # ser_num = int(line[8:10]) | |
| # Chain identifier. This may be any single legal character, | |
| # including a blank which is used if there is only one chain. | |
| chn_id = line[11] | |
| # Number of residues in the chain (repeated on every record) | |
| # num_res = int(line[13:17]) | |
| residues = [_res2aacode(res) for res in line[19:].split()] | |
| chains[chn_id].extend(residues) | |
| elif rec_name == "DBREF": | |
| # ID code of this entry (PDB ID) | |
| pdb_id = line[7:11] | |
| # Chain identifier. | |
| chn_id = line[12] | |
| # Initial sequence number of the PDB sequence segment. | |
| # seq_begin = int(line[14:18]) | |
| # Initial insertion code of the PDB sequence segment. | |
| # icode_begin = line[18] | |
| # Ending sequence number of the PDB sequence segment. | |
| # seq_end = int(line[20:24]) | |
| # Ending insertion code of the PDB sequence segment. | |
| # icode_end = line[24] | |
| # Sequence database name. | |
| database = line[26:32].strip() | |
| # Sequence database accession code. | |
| db_acc = line[33:41].strip() | |
| # Sequence database identification code. | |
| db_id_code = line[42:54].strip() | |
| # Initial sequence number of the database seqment. | |
| # db_seq_begin = int(line[55:60]) | |
| # Insertion code of initial residue of the segment, if PDB is the | |
| # reference. | |
| # db_icode_begin = line[60] | |
| # Ending sequence number of the database segment. | |
| # db_seq_end = int(line[62:67]) | |
| # Insertion code of the ending residue of the segment, if PDB is the | |
| # reference. | |
| # db_icode_end = line[67] | |
| metadata[chn_id].append( | |
| { | |
| "pdb_id": pdb_id, | |
| "database": database, | |
| "db_acc": db_acc, | |
| "db_id_code": db_id_code, | |
| } | |
| ) | |
| elif rec_name == "DBREF1": | |
| # ID code of this entry (PDB ID) | |
| pdb_id = line[7:11] | |
| # Chain identifier. | |
| chn_id = line[12] | |
| # Sequence database name. | |
| database = line[26:32].strip() | |
| # Sequence database identification code. | |
| db_id_code = line[47:67].strip() | |
| elif rec_name == "DBREF2": | |
| # Ensure ID code and chain are consistent: | |
| if pdb_id != line[7:11] or chn_id != line[12]: | |
| raise ValueError("DBREF2 identifiers do not match") | |
| # Sequence database accession code. | |
| db_acc = line[18:40].strip() | |
| metadata[chn_id].append( | |
| { | |
| "pdb_id": pdb_id, | |
| "database": database, | |
| "db_acc": db_acc, | |
| "db_id_code": db_id_code, | |
| } | |
| ) | |
| # ENH: 'SEQADV' 'MODRES' | |
| if rec_name is None: | |
| raise ValueError("Empty file.") | |
| for chn_id, residues in sorted(chains.items()): | |
| record = SeqRecord(Seq("".join(residues))) | |
| record.annotations = {"chain": chn_id} | |
| # TODO: Test PDB files with DNA and RNA too: | |
| record.annotations["molecule_type"] = "protein" | |
| if chn_id in metadata: | |
| m = metadata[chn_id][0] | |
| record.id = record.name = f"{m['pdb_id']}:{chn_id}" | |
| record.description = f"{m['database']}:{m['db_acc']} {m['db_id_code']}" | |
| for melem in metadata[chn_id]: | |
| record.dbxrefs.extend( | |
| [ | |
| f"{melem['database']}:{melem['db_acc']}", | |
| f"{melem['database']}:{melem['db_id_code']}", | |
| ] | |
| ) | |
| else: | |
| record.id = chn_id | |
| yield record | |
| def PdbAtomIterator(source): | |
| """Return SeqRecord objects for each chain in a PDB file. | |
| Argument source is a file-like object or a path to a file. | |
| The sequences are derived from the 3D structure (ATOM records), not the | |
| SEQRES lines in the PDB file header. | |
| Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries | |
| are converted to "X" in the sequence. | |
| In addition to information from the PDB header (which is the same for all | |
| records), the following chain specific information is placed in the | |
| annotation: | |
| record.annotations["residues"] = List of residue ID strings | |
| record.annotations["chain"] = Chain ID (typically A, B ,...) | |
| record.annotations["model"] = Model ID (typically zero) | |
| Where amino acids are missing from the structure, as indicated by residue | |
| numbering, the sequence is filled in with 'X' characters to match the size | |
| of the missing region, and None is included as the corresponding entry in | |
| the list record.annotations["residues"]. | |
| This function uses the Bio.PDB module to do most of the hard work. The | |
| annotation information could be improved but this extra parsing should be | |
| done in parse_pdb_header, not this module. | |
| This gets called internally via Bio.SeqIO for the atom based interpretation | |
| of the PDB file format: | |
| >>> from Bio import SeqIO | |
| >>> for record in SeqIO.parse("PDB/1A8O.pdb", "pdb-atom"): | |
| ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) | |
| ... | |
| Record id 1A8O:A, chain A | |
| Equivalently, | |
| >>> with open("PDB/1A8O.pdb") as handle: | |
| ... for record in PdbAtomIterator(handle): | |
| ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) | |
| ... | |
| Record id 1A8O:A, chain A | |
| """ | |
| # TODO - Add record.annotations to the doctest, esp the residues (not working?) | |
| # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO | |
| from Bio.PDB import PDBParser | |
| structure = PDBParser().get_structure(None, source) | |
| pdb_id = structure.header["idcode"] | |
| if not pdb_id: | |
| warnings.warn( | |
| "'HEADER' line not found; can't determine PDB ID.", BiopythonParserWarning | |
| ) | |
| pdb_id = "????" | |
| for record in AtomIterator(pdb_id, structure): | |
| # The PDB header was loaded as a dictionary, so let's reuse it all | |
| record.annotations.update(structure.header) | |
| # ENH - add letter annotations -- per-residue info, e.g. numbers | |
| yield record | |
| PDBX_POLY_SEQ_SCHEME_FIELDS = ( | |
| "_pdbx_poly_seq_scheme.asym_id", # Chain ID | |
| "_pdbx_poly_seq_scheme.mon_id", # Residue type | |
| ) | |
| STRUCT_REF_FIELDS = ( | |
| "_struct_ref.id", # ID of this reference | |
| "_struct_ref.db_name", # Name of the database | |
| "_struct_ref.db_code", # Code for this entity | |
| "_struct_ref.pdbx_db_accession", # DB accession ID of ref | |
| ) | |
| STRUCT_REF_SEQ_FIELDS = ( | |
| "_struct_ref_seq.ref_id", # Pointer to _struct_ref | |
| "_struct_ref_seq.pdbx_PDB_id_code", # PDB ID of this structure | |
| "_struct_ref_seq.pdbx_strand_id", # Chain ID of the reference | |
| ) | |
| def CifSeqresIterator(source): | |
| """Return SeqRecord objects for each chain in an mmCIF file. | |
| Argument source is a file-like object or a path to a file. | |
| The sequences are derived from the _entity_poly_seq entries in the mmCIF | |
| file, not the atoms of the 3D structure. | |
| Specifically, these mmCIF records are handled: _pdbx_poly_seq_scheme and | |
| _struct_ref_seq. The _pdbx_poly_seq records contain sequence information, | |
| and the _struct_ref_seq records contain database cross-references. | |
| See: | |
| http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v40.dic/Categories/pdbx_poly_seq_scheme.html | |
| and | |
| http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/struct_ref_seq.html | |
| This gets called internally via Bio.SeqIO for the sequence-based | |
| interpretation of the mmCIF file format: | |
| >>> from Bio import SeqIO | |
| >>> for record in SeqIO.parse("PDB/1A8O.cif", "cif-seqres"): | |
| ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) | |
| ... print(record.dbxrefs) | |
| ... | |
| Record id 1A8O:A, chain A | |
| ['UNP:P12497', 'UNP:POL_HV1N5'] | |
| Equivalently, | |
| >>> with open("PDB/1A8O.cif") as handle: | |
| ... for record in CifSeqresIterator(handle): | |
| ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) | |
| ... print(record.dbxrefs) | |
| ... | |
| Record id 1A8O:A, chain A | |
| ['UNP:P12497', 'UNP:POL_HV1N5'] | |
| Note the chain is recorded in the annotations dictionary, and any mmCIF | |
| _struct_ref_seq entries are recorded in the database cross-references list. | |
| """ | |
| # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO | |
| from Bio.PDB.MMCIF2Dict import MMCIF2Dict | |
| chains = collections.defaultdict(list) | |
| metadata = collections.defaultdict(list) | |
| records = MMCIF2Dict(source) | |
| # Explicitly convert records to list (See #1533). | |
| # If an item is not present, use an empty list | |
| for field in ( | |
| PDBX_POLY_SEQ_SCHEME_FIELDS + STRUCT_REF_SEQ_FIELDS + STRUCT_REF_FIELDS | |
| ): | |
| if field not in records: | |
| records[field] = [] | |
| elif not isinstance(records[field], list): | |
| records[field] = [records[field]] | |
| for asym_id, mon_id in zip( | |
| records["_pdbx_poly_seq_scheme.asym_id"], | |
| records["_pdbx_poly_seq_scheme.mon_id"], | |
| ): | |
| mon_id_1l = _res2aacode(mon_id) | |
| chains[asym_id].append(mon_id_1l) | |
| # Build a dict of _struct_ref records, indexed by the id field: | |
| struct_refs = {} | |
| for ref_id, db_name, db_code, db_acc in zip( | |
| records["_struct_ref.id"], | |
| records["_struct_ref.db_name"], | |
| records["_struct_ref.db_code"], | |
| records["_struct_ref.pdbx_db_accession"], | |
| ): | |
| struct_refs[ref_id] = { | |
| "database": db_name, | |
| "db_id_code": db_code, | |
| "db_acc": db_acc, | |
| } | |
| # Look through _struct_ref_seq records, look up the corresponding | |
| # _struct_ref and add an entry to the metadata list for this chain. | |
| for ref_id, pdb_id, chain_id in zip( | |
| records["_struct_ref_seq.ref_id"], | |
| records["_struct_ref_seq.pdbx_PDB_id_code"], | |
| records["_struct_ref_seq.pdbx_strand_id"], | |
| ): | |
| struct_ref = struct_refs[ref_id] | |
| # The names here mirror those in PdbIO | |
| metadata[chain_id].append({"pdb_id": pdb_id}) | |
| metadata[chain_id][-1].update(struct_ref) | |
| for chn_id, residues in sorted(chains.items()): | |
| record = SeqRecord(Seq("".join(residues))) | |
| record.annotations = {"chain": chn_id} | |
| # TODO: Test PDB files with DNA and RNA too: | |
| record.annotations["molecule_type"] = "protein" | |
| if chn_id in metadata: | |
| m = metadata[chn_id][0] | |
| record.id = record.name = f"{m['pdb_id']}:{chn_id}" | |
| record.description = f"{m['database']}:{m['db_acc']} {m['db_id_code']}" | |
| for melem in metadata[chn_id]: | |
| record.dbxrefs.extend( | |
| [ | |
| f"{melem['database']}:{melem['db_acc']}", | |
| f"{melem['database']}:{melem['db_id_code']}", | |
| ] | |
| ) | |
| else: | |
| record.id = chn_id | |
| yield record | |
| def CifAtomIterator(source): | |
| """Return SeqRecord objects for each chain in an mmCIF file. | |
| Argument source is a file-like object or a path to a file. | |
| The sequences are derived from the 3D structure (_atom_site.* fields) | |
| in the mmCIF file. | |
| Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries | |
| are converted to "X" in the sequence. | |
| In addition to information from the PDB header (which is the same for all | |
| records), the following chain specific information is placed in the | |
| annotation: | |
| record.annotations["residues"] = List of residue ID strings | |
| record.annotations["chain"] = Chain ID (typically A, B ,...) | |
| record.annotations["model"] = Model ID (typically zero) | |
| Where amino acids are missing from the structure, as indicated by residue | |
| numbering, the sequence is filled in with 'X' characters to match the size | |
| of the missing region, and None is included as the corresponding entry in | |
| the list record.annotations["residues"]. | |
| This function uses the Bio.PDB module to do most of the hard work. The | |
| annotation information could be improved but this extra parsing should be | |
| done in parse_pdb_header, not this module. | |
| This gets called internally via Bio.SeqIO for the atom based interpretation | |
| of the PDB file format: | |
| >>> from Bio import SeqIO | |
| >>> for record in SeqIO.parse("PDB/1A8O.cif", "cif-atom"): | |
| ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) | |
| ... | |
| Record id 1A8O:A, chain A | |
| Equivalently, | |
| >>> with open("PDB/1A8O.cif") as handle: | |
| ... for record in CifAtomIterator(handle): | |
| ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) | |
| ... | |
| Record id 1A8O:A, chain A | |
| """ | |
| # TODO - Add record.annotations to the doctest, esp the residues (not working?) | |
| # Only import parser when needed, to avoid/delay NumPy dependency in SeqIO | |
| from Bio.PDB.MMCIFParser import MMCIFParser | |
| structure = MMCIFParser().get_structure(None, source) | |
| pdb_id = structure.header["idcode"] | |
| if not pdb_id: | |
| warnings.warn("Could not determine the PDB ID.", BiopythonParserWarning) | |
| pdb_id = "????" | |
| yield from AtomIterator(pdb_id, structure) | |
| if __name__ == "__main__": | |
| from Bio._utils import run_doctest | |
| run_doctest(verbose=0) | |