Spaces:
No application file
No application file
| # Copyright 2001 by Gavin E. Crooks. All rights reserved. | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """ASTRAL RAF (Rapid Access Format) Sequence Maps. | |
| The ASTRAL RAF Sequence Maps record the relationship between the PDB SEQRES | |
| records (representing the sequence of the molecule used in an experiment) to | |
| the ATOM records (representing the atoms experimentally observed). | |
| This data is derived from the Protein Data Bank CIF files. Known errors in the | |
| CIF files are corrected manually, with the original PDB file serving as the | |
| final arbiter in case of discrepancies. | |
| Residues are referenced by residue ID. This consists of a the PDB residue | |
| sequence number (up to 4 digits) and an optional PDB insertion code (an | |
| ascii alphabetic character, a-z, A-Z). e.g. "1", "10A", "1010b", "-1" | |
| See "ASTRAL RAF Sequence Maps":http://astral.stanford.edu/raf.html | |
| Dictionary `protein_letters_3to1_extended` provides a mapping from the | |
| 3-letter amino acid codes found in PDB files to 1-letter codes. The 3-letter | |
| codes include chemically modified residues. | |
| """ | |
| from copy import copy | |
| from Bio.Data.PDBData import protein_letters_3to1_extended | |
| from Bio.SCOP.Residues import Residues | |
| def normalize_letters(one_letter_code): | |
| """Convert RAF one-letter amino acid codes into IUPAC standard codes. | |
| Letters are uppercased, and "." ("Unknown") is converted to "X". | |
| """ | |
| if one_letter_code == ".": | |
| return "X" | |
| else: | |
| return one_letter_code.upper() | |
| class SeqMapIndex(dict): | |
| """An RAF file index. | |
| The RAF file itself is about 50 MB. This index provides rapid, random | |
| access of RAF records without having to load the entire file into memory. | |
| The index key is a concatenation of the PDB ID and chain ID. e.g | |
| "2drcA", ``"155c_"``. RAF uses an underscore to indicate blank | |
| chain IDs. | |
| """ | |
| def __init__(self, filename): | |
| """Initialize the RAF file index. | |
| Arguments: | |
| - filename -- The file to index | |
| """ | |
| dict.__init__(self) | |
| self.filename = filename | |
| with open(self.filename) as f: | |
| position = 0 | |
| while True: | |
| line = f.readline() | |
| if not line: | |
| break | |
| key = line[0:5] | |
| if key is not None: | |
| self[key] = position | |
| position = f.tell() | |
| def __getitem__(self, key): | |
| """Return an item from the indexed file.""" | |
| position = dict.__getitem__(self, key) | |
| with open(self.filename) as f: | |
| f.seek(position) | |
| line = f.readline() | |
| record = SeqMap(line) | |
| return record | |
| def getSeqMap(self, residues): | |
| """Get the sequence map for a collection of residues. | |
| Arguments: | |
| - residues -- A Residues instance, or a string that can be | |
| converted into a Residues instance. | |
| """ | |
| if isinstance(residues, str): | |
| residues = Residues(residues) | |
| pdbid = residues.pdbid | |
| frags = residues.fragments | |
| if not frags: | |
| frags = (("_", "", ""),) # All residues of unnamed chain | |
| seqMap = None | |
| for frag in frags: | |
| chainid = frag[0] | |
| if chainid in ["", "-", " ", "_"]: | |
| chainid = "_" | |
| id = pdbid + chainid | |
| sm = self[id] | |
| # Cut out fragment of interest | |
| start = 0 | |
| end = len(sm.res) | |
| if frag[1]: | |
| start = int(sm.index(frag[1], chainid)) | |
| if frag[2]: | |
| end = int(sm.index(frag[2], chainid)) + 1 | |
| sm = sm[start:end] | |
| if seqMap is None: | |
| seqMap = sm | |
| else: | |
| seqMap += sm | |
| return seqMap | |
| class SeqMap: | |
| """An ASTRAL RAF (Rapid Access Format) Sequence Map. | |
| This is a list like object; You can find the location of particular residues | |
| with index(), slice this SeqMap into fragments, and glue fragments back | |
| together with extend(). | |
| Attributes: | |
| - pdbid -- The PDB 4 character ID | |
| - pdb_datestamp -- From the PDB file | |
| - version -- The RAF format version. e.g. 0.01 | |
| - flags -- RAF flags. (See release notes for more information.) | |
| - res -- A list of Res objects, one for each residue in this sequence map | |
| """ | |
| def __init__(self, line=None): | |
| """Initialize the class.""" | |
| self.pdbid = "" | |
| self.pdb_datestamp = "" | |
| self.version = "" | |
| self.flags = "" | |
| self.res = [] | |
| if line: | |
| self._process(line) | |
| def _process(self, line): | |
| """Parse a RAF record into a SeqMap object (PRIVATE).""" | |
| header_len = 38 | |
| line = line.rstrip() # no trailing whitespace | |
| if len(line) < header_len: | |
| raise ValueError("Incomplete header: " + line) | |
| self.pdbid = line[0:4] | |
| chainid = line[4:5] | |
| self.version = line[6:10] | |
| # Raf format versions 0.01 and 0.02 are identical for practical purposes | |
| if self.version != "0.01" and self.version != "0.02": | |
| raise ValueError("Incompatible RAF version: " + self.version) | |
| self.pdb_datestamp = line[14:20] | |
| self.flags = line[21:27] | |
| for i in range(header_len, len(line), 7): | |
| f = line[i : i + 7] | |
| if len(f) != 7: | |
| raise ValueError("Corrupt Field: (" + f + ")") | |
| r = Res() | |
| r.chainid = chainid | |
| r.resid = f[0:5].strip() | |
| r.atom = normalize_letters(f[5:6]) | |
| r.seqres = normalize_letters(f[6:7]) | |
| self.res.append(r) | |
| def index(self, resid, chainid="_"): | |
| """Return the index of the SeqMap for the given resid and chainid.""" | |
| for i in range(0, len(self.res)): | |
| if self.res[i].resid == resid and self.res[i].chainid == chainid: | |
| return i | |
| raise KeyError("No such residue " + chainid + resid) | |
| def __getitem__(self, index): | |
| """Extract a single Res object from the SeqMap.""" | |
| if not isinstance(index, slice): | |
| raise NotImplementedError | |
| s = copy(self) | |
| s.res = s.res[index] | |
| return s | |
| def append(self, res): | |
| """Append another Res object onto the list of residue mappings.""" | |
| self.res.append(res) | |
| def extend(self, other): | |
| """Append another SeqMap onto the end of self. | |
| Both SeqMaps must have the same PDB ID, PDB datestamp and | |
| RAF version. The RAF flags are erased if they are inconsistent. This | |
| may happen when fragments are taken from different chains. | |
| """ | |
| if not isinstance(other, SeqMap): | |
| raise TypeError("Can only extend a SeqMap with a SeqMap.") | |
| if self.pdbid != other.pdbid: | |
| raise TypeError("Cannot add fragments from different proteins") | |
| if self.version != other.version: | |
| raise TypeError("Incompatible rafs") | |
| if self.pdb_datestamp != other.pdb_datestamp: | |
| raise TypeError("Different pdb dates!") | |
| if self.flags != other.flags: | |
| self.flags = "" | |
| self.res += other.res | |
| def __iadd__(self, other): | |
| """In place addition of SeqMap objects.""" | |
| self.extend(other) | |
| return self | |
| def __add__(self, other): | |
| """Addition of SeqMap objects.""" | |
| s = copy(self) | |
| s.extend(other) | |
| return s | |
| def getAtoms(self, pdb_handle, out_handle): | |
| """Extract all relevant ATOM and HETATOM records from a PDB file. | |
| The PDB file is scanned for ATOM and HETATOM records. If the | |
| chain ID, residue ID (seqNum and iCode), and residue type match | |
| a residue in this sequence map, then the record is echoed to the | |
| output handle. | |
| This is typically used to find the coordinates of a domain, or other | |
| residue subset. | |
| Arguments: | |
| - pdb_handle -- A handle to the relevant PDB file. | |
| - out_handle -- All output is written to this file like object. | |
| """ | |
| # This code should be refactored when (if?) biopython gets a PDB parser | |
| # The set of residues that I have to find records for. | |
| resSet = {} | |
| for r in self.res: | |
| if r.atom == "X": # Unknown residue type | |
| continue | |
| chainid = r.chainid | |
| if chainid == "_": | |
| chainid = " " | |
| resid = r.resid | |
| resSet[(chainid, resid)] = r | |
| resFound = {} | |
| for line in pdb_handle: | |
| if line.startswith("ATOM ") or line.startswith("HETATM"): | |
| chainid = line[21:22] | |
| resid = line[22:27].strip() | |
| key = (chainid, resid) | |
| if key in resSet: | |
| res = resSet[key] | |
| atom_aa = res.atom | |
| resName = line[17:20] | |
| if resName in protein_letters_3to1_extended: | |
| if protein_letters_3to1_extended[resName] == atom_aa: | |
| out_handle.write(line) | |
| resFound[key] = res | |
| if len(resSet) != len(resFound): | |
| # for k in resFound: | |
| # del resSet[k] | |
| # print(resSet) | |
| raise RuntimeError( | |
| "Could not find at least one ATOM or HETATM" | |
| " record for each and every residue in this" | |
| " sequence map." | |
| ) | |
| class Res: | |
| """A single residue mapping from a RAF record. | |
| Attributes: | |
| - chainid -- A single character chain ID. | |
| - resid -- The residue ID. | |
| - atom -- amino acid one-letter code from ATOM records. | |
| - seqres -- amino acid one-letter code from SEQRES records. | |
| """ | |
| def __init__(self): | |
| """Initialize the class.""" | |
| self.chainid = "" | |
| self.resid = "" | |
| self.atom = "" | |
| self.seqres = "" | |
| def parse(handle): | |
| """Iterate over RAF file, giving a SeqMap object for each line. | |
| Arguments: | |
| - handle -- file-like object. | |
| """ | |
| for line in handle: | |
| yield SeqMap(line) | |