Spaces:
No application file
No application file
| # Copyright 2008-2015 by Peter Cock. All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Bio.SeqIO support for the "pir" (aka PIR or NBRF) file format. | |
| This module is for reading and writing PIR or NBRF format files as | |
| SeqRecord objects. | |
| You are expected to use this module via the Bio.SeqIO functions, or if | |
| the file contains a sequence alignment, optionally via Bio.AlignIO instead. | |
| This format was introduced for the Protein Information Resource (PIR), a | |
| project of the National Biomedical Research Foundation (NBRF). The PIR | |
| database itself is now part of UniProt. | |
| The file format is described online at: | |
| http://www.ebi.ac.uk/help/pir_frame.html | |
| http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html (currently down) | |
| An example file in this format would be:: | |
| >P1;CRAB_ANAPL | |
| ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). | |
| MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR | |
| SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH | |
| GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ | |
| SDVPERSIPI TREEKPAIAG AQRK* | |
| >P1;CRAB_BOVIN | |
| ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). | |
| MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR | |
| PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV | |
| HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK | |
| QASGPERTIP ITREEKPAVT AAPKK* | |
| Or, an example of a multiple sequence alignment:: | |
| >P1;S27231 | |
| rhodopsin - northern leopard frog | |
| MNGTEGPNFY IPMSNKTGVV RSPFDYPQYY LAEPWKYSVL AAYMFLLILL GLPINFMTLY | |
| VTIQHKKLRT PLNYILLNLG VCNHFMVLCG FTITMYTSLH GYFVFGQTGC YFEGFFATLG | |
| GEIALWSLVV LAIERYIVVC KPMSNFRFGE NHAMMGVAFT WIMALACAVP PLFGWSRYIP | |
| EGMQCSCGVD YYTLKPEVNN ESFVIYMFVV HFLIPLIIIS FCYGRLVCTV KEAAAQQQES | |
| ATTQKAEKEV TRMVIIMVIF FLICWVPYAY VAFYIFTHQG SEFGPIFMTV PAFFAKSSAI | |
| YNPVIYIMLN KQFRNCMITT LCCGKNPFGD DDASSAATSK TEATSVSTSQ VSPA* | |
| >P1;I51200 | |
| rhodopsin - African clawed frog | |
| MNGTEGPNFY VPMSNKTGVV RSPFDYPQYY LAEPWQYSAL AAYMFLLILL GLPINFMTLF | |
| VTIQHKKLRT PLNYILLNLV FANHFMVLCG FTVTMYTSMH GYFIFGPTGC YIEGFFATLG | |
| GEVALWSLVV LAVERYIVVC KPMANFRFGE NHAIMGVAFT WIMALSCAAP PLFGWSRYIP | |
| EGMQCSCGVD YYTLKPEVNN ESFVIYMFIV HFTIPLIVIF FCYGRLLCTV KEAAAQQQES | |
| LTTQKAEKEV TRMVVIMVVF FLICWVPYAY VAFYIFTHQG SNFGPVFMTV PAFFAKSSAI | |
| YNPVIYIVLN KQFRNCLITT LCCGKNPFGD EDGSSAATSK TEASSVSSSQ VSPA* | |
| >P1;JN0120 | |
| rhodopsin - Japanese lamprey | |
| MNGTEGDNFY VPFSNKTGLA RSPYEYPQYY LAEPWKYSAL AAYMFFLILV GFPVNFLTLF | |
| VTVQHKKLRT PLNYILLNLA MANLFMVLFG FTVTMYTSMN GYFVFGPTMC SIEGFFATLG | |
| GEVALWSLVV LAIERYIVIC KPMGNFRFGN THAIMGVAFT WIMALACAAP PLVGWSRYIP | |
| EGMQCSCGPD YYTLNPNFNN ESYVVYMFVV HFLVPFVIIF FCYGRLLCTV KEAAAAQQES | |
| ASTQKAEKEV TRMVVLMVIG FLVCWVPYAS VAFYIFTHQG SDFGATFMTL PAFFAKSSAL | |
| YNPVIYILMN KQFRNCMITT LCCGKNPLGD DE-SGASTSKT EVSSVSTSPV SPA* | |
| As with the FASTA format, each record starts with a line beginning with ">" | |
| character. There is then a two letter sequence type (P1, F1, DL, DC, RL, | |
| RC, or XX), a semi colon, and the identification code. The second like is | |
| free text description. The remaining lines contain the sequence itself, | |
| terminating in an asterisk. Space separated blocks of ten letters as shown | |
| above are typical. | |
| Sequence codes and their meanings: | |
| - P1 - Protein (complete) | |
| - F1 - Protein (fragment) | |
| - D1 - DNA (e.g. EMBOSS seqret output) | |
| - DL - DNA (linear) | |
| - DC - DNA (circular) | |
| - RL - RNA (linear) | |
| - RC - RNA (circular) | |
| - N3 - tRNA | |
| - N1 - Other functional RNA | |
| - XX - Unknown | |
| """ | |
| from Bio.Seq import Seq | |
| from Bio.SeqRecord import SeqRecord | |
| from .Interfaces import _get_seq_string | |
| from .Interfaces import SequenceIterator | |
| from .Interfaces import SequenceWriter | |
| _pir_mol_type = { | |
| "P1": "protein", | |
| "F1": "protein", | |
| "D1": "DNA", | |
| "DL": "DNA", | |
| "DC": "DNA", | |
| "RL": "RNA", | |
| "RC": "RNA", | |
| "N3": "RNA", | |
| "XX": None, | |
| } | |
| class PirIterator(SequenceIterator): | |
| """Parser for PIR files.""" | |
| def __init__(self, source): | |
| """Iterate over a PIR file and yield SeqRecord objects. | |
| source - file-like object or a path to a file. | |
| Examples | |
| -------- | |
| >>> with open("NBRF/DMB_prot.pir") as handle: | |
| ... for record in PirIterator(handle): | |
| ... print("%s length %i" % (record.id, len(record))) | |
| HLA:HLA00489 length 263 | |
| HLA:HLA00490 length 94 | |
| HLA:HLA00491 length 94 | |
| HLA:HLA00492 length 80 | |
| HLA:HLA00493 length 175 | |
| HLA:HLA01083 length 188 | |
| """ | |
| super().__init__(source, mode="t", fmt="Pir") | |
| def parse(self, handle): | |
| """Start parsing the file, and return a SeqRecord generator.""" | |
| records = self.iterate(handle) | |
| return records | |
| def iterate(self, handle): | |
| """Iterate over the records in the PIR file.""" | |
| # Skip any text before the first record (e.g. blank lines, comments) | |
| for line in handle: | |
| if line[0] == ">": | |
| break | |
| else: | |
| return # Premature end of file, or just empty? | |
| while True: | |
| pir_type = line[1:3] | |
| if pir_type not in _pir_mol_type or line[3] != ";": | |
| raise ValueError( | |
| "Records should start with '>XX;' where XX is a valid sequence type" | |
| ) | |
| identifier = line[4:].strip() | |
| description = handle.readline().strip() | |
| lines = [] | |
| for line in handle: | |
| if line[0] == ">": | |
| break | |
| # Remove trailing whitespace, and any internal spaces | |
| lines.append(line.rstrip().replace(" ", "")) | |
| else: | |
| line = None | |
| seq = "".join(lines) | |
| if seq[-1] != "*": | |
| # Note the * terminator is present on nucleotide sequences too, | |
| # it is not a stop codon! | |
| raise ValueError( | |
| "Sequences in PIR files should include a * terminator!" | |
| ) | |
| # Return the record and then continue... | |
| record = SeqRecord( | |
| Seq(seq[:-1]), id=identifier, name=identifier, description=description | |
| ) | |
| record.annotations["PIR-type"] = pir_type | |
| if _pir_mol_type[pir_type]: | |
| record.annotations["molecule_type"] = _pir_mol_type[pir_type] | |
| yield record | |
| if line is None: | |
| return # StopIteration | |
| raise ValueError("Unrecognised PIR record format.") | |
| class PirWriter(SequenceWriter): | |
| """Class to write PIR format files.""" | |
| def __init__(self, handle, wrap=60, record2title=None, code=None): | |
| """Create a PIR writer. | |
| Arguments: | |
| - handle - Handle to an output file, e.g. as returned | |
| by open(filename, "w") | |
| - wrap - Optional line length used to wrap sequence lines. | |
| Defaults to wrapping the sequence at 60 characters | |
| Use zero (or None) for no wrapping, giving a single | |
| long line for the sequence. | |
| - record2title - Optional function to return the text to be | |
| used for the title line of each record. By default | |
| a combination of the record.id, record.name and | |
| record.description is used. | |
| - code - Optional sequence code must be one of P1, F1, | |
| D1, DL, DC, RL, RC, N3 and XX. By default None is used, | |
| which means auto detection based on the molecule type | |
| in the record annotation. | |
| You can either use:: | |
| handle = open(filename, "w") | |
| writer = PirWriter(handle) | |
| writer.write_file(myRecords) | |
| handle.close() | |
| Or, follow the sequential file writer system, for example:: | |
| handle = open(filename, "w") | |
| writer = PirWriter(handle) | |
| writer.write_header() # does nothing for PIR files | |
| ... | |
| Multiple writer.write_record() and/or writer.write_records() calls | |
| ... | |
| writer.write_footer() # does nothing for PIR files | |
| handle.close() | |
| """ | |
| super().__init__(handle) | |
| self.wrap = None | |
| if wrap: | |
| if wrap < 1: | |
| raise ValueError("wrap should be None, 0, or a positive integer") | |
| self.wrap = wrap | |
| self.record2title = record2title | |
| self.code = code | |
| def write_record(self, record): | |
| """Write a single PIR record to the file.""" | |
| if self.record2title: | |
| title = self.clean(self.record2title(record)) | |
| else: | |
| title = self.clean(record.id) | |
| if record.name and record.description: | |
| description = self.clean(record.name + " - " + record.description) | |
| elif record.name and not record.description: | |
| description = self.clean(record.name) | |
| else: | |
| description = self.clean(record.description) | |
| if self.code: | |
| code = self.code | |
| else: | |
| molecule_type = record.annotations.get("molecule_type") | |
| if molecule_type is None: | |
| code = "XX" | |
| elif "DNA" in molecule_type: | |
| code = "D1" | |
| elif "RNA" in molecule_type: | |
| code = "RL" | |
| elif "protein" in molecule_type: | |
| code = "P1" | |
| else: | |
| code = "XX" | |
| if code not in _pir_mol_type: | |
| raise TypeError( | |
| "Sequence code must be one of " + _pir_mol_type.keys() + "." | |
| ) | |
| assert "\n" not in title | |
| assert "\r" not in description | |
| self.handle.write(f">{code};{title}\n{description}\n") | |
| data = _get_seq_string(record) # Catches sequence being None | |
| assert "\n" not in data | |
| assert "\r" not in data | |
| if self.wrap: | |
| line = "" | |
| for i in range(0, len(data), self.wrap): | |
| line += data[i : i + self.wrap] + "\n" | |
| line = line[:-1] + "*\n" | |
| self.handle.write(line) | |
| else: | |
| self.handle.write(data + "*\n") | |
| if __name__ == "__main__": | |
| from Bio._utils import run_doctest | |
| run_doctest(verbose=0) | |