Spaces:
No application file
No application file
| # Copyright 2006-2017,2020 by Peter Cock. All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| # | |
| # This module is for reading and writing FASTA format files as SeqRecord | |
| # objects. The code is partly inspired by earlier Biopython modules, | |
| # Bio.Fasta.* and the now removed module Bio.SeqIO.FASTA | |
| """Bio.SeqIO support for the "fasta" (aka FastA or Pearson) file format. | |
| You are expected to use this module via the Bio.SeqIO functions. | |
| """ | |
| import warnings | |
| from Bio import BiopythonDeprecationWarning | |
| from Bio.Seq import Seq | |
| from Bio.SeqRecord import SeqRecord | |
| from .Interfaces import _clean | |
| from .Interfaces import _get_seq_string | |
| from .Interfaces import SequenceIterator | |
| from .Interfaces import SequenceWriter | |
| def SimpleFastaParser(handle): | |
| """Iterate over Fasta records as string tuples. | |
| Arguments: | |
| - handle - input stream opened in text mode | |
| For each record a tuple of two strings is returned, the FASTA title | |
| line (without the leading '>' character), and the sequence (with any | |
| whitespace removed). The title line is not divided up into an | |
| identifier (the first word) and comment or description. | |
| >>> with open("Fasta/dups.fasta") as handle: | |
| ... for values in SimpleFastaParser(handle): | |
| ... print(values) | |
| ... | |
| ('alpha', 'ACGTA') | |
| ('beta', 'CGTC') | |
| ('gamma', 'CCGCC') | |
| ('alpha (again - this is a duplicate entry to test the indexing code)', 'ACGTA') | |
| ('delta', 'CGCGC') | |
| """ | |
| # Skip any text before the first record (e.g. blank lines, comments) | |
| for line in handle: | |
| if line[0] == ">": | |
| title = line[1:].rstrip() | |
| break | |
| else: | |
| # no break encountered - probably an empty file | |
| return | |
| # Main logic | |
| # Note, remove trailing whitespace, and any internal spaces | |
| # (and any embedded \r which are possible in mangled files | |
| # when not opened in universal read lines mode) | |
| lines = [] | |
| for line in handle: | |
| if line[0] == ">": | |
| yield title, "".join(lines).replace(" ", "").replace("\r", "") | |
| lines = [] | |
| title = line[1:].rstrip() | |
| continue | |
| lines.append(line.rstrip()) | |
| yield title, "".join(lines).replace(" ", "").replace("\r", "") | |
| def FastaTwoLineParser(handle): | |
| """Iterate over no-wrapping Fasta records as string tuples. | |
| Arguments: | |
| - handle - input stream opened in text mode | |
| Functionally the same as SimpleFastaParser but with a strict | |
| interpretation of the FASTA format as exactly two lines per | |
| record, the greater-than-sign identifier with description, | |
| and the sequence with no line wrapping. | |
| Any line wrapping will raise an exception, as will excess blank | |
| lines (other than the special case of a zero-length sequence | |
| as the second line of a record). | |
| Examples | |
| -------- | |
| This file uses two lines per FASTA record: | |
| >>> with open("Fasta/aster_no_wrap.pro") as handle: | |
| ... for title, seq in FastaTwoLineParser(handle): | |
| ... print("%s = %s..." % (title, seq[:3])) | |
| ... | |
| gi|3298468|dbj|BAA31520.1| SAMIPF = GGH... | |
| This equivalent file uses line wrapping: | |
| >>> with open("Fasta/aster.pro") as handle: | |
| ... for title, seq in FastaTwoLineParser(handle): | |
| ... print("%s = %s..." % (title, seq[:3])) | |
| ... | |
| Traceback (most recent call last): | |
| ... | |
| ValueError: Expected FASTA record starting with '>' character. Perhaps this file is using FASTA line wrapping? Got: 'MTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI' | |
| """ | |
| idx = -1 # for empty file | |
| for idx, line in enumerate(handle): | |
| if idx % 2 == 0: # title line | |
| if line[0] != ">": | |
| raise ValueError( | |
| "Expected FASTA record starting with '>' character. " | |
| "Perhaps this file is using FASTA line wrapping? " | |
| f"Got: '{line}'" | |
| ) | |
| title = line[1:].rstrip() | |
| else: # sequence line | |
| if line[0] == ">": | |
| raise ValueError( | |
| "Two '>' FASTA lines in a row. Missing sequence line " | |
| "if this is strict two-line-per-record FASTA format. " | |
| f"Have '>{title}' and '{line}'" | |
| ) | |
| yield title, line.strip() | |
| if idx == -1: | |
| pass # empty file | |
| elif idx % 2 == 0: # on a title line | |
| raise ValueError( | |
| "Missing sequence line at end of file if this is strict " | |
| f"two-line-per-record FASTA format. Have title line '{line}'" | |
| ) | |
| else: | |
| assert line[0] != ">", "line[0] == '>' ; this should be impossible!" | |
| class FastaIterator(SequenceIterator): | |
| """Parser for Fasta files.""" | |
| def __init__(self, source, alphabet=None, title2ids=None): | |
| """Iterate over Fasta records as SeqRecord objects. | |
| Arguments: | |
| - source - input stream opened in text mode, or a path to a file | |
| - alphabet - optional alphabet, not used. Leave as None. | |
| - title2ids (DEPRECATED) - A function that, when given the title of | |
| the FASTA file (without the beginning >), will return the id, name | |
| and description (in that order) for the record as a tuple of strings. | |
| If this is not given, then the entire title line will be used | |
| as the description, and the first word as the id and name. | |
| By default this will act like calling Bio.SeqIO.parse(handle, "fasta") | |
| with no custom handling of the title lines: | |
| >>> with open("Fasta/dups.fasta") as handle: | |
| ... for record in FastaIterator(handle): | |
| ... print(record.id) | |
| ... | |
| alpha | |
| beta | |
| gamma | |
| alpha | |
| delta | |
| However, you can supply a title2ids function to alter this (DEPRECATED): | |
| >>> def take_upper(title): | |
| ... return title.split(None, 1)[0].upper(), "", title | |
| >>> with open("Fasta/dups.fasta") as handle: | |
| ... for record in FastaIterator(handle, title2ids=take_upper): | |
| ... print(record.id) | |
| ... | |
| ALPHA | |
| BETA | |
| GAMMA | |
| ALPHA | |
| DELTA | |
| Instead of title2ids, please use a generator function to modify the | |
| records: | |
| >>> def modify_records(records): | |
| ... for record in records: | |
| ... record.id = record.id.upper() | |
| ... yield record | |
| ... | |
| >>> with open('Fasta/dups.fasta') as handle: | |
| ... for record in modify_records(FastaIterator(handle)): | |
| ... print(record.id) | |
| ... | |
| ALPHA | |
| BETA | |
| GAMMA | |
| ALPHA | |
| DELTA | |
| """ | |
| if alphabet is not None: | |
| raise ValueError("The alphabet argument is no longer supported") | |
| if title2ids is not None: | |
| warnings.warn( | |
| "The title2ids argument is deprecated. Instead, please use a " | |
| "generator function to modify records returned by the parser. " | |
| "For example, to change the record IDs to uppercase, and " | |
| "delete the description attribute, use\n" | |
| "\n" | |
| ">>> def modify_records(records):\n" | |
| "... for record in records:\n" | |
| "... record.id = record.id.upper()\n" | |
| "... del record.description\n" | |
| "... yield record\n" | |
| "...\n" | |
| ">>> with open('Fasta/dups.fasta') as handle:\n" | |
| "... for record in modify_records(FastaIterator(handle)):\n" | |
| "... print(record)\n" | |
| "\n", | |
| BiopythonDeprecationWarning, | |
| ) | |
| self.title2ids = title2ids | |
| super().__init__(source, mode="t", fmt="Fasta") | |
| def parse(self, handle): | |
| """Start parsing the file, and return a SeqRecord generator.""" | |
| records = self.iterate(handle) | |
| return records | |
| def iterate(self, handle): | |
| """Parse the file and generate SeqRecord objects.""" | |
| title2ids = self.title2ids | |
| if title2ids: | |
| for title, sequence in SimpleFastaParser(handle): | |
| id, name, descr = title2ids(title) | |
| yield SeqRecord(Seq(sequence), id=id, name=name, description=descr) | |
| else: | |
| for title, sequence in SimpleFastaParser(handle): | |
| try: | |
| first_word = title.split(None, 1)[0] | |
| except IndexError: | |
| assert not title, repr(title) | |
| # Should we use SeqRecord default for no ID? | |
| first_word = "" | |
| yield SeqRecord( | |
| Seq(sequence), id=first_word, name=first_word, description=title | |
| ) | |
| class FastaTwoLineIterator(SequenceIterator): | |
| """Parser for Fasta files with exactly two lines per record.""" | |
| def __init__(self, source): | |
| """Iterate over two-line Fasta records (as SeqRecord objects). | |
| Arguments: | |
| - source - input stream opened in text mode, or a path to a file | |
| This uses a strict interpretation of the FASTA as requiring | |
| exactly two lines per record (no line wrapping). | |
| Only the default title to ID/name/description parsing offered | |
| by the relaxed FASTA parser is offered. | |
| """ | |
| super().__init__(source, mode="t", fmt="FASTA") | |
| def parse(self, handle): | |
| """Start parsing the file, and return a SeqRecord generator.""" | |
| records = self.iterate(handle) | |
| return records | |
| def iterate(self, handle): | |
| """Parse the file and generate SeqRecord objects.""" | |
| for title, sequence in FastaTwoLineParser(handle): | |
| try: | |
| first_word = title.split(None, 1)[0] | |
| except IndexError: | |
| assert not title, repr(title) | |
| # Should we use SeqRecord default for no ID? | |
| first_word = "" | |
| yield SeqRecord( | |
| Seq(sequence), id=first_word, name=first_word, description=title | |
| ) | |
| class FastaWriter(SequenceWriter): | |
| """Class to write Fasta format files (OBSOLETE). | |
| Please use the ``as_fasta`` function instead, or the top level | |
| ``Bio.SeqIO.write()`` function instead using ``format="fasta"``. | |
| """ | |
| def __init__(self, target, wrap=60, record2title=None): | |
| """Create a Fasta writer (OBSOLETE). | |
| Arguments: | |
| - target - Output stream opened in text mode, or a path to a file. | |
| - wrap - Optional line length used to wrap sequence lines. | |
| Defaults to wrapping the sequence at 60 characters | |
| Use zero (or None) for no wrapping, giving a single | |
| long line for the sequence. | |
| - record2title - Optional function to return the text to be | |
| used for the title line of each record. By default | |
| a combination of the record.id and record.description | |
| is used. If the record.description starts with the | |
| record.id, then just the record.description is used. | |
| You can either use:: | |
| handle = open(filename, "w") | |
| writer = FastaWriter(handle) | |
| writer.write_file(myRecords) | |
| handle.close() | |
| Or, follow the sequential file writer system, for example:: | |
| handle = open(filename, "w") | |
| writer = FastaWriter(handle) | |
| writer.write_header() # does nothing for Fasta files | |
| ... | |
| Multiple writer.write_record() and/or writer.write_records() calls | |
| ... | |
| writer.write_footer() # does nothing for Fasta files | |
| handle.close() | |
| """ | |
| super().__init__(target) | |
| if wrap: | |
| if wrap < 1: | |
| raise ValueError | |
| self.wrap = wrap | |
| self.record2title = record2title | |
| def write_record(self, record): | |
| """Write a single Fasta record to the file.""" | |
| if self.record2title: | |
| title = self.clean(self.record2title(record)) | |
| else: | |
| id = self.clean(record.id) | |
| description = self.clean(record.description) | |
| if description and description.split(None, 1)[0] == id: | |
| # The description includes the id at the start | |
| title = description | |
| elif description: | |
| title = f"{id} {description}" | |
| else: | |
| title = id | |
| assert "\n" not in title | |
| assert "\r" not in title | |
| self.handle.write(f">{title}\n") | |
| data = _get_seq_string(record) # Catches sequence being None | |
| assert "\n" not in data | |
| assert "\r" not in data | |
| if self.wrap: | |
| for i in range(0, len(data), self.wrap): | |
| self.handle.write(data[i : i + self.wrap] + "\n") | |
| else: | |
| self.handle.write(data + "\n") | |
| class FastaTwoLineWriter(FastaWriter): | |
| """Class to write 2-line per record Fasta format files (OBSOLETE). | |
| This means we write the sequence information without line | |
| wrapping, and will always write a blank line for an empty | |
| sequence. | |
| Please use the ``as_fasta_2line`` function instead, or the top level | |
| ``Bio.SeqIO.write()`` function instead using ``format="fasta"``. | |
| """ | |
| def __init__(self, handle, record2title=None): | |
| """Create a 2-line per record Fasta writer (OBSOLETE). | |
| Arguments: | |
| - handle - Handle to an output file, e.g. as returned | |
| by open(filename, "w") | |
| - record2title - Optional function to return the text to be | |
| used for the title line of each record. By default | |
| a combination of the record.id and record.description | |
| is used. If the record.description starts with the | |
| record.id, then just the record.description is used. | |
| You can either use:: | |
| handle = open(filename, "w") | |
| writer = FastaWriter(handle) | |
| writer.write_file(myRecords) | |
| handle.close() | |
| Or, follow the sequential file writer system, for example:: | |
| handle = open(filename, "w") | |
| writer = FastaWriter(handle) | |
| writer.write_header() # does nothing for Fasta files | |
| ... | |
| Multiple writer.write_record() and/or writer.write_records() calls | |
| ... | |
| writer.write_footer() # does nothing for Fasta files | |
| handle.close() | |
| """ | |
| super().__init__(handle, wrap=None, record2title=record2title) | |
| def as_fasta(record): | |
| """Turn a SeqRecord into a FASTA formatted string. | |
| This is used internally by the SeqRecord's .format("fasta") | |
| method and by the SeqIO.write(..., ..., "fasta") function. | |
| """ | |
| id = _clean(record.id) | |
| description = _clean(record.description) | |
| if description and description.split(None, 1)[0] == id: | |
| # The description includes the id at the start | |
| title = description | |
| elif description: | |
| title = f"{id} {description}" | |
| else: | |
| title = id | |
| assert "\n" not in title | |
| assert "\r" not in title | |
| lines = [f">{title}\n"] | |
| data = _get_seq_string(record) # Catches sequence being None | |
| assert "\n" not in data | |
| assert "\r" not in data | |
| for i in range(0, len(data), 60): | |
| lines.append(data[i : i + 60] + "\n") | |
| return "".join(lines) | |
| def as_fasta_2line(record): | |
| """Turn a SeqRecord into a two-line FASTA formatted string. | |
| This is used internally by the SeqRecord's .format("fasta-2line") | |
| method and by the SeqIO.write(..., ..., "fasta-2line") function. | |
| """ | |
| id = _clean(record.id) | |
| description = _clean(record.description) | |
| if description and description.split(None, 1)[0] == id: | |
| # The description includes the id at the start | |
| title = description | |
| elif description: | |
| title = f"{id} {description}" | |
| else: | |
| title = id | |
| assert "\n" not in title | |
| assert "\r" not in title | |
| data = _get_seq_string(record) # Catches sequence being None | |
| assert "\n" not in data | |
| assert "\r" not in data | |
| return f">{title}\n{data}\n" | |
| if __name__ == "__main__": | |
| from Bio._utils import run_doctest | |
| run_doctest(verbose=0) | |