Spaces:
No application file
No application file
| # Copyright 2004 by Cymon J. Cox and Frank Kauff. All rights reserved. | |
| # Copyright 2008 by Michiel de Hoon. All rights reserved. | |
| # Revisions copyright 2009 by Cymon J. Cox. All rights reserved. | |
| # Revisions copyright 2009 by Peter Cock. All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Parser for PHD files output by PHRED and used by PHRAP and CONSED. | |
| This module can be used directly, which will return Record objects | |
| containing all the original data in the file. | |
| Alternatively, using Bio.SeqIO with the "phd" format will call this module | |
| internally. This will give SeqRecord objects for each contig sequence. | |
| """ | |
| from Bio import Seq | |
| CKEYWORDS = [ | |
| "CHROMAT_FILE", | |
| "ABI_THUMBPRINT", | |
| "PHRED_VERSION", | |
| "CALL_METHOD", | |
| "QUALITY_LEVELS", | |
| "TIME", | |
| "TRACE_ARRAY_MIN_INDEX", | |
| "TRACE_ARRAY_MAX_INDEX", | |
| "TRIM", | |
| "TRACE_PEAK_AREA_RATIO", | |
| "CHEM", | |
| "DYE", | |
| ] | |
| class Record: | |
| """Hold information from a PHD file.""" | |
| def __init__(self): | |
| """Initialize the class.""" | |
| self.file_name = "" | |
| self.comments = {} | |
| for kw in CKEYWORDS: | |
| self.comments[kw.lower()] = None | |
| self.sites = [] | |
| self.seq = "" | |
| self.seq_trimmed = "" | |
| def read(source): | |
| """Read one PHD record from the file and return it as a Record object. | |
| Argument source is a file-like object opened in text mode, or a path | |
| to a file. | |
| This function reads PHD file data line by line from the source, and | |
| returns a single Record object. A ValueError is raised if more than | |
| one record is found in the file. | |
| """ | |
| handle = _open(source) | |
| try: | |
| record = _read(handle) | |
| try: | |
| next(handle) | |
| except StopIteration: | |
| return record | |
| else: | |
| raise ValueError("More than one PHD record found") | |
| finally: | |
| if handle is not source: | |
| handle.close() | |
| def parse(source): | |
| """Iterate over a file yielding multiple PHD records. | |
| Argument source is a file-like object opened in text mode, or a path | |
| to a file. | |
| The data is read line by line from the source. | |
| Typical usage:: | |
| records = parse(handle) | |
| for record in records: | |
| # do something with the record object | |
| """ | |
| handle = _open(source) | |
| try: | |
| while True: | |
| record = _read(handle) | |
| if not record: | |
| return | |
| yield record | |
| finally: | |
| if handle is not source: | |
| handle.close() | |
| # Everything below is considered private | |
| def _open(source): | |
| try: | |
| handle = open(source) | |
| except TypeError: | |
| handle = source | |
| if handle.read(0) != "": | |
| raise ValueError("PHD files must be opened in text mode.") from None | |
| return handle | |
| def _read(handle): | |
| for line in handle: | |
| if line.startswith("BEGIN_SEQUENCE"): | |
| record = Record() | |
| record.file_name = line[15:].rstrip() | |
| break | |
| else: | |
| return # No record found | |
| for line in handle: | |
| if line.startswith("BEGIN_COMMENT"): | |
| break | |
| else: | |
| raise ValueError("Failed to find BEGIN_COMMENT line") | |
| for line in handle: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| if line == "END_COMMENT": | |
| break | |
| keyword, value = line.split(":", 1) | |
| keyword = keyword.lower() | |
| value = value.strip() | |
| if keyword in ( | |
| "chromat_file", | |
| "phred_version", | |
| "call_method", | |
| "chem", | |
| "dye", | |
| "time", | |
| "basecaller_version", | |
| "trace_processor_version", | |
| ): | |
| record.comments[keyword] = value | |
| elif keyword in ( | |
| "abi_thumbprint", | |
| "quality_levels", | |
| "trace_array_min_index", | |
| "trace_array_max_index", | |
| ): | |
| record.comments[keyword] = int(value) | |
| elif keyword == "trace_peak_area_ratio": | |
| record.comments[keyword] = float(value) | |
| elif keyword == "trim": | |
| first, last, prob = value.split() | |
| record.comments[keyword] = (int(first), int(last), float(prob)) | |
| else: | |
| raise ValueError("Failed to find END_COMMENT line") | |
| for line in handle: | |
| if line.startswith("BEGIN_DNA"): | |
| break | |
| else: | |
| raise ValueError("Failed to find BEGIN_DNA line") | |
| for line in handle: | |
| if line.startswith("END_DNA"): | |
| break | |
| else: | |
| # Line is: "site quality peak_location" | |
| # Peak location is optional according to | |
| # David Gordon (the Consed author) | |
| parts = line.split() | |
| if len(parts) in [2, 3]: | |
| record.sites.append(tuple(parts)) | |
| else: | |
| raise ValueError( | |
| "DNA line must contain a base and quality " | |
| "score, and optionally a peak location." | |
| ) | |
| for line in handle: | |
| if line.startswith("END_SEQUENCE"): | |
| break | |
| else: | |
| raise ValueError("Failed to find END_SEQUENCE line") | |
| record.seq = Seq.Seq("".join(n[0] for n in record.sites)) | |
| if record.comments["trim"] is not None: | |
| first, last = record.comments["trim"][:2] | |
| record.seq_trimmed = record.seq[first:last] | |
| return record | |
| if __name__ == "__main__": | |
| from Bio._utils import run_doctest | |
| run_doctest() | |