Spaces:
No application file
No application file
| # Copyright 2017-2019 Damien Goutte-Gattat. All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Bio.SeqIO support for the "xdna" file format. | |
| The Xdna binary format is generated by Christian Marck's DNA Strider program | |
| and also used by Serial Cloner. | |
| """ | |
| import warnings | |
| from re import match | |
| from struct import pack | |
| from struct import unpack | |
| from Bio import BiopythonWarning | |
| from Bio.Seq import Seq | |
| from Bio.SeqFeature import ExactPosition | |
| from Bio.SeqFeature import SimpleLocation | |
| from Bio.SeqFeature import SeqFeature | |
| from Bio.SeqRecord import SeqRecord | |
| from .Interfaces import SequenceIterator | |
| from .Interfaces import SequenceWriter | |
| _seq_types = { | |
| 0: None, | |
| 1: "DNA", | |
| 2: "DNA", | |
| 3: "RNA", | |
| 4: "protein", | |
| } | |
| _seq_topologies = {0: "linear", 1: "circular"} | |
| def _read(handle, length): | |
| """Read the specified number of bytes from the given handle.""" | |
| data = handle.read(length) | |
| if len(data) < length: | |
| raise ValueError("Cannot read %d bytes from handle" % length) | |
| return data | |
| def _read_pstring(handle): | |
| """Read a Pascal string. | |
| A Pascal string comprises a single byte giving the length of the string | |
| followed by as many bytes. | |
| """ | |
| length = unpack(">B", _read(handle, 1))[0] | |
| return unpack("%ds" % length, _read(handle, length))[0].decode("ASCII") | |
| def _read_pstring_as_integer(handle): | |
| return int(_read_pstring(handle)) | |
| def _read_overhang(handle): | |
| """Read an overhang specification. | |
| An overhang is represented in a XDNA file as: | |
| - a Pascal string containing the text representation of the overhang | |
| length, which also indicates the nature of the overhang: | |
| - a length of zero means no overhang, | |
| - a negative length means a 3' overhang, | |
| - a positive length means a 5' overhang; | |
| - the actual overhang sequence. | |
| Examples: | |
| - 0x01 0x30: no overhang ("0", as a P-string) | |
| - 0x01 0x32 0x41 0x41: 5' AA overhang (P-string "2", then "AA") | |
| - 0x02 0x2D 0x31 0x43: 3' C overhang (P-string "-1", then "C") | |
| Returns a tuple (length, sequence). | |
| """ | |
| length = _read_pstring_as_integer(handle) | |
| if length != 0: | |
| overhang = _read(handle, abs(length)) | |
| return (length, overhang) | |
| else: | |
| return (None, None) | |
| def _parse_feature_description(desc, qualifiers): | |
| """Parse the description field of a Xdna feature. | |
| The 'description' field of a feature sometimes contains several | |
| GenBank-like qualifiers, separated by carriage returns (CR, 0x0D). | |
| """ | |
| # Split the field's value in CR-separated lines, skipping empty lines | |
| for line in [x for x in desc.split("\x0D") if len(x) > 0]: | |
| # Is it a qualifier="value" line? | |
| m = match('^([^=]+)="([^"]+)"?$', line) | |
| if m: | |
| # Store the qualifier as provided | |
| qual, value = m.groups() | |
| qualifiers[qual] = [value] | |
| elif '"' not in line: # Reject ill-formed qualifiers | |
| # Store the entire line as a generic note qualifier | |
| qualifiers["note"] = [line] | |
| def _read_feature(handle, record): | |
| """Read a single sequence feature.""" | |
| name = _read_pstring(handle) | |
| desc = _read_pstring(handle) | |
| type = _read_pstring(handle) or "misc_feature" | |
| start = _read_pstring_as_integer(handle) | |
| end = _read_pstring_as_integer(handle) | |
| # Feature flags (4 bytes): | |
| # byte 1 is the strand (0: reverse strand, 1: forward strand); | |
| # byte 2 tells whether to display the feature; | |
| # byte 4 tells whether to draw an arrow when displaying the feature; | |
| # meaning of byte 3 is unknown. | |
| (forward, display, arrow) = unpack(">BBxB", _read(handle, 4)) | |
| if forward: | |
| strand = 1 | |
| else: | |
| strand = -1 | |
| start, end = end, start | |
| # The last field is a Pascal string usually containing a | |
| # comma-separated triplet of numbers ranging from 0 to 255. | |
| # I suspect this represents the RGB color to use when displaying | |
| # the feature. Skip it as we have no need for it. | |
| _read_pstring(handle) | |
| # Assemble the feature | |
| # Shift start by -1 as XDNA feature coordinates are 1-based | |
| # while Biopython uses 0-based counting. | |
| location = SimpleLocation(start - 1, end, strand=strand) | |
| qualifiers = {} | |
| if name: | |
| qualifiers["label"] = [name] | |
| _parse_feature_description(desc, qualifiers) | |
| feature = SeqFeature(location, type=type, qualifiers=qualifiers) | |
| record.features.append(feature) | |
| class XdnaIterator(SequenceIterator): | |
| """Parser for Xdna files.""" | |
| def __init__(self, source): | |
| """Parse a Xdna file and return a SeqRecord object. | |
| Argument source is a file-like object in binary mode or a path to a file. | |
| Note that this is an "iterator" in name only since an Xdna file always | |
| contain a single sequence. | |
| """ | |
| super().__init__(source, mode="b", fmt="Xdna") | |
| def parse(self, handle): | |
| """Start parsing the file, and return a SeqRecord generator.""" | |
| # Parse fixed-size header and do some rudimentary checks | |
| # | |
| # The "neg_length" value is the length of the part of the sequence | |
| # before the nucleotide considered as the "origin" (nucleotide number 1, | |
| # which in DNA Strider is not always the first nucleotide). | |
| # Biopython's SeqRecord has no such concept of a sequence origin as far | |
| # as I know, so we ignore that value. SerialCloner has no such concept | |
| # either and always generates files with a neg_length of zero. | |
| header = handle.read(112) | |
| if not header: | |
| raise ValueError("Empty file.") | |
| if len(header) < 112: | |
| raise ValueError("Improper header, cannot read 112 bytes from handle") | |
| records = self.iterate(handle, header) | |
| return records | |
| def iterate(self, handle, header): | |
| """Parse the file and generate SeqRecord objects.""" | |
| (version, seq_type, topology, length, neg_length, com_length) = unpack( | |
| ">BBB25xII60xI12x", header | |
| ) | |
| if version != 0: | |
| raise ValueError("Unsupported XDNA version") | |
| if seq_type not in _seq_types: | |
| raise ValueError("Unknown sequence type") | |
| # Read actual sequence and comment found in all XDNA files | |
| sequence = _read(handle, length).decode("ASCII") | |
| comment = _read(handle, com_length).decode("ASCII") | |
| # Try to derive a name from the first "word" of the comment | |
| name = comment.split(" ")[0] | |
| # Create record object | |
| record = SeqRecord(Seq(sequence), description=comment, name=name, id=name) | |
| if _seq_types[seq_type]: | |
| record.annotations["molecule_type"] = _seq_types[seq_type] | |
| if topology in _seq_topologies: | |
| record.annotations["topology"] = _seq_topologies[topology] | |
| if len(handle.read(1)) == 1: | |
| # This is an XDNA file with an optional annotation section. | |
| # Skip the overhangs as I don't know how to represent | |
| # them in the SeqRecord model. | |
| _read_overhang(handle) # right-side overhang | |
| _read_overhang(handle) # left-side overhang | |
| # Read the features | |
| num_features = unpack(">B", _read(handle, 1))[0] | |
| while num_features > 0: | |
| _read_feature(handle, record) | |
| num_features -= 1 | |
| yield record | |
| class XdnaWriter(SequenceWriter): | |
| """Write files in the Xdna format.""" | |
| def __init__(self, target): | |
| """Initialize an Xdna writer object. | |
| Arguments: | |
| - target - Output stream opened in binary mode, or a path to a file. | |
| """ | |
| super().__init__(target, mode="wb") | |
| def write_file(self, records): | |
| """Write the specified record to a Xdna file. | |
| Note that the function expects a list (or iterable) of records | |
| as per the SequenceWriter interface, but the list should contain | |
| only one record as the Xdna format is a mono-record format. | |
| """ | |
| records = iter(records) | |
| try: | |
| record = next(records) | |
| except StopIteration: | |
| raise ValueError("Must have one sequence") from None | |
| try: | |
| next(records) | |
| raise ValueError("More than one sequence found") | |
| except StopIteration: | |
| pass | |
| self._has_truncated_strings = False | |
| molecule_type = record.annotations.get("molecule_type") | |
| if molecule_type is None: | |
| seqtype = 0 | |
| elif "DNA" in molecule_type: | |
| seqtype = 1 | |
| elif "RNA" in molecule_type: | |
| seqtype = 3 | |
| elif "protein" in molecule_type: | |
| seqtype = 4 | |
| else: | |
| seqtype = 0 | |
| if record.annotations.get("topology", "linear") == "circular": | |
| topology = 1 | |
| else: | |
| topology = 0 | |
| # We store the record's id and description in the comment field. | |
| # Make sure to avoid duplicating the id if it is already | |
| # contained in the description. | |
| if record.description.startswith(record.id): | |
| comment = record.description | |
| else: | |
| comment = f"{record.id} {record.description}" | |
| # Write header | |
| self.handle.write( | |
| pack( | |
| ">BBB25xII60xI11xB", | |
| 0, # version | |
| seqtype, | |
| topology, | |
| len(record), | |
| 0, # negative length | |
| len(comment), | |
| 255, # end of header | |
| ) | |
| ) | |
| # Actual sequence and comment | |
| self.handle.write(bytes(record.seq)) | |
| self.handle.write(comment.encode("ASCII")) | |
| self.handle.write(pack(">B", 0)) # Annotation section marker | |
| self._write_pstring("0") # right-side overhang | |
| self._write_pstring("0") # left-side overhand | |
| # Write features | |
| # We must skip features with fuzzy locations as they cannot be | |
| # represented in the Xdna format | |
| features = [ | |
| f | |
| for f in record.features | |
| if type(f.location.start) == ExactPosition | |
| and type(f.location.end) == ExactPosition | |
| ] | |
| drop = len(record.features) - len(features) | |
| if drop > 0: | |
| warnings.warn( | |
| f"Dropping {drop} features with fuzzy locations", BiopythonWarning | |
| ) | |
| # We also cannot store more than 255 features as the number of | |
| # features is stored on a single byte... | |
| if len(features) > 255: | |
| drop = len(features) - 255 | |
| warnings.warn( | |
| f"Too many features, dropping the last {drop}", BiopythonWarning | |
| ) | |
| features = features[:255] | |
| self.handle.write(pack(">B", len(features))) | |
| for feature in features: | |
| self._write_pstring(feature.qualifiers.get("label", [""])[0]) | |
| description = "" | |
| for qname in feature.qualifiers: | |
| if qname in ("label", "translation"): | |
| continue | |
| for val in feature.qualifiers[qname]: | |
| if len(description) > 0: | |
| description = description + "\x0D" | |
| description = description + f'{qname}="{val}"' | |
| self._write_pstring(description) | |
| self._write_pstring(feature.type) | |
| start = int(feature.location.start) + 1 # 1-based coordinates | |
| end = int(feature.location.end) | |
| strand = 1 | |
| if feature.location.strand == -1: | |
| start, end = end, start | |
| strand = 0 | |
| self._write_pstring(str(start)) | |
| self._write_pstring(str(end)) | |
| self.handle.write(pack(">BBBB", strand, 1, 0, 1)) | |
| self._write_pstring("127,127,127") | |
| if self._has_truncated_strings: | |
| warnings.warn( | |
| "Some annotations were truncated to 255 characters", BiopythonWarning | |
| ) | |
| return 1 | |
| def _write_pstring(self, s): | |
| """Write the given string as a Pascal string.""" | |
| if len(s) > 255: | |
| self._has_truncated_strings = True | |
| s = s[:255] | |
| self.handle.write(pack(">B", len(s))) | |
| self.handle.write(s.encode("ASCII")) | |