Spaces:
No application file
No application file
| # Copyright 2019 Damien Goutte-Gattat. All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Bio.SeqIO support for the "gck" file format. | |
| The GCK binary format is generated by the Gene Construction Kit software | |
| from Textco BioSoftware, Inc. | |
| """ | |
| from struct import unpack | |
| from Bio.Seq import Seq | |
| from Bio.SeqFeature import SimpleLocation | |
| from Bio.SeqFeature import SeqFeature | |
| from Bio.SeqRecord import SeqRecord | |
| from .Interfaces import SequenceIterator | |
| def _read(handle, length): | |
| """Read the specified number of bytes from the given handle.""" | |
| data = handle.read(length) | |
| if len(data) < length: | |
| raise ValueError(f"Cannot read {length} bytes from handle") | |
| return data | |
| def _read_packet(handle): | |
| """Read a length-prefixed packet. | |
| Parts of a GCK file are made of "packets" comprising of 4 bytes | |
| giving the packet's size, followed by the packet's data. | |
| There is no type tag. The type of a packet, and thus the type of data | |
| it contains, is solely indicated by the position of the packet within | |
| the GCK file. | |
| """ | |
| length = _read(handle, 4) | |
| length = unpack(">I", length)[0] | |
| data = _read(handle, length) | |
| return (data, length) | |
| def _read_pstring(handle): | |
| """Read a Pascal string. | |
| A Pascal string is one byte for length followed by the actual string. | |
| """ | |
| length = _read(handle, 1) | |
| length = unpack(">B", length)[0] | |
| data = _read(handle, length).decode("ASCII") | |
| return data | |
| def _read_p4string(handle): | |
| """Read a 32-bit Pascal string. | |
| Similar to a Pascal string but length is encoded on 4 bytes. | |
| """ | |
| length = _read(handle, 4) | |
| length = unpack(">I", length)[0] | |
| data = _read(handle, length).decode("ASCII") | |
| return data | |
| def _parse(handle): | |
| # Skip file header | |
| # GCK files start with a 24-bytes header. Bytes 4 and 8 seem to | |
| # always be 12, maybe this could act as a magic cookie. Bytes | |
| # 17-20 and 21-24 contain variable values of unknown meaning. | |
| # check if file is empty | |
| data = handle.read(24) | |
| if not data: | |
| raise ValueError("Empty file.") | |
| if len(data) < 24: | |
| raise ValueError("Improper header, cannot read 24 bytes from handle") | |
| # Read the actual sequence data | |
| packet, length = _read_packet(handle) | |
| # The body of the sequence packet starts with a 32-bit integer | |
| # representing the length of the sequence. | |
| seq_length = unpack(">I", packet[:4])[0] | |
| # This length should not be larger than the length of the | |
| # sequence packet. | |
| if seq_length > length - 4: | |
| raise ValueError("Conflicting sequence length values") | |
| sequence = packet[4:].decode("ASCII") | |
| record = SeqRecord(Seq(sequence)) | |
| # Skip unknown packet | |
| _read_packet(handle) | |
| # Read features packet | |
| packet, length = _read_packet(handle) | |
| (seq_length, num_features) = unpack(">IH", packet[:6]) | |
| # Check that length in the features packet matches the actual | |
| # length of the sequence | |
| if seq_length != len(sequence): | |
| raise ValueError("Conflicting sequence length values") | |
| # Each feature is stored in a 92-bytes structure. | |
| if length - 6 != num_features * 92: | |
| raise ValueError("Features packet size inconsistent with number of features") | |
| for i in range(0, num_features): | |
| offset = 6 + i * 92 | |
| feature_data = packet[offset : offset + 92] | |
| # There's probably more stuff to unpack in that structure, | |
| # but those values are the only ones I understand. | |
| (start, end, type, strand, has_name, has_comment, version) = unpack( | |
| ">II6xH14xB17xII35xB", feature_data | |
| ) | |
| if strand == 1: # Reverse strand | |
| strand = -1 | |
| else: | |
| # Other possible values are 0 (no strand specified), | |
| # 2 (forward strand), and 3 (both strands). All are | |
| # treated as a forward strand. | |
| strand = 1 | |
| location = SimpleLocation(start, end, strand=strand) | |
| # It looks like any value > 0 indicates a CDS... | |
| if type > 0: | |
| type = "CDS" | |
| else: | |
| type = "misc_feature" | |
| # Each feature may have a name and a comment, which are then | |
| # stored immediately after the features packet. Names are | |
| # stored as Pascal strings (1 length byte followed by the | |
| # string itself), comments are stored as "32-bit Pascal strings" | |
| # (4 length bytes followed by the string). | |
| qualifiers = {} | |
| if has_name > 0: | |
| name = _read_pstring(handle) | |
| qualifiers["label"] = [name] | |
| if has_comment > 0: | |
| comment = _read_p4string(handle) | |
| qualifiers["note"] = [comment] | |
| # Each feature may exist in several "versions". We keep only | |
| # the most recent version. | |
| if version > 0: | |
| continue | |
| feature = SeqFeature(location, type=type, qualifiers=qualifiers) | |
| record.features.append(feature) | |
| # Read restriction sites packet | |
| # We are not interested in restriction sites, but we must still read | |
| # that packet so that we can skip the names and comments for each | |
| # site, which are stored after that packet in a similar way as for | |
| # the features above. | |
| packet, length = _read_packet(handle) | |
| (seq_length, num_sites) = unpack(">IH", packet[:6]) | |
| # Each site is stored in a 88-bytes structure | |
| if length - 6 != num_sites * 88: | |
| raise ValueError("Sites packet size inconsistent with number of sites") | |
| for i in range(0, num_sites): | |
| offset = 6 + i * 88 | |
| site_data = packet[offset : offset + 88] | |
| (start, end, has_name, has_comment) = unpack(">II24xII48x", site_data) | |
| # Skip names and comments | |
| if has_name: | |
| _read_pstring(handle) | |
| if has_comment: | |
| _read_p4string(handle) | |
| # Skip unknown packet | |
| _read_packet(handle) | |
| # Next in the file are "version packets". | |
| # However they are not properly formatted "packets" as they are not | |
| # preceded by an integer giving their size. Instead we have a | |
| # short integer indicating how many versions are there, and then | |
| # as many 260-bytes block as we have versions. | |
| num_versions = _read(handle, 2) | |
| num_versions = unpack(">H", num_versions)[0] | |
| versions = _read(handle, num_versions * 260) | |
| for i in range(0, num_versions): | |
| offset = i * 260 | |
| version_data = versions[offset : offset + 260] | |
| # Each version may have a comment, which is then stored | |
| # after all the "version packets". | |
| has_comment = unpack(">I", version_data[-4:])[0] | |
| if has_comment > 0: | |
| _read_p4string(handle) | |
| # Skip unknown fixed-size block | |
| # Whatever this block contains, it is not preceded by any length | |
| # indicator, so I hope its size is indeed constant in all files... | |
| _read(handle, 706) | |
| # Read the construct's name | |
| name = _read_pstring(handle) | |
| record.name = record.id = name.split(" ")[0] | |
| record.description = name | |
| # Circularity byte | |
| # There may be other flags in that block, but their meaning | |
| # is unknown to me. | |
| flags = _read(handle, 17) | |
| circularity = unpack(">16xB", flags)[0] | |
| if circularity > 0: | |
| record.annotations["topology"] = "circular" | |
| else: | |
| record.annotations["topology"] = "linear" | |
| yield record | |
| class GckIterator(SequenceIterator): | |
| """Parser for GCK files.""" | |
| def __init__(self, source): | |
| """Break up a GCK file into SeqRecord objects.""" | |
| super().__init__(source, mode="b", fmt="GCK") | |
| def parse(self, handle): | |
| """Start parsing the file, and return a SeqRecord generator. | |
| Note that a GCK file can only contain one sequence, so this | |
| iterator will always return a single record. | |
| """ | |
| records = _parse(handle) | |
| return records | |