Spaces:
No application file
No application file
| # Copyright 2009-2020 by Peter Cock. All rights reserved. | |
| # Based on code contributed and copyright 2009 by Jose Blanca (COMAV-UPV). | |
| # | |
| # This code is part of the Biopython distribution and governed by its | |
| # license. Please see the LICENSE file that should have been included | |
| # as part of this package. | |
| """Bio.SeqIO support for the binary Standard Flowgram Format (SFF) file format. | |
| SFF was designed by 454 Life Sciences (Roche), the Whitehead Institute for | |
| Biomedical Research and the Wellcome Trust Sanger Institute. SFF was also used | |
| as the native output format from early versions of Ion Torrent's PGM platform | |
| as well. You are expected to use this module via the Bio.SeqIO functions under | |
| the format name "sff" (or "sff-trim" as described below). | |
| For example, to iterate over the records in an SFF file, | |
| >>> from Bio import SeqIO | |
| >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff"): | |
| ... print("%s %i %s..." % (record.id, len(record), record.seq[:20])) | |
| ... | |
| E3MFGYR02JWQ7T 265 tcagGGTCTACATGTTGGTT... | |
| E3MFGYR02JA6IL 271 tcagTTTTTTTTGGAAAGGA... | |
| E3MFGYR02JHD4H 310 tcagAAAGACAAGTGGTATC... | |
| E3MFGYR02GFKUC 299 tcagCGGCCGGGCCTCTCAT... | |
| E3MFGYR02FTGED 281 tcagTGGTAATGGGGGGAAA... | |
| E3MFGYR02FR9G7 261 tcagCTCCGTAAGAAGGTGC... | |
| E3MFGYR02GAZMS 278 tcagAAAGAAGTAAGGTAAA... | |
| E3MFGYR02HHZ8O 221 tcagACTTTCTTCTTTACCG... | |
| E3MFGYR02GPGB1 269 tcagAAGCAGTGGTATCAAC... | |
| E3MFGYR02F7Z7G 219 tcagAATCATCCACTTTTTA... | |
| Each SeqRecord object will contain all the annotation from the SFF file, | |
| including the PHRED quality scores. | |
| >>> print("%s %i" % (record.id, len(record))) | |
| E3MFGYR02F7Z7G 219 | |
| >>> print("%s..." % record.seq[:10]) | |
| tcagAATCAT... | |
| >>> print("%r..." % (record.letter_annotations["phred_quality"][:10])) | |
| [22, 21, 23, 28, 26, 15, 12, 21, 28, 21]... | |
| Notice that the sequence is given in mixed case, the central upper case region | |
| corresponds to the trimmed sequence. This matches the output of the Roche | |
| tools (and the 3rd party tool sff_extract) for SFF to FASTA. | |
| >>> print(record.annotations["clip_qual_left"]) | |
| 4 | |
| >>> print(record.annotations["clip_qual_right"]) | |
| 134 | |
| >>> print(record.seq[:4]) | |
| tcag | |
| >>> print("%s...%s" % (record.seq[4:20], record.seq[120:134])) | |
| AATCATCCACTTTTTA...CAAAACACAAACAG | |
| >>> print(record.seq[134:]) | |
| atcttatcaacaaaactcaaagttcctaactgagacacgcaacaggggataagacaaggcacacaggggataggnnnnnnnnnnn | |
| The annotations dictionary also contains any adapter clip positions | |
| (usually zero), and information about the flows. e.g. | |
| >>> len(record.annotations) | |
| 12 | |
| >>> print(record.annotations["flow_key"]) | |
| TCAG | |
| >>> print(record.annotations["flow_values"][:10]) | |
| (83, 1, 128, 7, 4, 84, 6, 106, 3, 172) | |
| >>> print(len(record.annotations["flow_values"])) | |
| 400 | |
| >>> print(record.annotations["flow_index"][:10]) | |
| (1, 2, 3, 2, 2, 0, 3, 2, 3, 3) | |
| >>> print(len(record.annotations["flow_index"])) | |
| 219 | |
| Note that to convert from a raw reading in flow_values to the corresponding | |
| homopolymer stretch estimate, the value should be rounded to the nearest 100: | |
| >>> print("%r..." % [int(round(value, -2)) // 100 | |
| ... for value in record.annotations["flow_values"][:10]]) | |
| ... | |
| [1, 0, 1, 0, 0, 1, 0, 1, 0, 2]... | |
| If a read name is exactly 14 alphanumeric characters, the annotations | |
| dictionary will also contain meta-data about the read extracted by | |
| interpreting the name as a 454 Sequencing System "Universal" Accession | |
| Number. Note that if a read name happens to be exactly 14 alphanumeric | |
| characters but was not generated automatically, these annotation records | |
| will contain nonsense information. | |
| >>> print(record.annotations["region"]) | |
| 2 | |
| >>> print(record.annotations["time"]) | |
| [2008, 1, 9, 16, 16, 0] | |
| >>> print(record.annotations["coords"]) | |
| (2434, 1658) | |
| As a convenience method, you can read the file with SeqIO format name "sff-trim" | |
| instead of "sff" to get just the trimmed sequences (without any annotation | |
| except for the PHRED quality scores and anything encoded in the read names): | |
| >>> from Bio import SeqIO | |
| >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim"): | |
| ... print("%s %i %s..." % (record.id, len(record), record.seq[:20])) | |
| ... | |
| E3MFGYR02JWQ7T 260 GGTCTACATGTTGGTTAACC... | |
| E3MFGYR02JA6IL 265 TTTTTTTTGGAAAGGAAAAC... | |
| E3MFGYR02JHD4H 292 AAAGACAAGTGGTATCAACG... | |
| E3MFGYR02GFKUC 295 CGGCCGGGCCTCTCATCGGT... | |
| E3MFGYR02FTGED 277 TGGTAATGGGGGGAAATTTA... | |
| E3MFGYR02FR9G7 256 CTCCGTAAGAAGGTGCTGCC... | |
| E3MFGYR02GAZMS 271 AAAGAAGTAAGGTAAATAAC... | |
| E3MFGYR02HHZ8O 150 ACTTTCTTCTTTACCGTAAC... | |
| E3MFGYR02GPGB1 221 AAGCAGTGGTATCAACGCAG... | |
| E3MFGYR02F7Z7G 130 AATCATCCACTTTTTAACGT... | |
| Looking at the final record in more detail, note how this differs to the | |
| example above: | |
| >>> print("%s %i" % (record.id, len(record))) | |
| E3MFGYR02F7Z7G 130 | |
| >>> print("%s..." % record.seq[:10]) | |
| AATCATCCAC... | |
| >>> print("%r..." % record.letter_annotations["phred_quality"][:10]) | |
| [26, 15, 12, 21, 28, 21, 36, 28, 27, 27]... | |
| >>> len(record.annotations) | |
| 4 | |
| >>> print(record.annotations["region"]) | |
| 2 | |
| >>> print(record.annotations["coords"]) | |
| (2434, 1658) | |
| >>> print(record.annotations["time"]) | |
| [2008, 1, 9, 16, 16, 0] | |
| >>> print(record.annotations["molecule_type"]) | |
| DNA | |
| You might use the Bio.SeqIO.convert() function to convert the (trimmed) SFF | |
| reads into a FASTQ file (or a FASTA file and a QUAL file), e.g. | |
| >>> from Bio import SeqIO | |
| >>> from io import StringIO | |
| >>> out_handle = StringIO() | |
| >>> count = SeqIO.convert("Roche/E3MFGYR02_random_10_reads.sff", "sff", | |
| ... out_handle, "fastq") | |
| ... | |
| >>> print("Converted %i records" % count) | |
| Converted 10 records | |
| The output FASTQ file would start like this: | |
| >>> print("%s..." % out_handle.getvalue()[:50]) | |
| @E3MFGYR02JWQ7T | |
| tcagGGTCTACATGTTGGTTAACCCGTACTGATT... | |
| Bio.SeqIO.index() provides memory efficient random access to the reads in an | |
| SFF file by name. SFF files can include an index within the file, which can | |
| be read in making this very fast. If the index is missing (or in a format not | |
| yet supported in Biopython) the file is indexed by scanning all the reads - | |
| which is a little slower. For example, | |
| >>> from Bio import SeqIO | |
| >>> reads = SeqIO.index("Roche/E3MFGYR02_random_10_reads.sff", "sff") | |
| >>> record = reads["E3MFGYR02JHD4H"] | |
| >>> print("%s %i %s..." % (record.id, len(record), record.seq[:20])) | |
| E3MFGYR02JHD4H 310 tcagAAAGACAAGTGGTATC... | |
| >>> reads.close() | |
| Or, using the trimmed reads: | |
| >>> from Bio import SeqIO | |
| >>> reads = SeqIO.index("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim") | |
| >>> record = reads["E3MFGYR02JHD4H"] | |
| >>> print("%s %i %s..." % (record.id, len(record), record.seq[:20])) | |
| E3MFGYR02JHD4H 292 AAAGACAAGTGGTATCAACG... | |
| >>> reads.close() | |
| You can also use the Bio.SeqIO.write() function with the "sff" format. Note | |
| that this requires all the flow information etc, and thus is probably only | |
| useful for SeqRecord objects originally from reading another SFF file (and | |
| not the trimmed SeqRecord objects from parsing an SFF file as "sff-trim"). | |
| As an example, let's pretend this example SFF file represents some DNA which | |
| was pre-amplified with a PCR primers AAAGANNNNN. The following script would | |
| produce a sub-file containing all those reads whose post-quality clipping | |
| region (i.e. the sequence after trimming) starts with AAAGA exactly (the non- | |
| degenerate bit of this pretend primer): | |
| >>> from Bio import SeqIO | |
| >>> records = (record for record in | |
| ... SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff") | |
| ... if record.seq[record.annotations["clip_qual_left"]:].startswith("AAAGA")) | |
| ... | |
| >>> count = SeqIO.write(records, "temp_filtered.sff", "sff") | |
| >>> print("Selected %i records" % count) | |
| Selected 2 records | |
| Of course, for an assembly you would probably want to remove these primers. | |
| If you want FASTA or FASTQ output, you could just slice the SeqRecord. However, | |
| if you want SFF output we have to preserve all the flow information - the trick | |
| is just to adjust the left clip position! | |
| >>> from Bio import SeqIO | |
| >>> def filter_and_trim(records, primer): | |
| ... for record in records: | |
| ... if record.seq[record.annotations["clip_qual_left"]:].startswith(primer): | |
| ... record.annotations["clip_qual_left"] += len(primer) | |
| ... yield record | |
| ... | |
| >>> records = SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff") | |
| >>> count = SeqIO.write(filter_and_trim(records, "AAAGA"), | |
| ... "temp_filtered.sff", "sff") | |
| ... | |
| >>> print("Selected %i records" % count) | |
| Selected 2 records | |
| We can check the results, note the lower case clipped region now includes the "AAAGA" | |
| sequence: | |
| >>> for record in SeqIO.parse("temp_filtered.sff", "sff"): | |
| ... print("%s %i %s..." % (record.id, len(record), record.seq[:20])) | |
| ... | |
| E3MFGYR02JHD4H 310 tcagaaagaCAAGTGGTATC... | |
| E3MFGYR02GAZMS 278 tcagaaagaAGTAAGGTAAA... | |
| >>> for record in SeqIO.parse("temp_filtered.sff", "sff-trim"): | |
| ... print("%s %i %s..." % (record.id, len(record), record.seq[:20])) | |
| ... | |
| E3MFGYR02JHD4H 287 CAAGTGGTATCAACGCAGAG... | |
| E3MFGYR02GAZMS 266 AGTAAGGTAAATAACAAACG... | |
| >>> import os | |
| >>> os.remove("temp_filtered.sff") | |
| For a description of the file format, please see the Roche manuals and: | |
| http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=formats | |
| """ | |
| import re | |
| import struct | |
| from Bio import StreamModeError | |
| from Bio.Seq import Seq | |
| from Bio.SeqRecord import SeqRecord | |
| from .Interfaces import SequenceIterator | |
| from .Interfaces import SequenceWriter | |
| _null = b"\0" | |
| _sff = b".sff" | |
| _hsh = b".hsh" | |
| _srt = b".srt" | |
| _mft = b".mft" | |
| _flag = b"\xff" | |
| def _sff_file_header(handle): | |
| """Read in an SFF file header (PRIVATE). | |
| Assumes the handle is at the start of the file, will read forwards | |
| though the header and leave the handle pointing at the first record. | |
| Returns a tuple of values from the header (header_length, index_offset, | |
| index_length, number_of_reads, flows_per_read, flow_chars, key_sequence) | |
| >>> with open("Roche/greek.sff", "rb") as handle: | |
| ... values = _sff_file_header(handle) | |
| ... | |
| >>> print(values[0]) | |
| 840 | |
| >>> print(values[1]) | |
| 65040 | |
| >>> print(values[2]) | |
| 256 | |
| >>> print(values[3]) | |
| 24 | |
| >>> print(values[4]) | |
| 800 | |
| >>> values[-1] | |
| 'TCAG' | |
| """ | |
| # file header (part one) | |
| # use big endiean encdoing > | |
| # magic_number I | |
| # version 4B | |
| # index_offset Q | |
| # index_length I | |
| # number_of_reads I | |
| # header_length H | |
| # key_length H | |
| # number_of_flows_per_read H | |
| # flowgram_format_code B | |
| # [rest of file header depends on the number of flows and how many keys] | |
| fmt = ">4s4BQIIHHHB" | |
| assert 31 == struct.calcsize(fmt) | |
| data = handle.read(31) | |
| if not data: | |
| raise ValueError("Empty file.") | |
| elif len(data) < 31: | |
| raise ValueError("File too small to hold a valid SFF header.") | |
| try: | |
| ( | |
| magic_number, | |
| ver0, | |
| ver1, | |
| ver2, | |
| ver3, | |
| index_offset, | |
| index_length, | |
| number_of_reads, | |
| header_length, | |
| key_length, | |
| number_of_flows_per_read, | |
| flowgram_format, | |
| ) = struct.unpack(fmt, data) | |
| except TypeError: | |
| raise StreamModeError("SFF files must be opened in binary mode.") from None | |
| if magic_number in [_hsh, _srt, _mft]: | |
| # Probably user error, calling Bio.SeqIO.parse() twice! | |
| raise ValueError("Handle seems to be at SFF index block, not start") | |
| if magic_number != _sff: # 779314790 | |
| raise ValueError(f"SFF file did not start '.sff', but {magic_number!r}") | |
| if (ver0, ver1, ver2, ver3) != (0, 0, 0, 1): | |
| raise ValueError( | |
| "Unsupported SFF version in header, %i.%i.%i.%i" % (ver0, ver1, ver2, ver3) | |
| ) | |
| if flowgram_format != 1: | |
| raise ValueError("Flowgram format code %i not supported" % flowgram_format) | |
| if (index_offset != 0) ^ (index_length != 0): | |
| raise ValueError( | |
| "Index offset %i but index length %i" % (index_offset, index_length) | |
| ) | |
| flow_chars = handle.read(number_of_flows_per_read).decode("ASCII") | |
| key_sequence = handle.read(key_length).decode("ASCII") | |
| # According to the spec, the header_length field should be the total number | |
| # of bytes required by this set of header fields, and should be equal to | |
| # "31 + number_of_flows_per_read + key_length" rounded up to the next value | |
| # divisible by 8. | |
| assert header_length % 8 == 0 | |
| padding = header_length - number_of_flows_per_read - key_length - 31 | |
| assert 0 <= padding < 8, padding | |
| if handle.read(padding).count(_null) != padding: | |
| import warnings | |
| from Bio import BiopythonParserWarning | |
| warnings.warn( | |
| "Your SFF file is invalid, post header %i byte " | |
| "null padding region contained data." % padding, | |
| BiopythonParserWarning, | |
| ) | |
| return ( | |
| header_length, | |
| index_offset, | |
| index_length, | |
| number_of_reads, | |
| number_of_flows_per_read, | |
| flow_chars, | |
| key_sequence, | |
| ) | |
| def _sff_do_slow_index(handle): | |
| """Generate an index by scanning though all the reads in an SFF file (PRIVATE). | |
| This is a slow but generic approach if we can't parse the provided index | |
| (if present). | |
| Will use the handle seek/tell functions. | |
| """ | |
| handle.seek(0) | |
| ( | |
| header_length, | |
| index_offset, | |
| index_length, | |
| number_of_reads, | |
| number_of_flows_per_read, | |
| flow_chars, | |
| key_sequence, | |
| ) = _sff_file_header(handle) | |
| # Now on to the reads... | |
| read_header_fmt = ">2HI4H" | |
| read_header_size = struct.calcsize(read_header_fmt) | |
| # NOTE - assuming flowgram_format==1, which means struct type H | |
| read_flow_fmt = ">%iH" % number_of_flows_per_read | |
| read_flow_size = struct.calcsize(read_flow_fmt) | |
| assert 1 == struct.calcsize(">B") | |
| assert 1 == struct.calcsize(">s") | |
| assert 1 == struct.calcsize(">c") | |
| assert read_header_size % 8 == 0 # Important for padding calc later! | |
| for read in range(number_of_reads): | |
| record_offset = handle.tell() | |
| if record_offset == index_offset: | |
| # Found index block within reads, ignore it: | |
| offset = index_offset + index_length | |
| if offset % 8: | |
| offset += 8 - (offset % 8) | |
| assert offset % 8 == 0 | |
| handle.seek(offset) | |
| record_offset = offset | |
| # assert record_offset%8 == 0 # Worth checking, but slow | |
| # First the fixed header | |
| data = handle.read(read_header_size) | |
| ( | |
| read_header_length, | |
| name_length, | |
| seq_len, | |
| clip_qual_left, | |
| clip_qual_right, | |
| clip_adapter_left, | |
| clip_adapter_right, | |
| ) = struct.unpack(read_header_fmt, data) | |
| if read_header_length < 10 or read_header_length % 8 != 0: | |
| raise ValueError( | |
| "Malformed read header, says length is %i:\n%r" | |
| % (read_header_length, data) | |
| ) | |
| # now the name and any padding (remainder of header) | |
| name = handle.read(name_length).decode() | |
| padding = read_header_length - read_header_size - name_length | |
| if handle.read(padding).count(_null) != padding: | |
| import warnings | |
| from Bio import BiopythonParserWarning | |
| warnings.warn( | |
| "Your SFF file is invalid, post name %i byte " | |
| "padding region contained data" % padding, | |
| BiopythonParserWarning, | |
| ) | |
| assert record_offset + read_header_length == handle.tell() | |
| # now the flowgram values, flowgram index, bases and qualities | |
| size = read_flow_size + 3 * seq_len | |
| handle.seek(size, 1) | |
| # now any padding... | |
| padding = size % 8 | |
| if padding: | |
| padding = 8 - padding | |
| if handle.read(padding).count(_null) != padding: | |
| import warnings | |
| from Bio import BiopythonParserWarning | |
| warnings.warn( | |
| "Your SFF file is invalid, post quality %i " | |
| "byte padding region contained data" % padding, | |
| BiopythonParserWarning, | |
| ) | |
| yield name, record_offset | |
| if handle.tell() % 8 != 0: | |
| raise ValueError("After scanning reads, did not end on a multiple of 8") | |
| def _sff_find_roche_index(handle): | |
| """Locate any existing Roche style XML meta data and read index (PRIVATE). | |
| Makes a number of hard coded assumptions based on reverse engineered SFF | |
| files from Roche 454 machines. | |
| Returns a tuple of read count, SFF "index" offset and size, XML offset | |
| and size, and the actual read index offset and size. | |
| Raises a ValueError for unsupported or non-Roche index blocks. | |
| """ | |
| handle.seek(0) | |
| ( | |
| header_length, | |
| index_offset, | |
| index_length, | |
| number_of_reads, | |
| number_of_flows_per_read, | |
| flow_chars, | |
| key_sequence, | |
| ) = _sff_file_header(handle) | |
| assert handle.tell() == header_length | |
| if not index_offset or not index_length: | |
| raise ValueError("No index present in this SFF file") | |
| # Now jump to the header... | |
| handle.seek(index_offset) | |
| fmt = ">4s4B" | |
| fmt_size = struct.calcsize(fmt) | |
| data = handle.read(fmt_size) | |
| if not data: | |
| raise ValueError( | |
| "Premature end of file? Expected index of size %i at offset %i, found nothing" | |
| % (index_length, index_offset) | |
| ) | |
| if len(data) < fmt_size: | |
| raise ValueError( | |
| "Premature end of file? Expected index of size %i at offset %i, found %r" | |
| % (index_length, index_offset, data) | |
| ) | |
| magic_number, ver0, ver1, ver2, ver3 = struct.unpack(fmt, data) | |
| if magic_number == _mft: # 778921588 | |
| # Roche 454 manifest index | |
| # This is typical from raw Roche 454 SFF files (2009), and includes | |
| # both an XML manifest and the sorted index. | |
| if (ver0, ver1, ver2, ver3) != (49, 46, 48, 48): | |
| # This is "1.00" as a string | |
| raise ValueError( | |
| "Unsupported version in .mft index header, %i.%i.%i.%i" | |
| % (ver0, ver1, ver2, ver3) | |
| ) | |
| fmt2 = ">LL" | |
| fmt2_size = struct.calcsize(fmt2) | |
| xml_size, data_size = struct.unpack(fmt2, handle.read(fmt2_size)) | |
| if index_length != fmt_size + fmt2_size + xml_size + data_size: | |
| raise ValueError( | |
| "Problem understanding .mft index header, %i != %i + %i + %i + %i" | |
| % (index_length, fmt_size, fmt2_size, xml_size, data_size) | |
| ) | |
| return ( | |
| number_of_reads, | |
| header_length, | |
| index_offset, | |
| index_length, | |
| index_offset + fmt_size + fmt2_size, | |
| xml_size, | |
| index_offset + fmt_size + fmt2_size + xml_size, | |
| data_size, | |
| ) | |
| elif magic_number == _srt: # 779317876 | |
| # Roche 454 sorted index | |
| # I've had this from Roche tool sfffile when the read identifiers | |
| # had nonstandard lengths and there was no XML manifest. | |
| if (ver0, ver1, ver2, ver3) != (49, 46, 48, 48): | |
| # This is "1.00" as a string | |
| raise ValueError( | |
| "Unsupported version in .srt index header, %i.%i.%i.%i" | |
| % (ver0, ver1, ver2, ver3) | |
| ) | |
| data = handle.read(4) | |
| if data != _null * 4: | |
| raise ValueError("Did not find expected null four bytes in .srt index") | |
| return ( | |
| number_of_reads, | |
| header_length, | |
| index_offset, | |
| index_length, | |
| 0, | |
| 0, | |
| index_offset + fmt_size + 4, | |
| index_length - fmt_size - 4, | |
| ) | |
| elif magic_number == _hsh: | |
| raise ValueError( | |
| "Hash table style indexes (.hsh) in SFF files are not (yet) supported" | |
| ) | |
| else: | |
| raise ValueError( | |
| f"Unknown magic number {magic_number!r} in SFF index header:\n{data!r}" | |
| ) | |
| def ReadRocheXmlManifest(handle): | |
| """Read any Roche style XML manifest data in the SFF "index". | |
| The SFF file format allows for multiple different index blocks, and Roche | |
| took advantage of this to define their own index block which also embeds | |
| an XML manifest string. This is not a publicly documented extension to | |
| the SFF file format, this was reverse engineered. | |
| The handle should be to an SFF file opened in binary mode. This function | |
| will use the handle seek/tell functions and leave the handle in an | |
| arbitrary location. | |
| Any XML manifest found is returned as a Python string, which you can then | |
| parse as appropriate, or reuse when writing out SFF files with the | |
| SffWriter class. | |
| Returns a string, or raises a ValueError if an Roche manifest could not be | |
| found. | |
| """ | |
| ( | |
| number_of_reads, | |
| header_length, | |
| index_offset, | |
| index_length, | |
| xml_offset, | |
| xml_size, | |
| read_index_offset, | |
| read_index_size, | |
| ) = _sff_find_roche_index(handle) | |
| if not xml_offset or not xml_size: | |
| raise ValueError("No XML manifest found") | |
| handle.seek(xml_offset) | |
| return handle.read(xml_size).decode() | |
| # This is a generator function! | |
| def _sff_read_roche_index(handle): | |
| """Read any existing Roche style read index provided in the SFF file (PRIVATE). | |
| Will use the handle seek/tell functions. | |
| This works on ".srt1.00" and ".mft1.00" style Roche SFF index blocks. | |
| Roche SFF indices use base 255 not 256, meaning we see bytes in range the | |
| range 0 to 254 only. This appears to be so that byte 0xFF (character 255) | |
| can be used as a marker character to separate entries (required if the | |
| read name lengths vary). | |
| Note that since only four bytes are used for the read offset, this is | |
| limited to 255^4 bytes (nearly 4GB). If you try to use the Roche sfffile | |
| tool to combine SFF files beyond this limit, they issue a warning and | |
| omit the index (and manifest). | |
| """ | |
| ( | |
| number_of_reads, | |
| header_length, | |
| index_offset, | |
| index_length, | |
| xml_offset, | |
| xml_size, | |
| read_index_offset, | |
| read_index_size, | |
| ) = _sff_find_roche_index(handle) | |
| # Now parse the read index... | |
| handle.seek(read_index_offset) | |
| fmt = ">5B" | |
| for read in range(number_of_reads): | |
| # TODO - Be more aware of when the index should end? | |
| data = handle.read(6) | |
| while True: | |
| more = handle.read(1) | |
| if not more: | |
| raise ValueError("Premature end of file!") | |
| data += more | |
| if more == _flag: | |
| break | |
| assert data[-1:] == _flag, data[-1:] | |
| name = data[:-6].decode() | |
| off4, off3, off2, off1, off0 = struct.unpack(fmt, data[-6:-1]) | |
| offset = off0 + 255 * off1 + 65025 * off2 + 16581375 * off3 | |
| if off4: | |
| # Could in theory be used as a fifth piece of offset information, | |
| # i.e. offset =+ 4228250625L*off4, but testing the Roche tools this | |
| # is not the case. They simple don't support such large indexes. | |
| raise ValueError("Expected a null terminator to the read name.") | |
| yield name, offset | |
| if handle.tell() != read_index_offset + read_index_size: | |
| raise ValueError( | |
| "Problem with index length? %i vs %i" | |
| % (handle.tell(), read_index_offset + read_index_size) | |
| ) | |
| _valid_UAN_read_name = re.compile(r"^[a-zA-Z0-9]{14}$") | |
| def _sff_read_seq_record( | |
| handle, number_of_flows_per_read, flow_chars, key_sequence, trim=False | |
| ): | |
| """Parse the next read in the file, return data as a SeqRecord (PRIVATE).""" | |
| # Now on to the reads... | |
| # the read header format (fixed part): | |
| # read_header_length H | |
| # name_length H | |
| # seq_len I | |
| # clip_qual_left H | |
| # clip_qual_right H | |
| # clip_adapter_left H | |
| # clip_adapter_right H | |
| # [rest of read header depends on the name length etc] | |
| read_header_fmt = ">2HI4H" | |
| read_header_size = struct.calcsize(read_header_fmt) | |
| read_flow_fmt = ">%iH" % number_of_flows_per_read | |
| read_flow_size = struct.calcsize(read_flow_fmt) | |
| ( | |
| read_header_length, | |
| name_length, | |
| seq_len, | |
| clip_qual_left, | |
| clip_qual_right, | |
| clip_adapter_left, | |
| clip_adapter_right, | |
| ) = struct.unpack(read_header_fmt, handle.read(read_header_size)) | |
| if clip_qual_left: | |
| clip_qual_left -= 1 # python counting | |
| if clip_adapter_left: | |
| clip_adapter_left -= 1 # python counting | |
| if read_header_length < 10 or read_header_length % 8 != 0: | |
| raise ValueError( | |
| "Malformed read header, says length is %i" % read_header_length | |
| ) | |
| # now the name and any padding (remainder of header) | |
| name = handle.read(name_length).decode() | |
| padding = read_header_length - read_header_size - name_length | |
| if handle.read(padding).count(_null) != padding: | |
| import warnings | |
| from Bio import BiopythonParserWarning | |
| warnings.warn( | |
| "Your SFF file is invalid, post name %i " | |
| "byte padding region contained data" % padding, | |
| BiopythonParserWarning, | |
| ) | |
| # now the flowgram values, flowgram index, bases and qualities | |
| # NOTE - assuming flowgram_format==1, which means struct type H | |
| flow_values = handle.read(read_flow_size) # unpack later if needed | |
| temp_fmt = ">%iB" % seq_len # used for flow index and quals | |
| flow_index = handle.read(seq_len) # unpack later if needed | |
| seq = handle.read(seq_len) # Leave as bytes for Seq object | |
| quals = list(struct.unpack(temp_fmt, handle.read(seq_len))) | |
| # now any padding... | |
| padding = (read_flow_size + seq_len * 3) % 8 | |
| if padding: | |
| padding = 8 - padding | |
| if handle.read(padding).count(_null) != padding: | |
| import warnings | |
| from Bio import BiopythonParserWarning | |
| warnings.warn( | |
| "Your SFF file is invalid, post quality %i " | |
| "byte padding region contained data" % padding, | |
| BiopythonParserWarning, | |
| ) | |
| # Follow Roche and apply most aggressive of qual and adapter clipping. | |
| # Note Roche seems to ignore adapter clip fields when writing SFF, | |
| # and uses just the quality clipping values for any clipping. | |
| clip_left = max(clip_qual_left, clip_adapter_left) | |
| # Right clipping of zero means no clipping | |
| if clip_qual_right: | |
| if clip_adapter_right: | |
| clip_right = min(clip_qual_right, clip_adapter_right) | |
| else: | |
| # Typical case with Roche SFF files | |
| clip_right = clip_qual_right | |
| elif clip_adapter_right: | |
| clip_right = clip_adapter_right | |
| else: | |
| clip_right = seq_len | |
| # Now build a SeqRecord | |
| if trim: | |
| if clip_left >= clip_right: | |
| # Raise an error? | |
| import warnings | |
| from Bio import BiopythonParserWarning | |
| warnings.warn( | |
| "Overlapping clip values in SFF record, trimmed to nothing", | |
| BiopythonParserWarning, | |
| ) | |
| seq = "" | |
| quals = [] | |
| else: | |
| seq = seq[clip_left:clip_right].upper() | |
| quals = quals[clip_left:clip_right] | |
| # Don't record the clipping values, flow etc, they make no sense now: | |
| annotations = {} | |
| else: | |
| if clip_left >= clip_right: | |
| import warnings | |
| from Bio import BiopythonParserWarning | |
| warnings.warn( | |
| "Overlapping clip values in SFF record", BiopythonParserWarning | |
| ) | |
| seq = seq.lower() | |
| else: | |
| # This use of mixed case mimics the Roche SFF tool's FASTA output | |
| seq = ( | |
| seq[:clip_left].lower() | |
| + seq[clip_left:clip_right].upper() | |
| + seq[clip_right:].lower() | |
| ) | |
| annotations = { | |
| "flow_values": struct.unpack(read_flow_fmt, flow_values), | |
| "flow_index": struct.unpack(temp_fmt, flow_index), | |
| "flow_chars": flow_chars, | |
| "flow_key": key_sequence, | |
| "clip_qual_left": clip_qual_left, | |
| "clip_qual_right": clip_qual_right, | |
| "clip_adapter_left": clip_adapter_left, | |
| "clip_adapter_right": clip_adapter_right, | |
| } | |
| if re.match(_valid_UAN_read_name, name): | |
| annotations["time"] = _get_read_time(name) | |
| annotations["region"] = _get_read_region(name) | |
| annotations["coords"] = _get_read_xy(name) | |
| annotations["molecule_type"] = "DNA" | |
| record = SeqRecord( | |
| Seq(seq), id=name, name=name, description="", annotations=annotations | |
| ) | |
| # Dirty trick to speed up this line: | |
| # record.letter_annotations["phred_quality"] = quals | |
| dict.__setitem__(record._per_letter_annotations, "phred_quality", quals) | |
| # Return the record and then continue... | |
| return record | |
| _powers_of_36 = [36**i for i in range(6)] | |
| def _string_as_base_36(string): | |
| """Interpret a string as a base-36 number as per 454 manual (PRIVATE).""" | |
| total = 0 | |
| for c, power in zip(string[::-1], _powers_of_36): | |
| # For reference: ord('0') = 48, ord('9') = 57 | |
| # For reference: ord('A') = 65, ord('Z') = 90 | |
| # For reference: ord('a') = 97, ord('z') = 122 | |
| if 48 <= ord(c) <= 57: | |
| val = ord(c) - 22 # equivalent to: - ord('0') + 26 | |
| elif 65 <= ord(c) <= 90: | |
| val = ord(c) - 65 | |
| elif 97 <= ord(c) <= 122: | |
| val = ord(c) - 97 | |
| else: | |
| # Invalid character | |
| val = 0 | |
| total += val * power | |
| return total | |
| def _get_read_xy(read_name): | |
| """Extract coordinates from last 5 characters of read name (PRIVATE).""" | |
| number = _string_as_base_36(read_name[9:]) | |
| return divmod(number, 4096) | |
| _time_denominators = [ | |
| 13 * 32 * 24 * 60 * 60, | |
| 32 * 24 * 60 * 60, | |
| 24 * 60 * 60, | |
| 60 * 60, | |
| 60, | |
| ] | |
| def _get_read_time(read_name): | |
| """Extract time from first 6 characters of read name (PRIVATE).""" | |
| time_list = [] | |
| remainder = _string_as_base_36(read_name[:6]) | |
| for denominator in _time_denominators: | |
| this_term, remainder = divmod(remainder, denominator) | |
| time_list.append(this_term) | |
| time_list.append(remainder) | |
| time_list[0] += 2000 | |
| return time_list | |
| def _get_read_region(read_name): | |
| """Extract region from read name (PRIVATE).""" | |
| return int(read_name[8]) | |
| def _sff_read_raw_record(handle, number_of_flows_per_read): | |
| """Extract the next read in the file as a raw (bytes) string (PRIVATE).""" | |
| read_header_fmt = ">2HI" | |
| read_header_size = struct.calcsize(read_header_fmt) | |
| read_flow_fmt = ">%iH" % number_of_flows_per_read | |
| read_flow_size = struct.calcsize(read_flow_fmt) | |
| raw = handle.read(read_header_size) | |
| read_header_length, name_length, seq_len = struct.unpack(read_header_fmt, raw) | |
| if read_header_length < 10 or read_header_length % 8 != 0: | |
| raise ValueError( | |
| "Malformed read header, says length is %i" % read_header_length | |
| ) | |
| # now the four clip values (4H = 8 bytes), and read name | |
| raw += handle.read(8 + name_length) | |
| # and any padding (remainder of header) | |
| padding = read_header_length - read_header_size - 8 - name_length | |
| pad = handle.read(padding) | |
| if pad.count(_null) != padding: | |
| import warnings | |
| from Bio import BiopythonParserWarning | |
| warnings.warn( | |
| "Your SFF file is invalid, post name %i " | |
| "byte padding region contained data" % padding, | |
| BiopythonParserWarning, | |
| ) | |
| raw += pad | |
| # now the flowgram values, flowgram index, bases and qualities | |
| raw += handle.read(read_flow_size + seq_len * 3) | |
| padding = (read_flow_size + seq_len * 3) % 8 | |
| # now any padding... | |
| if padding: | |
| padding = 8 - padding | |
| pad = handle.read(padding) | |
| if pad.count(_null) != padding: | |
| import warnings | |
| from Bio import BiopythonParserWarning | |
| warnings.warn( | |
| "Your SFF file is invalid, post quality %i " | |
| "byte padding region contained data" % padding, | |
| BiopythonParserWarning, | |
| ) | |
| raw += pad | |
| # Return the raw bytes | |
| return raw | |
| class _AddTellHandle: | |
| """Wrapper for handles which do not support the tell method (PRIVATE). | |
| Intended for use with things like network handles where tell (and reverse | |
| seek) are not supported. The SFF file needs to track the current offset in | |
| order to deal with the index block. | |
| """ | |
| def __init__(self, handle): | |
| self._handle = handle | |
| self._offset = 0 | |
| def read(self, length): | |
| data = self._handle.read(length) | |
| self._offset += len(data) | |
| return data | |
| def tell(self): | |
| return self._offset | |
| def seek(self, offset): | |
| if offset < self._offset: | |
| raise RuntimeError("Can't seek backwards") | |
| self._handle.read(offset - self._offset) | |
| def close(self): | |
| return self._handle.close() | |
| class SffIterator(SequenceIterator): | |
| """Parser for Standard Flowgram Format (SFF) files.""" | |
| def __init__(self, source, alphabet=None, trim=False): | |
| """Iterate over Standard Flowgram Format (SFF) reads (as SeqRecord objects). | |
| - source - path to an SFF file, e.g. from Roche 454 sequencing, | |
| or a file-like object opened in binary mode. | |
| - alphabet - optional alphabet, unused. Leave as None. | |
| - trim - should the sequences be trimmed? | |
| The resulting SeqRecord objects should match those from a paired FASTA | |
| and QUAL file converted from the SFF file using the Roche 454 tool | |
| ssfinfo. i.e. The sequence will be mixed case, with the trim regions | |
| shown in lower case. | |
| This function is used internally via the Bio.SeqIO functions: | |
| >>> from Bio import SeqIO | |
| >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff"): | |
| ... print("%s %i" % (record.id, len(record))) | |
| ... | |
| E3MFGYR02JWQ7T 265 | |
| E3MFGYR02JA6IL 271 | |
| E3MFGYR02JHD4H 310 | |
| E3MFGYR02GFKUC 299 | |
| E3MFGYR02FTGED 281 | |
| E3MFGYR02FR9G7 261 | |
| E3MFGYR02GAZMS 278 | |
| E3MFGYR02HHZ8O 221 | |
| E3MFGYR02GPGB1 269 | |
| E3MFGYR02F7Z7G 219 | |
| You can also call it directly: | |
| >>> with open("Roche/E3MFGYR02_random_10_reads.sff", "rb") as handle: | |
| ... for record in SffIterator(handle): | |
| ... print("%s %i" % (record.id, len(record))) | |
| ... | |
| E3MFGYR02JWQ7T 265 | |
| E3MFGYR02JA6IL 271 | |
| E3MFGYR02JHD4H 310 | |
| E3MFGYR02GFKUC 299 | |
| E3MFGYR02FTGED 281 | |
| E3MFGYR02FR9G7 261 | |
| E3MFGYR02GAZMS 278 | |
| E3MFGYR02HHZ8O 221 | |
| E3MFGYR02GPGB1 269 | |
| E3MFGYR02F7Z7G 219 | |
| Or, with the trim option: | |
| >>> with open("Roche/E3MFGYR02_random_10_reads.sff", "rb") as handle: | |
| ... for record in SffIterator(handle, trim=True): | |
| ... print("%s %i" % (record.id, len(record))) | |
| ... | |
| E3MFGYR02JWQ7T 260 | |
| E3MFGYR02JA6IL 265 | |
| E3MFGYR02JHD4H 292 | |
| E3MFGYR02GFKUC 295 | |
| E3MFGYR02FTGED 277 | |
| E3MFGYR02FR9G7 256 | |
| E3MFGYR02GAZMS 271 | |
| E3MFGYR02HHZ8O 150 | |
| E3MFGYR02GPGB1 221 | |
| E3MFGYR02F7Z7G 130 | |
| """ | |
| if alphabet is not None: | |
| raise ValueError("The alphabet argument is no longer supported") | |
| super().__init__(source, mode="b", fmt="SFF") | |
| self.trim = trim | |
| def parse(self, handle): | |
| """Start parsing the file, and return a SeqRecord generator.""" | |
| try: | |
| if 0 != handle.tell(): | |
| raise ValueError("Not at start of file, offset %i" % handle.tell()) | |
| except AttributeError: | |
| # Probably a network handle or something like that | |
| handle = _AddTellHandle(handle) | |
| records = self.iterate(handle) | |
| return records | |
| def iterate(self, handle): | |
| """Parse the file and generate SeqRecord objects.""" | |
| trim = self.trim | |
| ( | |
| header_length, | |
| index_offset, | |
| index_length, | |
| number_of_reads, | |
| number_of_flows_per_read, | |
| flow_chars, | |
| key_sequence, | |
| ) = _sff_file_header(handle) | |
| # Now on to the reads... | |
| # the read header format (fixed part): | |
| # read_header_length H | |
| # name_length H | |
| # seq_len I | |
| # clip_qual_left H | |
| # clip_qual_right H | |
| # clip_adapter_left H | |
| # clip_adapter_right H | |
| # [rest of read header depends on the name length etc] | |
| read_header_fmt = ">2HI4H" | |
| read_header_size = struct.calcsize(read_header_fmt) | |
| read_flow_fmt = ">%iH" % number_of_flows_per_read | |
| read_flow_size = struct.calcsize(read_flow_fmt) | |
| assert 1 == struct.calcsize(">B") | |
| assert 1 == struct.calcsize(">s") | |
| assert 1 == struct.calcsize(">c") | |
| assert read_header_size % 8 == 0 # Important for padding calc later! | |
| # The spec allows for the index block to be before or even in the middle | |
| # of the reads. We can check that if we keep track of our position | |
| # in the file... | |
| for read in range(number_of_reads): | |
| if index_offset and handle.tell() == index_offset: | |
| offset = index_offset + index_length | |
| if offset % 8: | |
| offset += 8 - (offset % 8) | |
| assert offset % 8 == 0 | |
| handle.seek(offset) | |
| # Now that we've done this, we don't need to do it again. Clear | |
| # the index_offset so we can skip extra handle.tell() calls: | |
| index_offset = 0 | |
| yield _sff_read_seq_record( | |
| handle, number_of_flows_per_read, flow_chars, key_sequence, trim | |
| ) | |
| _check_eof(handle, index_offset, index_length) | |
| def _check_eof(handle, index_offset, index_length): | |
| """Check final padding is OK (8 byte alignment) and file ends (PRIVATE). | |
| Will attempt to spot apparent SFF file concatenation and give an error. | |
| Will not attempt to seek, only moves the handle forward. | |
| """ | |
| offset = handle.tell() | |
| extra = b"" | |
| padding = 0 | |
| if index_offset and offset <= index_offset: | |
| # Index block then end of file... | |
| if offset < index_offset: | |
| raise ValueError( | |
| "Gap of %i bytes after final record end %i, " | |
| "before %i where index starts?" | |
| % (index_offset - offset, offset, index_offset) | |
| ) | |
| # Doing read to jump the index rather than a seek | |
| # in case this is a network handle or similar | |
| handle.read(index_offset + index_length - offset) | |
| offset = index_offset + index_length | |
| if offset != handle.tell(): | |
| raise ValueError( | |
| "Wanted %i, got %i, index is %i to %i" | |
| % (offset, handle.tell(), index_offset, index_offset + index_length) | |
| ) | |
| if offset % 8: | |
| padding = 8 - (offset % 8) | |
| extra = handle.read(padding) | |
| if padding >= 4 and extra[-4:] == _sff: | |
| # Seen this in one user supplied file, should have been | |
| # four bytes of null padding but was actually .sff and | |
| # the start of a new concatenated SFF file! | |
| raise ValueError( | |
| "Your SFF file is invalid, post index %i byte " | |
| "null padding region ended '.sff' which could " | |
| "be the start of a concatenated SFF file? " | |
| "See offset %i" % (padding, offset) | |
| ) | |
| if padding and not extra: | |
| # TODO - Is this error harmless enough to just ignore? | |
| import warnings | |
| from Bio import BiopythonParserWarning | |
| warnings.warn( | |
| "Your SFF file is technically invalid as it is missing " | |
| "a terminal %i byte null padding region." % padding, | |
| BiopythonParserWarning, | |
| ) | |
| return | |
| if extra.count(_null) != padding: | |
| import warnings | |
| from Bio import BiopythonParserWarning | |
| warnings.warn( | |
| "Your SFF file is invalid, post index %i byte " | |
| "null padding region contained data: %r" % (padding, extra), | |
| BiopythonParserWarning, | |
| ) | |
| offset = handle.tell() | |
| if offset % 8 != 0: | |
| raise ValueError("Wanted offset %i %% 8 = %i to be zero" % (offset, offset % 8)) | |
| # Should now be at the end of the file... | |
| extra = handle.read(4) | |
| if extra == _sff: | |
| raise ValueError( | |
| "Additional data at end of SFF file, " | |
| "perhaps multiple SFF files concatenated? " | |
| "See offset %i" % offset | |
| ) | |
| elif extra: | |
| raise ValueError("Additional data at end of SFF file, see offset %i" % offset) | |
| class _SffTrimIterator(SffIterator): | |
| """Iterate over SFF reads (as SeqRecord objects) with trimming (PRIVATE).""" | |
| def __init__(self, source): | |
| super().__init__(source, trim=True) | |
| class SffWriter(SequenceWriter): | |
| """SFF file writer.""" | |
| def __init__(self, target, index=True, xml=None): | |
| """Initialize an SFF writer object. | |
| Arguments: | |
| - target - Output stream opened in binary mode, or a path to a file. | |
| - index - Boolean argument, should we try and write an index? | |
| - xml - Optional string argument, xml manifest to be recorded | |
| in the index block (see function ReadRocheXmlManifest for | |
| reading this data). | |
| """ | |
| super().__init__(target, "wb") | |
| self._xml = xml | |
| if index: | |
| self._index = [] | |
| else: | |
| self._index = None | |
| def write_file(self, records): | |
| """Use this to write an entire file containing the given records.""" | |
| try: | |
| self._number_of_reads = len(records) | |
| except TypeError: | |
| self._number_of_reads = 0 # dummy value | |
| if not hasattr(self.handle, "seek") or not hasattr(self.handle, "tell"): | |
| raise ValueError( | |
| "A handle with a seek/tell methods is required in order " | |
| "to record the total record count in the file header " | |
| "(once it is known at the end)." | |
| ) from None | |
| if self._index is not None and not ( | |
| hasattr(self.handle, "seek") and hasattr(self.handle, "tell") | |
| ): | |
| import warnings | |
| warnings.warn( | |
| "A handle with a seek/tell methods is required in " | |
| "order to record an SFF index." | |
| ) | |
| self._index = None | |
| self._index_start = 0 | |
| self._index_length = 0 | |
| if not hasattr(records, "next"): | |
| records = iter(records) | |
| # Get the first record in order to find the flow information | |
| # we will need for the header. | |
| try: | |
| record = next(records) | |
| except StopIteration: | |
| record = None | |
| if record is None: | |
| # No records -> empty SFF file (or an error)? | |
| # We can't write a header without the flow information. | |
| # return 0 | |
| raise ValueError("Must have at least one sequence") | |
| try: | |
| self._key_sequence = record.annotations["flow_key"].encode("ASCII") | |
| self._flow_chars = record.annotations["flow_chars"].encode("ASCII") | |
| self._number_of_flows_per_read = len(self._flow_chars) | |
| except KeyError: | |
| raise ValueError("Missing SFF flow information") from None | |
| self.write_header() | |
| self.write_record(record) | |
| count = 1 | |
| for record in records: | |
| self.write_record(record) | |
| count += 1 | |
| if self._number_of_reads == 0: | |
| # Must go back and record the record count... | |
| offset = self.handle.tell() | |
| self.handle.seek(0) | |
| self._number_of_reads = count | |
| self.write_header() | |
| self.handle.seek(offset) # not essential? | |
| else: | |
| assert count == self._number_of_reads | |
| if self._index is not None: | |
| self._write_index() | |
| return count | |
| def _write_index(self): | |
| assert len(self._index) == self._number_of_reads | |
| handle = self.handle | |
| self._index.sort() | |
| self._index_start = handle.tell() # need for header | |
| # XML... | |
| if self._xml is not None: | |
| xml = self._xml.encode() | |
| else: | |
| from Bio import __version__ | |
| xml = f"<!-- This file was output with Biopython {__version__} -->\n" | |
| xml += ( | |
| "<!-- This XML and index block attempts to mimic Roche SFF files -->\n" | |
| ) | |
| xml += "<!-- This file may be a combination of multiple SFF files etc -->\n" | |
| xml = xml.encode() | |
| xml_len = len(xml) | |
| # Write to the file... | |
| fmt = ">I4BLL" | |
| fmt_size = struct.calcsize(fmt) | |
| handle.write(_null * fmt_size + xml) # fill this later | |
| fmt2 = ">6B" | |
| assert 6 == struct.calcsize(fmt2) | |
| self._index.sort() | |
| index_len = 0 # don't know yet! | |
| for name, offset in self._index: | |
| # Roche files record the offsets using base 255 not 256. | |
| # See comments for parsing the index block. There may be a faster | |
| # way to code this, but we can't easily use shifts due to odd base | |
| off3 = offset | |
| off0 = off3 % 255 | |
| off3 -= off0 | |
| off1 = off3 % 65025 | |
| off3 -= off1 | |
| off2 = off3 % 16581375 | |
| off3 -= off2 | |
| if offset != off0 + off1 + off2 + off3: | |
| raise RuntimeError( | |
| "%i -> %i %i %i %i" % (offset, off0, off1, off2, off3) | |
| ) | |
| off3, off2, off1, off0 = ( | |
| off3 // 16581375, | |
| off2 // 65025, | |
| off1 // 255, | |
| off0, | |
| ) | |
| if not (off0 < 255 and off1 < 255 and off2 < 255 and off3 < 255): | |
| raise RuntimeError( | |
| "%i -> %i %i %i %i" % (offset, off0, off1, off2, off3) | |
| ) | |
| handle.write(name + struct.pack(fmt2, 0, off3, off2, off1, off0, 255)) | |
| index_len += len(name) + 6 | |
| # Note any padding in not included: | |
| self._index_length = fmt_size + xml_len + index_len # need for header | |
| # Pad out to an 8 byte boundary (although I have noticed some | |
| # real Roche SFF files neglect to do this depsite their manual | |
| # suggesting this padding should be there): | |
| if self._index_length % 8: | |
| padding = 8 - (self._index_length % 8) | |
| handle.write(_null * padding) | |
| else: | |
| padding = 0 | |
| offset = handle.tell() | |
| if offset != self._index_start + self._index_length + padding: | |
| raise RuntimeError( | |
| "%i vs %i + %i + %i" | |
| % (offset, self._index_start, self._index_length, padding) | |
| ) | |
| # Must now go back and update the index header with index size... | |
| handle.seek(self._index_start) | |
| handle.write( | |
| struct.pack( | |
| fmt, | |
| 778921588, # magic number | |
| 49, | |
| 46, | |
| 48, | |
| 48, # Roche index version, "1.00" | |
| xml_len, | |
| index_len, | |
| ) | |
| + xml | |
| ) | |
| # Must now go back and update the header... | |
| handle.seek(0) | |
| self.write_header() | |
| handle.seek(offset) # not essential? | |
| def write_header(self): | |
| """Write the SFF file header.""" | |
| # Do header... | |
| key_length = len(self._key_sequence) | |
| # file header (part one) | |
| # use big endiean encdoing > | |
| # magic_number I | |
| # version 4B | |
| # index_offset Q | |
| # index_length I | |
| # number_of_reads I | |
| # header_length H | |
| # key_length H | |
| # number_of_flows_per_read H | |
| # flowgram_format_code B | |
| # [rest of file header depends on the number of flows and how many keys] | |
| fmt = ">I4BQIIHHHB%is%is" % (self._number_of_flows_per_read, key_length) | |
| # According to the spec, the header_length field should be the total | |
| # number of bytes required by this set of header fields, and should be | |
| # equal to "31 + number_of_flows_per_read + key_length" rounded up to | |
| # the next value divisible by 8. | |
| if struct.calcsize(fmt) % 8 == 0: | |
| padding = 0 | |
| else: | |
| padding = 8 - (struct.calcsize(fmt) % 8) | |
| header_length = struct.calcsize(fmt) + padding | |
| assert header_length % 8 == 0 | |
| header = struct.pack( | |
| fmt, | |
| 779314790, # magic number 0x2E736666 | |
| 0, | |
| 0, | |
| 0, | |
| 1, # version | |
| self._index_start, | |
| self._index_length, | |
| self._number_of_reads, | |
| header_length, | |
| key_length, | |
| self._number_of_flows_per_read, | |
| 1, # the only flowgram format code we support | |
| self._flow_chars, | |
| self._key_sequence, | |
| ) | |
| self.handle.write(header + _null * padding) | |
| def write_record(self, record): | |
| """Write a single additional record to the output file. | |
| This assumes the header has been done. | |
| """ | |
| # Basics | |
| name = record.id.encode() | |
| name_len = len(name) | |
| seq = bytes(record.seq).upper() | |
| seq_len = len(seq) | |
| # Qualities | |
| try: | |
| quals = record.letter_annotations["phred_quality"] | |
| except KeyError: | |
| raise ValueError( | |
| f"Missing PHRED qualities information for {record.id}" | |
| ) from None | |
| # Flow | |
| try: | |
| flow_values = record.annotations["flow_values"] | |
| flow_index = record.annotations["flow_index"] | |
| if ( | |
| self._key_sequence != record.annotations["flow_key"].encode() | |
| or self._flow_chars != record.annotations["flow_chars"].encode() | |
| ): | |
| raise ValueError("Records have inconsistent SFF flow data") | |
| except KeyError: | |
| raise ValueError(f"Missing SFF flow information for {record.id}") from None | |
| except AttributeError: | |
| raise ValueError("Header not written yet?") from None | |
| # Clipping | |
| try: | |
| clip_qual_left = record.annotations["clip_qual_left"] | |
| if clip_qual_left < 0: | |
| raise ValueError(f"Negative SFF clip_qual_left value for {record.id}") | |
| if clip_qual_left: | |
| clip_qual_left += 1 | |
| clip_qual_right = record.annotations["clip_qual_right"] | |
| if clip_qual_right < 0: | |
| raise ValueError(f"Negative SFF clip_qual_right value for {record.id}") | |
| clip_adapter_left = record.annotations["clip_adapter_left"] | |
| if clip_adapter_left < 0: | |
| raise ValueError( | |
| f"Negative SFF clip_adapter_left value for {record.id}" | |
| ) | |
| if clip_adapter_left: | |
| clip_adapter_left += 1 | |
| clip_adapter_right = record.annotations["clip_adapter_right"] | |
| if clip_adapter_right < 0: | |
| raise ValueError( | |
| f"Negative SFF clip_adapter_right value for {record.id}" | |
| ) | |
| except KeyError: | |
| raise ValueError( | |
| f"Missing SFF clipping information for {record.id}" | |
| ) from None | |
| # Capture information for index | |
| if self._index is not None: | |
| offset = self.handle.tell() | |
| # Check the position of the final record (before sort by name) | |
| # Using a four-digit base 255 number, so the upper bound is | |
| # 254*(1)+254*(255)+254*(255**2)+254*(255**3) = 4228250624 | |
| # or equivalently it overflows at 255**4 = 4228250625 | |
| if offset > 4228250624: | |
| import warnings | |
| warnings.warn( | |
| "Read %s has file offset %i, which is too large " | |
| "to store in the Roche SFF index structure. No " | |
| "index block will be recorded." % (name, offset) | |
| ) | |
| # No point recoring the offsets now | |
| self._index = None | |
| else: | |
| self._index.append((name, self.handle.tell())) | |
| # the read header format (fixed part): | |
| # read_header_length H | |
| # name_length H | |
| # seq_len I | |
| # clip_qual_left H | |
| # clip_qual_right H | |
| # clip_adapter_left H | |
| # clip_adapter_right H | |
| # [rest of read header depends on the name length etc] | |
| # name | |
| # flow values | |
| # flow index | |
| # sequence | |
| # padding | |
| read_header_fmt = ">2HI4H%is" % name_len | |
| if struct.calcsize(read_header_fmt) % 8 == 0: | |
| padding = 0 | |
| else: | |
| padding = 8 - (struct.calcsize(read_header_fmt) % 8) | |
| read_header_length = struct.calcsize(read_header_fmt) + padding | |
| assert read_header_length % 8 == 0 | |
| data = ( | |
| struct.pack( | |
| read_header_fmt, | |
| read_header_length, | |
| name_len, | |
| seq_len, | |
| clip_qual_left, | |
| clip_qual_right, | |
| clip_adapter_left, | |
| clip_adapter_right, | |
| name, | |
| ) | |
| + _null * padding | |
| ) | |
| assert len(data) == read_header_length | |
| # now the flowgram values, flowgram index, bases and qualities | |
| # NOTE - assuming flowgram_format==1, which means struct type H | |
| read_flow_fmt = ">%iH" % self._number_of_flows_per_read | |
| read_flow_size = struct.calcsize(read_flow_fmt) | |
| temp_fmt = ">%iB" % seq_len # used for flow index and quals | |
| data += ( | |
| struct.pack(read_flow_fmt, *flow_values) | |
| + struct.pack(temp_fmt, *flow_index) | |
| + seq | |
| + struct.pack(temp_fmt, *quals) | |
| ) | |
| # now any final padding... | |
| padding = (read_flow_size + seq_len * 3) % 8 | |
| if padding: | |
| padding = 8 - padding | |
| self.handle.write(data + _null * padding) | |
| if __name__ == "__main__": | |
| from Bio._utils import run_doctest | |
| run_doctest(verbose=0) | |