Spaces:
No application file
No application file
| # Copyright 2019 by Michiel de Hoon. All rights reserved. | |
| # Based on code contributed and copyright 2016 by Peter Cock. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Bio.SeqIO support for the UCSC nib file format. | |
| Nib stands for nibble (4 bit) representation of nucleotide sequences. | |
| The two nibbles in a byte each store one nucleotide, represented numerically | |
| as follows: | |
| - ``0`` - T | |
| - ``1`` - C | |
| - ``2`` - A | |
| - ``3`` - G | |
| - ``4`` - N (unknown) | |
| As the first bit in a nibble is set if the nucleotide is soft-masked, we | |
| additionally have: | |
| - ``8`` - t | |
| - ``9`` - c | |
| - ``a`` - a | |
| - ``b`` - g | |
| - ``c`` - n (unknown) | |
| A nib file contains only one sequence record. | |
| You are expected to use this module via the Bio.SeqIO functions under | |
| the format name "nib": | |
| >>> from Bio import SeqIO | |
| >>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib") | |
| >>> print("%i %s..." % (len(record), record.seq[:20])) | |
| 50 nAGAAGagccgcNGgCActt... | |
| For detailed information on the file format, please see the UCSC | |
| description at https://genome.ucsc.edu/FAQ/FAQformat.html. | |
| """ | |
| import binascii | |
| import struct | |
| import sys | |
| from Bio.Seq import Seq | |
| from Bio.SeqRecord import SeqRecord | |
| from .Interfaces import SequenceIterator | |
| from .Interfaces import SequenceWriter | |
| class NibIterator(SequenceIterator): | |
| """Parser for nib files.""" | |
| def __init__(self, source): | |
| """Iterate over a nib file and yield a SeqRecord. | |
| - source - a file-like object or a path to a file in the nib file | |
| format as defined by UCSC; the file must be opened in binary mode. | |
| Note that a nib file always contains only one sequence record. | |
| The sequence of the resulting SeqRecord object should match the sequence | |
| generated by Jim Kent's nibFrag utility run with the -masked option. | |
| This function is used internally via the Bio.SeqIO functions: | |
| >>> from Bio import SeqIO | |
| >>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib") | |
| >>> print("%s %i" % (record.seq, len(record))) | |
| nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50 | |
| You can also call it directly: | |
| >>> with open("Nib/test_even_bigendian.nib", "rb") as handle: | |
| ... for record in NibIterator(handle): | |
| ... print("%s %i" % (record.seq, len(record))) | |
| ... | |
| nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50 | |
| """ | |
| super().__init__(source, mode="b", fmt="Nib") | |
| def parse(self, handle): | |
| """Start parsing the file, and return a SeqRecord generator.""" | |
| word = handle.read(4) | |
| if not word: | |
| raise ValueError("Empty file.") | |
| signature = word.hex() | |
| if signature == "3a3de96b": | |
| byteorder = "little" # little-endian | |
| elif signature == "6be93d3a": | |
| byteorder = "big" # big-endian | |
| else: | |
| raise ValueError("unexpected signature in nib header") | |
| records = self.iterate(handle, byteorder) | |
| return records | |
| def iterate(self, handle, byteorder): | |
| """Iterate over the records in the nib file.""" | |
| number = handle.read(4) | |
| length = int.from_bytes(number, byteorder) | |
| data = handle.read() | |
| indices = binascii.hexlify(data) | |
| if length % 2 == 0: | |
| if len(indices) != length: | |
| raise ValueError("Unexpected file size") | |
| elif length % 2 == 1: | |
| if len(indices) != length + 1: | |
| raise ValueError("Unexpected file size") | |
| indices = indices[:length] | |
| if not set(indices).issubset(b"0123489abc"): | |
| raise ValueError("Unexpected sequence data found in file") | |
| table = bytes.maketrans(b"0123489abc", b"TCAGNtcagn") | |
| nucleotides = indices.translate(table) | |
| sequence = Seq(nucleotides) | |
| record = SeqRecord(sequence) | |
| yield record | |
| class NibWriter(SequenceWriter): | |
| """Nib file writer.""" | |
| def __init__(self, target): | |
| """Initialize a Nib writer object. | |
| Arguments: | |
| - target - output stream opened in binary mode, or a path to a file | |
| """ | |
| super().__init__(target, mode="wb") | |
| def write_header(self): | |
| """Write the file header.""" | |
| super().write_header() | |
| handle = self.handle | |
| byteorder = sys.byteorder | |
| if byteorder == "little": # little-endian | |
| signature = "3a3de96b" | |
| elif byteorder == "big": # big-endian | |
| signature = "6be93d3a" | |
| else: | |
| raise RuntimeError(f"unexpected system byte order {byteorder}") | |
| handle.write(bytes.fromhex(signature)) | |
| def write_record(self, record): | |
| """Write a single record to the output file.""" | |
| handle = self.handle | |
| sequence = record.seq | |
| nucleotides = bytes(sequence) | |
| length = len(sequence) | |
| handle.write(struct.pack("i", length)) | |
| table = bytes.maketrans(b"TCAGNtcagn", b"0123489abc") | |
| padding = length % 2 | |
| suffix = padding * b"T" | |
| nucleotides += suffix | |
| if not set(nucleotides).issubset(b"ACGTNacgtn"): | |
| raise ValueError("Sequence should contain A,C,G,T,N,a,c,g,t,n only") | |
| indices = nucleotides.translate(table) | |
| handle.write(binascii.unhexlify(indices)) | |
| def write_file(self, records): | |
| """Write the complete file with the records, and return the number of records.""" | |
| count = super().write_file(records, mincount=1, maxcount=1) | |
| return count | |
| if __name__ == "__main__": | |
| from Bio._utils import run_doctest | |
| run_doctest(verbose=0) | |