Spaces:
No application file
No application file
| # Copyright 2022 by Michiel de Hoon. All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Bio.Align support for alignment files in the bigBed format. | |
| The bigBed format stores a series of pairwise alignments in a single indexed | |
| binary file. Typically they are used for transcript to genome alignments. As | |
| in the BED format, the alignment positions and alignment scores are stored, | |
| but the aligned sequences are not. | |
| See http://genome.ucsc.edu/goldenPath/help/bigBed.html for more information. | |
| You are expected to use this module via the Bio.Align functions. | |
| """ | |
| # This parser was written based on the description of the bigBed file format in | |
| # W. J. Kent, A. S. Zweig,* G. Barber, A. S. Hinrichs, and D. Karolchik: | |
| # "BigWig and BigBed: enabling browsing of large distributed datasets: | |
| # Bioinformatics 26(17): 2204–2207 (2010) | |
| # in particular the tables in the supplemental materials listing the contents | |
| # of a bigBed file byte-by-byte. | |
| import numpy | |
| import struct | |
| import zlib | |
| from collections import namedtuple | |
| from Bio.Align import Alignment | |
| from Bio.Align import interfaces | |
| from Bio.Seq import Seq | |
| from Bio.SeqRecord import SeqRecord | |
| Field = namedtuple("Field", ("type", "name", "comment")) | |
| class AutoSQLTable: | |
| """AutoSQL table describing the columns of an (possibly extended) BED format.""" | |
| def __init__(self, text=None): | |
| """Create an AutoSQL table describing the columns of an (extended) BED format.""" | |
| if text is None: | |
| self.name = None | |
| self.comment = None | |
| self.fields = [] | |
| else: | |
| assert text.endswith(chr(0)) # NULL-terminated string | |
| word, text = text[:-1].split(None, 1) | |
| assert word == "table" | |
| name, text = text.split(None, 1) | |
| assert len(name.split()) == 1 | |
| self.name = name | |
| assert text.startswith('"') | |
| i = text.find('"', 1) | |
| self.comment = text[1:i] | |
| text = text[i + 1 :].strip() | |
| assert text.startswith("(") | |
| assert text.endswith(")") | |
| text = text[1:-1].strip() | |
| fields = [] | |
| while text: | |
| i = text.index('"') | |
| j = text.index('"', i + 1) | |
| field_comment = text[i + 1 : j] | |
| definition = text[:i].strip() | |
| assert definition.endswith(";") | |
| field_type, field_name = definition[:-1].rsplit(None, 1) | |
| if field_type.endswith("]"): | |
| i = field_type.index("[") | |
| data_type = field_type[:i] | |
| else: | |
| data_type = field_type | |
| assert data_type in ( | |
| "int", | |
| "uint", | |
| "short", | |
| "ushort", | |
| "byte", | |
| "ubyte", | |
| "float", | |
| "char", | |
| "string", | |
| "lstring", | |
| ) | |
| field = Field(field_type, field_name, field_comment) | |
| fields.append(field) | |
| text = text[j + 1 :].strip() | |
| self.fields = fields | |
| def __str__(self): | |
| type_width = max(len(str(field.type)) for field in self.fields) | |
| name_width = max(len(field.name) for field in self.fields) + 1 | |
| lines = [] | |
| lines.append("table %s\n" % self.name) | |
| lines.append('"%s"\n' % self.comment) | |
| lines.append("(\n") | |
| for field in self.fields: | |
| name = field.name + ";" | |
| lines.append( | |
| ' %s %s "%s"\n' | |
| % (field.type.ljust(type_width), name.ljust(name_width), field.comment) | |
| ) | |
| lines.append(")\n") | |
| return "".join(lines) | |
| class AlignmentIterator(interfaces.AlignmentIterator): | |
| """Alignment iterator for bigBed files. | |
| The pairwise alignments stored in the bigBed file are loaded and returned | |
| incrementally. Additional alignment information is stored as attributes | |
| of each alignment. | |
| """ | |
| fmt = "bigBed" | |
| mode = "b" | |
| def _read_header(self, stream): | |
| # Supplemental Table 5: Common header | |
| # magic 4 bytes | |
| # version 2 bytes | |
| # zoomLevels 2 bytes | |
| # chromosomeTreeOffset 8 bytes | |
| # fullDataOffset 8 bytes; points to dataCount | |
| # fullIndexOffset 8 bytes | |
| # fieldCount 2 bytes | |
| # definedFieldCount 2 bytes | |
| # autoSqlOffset 8 bytes | |
| # totalSummaryOffset 8 bytes | |
| # uncompressBufSize 4 bytes | |
| # reserved 8 bytes | |
| signature = 0x8789F2EB | |
| magic = stream.read(4) | |
| for byteorder in ("little", "big"): | |
| if int.from_bytes(magic, byteorder=byteorder) == signature: | |
| break | |
| else: | |
| raise ValueError("not a bigBed file") | |
| self.byteorder = byteorder | |
| if byteorder == "little": | |
| byteorder_char = "<" | |
| elif byteorder == "big": | |
| byteorder_char = ">" | |
| else: | |
| raise ValueError("Unexpected byteorder '%s'" % byteorder) | |
| ( | |
| version, | |
| zoomLevels, | |
| chromosomeTreeOffset, | |
| fullDataOffset, | |
| fullIndexOffset, | |
| fieldCount, | |
| definedFieldCount, | |
| autoSqlOffset, | |
| totalSummaryOffset, | |
| uncompressBufSize, | |
| ) = struct.unpack(byteorder_char + "hhqqqhhqqixxxxxxxx", stream.read(60)) | |
| autoSqlSize = totalSummaryOffset - autoSqlOffset | |
| self.declaration = self._read_autosql( | |
| stream, autoSqlOffset, autoSqlSize, fieldCount, definedFieldCount | |
| ) | |
| stream.seek(fullDataOffset) | |
| dataCount = int.from_bytes(stream.read(8), byteorder=byteorder) | |
| self._length = dataCount | |
| if uncompressBufSize > 0: | |
| self._compressed = True | |
| else: | |
| self._compressed = False | |
| self.targets = self._read_chromosomes(stream, chromosomeTreeOffset) | |
| self.tree = self._read_index(stream, fullIndexOffset) | |
| self._data = self._iterate_index(stream) | |
| def _read_autosql(self, stream, pos, size, fieldCount, definedFieldCount): | |
| if definedFieldCount < 3 or definedFieldCount > 12: | |
| raise ValueError( | |
| "expected between 3 and 12 columns, found %d" % definedFieldCount | |
| ) | |
| self.bedN = definedFieldCount | |
| stream.seek(pos) | |
| data = stream.read(size) | |
| declaration = AutoSQLTable(data.decode()) | |
| self._analyze_fields(declaration.fields, fieldCount, definedFieldCount) | |
| return declaration | |
| def _analyze_fields(self, fields, fieldCount, definedFieldCount): | |
| names = ( | |
| "chrom", | |
| "chromStart", | |
| "chromEnd", | |
| "name", | |
| "score", | |
| "strand", | |
| "thickStart", | |
| "thickEnd", | |
| "reserved", | |
| "blockCount", | |
| "blockSizes", | |
| "chromStarts", | |
| ) | |
| for i in range(self.bedN): | |
| name = fields[i].name | |
| if name != names[i]: | |
| raise ValueError( | |
| "Expected field name '%s'; found '%s'" % (names[i], name) | |
| ) | |
| if fieldCount > definedFieldCount: | |
| self._custom_fields = [] | |
| for i in range(definedFieldCount, fieldCount): | |
| field_name = fields[i].name | |
| field_type = fields[i].type | |
| if "[" in field_type and "]" in field_type: | |
| make_array = True | |
| field_type, _ = field_type.split("[") | |
| field_type = field_type.strip() | |
| else: | |
| make_array = False | |
| if field_type in ("int", "uint", "short", "ushort"): | |
| converter = int | |
| elif field_type in ("byte", "ubyte"): | |
| converter = bytes | |
| elif field_type == "float": | |
| converter = float | |
| elif field_type in ("float", "char", "string", "lstring"): | |
| converter = str | |
| else: | |
| raise Exception("Unknown field type %s" % field_type) | |
| if make_array: | |
| item_converter = converter | |
| def converter(data, item_converter=item_converter): | |
| values = data.rstrip(",").split(",") | |
| return [item_converter(value) for value in values] | |
| self._custom_fields.append([field_name, converter]) | |
| def _read_chromosomes(self, stream, pos): | |
| byteorder = self.byteorder | |
| if byteorder == "little": | |
| byteorder_char = "<" | |
| elif byteorder == "big": | |
| byteorder_char = ">" | |
| else: | |
| raise ValueError("Unexpected byteorder '%s'" % byteorder) | |
| # Supplemental Table 8: Chromosome B+ tree header | |
| # magic 4 bytes | |
| # blockSize 4 bytes | |
| # keySize 4 bytes | |
| # valSize 4 bytes | |
| # itemCount 8 bytes | |
| # reserved 8 bytes | |
| stream.seek(pos) | |
| signature = 0x78CA8C91 | |
| magic = int.from_bytes(stream.read(4), byteorder=byteorder) | |
| assert magic == signature | |
| blockSize, keySize, valSize, itemCount = struct.unpack( | |
| byteorder_char + "iiiqxxxxxxxx", stream.read(28) | |
| ) | |
| assert valSize == 8 | |
| Node = namedtuple("Node", ["parent", "children"]) | |
| targets = [] | |
| node = None | |
| while True: | |
| # Supplemental Table 9: Chromosome B+ tree node | |
| # isLeaf 1 byte | |
| # reserved 1 byte | |
| # count 2 bytes | |
| isLeaf, count = struct.unpack(byteorder_char + "?xh", stream.read(4)) | |
| if isLeaf: | |
| for i in range(count): | |
| # Supplemental Table 10: Chromosome B+ tree leaf item format | |
| # key keySize bytes | |
| # chromId 4 bytes | |
| # chromSize 4 bytes | |
| key = stream.read(keySize) | |
| name = key.rstrip(b"\x00").decode() | |
| chromId, chromSize = struct.unpack("<II", stream.read(valSize)) | |
| assert chromId == len(targets) | |
| sequence = Seq(None, length=chromSize) | |
| record = SeqRecord(sequence, id=name) | |
| targets.append(record) | |
| else: | |
| children = [] | |
| for i in range(count): | |
| # Supplemental Table 11: Chromosome B+ tree non-leaf item | |
| # key keySize bytes | |
| # childOffset 8 bytes | |
| key = stream.read(keySize) | |
| pos = int.from_bytes(stream.read(8), byteorder) | |
| children.append(pos) | |
| parent = node | |
| node = Node(parent, children) | |
| while True: | |
| if node is None: | |
| assert len(targets) == itemCount | |
| return targets | |
| children = node.children | |
| try: | |
| pos = children.pop(0) | |
| except IndexError: | |
| node = node.parent | |
| else: | |
| break | |
| stream.seek(pos) | |
| def _read_index(self, stream, pos): | |
| byteorder = self.byteorder | |
| if byteorder == "little": | |
| byteorder_char = "<" | |
| elif byteorder == "big": | |
| byteorder_char = ">" | |
| else: | |
| raise ValueError("Unexpected byteorder '%s'" % byteorder) | |
| Node = namedtuple( | |
| "Node", | |
| [ | |
| "parent", | |
| "children", | |
| "startChromIx", | |
| "startBase", | |
| "endChromIx", | |
| "endBase", | |
| ], | |
| ) | |
| Leaf = namedtuple( | |
| "Leaf", | |
| [ | |
| "parent", | |
| "startChromIx", | |
| "startBase", | |
| "endChromIx", | |
| "endBase", | |
| "dataOffset", | |
| "dataSize", | |
| ], | |
| ) | |
| # Supplemental Table 14: R tree index header | |
| # magic 4 bytes | |
| # blockSize 4 bytes | |
| # itemCount 8 bytes | |
| # startChromIx 4 bytes | |
| # startBase 4 bytes | |
| # endChromIx 4 bytes | |
| # endBase 4 bytes | |
| # endFileOffset 8 bytes | |
| # itemsPerSlot 4 bytes | |
| # reserved 4 bytes | |
| stream.seek(pos) | |
| signature = 0x2468ACE0 | |
| magic = int.from_bytes(stream.read(4), byteorder=byteorder) | |
| assert magic == signature | |
| ( | |
| blockSize, | |
| itemCount, | |
| startChromIx, | |
| startBase, | |
| endChromIx, | |
| endBase, | |
| endFileOffset, | |
| itemsPerSlot, | |
| ) = struct.unpack(byteorder_char + "iqiiiiqixxxx", stream.read(44)) | |
| root = Node(None, [], startChromIx, startBase, endChromIx, endBase) | |
| node = root | |
| itemsCounted = 0 | |
| dataOffsets = {} | |
| while True: | |
| # Supplemental Table 15: R tree node format | |
| # isLeaf 1 byte | |
| # reserved 1 byte | |
| # count 2 bytes | |
| isLeaf, count = struct.unpack(byteorder_char + "?xh", stream.read(4)) | |
| if isLeaf: | |
| children = node.children | |
| for i in range(count): | |
| # Supplemental Table 16: R tree leaf format | |
| # startChromIx 4 bytes | |
| # startBase 4 bytes | |
| # endChromIx 4 bytes | |
| # endBase 4 bytes | |
| # dataOffset 8 bytes | |
| # dataSize 8 bytes | |
| ( | |
| startChromIx, | |
| startBase, | |
| endChromIx, | |
| endBase, | |
| dataOffset, | |
| dataSize, | |
| ) = struct.unpack(byteorder_char + "iiiiqq", stream.read(32)) | |
| child = Leaf( | |
| node, | |
| startChromIx, | |
| startBase, | |
| endChromIx, | |
| endBase, | |
| dataOffset, | |
| dataSize, | |
| ) | |
| children.append(child) | |
| itemsCounted += count | |
| while True: | |
| parent = node.parent | |
| if parent is None: | |
| assert itemsCounted == itemCount | |
| assert not dataOffsets | |
| return node | |
| for index, child in enumerate(parent.children): | |
| if id(node) == id(child): | |
| break | |
| else: | |
| raise RuntimeError("Failed to find child node") | |
| try: | |
| node = parent.children[index + 1] | |
| except IndexError: | |
| node = parent | |
| else: | |
| break | |
| else: | |
| children = node.children | |
| for i in range(count): | |
| # Supplemental Table 17: R tree non-leaf format | |
| # startChromIx 4 bytes | |
| # startBase 4 bytes | |
| # endChromIx 4 bytes | |
| # endBase 4 bytes | |
| # dataOffset 8 bytes | |
| ( | |
| startChromIx, | |
| startBase, | |
| endChromIx, | |
| endBase, | |
| dataOffset, | |
| ) = struct.unpack(byteorder_char + "iiiiq", stream.read(24)) | |
| child = Node(node, [], startChromIx, startBase, endChromIx, endBase) | |
| dataOffsets[id(child)] = dataOffset | |
| children.append(child) | |
| parent = node | |
| node = children[0] | |
| pos = dataOffsets.pop(id(node)) | |
| stream.seek(pos) | |
| def _iterate_index(self, stream): | |
| byteorder = self.byteorder | |
| if byteorder == "little": | |
| byteorder_char = "<" | |
| elif byteorder == "big": | |
| byteorder_char = ">" | |
| else: | |
| raise ValueError("Unexpected byteorder '%s'" % byteorder) | |
| node = self.tree | |
| while True: | |
| try: | |
| children = node.children | |
| except AttributeError: | |
| stream.seek(node.dataOffset) | |
| data = stream.read(node.dataSize) | |
| if self._compressed > 0: | |
| data = zlib.decompress(data) | |
| while data: | |
| # Supplemental Table 12: Binary BED-data format | |
| # chromId 4 bytes | |
| # chromStart 4 bytes | |
| # chromEnd 4 bytes | |
| # rest zero-terminated string in tab-separated format | |
| chromId, chromStart, chromEnd = struct.unpack( | |
| byteorder_char + "III", data[:12] | |
| ) | |
| rest, data = data[12:].split(b"\00", 1) | |
| yield (chromId, chromStart, chromEnd, rest) | |
| while True: | |
| parent = node.parent | |
| if parent is None: | |
| return | |
| for index, child in enumerate(parent.children): | |
| if id(node) == id(child): | |
| break | |
| else: | |
| raise RuntimeError("Failed to find child node") | |
| try: | |
| node = parent.children[index + 1] | |
| except IndexError: | |
| node = parent | |
| else: | |
| break | |
| else: | |
| node = children[0] | |
| def _search_index(self, stream, chromIx, start, end): | |
| byteorder = self.byteorder | |
| if byteorder == "little": | |
| byteorder_char = "<" | |
| elif byteorder == "big": | |
| byteorder_char = ">" | |
| else: | |
| raise ValueError("Unexpected byteorder '%s'" % byteorder) | |
| padded_start = start - 1 | |
| padded_end = end + 1 | |
| node = self.tree | |
| while True: | |
| try: | |
| children = node.children | |
| except AttributeError: | |
| stream.seek(node.dataOffset) | |
| data = stream.read(node.dataSize) | |
| if self._compressed > 0: | |
| data = zlib.decompress(data) | |
| while data: | |
| # Supplemental Table 12: Binary BED-data format | |
| # chromId 4 bytes | |
| # chromStart 4 bytes | |
| # chromEnd 4 bytes | |
| # rest zero-terminated string in tab-separated format | |
| child_chromIx, child_chromStart, child_chromEnd = struct.unpack( | |
| byteorder_char + "III", data[:12] | |
| ) | |
| rest, data = data[12:].split(b"\00", 1) | |
| if child_chromIx != chromIx: | |
| continue | |
| if end <= child_chromStart or child_chromEnd <= start: | |
| if child_chromStart != child_chromEnd: | |
| continue | |
| if child_chromStart != end and child_chromEnd != start: | |
| continue | |
| yield (child_chromIx, child_chromStart, child_chromEnd, rest) | |
| else: | |
| visit_child = False | |
| for child in children: | |
| if (child.endChromIx, child.endBase) < (chromIx, padded_start): | |
| continue | |
| if (chromIx, padded_end) < (child.startChromIx, child.startBase): | |
| continue | |
| visit_child = True | |
| break | |
| if visit_child: | |
| node = child | |
| continue | |
| while True: | |
| parent = node.parent | |
| if parent is None: | |
| return | |
| for index, child in enumerate(parent.children): | |
| if id(node) == id(child): | |
| break | |
| else: | |
| raise RuntimeError("Failed to find child node") | |
| try: | |
| node = parent.children[index + 1] | |
| except IndexError: | |
| node = parent | |
| else: | |
| break | |
| def _read_next_alignment(self, stream): | |
| chunk = next(self._data) | |
| return self._create_alignment(chunk) | |
| def _create_alignment(self, chunk): | |
| chromId, chromStart, chromEnd, rest = chunk | |
| if rest: | |
| words = rest.decode().split("\t") | |
| else: | |
| words = [] | |
| target_record = self.targets[chromId] | |
| if self.bedN > 3: | |
| name = words[0] | |
| else: | |
| name = None | |
| if self.bedN > 5: | |
| strand = words[2] | |
| else: | |
| strand = "+" | |
| if self.bedN > 9: | |
| blockCount = int(words[6]) | |
| blockSizes = [ | |
| int(blockSize) for blockSize in words[7].rstrip(",").split(",") | |
| ] | |
| blockStarts = [ | |
| int(blockStart) for blockStart in words[8].rstrip(",").split(",") | |
| ] | |
| if len(blockSizes) != blockCount: | |
| raise ValueError( | |
| "Inconsistent number of block sizes (%d found, expected %d)" | |
| % (len(blockSizes), blockCount) | |
| ) | |
| if len(blockStarts) != blockCount: | |
| raise ValueError( | |
| "Inconsistent number of block start positions (%d found, expected %d)" | |
| % (len(blockStarts), blockCount) | |
| ) | |
| blockSizes = numpy.array(blockSizes) | |
| blockStarts = numpy.array(blockStarts) | |
| tPosition = 0 | |
| qPosition = 0 | |
| coordinates = [[tPosition, qPosition]] | |
| for blockSize, blockStart in zip(blockSizes, blockStarts): | |
| if blockStart != tPosition: | |
| coordinates.append([blockStart, qPosition]) | |
| tPosition = blockStart | |
| tPosition += blockSize | |
| qPosition += blockSize | |
| coordinates.append([tPosition, qPosition]) | |
| coordinates = numpy.array(coordinates).transpose() | |
| qSize = sum(blockSizes) | |
| else: | |
| blockSize = chromEnd - chromStart | |
| coordinates = numpy.array([[0, blockSize], [0, blockSize]]) | |
| qSize = blockSize | |
| coordinates[0, :] += chromStart | |
| query_sequence = Seq(None, length=qSize) | |
| query_record = SeqRecord(query_sequence, id=name) | |
| records = [target_record, query_record] | |
| if strand == "-": | |
| coordinates[1, :] = qSize - coordinates[1, :] | |
| if chromStart != coordinates[0, 0]: | |
| raise ValueError( | |
| "Inconsistent chromStart found (%d, expected %d)" | |
| % (chromStart, coordinates[0, 0]) | |
| ) | |
| if chromEnd != coordinates[0, -1]: | |
| raise ValueError( | |
| "Inconsistent chromEnd found (%d, expected %d)" | |
| % (chromEnd, coordinates[0, -1]) | |
| ) | |
| alignment = Alignment(records, coordinates) | |
| if len(words) > self.bedN - 3: | |
| alignment.annotations = {} | |
| for word, custom_field in zip(words[self.bedN - 3 :], self._custom_fields): | |
| name, converter = custom_field | |
| alignment.annotations[name] = converter(word) | |
| if self.bedN <= 4: | |
| return alignment | |
| score = words[1] | |
| try: | |
| score = float(score) | |
| except ValueError: | |
| pass | |
| else: | |
| if score.is_integer(): | |
| score = int(score) | |
| alignment.score = score | |
| if self.bedN <= 6: | |
| return alignment | |
| alignment.thickStart = int(words[3]) | |
| if self.bedN <= 7: | |
| return alignment | |
| alignment.thickEnd = int(words[4]) | |
| if self.bedN <= 8: | |
| return alignment | |
| alignment.itemRgb = words[5] | |
| return alignment | |
| def __len__(self): | |
| return self._length | |
| def search(self, chromosome=None, start=None, end=None): | |
| """Iterate over alignments overlapping the specified chromosome region.. | |
| This method searches the index to find alignments to the specified | |
| chromosome that fully or partially overlap the chromosome region | |
| between start and end. | |
| Arguments: | |
| - chromosome - chromosome name. If None (default value), include all | |
| alignments. | |
| - start - starting position on the chromosome. If None (default | |
| value), use 0 as the starting position. | |
| - end - end position on the chromosome. If None (default value), | |
| use the length of the chromosome as the end position. | |
| """ | |
| stream = self._stream | |
| if chromosome is None: | |
| if start is not None or end is not None: | |
| raise ValueError( | |
| "start and end must both be None if chromosome is None" | |
| ) | |
| else: | |
| for chromIx, target in enumerate(self.targets): | |
| if target.id == chromosome: | |
| break | |
| else: | |
| raise ValueError("Failed to find %s in alignments" % chromosome) | |
| if start is None: | |
| if end is None: | |
| start = 0 | |
| end = len(target) | |
| else: | |
| raise ValueError("end must be None if start is None") | |
| elif end is None: | |
| end = start + 1 | |
| data = self._search_index(stream, chromIx, start, end) | |
| for chunk in data: | |
| alignment = self._create_alignment(chunk) | |
| yield alignment | |