Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /Align /tabular.py

aakash0017

Upload folder using huggingface_hub

b7731cd over 2 years ago

raw

history blame contribute delete

16.5 kB

	# Copyright 2021 by Michiel de Hoon. All rights reserved.
	#
	# This file is part of the Biopython distribution and governed by your
	# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
	# Please see the LICENSE file that should have been included as part of this
	# package.
	"""Bio.Align support for tabular output from BLAST or FASTA.

	This module contains a parser for tabular output from BLAST run with the
	'-outfmt 7' argument, as well as tabular output from William Pearson's
	FASTA alignment tools using the '-m 8CB' or '-m 8CC' arguments.
	"""
	import re
	import enum
	import numpy
	from Bio.Align import Alignment
	from Bio.Align import interfaces
	from Bio.Seq import Seq
	from Bio.SeqRecord import SeqRecord


	class State(enum.Enum):
	"""Enumerate alignment states needed when parsing a BTOP string."""

	MATCH = enum.auto()
	QUERY_GAP = enum.auto()
	TARGET_GAP = enum.auto()
	NONE = enum.auto()


	class AlignmentIterator(interfaces.AlignmentIterator):
	"""Alignment iterator for tabular output from BLAST or FASTA.

	For reading (pairwise) alignments from tabular output generated by BLAST
	run with the '-outfmt 7' argument, as well as tabular output generated by
	William Pearson's FASTA alignment programs with the '-m 8CB' or '-m 8CC'
	output formats.
	"""

	fmt = "Tabular"

	def _read_header(self, stream):
	try:
	line = next(stream)
	except StopIteration:
	raise ValueError("Empty file.") from None
	if not line.startswith("# "):
	raise ValueError("Missing header.")
	line = line.rstrip()
	self._parse_header(stream, line)

	def _parse_header(self, stream, line):
	metadata = {}
	blast_programs = (
	"BLASTN",
	"BLASTP",
	"BLASTX",
	"TBLASTN",
	"TBLASTX",
	"DELTABLAST",
	"PSIBLAST",
	"RPSBLAST",
	"RPSTBLASTN",
	)
	try:
	program, version = line[2:].split(None, 1)
	if program not in blast_programs:
	raise ValueError("Not a BLAST program")
	except ValueError:
	# FASTA
	metadata["Command line"] = line[2:]
	line = next(stream)
	assert line.startswith("# ")
	metadata["Program"], metadata["Version"] = line[2:].rstrip().split(None, 1)
	self._final_prefix = "# FASTA processed "
	else:
	# BLAST
	metadata["Program"], metadata["Version"] = program, version
	self._final_prefix = "# BLAST processed "
	for line in stream:
	line = line.strip()
	assert line.startswith("# ")
	try:
	prefix, value = line[2:].split(": ")
	except ValueError:
	suffix = " hits found"
	assert line.endswith(suffix)
	hits = int(line[2 : -len(suffix)])
	break
	if prefix == "Query":
	if metadata["Program"] == "FASTA":
	query_line, query_size = value.rsplit(" - ", 1)
	query_size, unit = query_size.split()
	self._query_size = int(query_size)
	assert unit in ("nt", "aa")
	else:
	query_line = value
	self._query_size = None
	try:
	self._query_id, self._query_description = query_line.split(None, 1)
	except ValueError:
	self._query_id = query_line.strip()
	self._query_description = None
	elif prefix == "Database":
	metadata["Database"] = value
	elif prefix == "Fields":
	self._fields = value.split(", ")
	elif prefix == "RID":
	metadata["RID"] = value
	self.metadata = metadata

	def _read_next_alignment(self, stream):
	for line in stream:
	line = line.rstrip()
	if line.startswith("# "):
	if line.startswith(self._final_prefix) and line.endswith(" queries"):
	del self._fields
	del self._query_id
	del self._query_description
	del self._query_size
	del self._final_prefix
	return
	self._parse_header(stream, line)
	else:
	break
	alignment_length = None
	identical = None
	btop = None
	cigar = None
	score = None
	query_id = None
	target_id = None
	query_start = None
	query_end = None
	target_start = None
	target_end = None
	query_sequence = None
	target_sequence = None
	target_length = None
	coordinates = None
	query_size = self._query_size
	columns = line.split("\t")
	assert len(columns) == len(self._fields)
	annotations = {}
	query_annotations = {}
	target_annotations = {}
	for column, field in zip(columns, self._fields):
	if field == "query id":
	query_id = column
	if self._query_id is not None:
	assert query_id == self._query_id
	elif field == "subject id":
	target_id = column
	elif field == "% identity":
	annotations[field] = float(column)
	elif field == "alignment length":
	alignment_length = int(column)
	elif field == "mismatches":
	annotations[field] = int(column)
	elif field == "gap opens":
	annotations[field] = int(column)
	elif field == "q. start":
	query_start = int(column)
	elif field == "q. end":
	query_end = int(column)
	elif field == "s. start":
	target_start = int(column)
	elif field == "s. end":
	target_end = int(column)
	elif field == "evalue":
	annotations["evalue"] = float(column)
	elif field == "bit score":
	annotations["bit score"] = float(column)
	elif field == "BTOP":
	coordinates = self.parse_btop(column)
	elif field == "aln_code":
	coordinates = self.parse_cigar(column)
	elif field == "query gi":
	query_annotations["gi"] = column
	elif field == "query acc.":
	query_annotations["acc."] = column
	elif field == "query acc.ver":
	query_annotations["acc.ver"] = column
	if query_id is None:
	query_id = column
	elif field == "query length":
	if query_size is None:
	query_size = int(column)
	else:
	assert query_size == int(column)
	elif field == "subject ids":
	target_annotations["ids"] = column
	elif field == "subject gi":
	target_annotations["gi"] = column
	elif field == "subject gis":
	target_annotations["gis"] = column
	elif field == "subject acc.":
	target_annotations["acc."] = column
	elif field == "subject accs.":
	target_annotations["accs."] = column
	elif field == "subject tax ids":
	target_annotations["tax ids"] = column
	elif field == "subject sci names":
	target_annotations["sci names"] = column
	elif field == "subject com names":
	target_annotations["com names"] = column
	elif field == "subject blast names":
	target_annotations["blast names"] = column
	elif field == "subject super kingdoms":
	target_annotations["super kingdoms"] = column
	elif field == "subject title":
	target_annotations["title"] = column
	elif field == "subject titles":
	target_annotations["titles"] = column
	elif field == "subject strand":
	target_annotations["strand"] = column
	elif field == "% subject coverage":
	target_annotations["% coverage"] = float(column)
	elif field == "subject acc.ver":
	target_annotations["acc.ver"] = column
	if target_id is None:
	target_id = column
	elif field == "subject length":
	target_length = int(column)
	elif field == "query seq":
	query_sequence = column
	elif field == "subject seq":
	target_sequence = column
	elif field == "score":
	score = int(column)
	elif field == "identical":
	identical = int(column)
	annotations[field] = identical
	elif field == "positives":
	annotations[field] = int(column)
	elif field == "gaps":
	annotations[field] = int(column)
	elif field == "% positives":
	annotations[field] = float(column)
	elif field == "% hsp coverage":
	annotations[field] = float(column)
	elif field == "query/sbjct frames":
	annotations[field] = column
	elif field == "query frame":
	query_annotations["frame"] = column
	elif field == "sbjct frame":
	target_annotations["frame"] = column
	else:
	raise ValueError("Unexpected field '%s'" % field)
	program = self.metadata["Program"]
	if coordinates is None:
	if alignment_length is not None:
	annotations["alignment length"] = alignment_length
	# otherwise, get it from alignment.shape
	if query_start is not None and query_end is not None:
	if query_start < query_end:
	query_start -= 1
	else:
	query_end -= 1
	if target_start is not None and target_end is not None:
	if target_start < target_end:
	target_start -= 1
	else:
	target_end -= 1
	if coordinates is None or program in ("BLASTX", "TBLASTX"):
	if query_start is not None:
	query_annotations["start"] = query_start
	if query_end is not None:
	query_annotations["end"] = query_end
	elif coordinates is not None:
	if query_start < query_end:
	coordinates[1, :] += query_start
	else:
	# mapped to reverse strand
	coordinates[1, :] = query_start - coordinates[1, :]
	if coordinates is None or program in ("TBLASTN", "TBLASTX"):
	if target_start is not None:
	target_annotations["start"] = target_start
	if target_end is not None:
	target_annotations["end"] = target_end
	elif coordinates is not None:
	coordinates[0, :] += target_start
	if query_sequence is None:
	if query_size is None:
	query_seq = None
	else:
	query_seq = Seq(None, length=query_size)
	else:
	query_sequence = query_sequence.replace("-", "")
	if program == "TBLASTN":
	assert len(query_sequence) == query_end - query_start
	query_seq = Seq({query_start: query_sequence}, length=query_size)
	elif program == "TBLASTX":
	query_annotations["start"] = query_start
	query_annotations["end"] = query_end
	query_seq = Seq(query_sequence)
	else:
	raise Exception("Unknown program %s" % program)
	query = SeqRecord(query_seq, id=query_id)
	if self._query_description is not None:
	query.description = self._query_description
	if query_annotations:
	query.annotations = query_annotations
	if self.metadata["Program"] in ("TBLASTN", "TBLASTX"):
	target_annotations["length"] = target_length
	if target_sequence is None:
	target_seq = None
	else:
	target_sequence = target_sequence.replace("-", "")
	target_seq = Seq(target_sequence)
	else:
	if target_sequence is None:
	if target_end is None:
	target_seq = None
	else:
	target_seq = Seq(None, length=target_end)
	else:
	target_sequence = target_sequence.replace("-", "")
	if target_start is not None and target_end is not None:
	assert len(target_sequence) == target_end - target_start
	target_seq = Seq({target_start: target_sequence}, length=target_end)
	target = SeqRecord(target_seq, id=target_id)
	if target_annotations:
	target.annotations = target_annotations
	records = [target, query]
	alignment = Alignment(records, coordinates)
	alignment.annotations = annotations
	if score is not None:
	alignment.score = score
	return alignment

	def parse_btop(self, btop):
	"""Parse a BTOP string and return alignment coordinates.

	A BTOP (Blast trace-back operations) string is used by BLAST to
	describe a sequence alignment.
	"""
	target_coordinates = []
	query_coordinates = []
	target_coordinates.append(0)
	query_coordinates.append(0)
	state = State.NONE
	tokens = re.findall("([A-Z-*]{2}\|\\d+)", btop)
	# each token is now
	# - an integer
	# - a pair of characters, which may include dashes
	for token in tokens:
	if token.startswith("-"):
	if state != State.QUERY_GAP:
	target_coordinates.append(target_coordinates[-1])
	query_coordinates.append(query_coordinates[-1])
	state = State.QUERY_GAP
	target_coordinates[-1] += 1
	elif token.endswith("-"):
	if state != State.TARGET_GAP:
	target_coordinates.append(target_coordinates[-1])
	query_coordinates.append(query_coordinates[-1])
	state = State.TARGET_GAP
	query_coordinates[-1] += 1
	else:
	try:
	length = int(token)
	except ValueError:
	# pair of mismatched letters
	length = 1
	if state == State.MATCH:
	target_coordinates[-1] += length
	query_coordinates[-1] += length
	else:
	target_coordinates.append(target_coordinates[-1] + length)
	query_coordinates.append(query_coordinates[-1] + length)
	state = State.MATCH
	coordinates = numpy.array([target_coordinates, query_coordinates])
	return coordinates

	def parse_cigar(self, cigar):
	"""Parse a CIGAR string and return alignment coordinates.

	A CIGAR string, as defined by the SAM Sequence Alignment/Map format,
	describes a sequence alignment as a series of lengths and operation
	(alignment/insertion/deletion) codes.
	"""
	target_coordinates = []
	query_coordinates = []
	target_coordinate = 0
	query_coordinate = 0
	target_coordinates.append(target_coordinate)
	query_coordinates.append(query_coordinate)
	state = State.NONE
	tokens = re.findall("(M\|D\|I\|\\d+)", cigar)
	# each token is now
	# - the length of the operation
	# - the operation
	for length, operation in zip(tokens[::2], tokens[1::2]):
	length = int(length)
	if operation == "M":
	target_coordinate += length
	query_coordinate += length
	elif operation == "I":
	target_coordinate += length
	elif operation == "D":
	query_coordinate += length
	target_coordinates.append(target_coordinate)
	query_coordinates.append(query_coordinate)
	coordinates = numpy.array([target_coordinates, query_coordinates])
	return coordinates