Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /BioSQL /BioSeq.py

aakash0017

Upload folder using huggingface_hub

b7731cd about 2 years ago

raw

history blame contribute delete

21.6 kB

	# Copyright 2002 by Andrew Dalke. All rights reserved.
	# Revisions 2007-2016 copyright by Peter Cock. All rights reserved.
	# Revisions 2008-2009 copyright by Cymon J. Cox. All rights reserved.
	#
	# This file is part of the Biopython distribution and governed by your
	# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
	# Please see the LICENSE file that should have been included as part of this
	# package.
	#
	# Note that BioSQL (including the database schema and scripts) is
	# available and licensed separately. Please consult www.biosql.org
	"""Implementations of Biopython-like Seq objects on top of BioSQL.

	This allows retrieval of items stored in a BioSQL database using
	a biopython-like SeqRecord and Seq interface.

	Note: Currently we do not support recording per-letter-annotations
	(like quality scores) in BioSQL.
	"""

	from Bio.Seq import Seq, SequenceDataAbstractBaseClass
	from Bio.SeqRecord import SeqRecord, _RestrictedDict
	from Bio import SeqFeature


	class _BioSQLSequenceData(SequenceDataAbstractBaseClass):
	"""Retrieves sequence data from a BioSQL database (PRIVATE)."""

	__slots__ = ("primary_id", "adaptor", "_length", "start")

	def __init__(self, primary_id, adaptor, start=0, length=0):
	"""Create a new _BioSQLSequenceData object referring to a BioSQL entry.

	You wouldn't normally create a _BioSQLSequenceData object yourself,
	this is done for you when retrieving a DBSeqRecord object from the
	database, which creates a Seq object using a _BioSQLSequenceData
	instance as the data provider.
	"""
	self.primary_id = primary_id
	self.adaptor = adaptor
	self._length = length
	self.start = start
	super().__init__()

	def __len__(self):
	"""Return the length of the sequence."""
	return self._length

	def __getitem__(self, key):
	"""Return a subsequence as a bytes or a _BioSQLSequenceData object."""
	if isinstance(key, slice):
	start, end, step = key.indices(self._length)
	size = len(range(start, end, step))
	if size == 0:
	return b""
	else:
	# Return a single letter as an integer (consistent with bytes)
	i = key
	if i < 0:
	i += self._length
	if i < 0:
	raise IndexError(key)
	elif i >= self._length:
	raise IndexError(key)
	c = self.adaptor.get_subseq_as_string(
	self.primary_id, self.start + i, self.start + i + 1
	)
	return ord(c)

	if step == 1:
	if start == 0 and size == self._length:
	# Return the full sequence as bytes
	sequence = self.adaptor.get_subseq_as_string(
	self.primary_id, self.start, self.start + self._length
	)
	return sequence.encode("ASCII")
	else:
	# Return a _BioSQLSequenceData with the start and end adjusted
	return _BioSQLSequenceData(
	self.primary_id, self.adaptor, self.start + start, size
	)
	else:
	# Will have to extract the sequence because of the stride
	full = self.adaptor.get_subseq_as_string(
	self.primary_id, self.start + start, self.start + end
	)
	return full[::step].encode("ASCII")


	def _retrieve_seq_len(adaptor, primary_id):
	# The database schema ensures there will be only one matching row
	seqs = adaptor.execute_and_fetchall(
	"SELECT length FROM biosequence WHERE bioentry_id = %s", (primary_id,)
	)
	if not seqs:
	return None
	if len(seqs) != 1:
	raise ValueError(f"Expected 1 response, got {len(seqs)}.")
	(given_length,) = seqs[0]
	return int(given_length)


	def _retrieve_seq(adaptor, primary_id):
	# The database schema ensures there will be only one matching
	# row in the table.

	# If an undefined sequence was recorded, seq will be NULL,
	# but length will be populated. This means length(seq)
	# will return None.
	seqs = adaptor.execute_and_fetchall(
	"SELECT alphabet, length, length(seq) FROM biosequence WHERE bioentry_id = %s",
	(primary_id,),
	)
	if not seqs:
	return
	if len(seqs) != 1:
	raise ValueError(f"Expected 1 response, got {len(seqs)}.")
	moltype, given_length, length = seqs[0]

	try:
	length = int(length)
	given_length = int(given_length)
	if length != given_length:
	raise ValueError(
	f"'length' differs from sequence length, {given_length}, {length}"
	)
	have_seq = True
	except TypeError:
	if length is not None:
	raise ValueError(f"Expected 'length' to be 'None', got {length}.")
	seqs = adaptor.execute_and_fetchall(
	"SELECT alphabet, length, seq FROM biosequence WHERE bioentry_id = %s",
	(primary_id,),
	)
	if len(seqs) != 1:
	raise ValueError(f"Expected 1 response, got {len(seqs)}.")
	moltype, given_length, seq = seqs[0]
	if seq:
	raise ValueError(f"Expected 'seq' to have a falsy value, got {seq}.")
	length = int(given_length)
	have_seq = False
	del seq
	del given_length

	if have_seq:
	data = _BioSQLSequenceData(primary_id, adaptor, start=0, length=length)
	return Seq(data)
	else:
	return Seq(None, length=length)


	def _retrieve_dbxrefs(adaptor, primary_id):
	"""Retrieve the database cross references for the sequence (PRIVATE)."""
	_dbxrefs = []
	dbxrefs = adaptor.execute_and_fetchall(
	"SELECT dbname, accession, version"
	" FROM bioentry_dbxref join dbxref using (dbxref_id)"
	" WHERE bioentry_id = %s"
	' ORDER BY "rank"',
	(primary_id,),
	)
	for dbname, accession, version in dbxrefs:
	if version and version != "0":
	v = f"{accession}.{version}"
	else:
	v = accession
	_dbxrefs.append(f"{dbname}:{v}")
	return _dbxrefs


	def _retrieve_features(adaptor, primary_id):
	sql = (
	'SELECT seqfeature_id, type.name, "rank"'
	" FROM seqfeature join term type on (type_term_id = type.term_id)"
	" WHERE bioentry_id = %s"
	' ORDER BY "rank"'
	)
	results = adaptor.execute_and_fetchall(sql, (primary_id,))
	seq_feature_list = []
	for seqfeature_id, seqfeature_type, seqfeature_rank in results:
	# Get qualifiers [except for db_xref which is stored separately]
	qvs = adaptor.execute_and_fetchall(
	"SELECT name, value"
	" FROM seqfeature_qualifier_value join term using (term_id)"
	" WHERE seqfeature_id = %s"
	' ORDER BY "rank"',
	(seqfeature_id,),
	)
	qualifiers = {}
	for qv_name, qv_value in qvs:
	qualifiers.setdefault(qv_name, []).append(qv_value)
	# Get db_xrefs [special case of qualifiers]
	qvs = adaptor.execute_and_fetchall(
	"SELECT dbxref.dbname, dbxref.accession"
	" FROM dbxref join seqfeature_dbxref using (dbxref_id)"
	" WHERE seqfeature_dbxref.seqfeature_id = %s"
	' ORDER BY "rank"',
	(seqfeature_id,),
	)
	for qv_name, qv_value in qvs:
	value = f"{qv_name}:{qv_value}"
	qualifiers.setdefault("db_xref", []).append(value)
	# Get locations
	results = adaptor.execute_and_fetchall(
	"SELECT location_id, start_pos, end_pos, strand"
	" FROM location"
	" WHERE seqfeature_id = %s"
	' ORDER BY "rank"',
	(seqfeature_id,),
	)
	locations = []
	# convert to Python standard form
	# Convert strand = 0 to strand = None
	# re: comment in Loader.py:
	# Biopython uses None when we don't know strand information but
	# BioSQL requires something (non null) and sets this as zero
	# So we'll use the strand or 0 if Biopython spits out None
	for location_id, start, end, strand in results:
	if start:
	start -= 1
	if strand == 0:
	strand = None
	if strand not in (+1, -1, None):
	raise ValueError(
	"Invalid strand %s found in database for "
	"seqfeature_id %s" % (strand, seqfeature_id)
	)
	if start is not None and end is not None and end < start:
	import warnings
	from Bio import BiopythonWarning

	warnings.warn(
	"Inverted location start/end (%i and %i) for "
	"seqfeature_id %s" % (start, end, seqfeature_id),
	BiopythonWarning,
	)

	# For SwissProt unknown positions (?)
	if start is None:
	start = SeqFeature.UnknownPosition()
	if end is None:
	end = SeqFeature.UnknownPosition()

	locations.append((location_id, start, end, strand))
	# Get possible remote reference information
	remote_results = adaptor.execute_and_fetchall(
	"SELECT location_id, dbname, accession, version"
	" FROM location join dbxref using (dbxref_id)"
	" WHERE seqfeature_id = %s",
	(seqfeature_id,),
	)
	lookup = {}
	for location_id, dbname, accession, version in remote_results:
	if version and version != "0":
	v = f"{accession}.{version}"
	else:
	v = accession
	# subfeature remote location db_ref are stored as a empty string
	# when not present
	if dbname == "":
	dbname = None
	lookup[location_id] = (dbname, v)

	feature = SeqFeature.SeqFeature(type=seqfeature_type)
	# Store the key as a private property
	feature._seqfeature_id = seqfeature_id
	feature.qualifiers = qualifiers
	if len(locations) == 0:
	pass
	elif len(locations) == 1:
	location_id, start, end, strand = locations[0]
	# See Bug 2677, we currently don't record the location_operator
	# For consistency with older versions Biopython, default to "".
	feature.location_operator = _retrieve_location_qualifier_value(
	adaptor, location_id
	)
	dbname, version = lookup.get(location_id, (None, None))
	feature.location = SeqFeature.SimpleLocation(start, end)
	feature.strand = strand
	feature.ref_db = dbname
	feature.ref = version
	else:
	locs = []
	for location in locations:
	location_id, start, end, strand = location
	dbname, version = lookup.get(location_id, (None, None))
	locs.append(
	SeqFeature.SimpleLocation(
	start, end, strand=strand, ref=version, ref_db=dbname
	)
	)
	# Locations are typically in biological in order (see negative
	# strands below), but because of remote locations for
	# sub-features they are not necessarily in numerical order:
	strands = {_.strand for _ in locs}
	if len(strands) == 1 and -1 in strands:
	# Evil hack time for backwards compatibility
	# TODO - Check if BioPerl and (old) Biopython did the same,
	# we may have an existing incompatibility lurking here...
	locs = locs[::-1]
	feature.location = SeqFeature.CompoundLocation(locs, "join")
	# TODO - See Bug 2677 - we don't yet record location operator,
	# so for consistency with older versions of Biopython default
	# to assuming its a join.
	seq_feature_list.append(feature)
	return seq_feature_list


	def _retrieve_location_qualifier_value(adaptor, location_id):
	value = adaptor.execute_and_fetch_col0(
	"SELECT value FROM location_qualifier_value WHERE location_id = %s",
	(location_id,),
	)
	try:
	return value[0]
	except IndexError:
	return ""


	def _retrieve_annotations(adaptor, primary_id, taxon_id):
	annotations = {}
	annotations.update(_retrieve_alphabet(adaptor, primary_id))
	annotations.update(_retrieve_qualifier_value(adaptor, primary_id))
	annotations.update(_retrieve_reference(adaptor, primary_id))
	annotations.update(_retrieve_taxon(adaptor, primary_id, taxon_id))
	annotations.update(_retrieve_comment(adaptor, primary_id))
	return annotations


	def _retrieve_alphabet(adaptor, primary_id):
	results = adaptor.execute_and_fetchall(
	"SELECT alphabet FROM biosequence WHERE bioentry_id = %s", (primary_id,)
	)
	if len(results) != 1:
	raise ValueError(f"Expected 1 response, got {len(results)}.")
	alphabets = results[0]
	if len(alphabets) != 1:
	raise ValueError(f"Expected 1 alphabet in response, got {len(alphabets)}.")
	alphabet = alphabets[0]
	if alphabet == "dna":
	molecule_type = "DNA"
	elif alphabet == "rna":
	molecule_type = "RNA"
	elif alphabet == "protein":
	molecule_type = "protein"
	else:
	molecule_type = None
	if molecule_type is not None:
	return {"molecule_type": molecule_type}
	else:
	return {}


	def _retrieve_qualifier_value(adaptor, primary_id):
	qvs = adaptor.execute_and_fetchall(
	"SELECT name, value"
	" FROM bioentry_qualifier_value JOIN term USING (term_id)"
	" WHERE bioentry_id = %s"
	' ORDER BY "rank"',
	(primary_id,),
	)
	qualifiers = {}
	for name, value in qvs:
	if name == "keyword":
	name = "keywords"
	# See handling of "date" in Loader.py
	elif name == "date_changed":
	name = "date"
	elif name == "secondary_accession":
	name = "accessions"
	qualifiers.setdefault(name, []).append(value)
	return qualifiers


	def _retrieve_reference(adaptor, primary_id):
	# XXX dbxref_qualifier_value

	refs = adaptor.execute_and_fetchall(
	"SELECT start_pos, end_pos, "
	" location, title, authors,"
	" dbname, accession"
	" FROM bioentry_reference"
	" JOIN reference USING (reference_id)"
	" LEFT JOIN dbxref USING (dbxref_id)"
	" WHERE bioentry_id = %s"
	' ORDER BY "rank"',
	(primary_id,),
	)
	references = []
	for start, end, location, title, authors, dbname, accession in refs:
	reference = SeqFeature.Reference()
	# If the start/end are missing, reference.location is an empty list
	if (start is not None) or (end is not None):
	if start is not None:
	start -= 1 # python counting
	reference.location = [SeqFeature.SimpleLocation(start, end)]
	# Don't replace the default "" with None.
	if authors:
	reference.authors = authors
	if title:
	reference.title = title
	reference.journal = location
	if dbname == "PUBMED":
	reference.pubmed_id = accession
	elif dbname == "MEDLINE":
	reference.medline_id = accession
	references.append(reference)
	if references:
	return {"references": references}
	else:
	return {}


	def _retrieve_taxon(adaptor, primary_id, taxon_id):
	a = {}
	common_names = adaptor.execute_and_fetch_col0(
	"SELECT name FROM taxon_name WHERE taxon_id = %s"
	" AND name_class = 'genbank common name'",
	(taxon_id,),
	)
	if common_names:
	a["source"] = common_names[0]
	scientific_names = adaptor.execute_and_fetch_col0(
	"SELECT name FROM taxon_name WHERE taxon_id = %s"
	" AND name_class = 'scientific name'",
	(taxon_id,),
	)
	if scientific_names:
	a["organism"] = scientific_names[0]
	ncbi_taxids = adaptor.execute_and_fetch_col0(
	"SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %s", (taxon_id,)
	)
	if ncbi_taxids and ncbi_taxids[0] and ncbi_taxids[0] != "0":
	a["ncbi_taxid"] = ncbi_taxids[0]

	# Old code used the left/right values in the taxon table to get the
	# taxonomy lineage in one SQL command. This was actually very slow,
	# and would fail if the (optional) left/right values were missing.
	#
	# The following code is based on a contribution from Eric Gibert, and
	# relies on the taxon table's parent_taxon_id field only (ignoring the
	# optional left/right values). This means that it has to make a
	# separate SQL query for each entry in the lineage, but it does still
	# appear to be much faster. See Bug 2494.
	taxonomy = []
	while taxon_id:
	name, rank, parent_taxon_id = adaptor.execute_one(
	"SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id"
	" FROM taxon, taxon_name"
	" WHERE taxon.taxon_id=taxon_name.taxon_id"
	" AND taxon_name.name_class='scientific name'"
	" AND taxon.taxon_id = %s",
	(taxon_id,),
	)
	if taxon_id == parent_taxon_id:
	# If the taxon table has been populated by the BioSQL script
	# load_ncbi_taxonomy.pl this is how top parent nodes are stored.
	# Personally, I would have used a NULL parent_taxon_id here.
	break

	taxonomy.insert(0, name)
	taxon_id = parent_taxon_id

	if taxonomy:
	a["taxonomy"] = taxonomy
	return a


	def _retrieve_comment(adaptor, primary_id):
	qvs = adaptor.execute_and_fetchall(
	'SELECT comment_text FROM comment WHERE bioentry_id=%s ORDER BY "rank"',
	(primary_id,),
	)
	comments = [comm[0] for comm in qvs]
	# Don't want to add an empty list...
	if comments:
	return {"comment": comments}
	else:
	return {}


	class DBSeqRecord(SeqRecord):
	"""BioSQL equivalent of the Biopython SeqRecord object."""

	def __init__(self, adaptor, primary_id):
	"""Create a DBSeqRecord object.

	Arguments:
	- adaptor - A BioSQL.BioSeqDatabase.Adaptor object
	- primary_id - An internal integer ID used by BioSQL

	You wouldn't normally create a DBSeqRecord object yourself,
	this is done for you when using a BioSeqDatabase object
	"""
	self._adaptor = adaptor
	self._primary_id = primary_id

	(
	self._biodatabase_id,
	self._taxon_id,
	self.name,
	accession,
	version,
	self._identifier,
	self._division,
	self.description,
	) = self._adaptor.execute_one(
	"SELECT biodatabase_id, taxon_id, name, accession, version,"
	" identifier, division, description"
	" FROM bioentry"
	" WHERE bioentry_id = %s",
	(self._primary_id,),
	)
	if version and version != "0":
	self.id = f"{accession}.{version}"
	else:
	self.id = accession
	# We don't yet record any per-letter-annotations in the
	# BioSQL database, but we should set this property up
	# for completeness (and the __str__ method).
	# We do NOT want to load the sequence from the DB here!
	length = _retrieve_seq_len(adaptor, primary_id)
	self._per_letter_annotations = _RestrictedDict(length=length)

	def __get_seq(self):
	if not hasattr(self, "_seq"):
	self._seq = _retrieve_seq(self._adaptor, self._primary_id)
	return self._seq

	def __set_seq(self, seq):
	# TODO - Check consistent with self._per_letter_annotations
	self._seq = seq

	def __del_seq(self):
	del self._seq

	seq = property(__get_seq, __set_seq, __del_seq, "Seq object")

	def __get_dbxrefs(self):
	if not hasattr(self, "_dbxrefs"):
	self._dbxrefs = _retrieve_dbxrefs(self._adaptor, self._primary_id)
	return self._dbxrefs

	def __set_dbxrefs(self, dbxrefs):
	self._dbxrefs = dbxrefs

	def __del_dbxrefs(self):
	del self._dbxrefs

	dbxrefs = property(
	__get_dbxrefs, __set_dbxrefs, __del_dbxrefs, "Database cross references"
	)

	def __get_features(self):
	if not hasattr(self, "_features"):
	self._features = _retrieve_features(self._adaptor, self._primary_id)
	return self._features

	def __set_features(self, features):
	self._features = features

	def __del_features(self):
	del self._features

	features = property(__get_features, __set_features, __del_features, "Features")

	def __get_annotations(self):
	if not hasattr(self, "_annotations"):
	self._annotations = _retrieve_annotations(
	self._adaptor, self._primary_id, self._taxon_id
	)
	if self._identifier:
	self._annotations["gi"] = self._identifier
	if self._division:
	self._annotations["data_file_division"] = self._division
	return self._annotations

	def __set_annotations(self, annotations):
	self._annotations = annotations

	def __del_annotations(self):
	del self._annotations

	annotations = property(
	__get_annotations, __set_annotations, __del_annotations, "Annotations"
	)