Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /UniGene /__init__.py

aakash0017

Upload folder using huggingface_hub

b7731cd over 2 years ago

raw

history blame contribute delete

12.7 kB

	# Copyright 2006 by Sean Davis, National Cancer Institute, NIH.
	# All rights reserved.
	#
	# This file is part of the Biopython distribution and governed by your
	# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
	# Please see the LICENSE file that should have been included as part of this
	# package.
	"""Parse Unigene flat file format files such as the Hs.data file.

	Here is an overview of the flat file format that this parser deals with:

	Line types/qualifiers::

	ID UniGene cluster ID
	TITLE Title for the cluster
	GENE Gene symbol
	CYTOBAND Cytological band
	EXPRESS Tissues of origin for ESTs in cluster
	RESTR_EXPR Single tissue or development stage contributes
	more than half the total EST frequency for this gene.
	GNM_TERMINUS genomic confirmation of presence of a 3' terminus;
	T if a non-templated polyA tail is found among
	a cluster's sequences; else
	I if templated As are found in genomic sequence or
	S if a canonical polyA signal is found on
	the genomic sequence
	GENE_ID Entrez gene identifier associated with at least one
	sequence in this cluster;
	to be used instead of LocusLink.
	LOCUSLINK LocusLink identifier associated with at least one
	sequence in this cluster;
	deprecated in favor of GENE_ID
	HOMOL Homology;
	CHROMOSOME Chromosome. For plants, CHROMOSOME refers to mapping
	on the arabidopsis genome.
	STS STS
	ACC= GenBank/EMBL/DDBJ accession number of STS
	[optional field]
	UNISTS= identifier in NCBI's UNISTS database
	TXMAP Transcript map interval
	MARKER= Marker found on at least one sequence in this
	cluster
	RHPANEL= Radiation Hybrid panel used to place marker
	PROTSIM Protein Similarity data for the sequence with
	highest-scoring protein similarity in this cluster
	ORG= Organism
	PROTGI= Sequence GI of protein
	PROTID= Sequence ID of protein
	PCT= Percent alignment
	ALN= length of aligned region (aa)
	SCOUNT Number of sequences in the cluster
	SEQUENCE Sequence
	ACC= GenBank/EMBL/DDBJ accession number of sequence
	NID= Unique nucleotide sequence identifier (gi)
	PID= Unique protein sequence identifier (used for
	non-ESTs)
	CLONE= Clone identifier (used for ESTs only)
	END= End (5'/3') of clone insert read (used for
	ESTs only)
	LID= Library ID; see Hs.lib.info for library name
	and tissue
	MGC= 5' CDS-completeness indicator; if present, the
	clone associated with this sequence is believed
	CDS-complete. A value greater than 511 is the gi
	of the CDS-complete mRNA matched by the EST,
	otherwise the value is an indicator of the
	reliability of the test indicating CDS
	completeness; higher values indicate more
	reliable CDS-completeness predictions.
	SEQTYPE= Description of the nucleotide sequence.
	Possible values are mRNA, EST and HTC.
	TRACE= The Trace ID of the EST sequence, as provided by
	NCBI Trace Archive

	"""


	class SequenceLine:
	"""Store the information for one SEQUENCE line from a Unigene file.

	Initialize with the text part of the SEQUENCE line, or nothing.

	Attributes and descriptions (access as LOWER CASE):
	- ACC= GenBank/EMBL/DDBJ accession number of sequence
	- NID= Unique nucleotide sequence identifier (gi)
	- PID= Unique protein sequence identifier (used for non-ESTs)
	- CLONE= Clone identifier (used for ESTs only)
	- END= End (5'/3') of clone insert read (used for ESTs only)
	- LID= Library ID; see Hs.lib.info for library name and tissue
	- MGC= 5' CDS-completeness indicator; if present,
	the clone associated with this sequence
	is believed CDS-complete. A value greater than 511
	is the gi of the CDS-complete mRNA matched by the EST,
	otherwise the value is an indicator of the reliability
	of the test indicating CDS completeness;
	higher values indicate more reliable CDS-completeness
	predictions.
	- SEQTYPE= Description of the nucleotide sequence. Possible values
	are mRNA, EST and HTC.
	- TRACE= The Trace ID of the EST sequence, as provided by NCBI
	Trace Archive

	"""

	def __init__(self, text=None):
	"""Initialize the class."""
	self.acc = ""
	self.nid = ""
	self.lid = ""
	self.pid = ""
	self.clone = ""
	self.image = ""
	self.is_image = False
	self.end = ""
	self.mgc = ""
	self.seqtype = ""
	self.trace = ""
	if text is not None:
	self.text = text
	self._init_from_text(text)

	def _init_from_text(self, text):
	parts = text.split("; ")
	for part in parts:
	key, val = part.split("=")
	if key == "CLONE":
	if val[:5] == "IMAGE":
	self.is_image = True
	self.image = val[6:]
	setattr(self, key.lower(), val)

	def __repr__(self):
	"""Return UniGene SequenceLine object as a string."""
	return self.text


	class ProtsimLine:
	"""Store the information for one PROTSIM line from a Unigene file.

	Initialize with the text part of the PROTSIM line, or nothing.

	Attributes and descriptions (access as LOWER CASE)
	ORG= Organism
	PROTGI= Sequence GI of protein
	PROTID= Sequence ID of protein
	PCT= Percent alignment
	ALN= length of aligned region (aa)
	"""

	def __init__(self, text=None):
	"""Initialize the class."""
	self.org = ""
	self.protgi = ""
	self.protid = ""
	self.pct = ""
	self.aln = ""
	if text is not None:
	self.text = text
	self._init_from_text(text)

	def _init_from_text(self, text):
	parts = text.split("; ")

	for part in parts:
	key, val = part.split("=")
	setattr(self, key.lower(), val)

	def __repr__(self):
	"""Return UniGene ProtsimLine object as a string."""
	return self.text


	class STSLine:
	"""Store the information for one STS line from a Unigene file.

	Initialize with the text part of the STS line, or nothing.

	Attributes and descriptions (access as LOWER CASE)

	ACC= GenBank/EMBL/DDBJ accession number of STS [optional field]
	UNISTS= identifier in NCBI's UNISTS database
	"""

	def __init__(self, text=None):
	"""Initialize the class."""
	self.acc = ""
	self.unists = ""
	if text is not None:
	self.text = text
	self._init_from_text(text)

	def _init_from_text(self, text):
	parts = text.split(" ")

	for part in parts:
	key, val = part.split("=")
	setattr(self, key.lower(), val)

	def __repr__(self):
	"""Return UniGene STSLine object as a string."""
	return self.text


	class Record:
	"""Store a Unigene record.

	Here is what is stored::

	self.ID = '' # ID line
	self.species = '' # Hs, Bt, etc.
	self.title = '' # TITLE line
	self.symbol = '' # GENE line
	self.cytoband = '' # CYTOBAND line
	self.express = [] # EXPRESS line, parsed on ';'
	# Will be an array of strings
	self.restr_expr = '' # RESTR_EXPR line
	self.gnm_terminus = '' # GNM_TERMINUS line
	self.gene_id = '' # GENE_ID line
	self.locuslink = '' # LOCUSLINK line
	self.homol = '' # HOMOL line
	self.chromosome = '' # CHROMOSOME line
	self.protsim = [] # PROTSIM entries, array of Protsims
	# Type ProtsimLine
	self.sequence = [] # SEQUENCE entries, array of Sequence entries
	# Type SequenceLine
	self.sts = [] # STS entries, array of STS entries
	# Type STSLine
	self.txmap = [] # TXMAP entries, array of TXMap entries

	"""

	def __init__(self):
	"""Initialize the class."""
	self.ID = "" # ID line
	self.species = "" # Hs, Bt, etc.
	self.title = "" # TITLE line
	self.symbol = "" # GENE line
	self.cytoband = "" # CYTOBAND line
	self.express = [] # EXPRESS line, parsed on ';'
	self.restr_expr = "" # RESTR_EXPR line
	self.gnm_terminus = "" # GNM_TERMINUS line
	self.gene_id = "" # GENE_ID line
	self.locuslink = "" # LOCUSLINK line
	self.homol = "" # HOMOL line
	self.chromosome = "" # CHROMOSOME line
	self.protsim = [] # PROTSIM entries, array of Protsims
	self.sequence = [] # SEQUENCE entries, array of Sequence entries
	self.sts = [] # STS entries, array of STS entries
	self.txmap = [] # TXMAP entries, array of TXMap entries

	def __repr__(self):
	"""Represent the UniGene Record object as a string for debugging."""
	return f"<{self.__class__.__name__}> {self.ID} {self.symbol} {self.title}"


	def parse(handle):
	"""Read and load a UniGene records, for files containing multiple records."""
	while True:
	record = _read(handle)
	if not record:
	return
	yield record


	def read(handle):
	"""Read and load a UniGene record, one record per file."""
	record = _read(handle)
	if not record:
	raise ValueError("No SwissProt record found")
	# We should have reached the end of the record by now
	remainder = handle.read()
	if remainder:
	raise ValueError("More than one SwissProt record found")
	return record


	# Everything below is private


	def _read(handle):
	UG_INDENT = 12
	record = None
	for line in handle:
	tag, value = line[:UG_INDENT].rstrip(), line[UG_INDENT:].rstrip()
	line = line.rstrip()
	if tag == "ID":
	record = Record()
	record.ID = value
	record.species = record.ID.split(".")[0]
	elif tag == "TITLE":
	record.title = value
	elif tag == "GENE":
	record.symbol = value
	elif tag == "GENE_ID":
	record.gene_id = value
	elif tag == "LOCUSLINK":
	record.locuslink = value
	elif tag == "HOMOL":
	if value == "YES":
	record.homol = True
	elif value == "NO":
	record.homol = True
	else:
	raise ValueError(f"Cannot parse HOMOL line {line}")
	elif tag == "EXPRESS":
	record.express = [word.strip() for word in value.split("\|")]
	elif tag == "RESTR_EXPR":
	record.restr_expr = [word.strip() for word in value.split("\|")]
	elif tag == "CHROMOSOME":
	record.chromosome = value
	elif tag == "CYTOBAND":
	record.cytoband = value
	elif tag == "PROTSIM":
	protsim = ProtsimLine(value)
	record.protsim.append(protsim)
	elif tag == "SCOUNT":
	scount = int(value)
	elif tag == "SEQUENCE":
	sequence = SequenceLine(value)
	record.sequence.append(sequence)
	elif tag == "STS":
	sts = STSLine(value)
	record.sts.append(sts)
	elif tag == "//":
	if len(record.sequence) != scount:
	raise ValueError(
	"The number of sequences specified in the record "
	"(%d) does not agree with the number of sequences found (%d)"
	% (scount, len(record.sequence))
	)
	return record
	else:
	raise ValueError(f"Unknown tag {tag}")
	if record:
	raise ValueError("Unexpected end of stream.")