Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /ExPASy /Prosite.py

aakash0017

Upload folder using huggingface_hub

b7731cd over 2 years ago

raw

history blame contribute delete

11.5 kB

	# Copyright 1999 by Jeffrey Chang. All rights reserved.
	# Copyright 2000 by Jeffrey Chang. All rights reserved.
	# Revisions Copyright 2007 by Peter Cock. All rights reserved.
	# Revisions Copyright 2009 by Michiel de Hoon. All rights reserved.
	# This code is part of the Biopython distribution and governed by its
	# license. Please see the LICENSE file that should have been included
	# as part of this package.
	"""Parser for the prosite dat file from Prosite at ExPASy.

	See https://www.expasy.org/prosite/

	Tested with:
	- Release 20.43, 10-Feb-2009
	- Release 2017_03 of 15-Mar-2017.

	Functions:
	- read Reads a Prosite file containing one Prosite record
	- parse Iterates over records in a Prosite file.

	Classes:
	- Record Holds Prosite data.

	"""


	def parse(handle):
	"""Parse Prosite records.

	This function is for parsing Prosite files containing multiple
	records.

	Arguments:
	- handle - handle to the file.

	"""
	while True:
	record = __read(handle)
	if not record:
	break
	yield record


	def read(handle):
	"""Read one Prosite record.

	This function is for parsing Prosite files containing
	exactly one record.

	Arguments:
	- handle - handle to the file.

	"""
	record = __read(handle)
	# We should have reached the end of the record by now
	remainder = handle.read()
	if remainder:
	raise ValueError("More than one Prosite record found")
	return record


	class Record:
	"""Holds information from a Prosite record.

	Main attributes:
	- name ID of the record. e.g. ADH_ZINC
	- type Type of entry. e.g. PATTERN, MATRIX, or RULE
	- accession e.g. PS00387
	- created Date the entry was created. (MMM-YYYY for releases
	before January 2017, DD-MMM-YYYY since January 2017)
	- data_update Date the 'primary' data was last updated.
	- info_update Date data other than 'primary' data was last updated.
	- pdoc ID of the PROSITE DOCumentation.
	- description Free-format description.
	- pattern The PROSITE pattern. See docs.
	- matrix List of strings that describes a matrix entry.
	- rules List of rule definitions (from RU lines). (strings)
	- prorules List of prorules (from PR lines). (strings)

	NUMERICAL RESULTS:
	- nr_sp_release SwissProt release.
	- nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int)
	- nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs)
	- nr_positive True positives. tuple of (hits, seqs)
	- nr_unknown Could be positives. tuple of (hits, seqs)
	- nr_false_pos False positives. tuple of (hits, seqs)
	- nr_false_neg False negatives. (int)
	- nr_partial False negatives, because they are fragments. (int)

	COMMENTS:
	- cc_taxo_range Taxonomic range. See docs for format
	- cc_max_repeat Maximum number of repetitions in a protein
	- cc_site Interesting site. list of tuples (pattern pos, desc.)
	- cc_skip_flag Can this entry be ignored?
	- cc_matrix_type
	- cc_scaling_db
	- cc_author
	- cc_ft_key
	- cc_ft_desc
	- cc_version version number (introduced in release 19.0)

	The following are all lists if tuples (swiss-prot accession, swiss-prot name).

	DATA BANK REFERENCES:
	- dr_positive
	- dr_false_neg
	- dr_false_pos
	- dr_potential Potential hits, but fingerprint region not yet available.
	- dr_unknown Could possibly belong
	- pdb_structs List of PDB entries.

	"""

	def __init__(self):
	"""Initialize the class."""
	self.name = ""
	self.type = ""
	self.accession = ""
	self.created = ""
	self.data_update = ""
	self.info_update = ""
	self.pdoc = ""

	self.description = ""
	self.pattern = ""
	self.matrix = []
	self.rules = []
	self.prorules = []
	self.postprocessing = []

	self.nr_sp_release = ""
	self.nr_sp_seqs = ""
	self.nr_total = (None, None)
	self.nr_positive = (None, None)
	self.nr_unknown = (None, None)
	self.nr_false_pos = (None, None)
	self.nr_false_neg = None
	self.nr_partial = None

	self.cc_taxo_range = ""
	self.cc_max_repeat = ""
	self.cc_site = []
	self.cc_skip_flag = ""

	self.dr_positive = []
	self.dr_false_neg = []
	self.dr_false_pos = []
	self.dr_potential = []
	self.dr_unknown = []

	self.pdb_structs = []


	# Everything below are private functions


	def __read(handle):
	import re

	record = None
	for line in handle:
	keyword, value = line[:2], line[5:].rstrip()
	if keyword == "ID":
	record = Record()
	cols = value.split("; ")
	if len(cols) != 2:
	raise ValueError(f"I don't understand identification line\n{line}")
	record.name = cols[0]
	record.type = cols[1].rstrip(".") # don't want '.'
	elif keyword == "AC":
	record.accession = value.rstrip(";")
	elif keyword == "DT":
	# e.g. from January 2017,
	# DT 01-APR-1990 CREATED; 01-APR-1990 DATA UPDATE; 01-APR-1990 INFO UPDATE.
	# Older files had brackets round the date descriptions and used MMM-YYYY
	dates = value.rstrip(".").split("; ")
	if dates[0].endswith((" (CREATED)", " CREATED")):
	# Remove last word
	record.created = dates[0].rsplit(" ", 1)[0]
	else:
	raise ValueError(f"I don't understand date line\n{line}")
	if dates[1].endswith((" (DATA UPDATE)", " DATA UPDATE")):
	# Remove last two words
	record.data_update = dates[1].rsplit(" ", 2)[0]
	else:
	raise ValueError(f"I don't understand date line\n{line}")
	if dates[2].endswith((" (INFO UPDATE)", " INFO UPDATE")):
	# Remove last two words
	record.info_update = dates[2].rsplit(" ", 2)[0]
	else:
	raise ValueError(f"I don't understand date line\n{line}")
	elif keyword == "DE":
	record.description = value
	elif keyword == "PA":
	record.pattern += value
	elif keyword == "MA":
	record.matrix.append(value)
	elif keyword == "PP":
	record.postprocessing.extend(value.split(";"))
	elif keyword == "RU":
	record.rules.append(value)
	elif keyword == "NR":
	cols = value.split(";")
	for col in cols:
	if not col:
	continue
	qual, data = (word.lstrip() for word in col.split("="))
	if qual == "/RELEASE":
	release, seqs = data.split(",")
	record.nr_sp_release = release
	record.nr_sp_seqs = int(seqs)
	elif qual == "/FALSE_NEG":
	record.nr_false_neg = int(data)
	elif qual == "/PARTIAL":
	record.nr_partial = int(data)
	elif qual in ["/TOTAL", "/POSITIVE", "/UNKNOWN", "/FALSE_POS"]:
	m = re.match(r"(\d+)\((\d+)\)", data)
	if not m:
	raise Exception(f"Broken data {data} in comment line\n{line!r}")
	hits = tuple(map(int, m.groups()))
	if qual == "/TOTAL":
	record.nr_total = hits
	elif qual == "/POSITIVE":
	record.nr_positive = hits
	elif qual == "/UNKNOWN":
	record.nr_unknown = hits
	elif qual == "/FALSE_POS":
	record.nr_false_pos = hits
	else:
	raise ValueError(f"Unknown qual {qual} in comment line\n{line!r}")
	elif keyword == "CC":
	# Expect CC lines like this:
	# CC /TAXO-RANGE=??EPV; /MAX-REPEAT=2;
	# Can (normally) split on ";" and then on "="
	cols = value.split(";")
	for col in cols:
	if not col or col[:17] == "Automatic scaling":
	# DNAJ_2 in Release 15 has a non-standard comment line:
	# CC Automatic scaling using reversed database
	# Throw it away. (Should I keep it?)
	continue
	if col.count("=") == 0:
	# Missing qualifier! Can we recover gracefully?
	# For example, from Bug 2403, in PS50293 have:
	# CC /AUTHOR=K_Hofmann; N_Hulo
	continue
	qual, data = (word.lstrip() for word in col.split("="))
	if qual == "/TAXO-RANGE":
	record.cc_taxo_range = data
	elif qual == "/MAX-REPEAT":
	record.cc_max_repeat = data
	elif qual == "/SITE":
	pos, desc = data.split(",")
	record.cc_site.append((int(pos), desc))
	elif qual == "/SKIP-FLAG":
	record.cc_skip_flag = data
	elif qual == "/MATRIX_TYPE":
	record.cc_matrix_type = data
	elif qual == "/SCALING_DB":
	record.cc_scaling_db = data
	elif qual == "/AUTHOR":
	record.cc_author = data
	elif qual == "/FT_KEY":
	record.cc_ft_key = data
	elif qual == "/FT_DESC":
	record.cc_ft_desc = data
	elif qual == "/VERSION":
	record.cc_version = data
	else:
	raise ValueError(f"Unknown qual {qual} in comment line\n{line!r}")
	elif keyword == "DR":
	refs = value.split(";")
	for ref in refs:
	if not ref:
	continue
	acc, name, type = (word.strip() for word in ref.split(","))
	if type == "T":
	record.dr_positive.append((acc, name))
	elif type == "F":
	record.dr_false_pos.append((acc, name))
	elif type == "N":
	record.dr_false_neg.append((acc, name))
	elif type == "P":
	record.dr_potential.append((acc, name))
	elif type == "?":
	record.dr_unknown.append((acc, name))
	else:
	raise ValueError(f"I don't understand type flag {type}")
	elif keyword == "3D":
	cols = value.split()
	for id in cols:
	record.pdb_structs.append(id.rstrip(";"))
	elif keyword == "PR":
	rules = value.split(";")
	record.prorules.extend(rules)
	elif keyword == "DO":
	record.pdoc = value.rstrip(";")
	elif keyword == "//":
	if not record:
	# Then this was the copyright statement
	continue
	break
	else:
	raise ValueError(f"Unknown keyword {keyword} found")
	else:
	return
	if not record:
	raise ValueError("Unexpected end of stream.")
	return record