Spaces:
No application file
No application file
| # Copyright 1999 by Jeffrey Chang. All rights reserved. | |
| # Copyright 2000 by Jeffrey Chang. All rights reserved. | |
| # Revisions Copyright 2007 by Peter Cock. All rights reserved. | |
| # Revisions Copyright 2009 by Michiel de Hoon. All rights reserved. | |
| # This code is part of the Biopython distribution and governed by its | |
| # license. Please see the LICENSE file that should have been included | |
| # as part of this package. | |
| """Parser for the prosite dat file from Prosite at ExPASy. | |
| See https://www.expasy.org/prosite/ | |
| Tested with: | |
| - Release 20.43, 10-Feb-2009 | |
| - Release 2017_03 of 15-Mar-2017. | |
| Functions: | |
| - read Reads a Prosite file containing one Prosite record | |
| - parse Iterates over records in a Prosite file. | |
| Classes: | |
| - Record Holds Prosite data. | |
| """ | |
| def parse(handle): | |
| """Parse Prosite records. | |
| This function is for parsing Prosite files containing multiple | |
| records. | |
| Arguments: | |
| - handle - handle to the file. | |
| """ | |
| while True: | |
| record = __read(handle) | |
| if not record: | |
| break | |
| yield record | |
| def read(handle): | |
| """Read one Prosite record. | |
| This function is for parsing Prosite files containing | |
| exactly one record. | |
| Arguments: | |
| - handle - handle to the file. | |
| """ | |
| record = __read(handle) | |
| # We should have reached the end of the record by now | |
| remainder = handle.read() | |
| if remainder: | |
| raise ValueError("More than one Prosite record found") | |
| return record | |
| class Record: | |
| """Holds information from a Prosite record. | |
| Main attributes: | |
| - name ID of the record. e.g. ADH_ZINC | |
| - type Type of entry. e.g. PATTERN, MATRIX, or RULE | |
| - accession e.g. PS00387 | |
| - created Date the entry was created. (MMM-YYYY for releases | |
| before January 2017, DD-MMM-YYYY since January 2017) | |
| - data_update Date the 'primary' data was last updated. | |
| - info_update Date data other than 'primary' data was last updated. | |
| - pdoc ID of the PROSITE DOCumentation. | |
| - description Free-format description. | |
| - pattern The PROSITE pattern. See docs. | |
| - matrix List of strings that describes a matrix entry. | |
| - rules List of rule definitions (from RU lines). (strings) | |
| - prorules List of prorules (from PR lines). (strings) | |
| NUMERICAL RESULTS: | |
| - nr_sp_release SwissProt release. | |
| - nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int) | |
| - nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs) | |
| - nr_positive True positives. tuple of (hits, seqs) | |
| - nr_unknown Could be positives. tuple of (hits, seqs) | |
| - nr_false_pos False positives. tuple of (hits, seqs) | |
| - nr_false_neg False negatives. (int) | |
| - nr_partial False negatives, because they are fragments. (int) | |
| COMMENTS: | |
| - cc_taxo_range Taxonomic range. See docs for format | |
| - cc_max_repeat Maximum number of repetitions in a protein | |
| - cc_site Interesting site. list of tuples (pattern pos, desc.) | |
| - cc_skip_flag Can this entry be ignored? | |
| - cc_matrix_type | |
| - cc_scaling_db | |
| - cc_author | |
| - cc_ft_key | |
| - cc_ft_desc | |
| - cc_version version number (introduced in release 19.0) | |
| The following are all lists if tuples (swiss-prot accession, swiss-prot name). | |
| DATA BANK REFERENCES: | |
| - dr_positive | |
| - dr_false_neg | |
| - dr_false_pos | |
| - dr_potential Potential hits, but fingerprint region not yet available. | |
| - dr_unknown Could possibly belong | |
| - pdb_structs List of PDB entries. | |
| """ | |
| def __init__(self): | |
| """Initialize the class.""" | |
| self.name = "" | |
| self.type = "" | |
| self.accession = "" | |
| self.created = "" | |
| self.data_update = "" | |
| self.info_update = "" | |
| self.pdoc = "" | |
| self.description = "" | |
| self.pattern = "" | |
| self.matrix = [] | |
| self.rules = [] | |
| self.prorules = [] | |
| self.postprocessing = [] | |
| self.nr_sp_release = "" | |
| self.nr_sp_seqs = "" | |
| self.nr_total = (None, None) | |
| self.nr_positive = (None, None) | |
| self.nr_unknown = (None, None) | |
| self.nr_false_pos = (None, None) | |
| self.nr_false_neg = None | |
| self.nr_partial = None | |
| self.cc_taxo_range = "" | |
| self.cc_max_repeat = "" | |
| self.cc_site = [] | |
| self.cc_skip_flag = "" | |
| self.dr_positive = [] | |
| self.dr_false_neg = [] | |
| self.dr_false_pos = [] | |
| self.dr_potential = [] | |
| self.dr_unknown = [] | |
| self.pdb_structs = [] | |
| # Everything below are private functions | |
| def __read(handle): | |
| import re | |
| record = None | |
| for line in handle: | |
| keyword, value = line[:2], line[5:].rstrip() | |
| if keyword == "ID": | |
| record = Record() | |
| cols = value.split("; ") | |
| if len(cols) != 2: | |
| raise ValueError(f"I don't understand identification line\n{line}") | |
| record.name = cols[0] | |
| record.type = cols[1].rstrip(".") # don't want '.' | |
| elif keyword == "AC": | |
| record.accession = value.rstrip(";") | |
| elif keyword == "DT": | |
| # e.g. from January 2017, | |
| # DT 01-APR-1990 CREATED; 01-APR-1990 DATA UPDATE; 01-APR-1990 INFO UPDATE. | |
| # Older files had brackets round the date descriptions and used MMM-YYYY | |
| dates = value.rstrip(".").split("; ") | |
| if dates[0].endswith((" (CREATED)", " CREATED")): | |
| # Remove last word | |
| record.created = dates[0].rsplit(" ", 1)[0] | |
| else: | |
| raise ValueError(f"I don't understand date line\n{line}") | |
| if dates[1].endswith((" (DATA UPDATE)", " DATA UPDATE")): | |
| # Remove last two words | |
| record.data_update = dates[1].rsplit(" ", 2)[0] | |
| else: | |
| raise ValueError(f"I don't understand date line\n{line}") | |
| if dates[2].endswith((" (INFO UPDATE)", " INFO UPDATE")): | |
| # Remove last two words | |
| record.info_update = dates[2].rsplit(" ", 2)[0] | |
| else: | |
| raise ValueError(f"I don't understand date line\n{line}") | |
| elif keyword == "DE": | |
| record.description = value | |
| elif keyword == "PA": | |
| record.pattern += value | |
| elif keyword == "MA": | |
| record.matrix.append(value) | |
| elif keyword == "PP": | |
| record.postprocessing.extend(value.split(";")) | |
| elif keyword == "RU": | |
| record.rules.append(value) | |
| elif keyword == "NR": | |
| cols = value.split(";") | |
| for col in cols: | |
| if not col: | |
| continue | |
| qual, data = (word.lstrip() for word in col.split("=")) | |
| if qual == "/RELEASE": | |
| release, seqs = data.split(",") | |
| record.nr_sp_release = release | |
| record.nr_sp_seqs = int(seqs) | |
| elif qual == "/FALSE_NEG": | |
| record.nr_false_neg = int(data) | |
| elif qual == "/PARTIAL": | |
| record.nr_partial = int(data) | |
| elif qual in ["/TOTAL", "/POSITIVE", "/UNKNOWN", "/FALSE_POS"]: | |
| m = re.match(r"(\d+)\((\d+)\)", data) | |
| if not m: | |
| raise Exception(f"Broken data {data} in comment line\n{line!r}") | |
| hits = tuple(map(int, m.groups())) | |
| if qual == "/TOTAL": | |
| record.nr_total = hits | |
| elif qual == "/POSITIVE": | |
| record.nr_positive = hits | |
| elif qual == "/UNKNOWN": | |
| record.nr_unknown = hits | |
| elif qual == "/FALSE_POS": | |
| record.nr_false_pos = hits | |
| else: | |
| raise ValueError(f"Unknown qual {qual} in comment line\n{line!r}") | |
| elif keyword == "CC": | |
| # Expect CC lines like this: | |
| # CC /TAXO-RANGE=??EPV; /MAX-REPEAT=2; | |
| # Can (normally) split on ";" and then on "=" | |
| cols = value.split(";") | |
| for col in cols: | |
| if not col or col[:17] == "Automatic scaling": | |
| # DNAJ_2 in Release 15 has a non-standard comment line: | |
| # CC Automatic scaling using reversed database | |
| # Throw it away. (Should I keep it?) | |
| continue | |
| if col.count("=") == 0: | |
| # Missing qualifier! Can we recover gracefully? | |
| # For example, from Bug 2403, in PS50293 have: | |
| # CC /AUTHOR=K_Hofmann; N_Hulo | |
| continue | |
| qual, data = (word.lstrip() for word in col.split("=")) | |
| if qual == "/TAXO-RANGE": | |
| record.cc_taxo_range = data | |
| elif qual == "/MAX-REPEAT": | |
| record.cc_max_repeat = data | |
| elif qual == "/SITE": | |
| pos, desc = data.split(",") | |
| record.cc_site.append((int(pos), desc)) | |
| elif qual == "/SKIP-FLAG": | |
| record.cc_skip_flag = data | |
| elif qual == "/MATRIX_TYPE": | |
| record.cc_matrix_type = data | |
| elif qual == "/SCALING_DB": | |
| record.cc_scaling_db = data | |
| elif qual == "/AUTHOR": | |
| record.cc_author = data | |
| elif qual == "/FT_KEY": | |
| record.cc_ft_key = data | |
| elif qual == "/FT_DESC": | |
| record.cc_ft_desc = data | |
| elif qual == "/VERSION": | |
| record.cc_version = data | |
| else: | |
| raise ValueError(f"Unknown qual {qual} in comment line\n{line!r}") | |
| elif keyword == "DR": | |
| refs = value.split(";") | |
| for ref in refs: | |
| if not ref: | |
| continue | |
| acc, name, type = (word.strip() for word in ref.split(",")) | |
| if type == "T": | |
| record.dr_positive.append((acc, name)) | |
| elif type == "F": | |
| record.dr_false_pos.append((acc, name)) | |
| elif type == "N": | |
| record.dr_false_neg.append((acc, name)) | |
| elif type == "P": | |
| record.dr_potential.append((acc, name)) | |
| elif type == "?": | |
| record.dr_unknown.append((acc, name)) | |
| else: | |
| raise ValueError(f"I don't understand type flag {type}") | |
| elif keyword == "3D": | |
| cols = value.split() | |
| for id in cols: | |
| record.pdb_structs.append(id.rstrip(";")) | |
| elif keyword == "PR": | |
| rules = value.split(";") | |
| record.prorules.extend(rules) | |
| elif keyword == "DO": | |
| record.pdoc = value.rstrip(";") | |
| elif keyword == "//": | |
| if not record: | |
| # Then this was the copyright statement | |
| continue | |
| break | |
| else: | |
| raise ValueError(f"Unknown keyword {keyword} found") | |
| else: | |
| return | |
| if not record: | |
| raise ValueError("Unexpected end of stream.") | |
| return record | |