Spaces:
No application file
No application file
| # Copyright 2010 by Andrea Pierleoni | |
| # Revisions copyright 2010, 2016 by Peter Cock | |
| # All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Bio.SeqIO support for the "uniprot-xml" file format. | |
| See Also: | |
| http://www.uniprot.org | |
| The UniProt XML format essentially replaces the old plain text file format | |
| originally introduced by SwissProt ("swiss" format in Bio.SeqIO). | |
| """ | |
| from xml.etree import ElementTree | |
| from xml.parsers.expat import errors | |
| from Bio import SeqFeature | |
| from Bio.Seq import Seq | |
| from Bio.SeqRecord import SeqRecord | |
| NS = "{http://uniprot.org/uniprot}" | |
| REFERENCE_JOURNAL = "%(name)s %(volume)s:%(first)s-%(last)s(%(pub_date)s)" | |
| def UniprotIterator(source, alphabet=None, return_raw_comments=False): | |
| """Iterate over UniProt XML as SeqRecord objects. | |
| parses an XML entry at a time from any UniProt XML file | |
| returns a SeqRecord for each iteration | |
| This generator can be used in Bio.SeqIO | |
| Argument source is a file-like object or a path to a file. | |
| Optional argument alphabet should not be used anymore. | |
| return_raw_comments = True --> comment fields are returned as complete XML to allow further processing | |
| skip_parsing_errors = True --> if parsing errors are found, skip to next entry | |
| """ | |
| if alphabet is not None: | |
| raise ValueError("The alphabet argument is no longer supported") | |
| try: | |
| for event, elem in ElementTree.iterparse(source, events=("start", "end")): | |
| if event == "end" and elem.tag == NS + "entry": | |
| yield Parser(elem, return_raw_comments=return_raw_comments).parse() | |
| elem.clear() | |
| except ElementTree.ParseError as exception: | |
| if errors.messages[exception.code] == errors.XML_ERROR_NO_ELEMENTS: | |
| assert exception.position == (1, 0) # line 1, column 0 | |
| raise ValueError("Empty file.") from None | |
| else: | |
| raise | |
| class Parser: | |
| """Parse a UniProt XML entry to a SeqRecord. | |
| Optional argument alphabet is no longer used. | |
| return_raw_comments=True to get back the complete comment field in XML format | |
| """ | |
| def __init__(self, elem, alphabet=None, return_raw_comments=False): | |
| """Initialize the class.""" | |
| if alphabet is not None: | |
| raise ValueError("The alphabet argument is no longer supported") | |
| self.entry = elem | |
| self.return_raw_comments = return_raw_comments | |
| def parse(self): | |
| """Parse the input.""" | |
| assert self.entry.tag == NS + "entry" | |
| def append_to_annotations(key, value): | |
| if key not in self.ParsedSeqRecord.annotations: | |
| self.ParsedSeqRecord.annotations[key] = [] | |
| if value not in self.ParsedSeqRecord.annotations[key]: | |
| self.ParsedSeqRecord.annotations[key].append(value) | |
| def _parse_name(element): | |
| self.ParsedSeqRecord.name = element.text | |
| self.ParsedSeqRecord.dbxrefs.append(self.dbname + ":" + element.text) | |
| def _parse_accession(element): | |
| append_to_annotations( | |
| "accessions", element.text | |
| ) # to cope with SwissProt plain text parser | |
| self.ParsedSeqRecord.dbxrefs.append(self.dbname + ":" + element.text) | |
| def _parse_protein(element): | |
| """Parse protein names (PRIVATE).""" | |
| descr_set = False | |
| for protein_element in element: | |
| if protein_element.tag in [ | |
| NS + "recommendedName", | |
| NS + "submittedName", | |
| NS + "alternativeName", | |
| ]: # recommendedName tag are parsed before | |
| # use protein fields for name and description | |
| for rec_name in protein_element: | |
| ann_key = "%s_%s" % ( | |
| protein_element.tag.replace(NS, ""), | |
| rec_name.tag.replace(NS, ""), | |
| ) | |
| append_to_annotations(ann_key, rec_name.text) | |
| if (rec_name.tag == NS + "fullName") and not descr_set: | |
| self.ParsedSeqRecord.description = rec_name.text | |
| descr_set = True | |
| elif protein_element.tag == NS + "component": | |
| pass # not parsed | |
| elif protein_element.tag == NS + "domain": | |
| pass # not parsed | |
| def _parse_gene(element): | |
| for genename_element in element: | |
| if "type" in genename_element.attrib: | |
| ann_key = "gene_%s_%s" % ( | |
| genename_element.tag.replace(NS, ""), | |
| genename_element.attrib["type"], | |
| ) | |
| if genename_element.attrib["type"] == "primary": | |
| self.ParsedSeqRecord.annotations[ | |
| ann_key | |
| ] = genename_element.text | |
| else: | |
| append_to_annotations(ann_key, genename_element.text) | |
| def _parse_geneLocation(element): | |
| append_to_annotations("geneLocation", element.attrib["type"]) | |
| def _parse_organism(element): | |
| organism_name = com_name = sci_name = "" | |
| for organism_element in element: | |
| if organism_element.tag == NS + "name": | |
| if organism_element.text: | |
| if organism_element.attrib["type"] == "scientific": | |
| sci_name = organism_element.text | |
| elif organism_element.attrib["type"] == "common": | |
| com_name = organism_element.text | |
| else: | |
| # e.g. synonym | |
| append_to_annotations( | |
| "organism_name", organism_element.text | |
| ) | |
| elif organism_element.tag == NS + "dbReference": | |
| self.ParsedSeqRecord.dbxrefs.append( | |
| organism_element.attrib["type"] | |
| + ":" | |
| + organism_element.attrib["id"] | |
| ) | |
| elif organism_element.tag == NS + "lineage": | |
| for taxon_element in organism_element: | |
| if taxon_element.tag == NS + "taxon": | |
| append_to_annotations("taxonomy", taxon_element.text) | |
| if sci_name and com_name: | |
| organism_name = f"{sci_name} ({com_name})" | |
| elif sci_name: | |
| organism_name = sci_name | |
| elif com_name: | |
| organism_name = com_name | |
| self.ParsedSeqRecord.annotations["organism"] = organism_name | |
| def _parse_organismHost(element): | |
| for organism_element in element: | |
| if organism_element.tag == NS + "name": | |
| append_to_annotations("organism_host", organism_element.text) | |
| def _parse_keyword(element): | |
| append_to_annotations("keywords", element.text) | |
| def _parse_comment(element): | |
| """Parse comments (PRIVATE). | |
| Comment fields are very heterogeneus. each type has his own (frequently mutated) schema. | |
| To store all the contained data, more complex data structures are needed, such as | |
| annotated dictionaries. This is left to end user, by optionally setting: | |
| return_raw_comments=True | |
| The original XML is returned in the annotation fields. | |
| Available comment types at december 2009: | |
| - "allergen" | |
| - "alternative products" | |
| - "biotechnology" | |
| - "biophysicochemical properties" | |
| - "catalytic activity" | |
| - "caution" | |
| - "cofactor" | |
| - "developmental stage" | |
| - "disease" | |
| - "domain" | |
| - "disruption phenotype" | |
| - "enzyme regulation" | |
| - "function" | |
| - "induction" | |
| - "miscellaneous" | |
| - "pathway" | |
| - "pharmaceutical" | |
| - "polymorphism" | |
| - "PTM" | |
| - "RNA editing" | |
| - "similarity" | |
| - "subcellular location" | |
| - "sequence caution" | |
| - "subunit" | |
| - "tissue specificity" | |
| - "toxic dose" | |
| - "online information" | |
| - "mass spectrometry" | |
| - "interaction" | |
| """ | |
| simple_comments = [ | |
| "allergen", | |
| "biotechnology", | |
| "biophysicochemical properties", | |
| "catalytic activity", | |
| "caution", | |
| "cofactor", | |
| "developmental stage", | |
| "disease", | |
| "domain", | |
| "disruption phenotype", | |
| "enzyme regulation", | |
| "function", | |
| "induction", | |
| "miscellaneous", | |
| "pathway", | |
| "pharmaceutical", | |
| "polymorphism", | |
| "PTM", | |
| "RNA editing", # positions not parsed | |
| "similarity", | |
| "subunit", | |
| "tissue specificity", | |
| "toxic dose", | |
| ] | |
| if element.attrib["type"] in simple_comments: | |
| ann_key = f"comment_{element.attrib['type'].replace(' ', '')}" | |
| for text_element in element.iter(NS + "text"): | |
| if text_element.text: | |
| append_to_annotations(ann_key, text_element.text) | |
| elif element.attrib["type"] == "subcellular location": | |
| for subloc_element in element.iter(NS + "subcellularLocation"): | |
| for el in subloc_element: | |
| if el.text: | |
| ann_key = "comment_%s_%s" % ( | |
| element.attrib["type"].replace(" ", ""), | |
| el.tag.replace(NS, ""), | |
| ) | |
| append_to_annotations(ann_key, el.text) | |
| elif element.attrib["type"] == "interaction": | |
| for interact_element in element.iter(NS + "interactant"): | |
| ann_key = f"comment_{element.attrib['type']}_intactId" | |
| append_to_annotations(ann_key, interact_element.attrib["intactId"]) | |
| elif element.attrib["type"] == "alternative products": | |
| for alt_element in element.iter(NS + "isoform"): | |
| ann_key = "comment_%s_isoform" % element.attrib["type"].replace( | |
| " ", "" | |
| ) | |
| for id_element in alt_element.iter(NS + "id"): | |
| append_to_annotations(ann_key, id_element.text) | |
| elif element.attrib["type"] == "mass spectrometry": | |
| ann_key = f"comment_{element.attrib['type'].replace(' ', '')}" | |
| start = end = 0 | |
| for el in element.iter(NS + "location"): | |
| pos_els = list(el.iter(NS + "position")) | |
| # this try should be avoided, maybe it is safer to skip position parsing for mass spectrometry | |
| try: | |
| if pos_els: | |
| end = int(pos_els[0].attrib["position"]) | |
| start = end - 1 | |
| else: | |
| start = int(next(el.iter(NS + "begin")).attrib["position"]) | |
| start -= 1 | |
| end = int(next(el.iter(NS + "end")).attrib["position"]) | |
| except (ValueError, KeyError): | |
| # undefined positions or erroneously mapped | |
| pass | |
| mass = element.attrib["mass"] | |
| method = element.attrib["method"] | |
| if start == end == 0: | |
| append_to_annotations(ann_key, f"undefined:{mass}|{method}") | |
| else: | |
| append_to_annotations(ann_key, f"{start}..{end}:{mass}|{method}") | |
| elif element.attrib["type"] == "sequence caution": | |
| pass # not parsed: few information, complex structure | |
| elif element.attrib["type"] == "online information": | |
| for link_element in element.iter(NS + "link"): | |
| ann_key = f"comment_{element.attrib['type'].replace(' ', '')}" | |
| for id_element in link_element.iter(NS + "link"): | |
| append_to_annotations( | |
| ann_key, | |
| f"{element.attrib['name']}@{link_element.attrib['uri']}", | |
| ) | |
| # return raw XML comments if needed | |
| if self.return_raw_comments: | |
| ann_key = f"comment_{element.attrib['type'].replace(' ', '')}_xml" | |
| append_to_annotations(ann_key, ElementTree.tostring(element)) | |
| def _parse_dbReference(element): | |
| self.ParsedSeqRecord.dbxrefs.append( | |
| element.attrib["type"] + ":" + element.attrib["id"] | |
| ) | |
| # e.g. | |
| # <dbReference type="PDB" key="11" id="2GEZ"> | |
| # <property value="X-ray" type="method"/> | |
| # <property value="2.60 A" type="resolution"/> | |
| # <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/> | |
| # </dbReference> | |
| if "type" in element.attrib: | |
| if element.attrib["type"] == "PDB": | |
| method = "" | |
| resolution = "" | |
| for ref_element in element: | |
| if ref_element.tag == NS + "property": | |
| dat_type = ref_element.attrib["type"] | |
| if dat_type == "method": | |
| method = ref_element.attrib["value"] | |
| if dat_type == "resolution": | |
| resolution = ref_element.attrib["value"] | |
| if dat_type == "chains": | |
| pairs = ref_element.attrib["value"].split(",") | |
| for elem in pairs: | |
| pair = elem.strip().split("=") | |
| if pair[1] != "-": | |
| # TODO - How best to store these, do SeqFeatures make sense? | |
| feature = SeqFeature.SeqFeature() | |
| feature.type = element.attrib["type"] | |
| feature.qualifiers["name"] = element.attrib[ | |
| "id" | |
| ] | |
| feature.qualifiers["method"] = method | |
| feature.qualifiers["resolution"] = resolution | |
| feature.qualifiers["chains"] = pair[0].split( | |
| "/" | |
| ) | |
| start = int(pair[1].split("-")[0]) - 1 | |
| end = int(pair[1].split("-")[1]) | |
| feature.location = SeqFeature.SimpleLocation( | |
| start, end | |
| ) | |
| # self.ParsedSeqRecord.features.append(feature) | |
| for ref_element in element: | |
| if ref_element.tag == NS + "property": | |
| pass # this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs | |
| def _parse_reference(element): | |
| reference = SeqFeature.Reference() | |
| authors = [] | |
| scopes = [] | |
| tissues = [] | |
| journal_name = "" | |
| pub_type = "" | |
| pub_date = "" | |
| for ref_element in element: | |
| if ref_element.tag == NS + "citation": | |
| pub_type = ref_element.attrib["type"] | |
| if pub_type == "submission": | |
| pub_type += " to the " + ref_element.attrib["db"] | |
| if "name" in ref_element.attrib: | |
| journal_name = ref_element.attrib["name"] | |
| pub_date = ref_element.attrib.get("date", "") | |
| j_volume = ref_element.attrib.get("volume", "") | |
| j_first = ref_element.attrib.get("first", "") | |
| j_last = ref_element.attrib.get("last", "") | |
| for cit_element in ref_element: | |
| if cit_element.tag == NS + "title": | |
| reference.title = cit_element.text | |
| elif cit_element.tag == NS + "authorList": | |
| for person_element in cit_element: | |
| authors.append(person_element.attrib["name"]) | |
| elif cit_element.tag == NS + "dbReference": | |
| self.ParsedSeqRecord.dbxrefs.append( | |
| cit_element.attrib["type"] | |
| + ":" | |
| + cit_element.attrib["id"] | |
| ) | |
| if cit_element.attrib["type"] == "PubMed": | |
| reference.pubmed_id = cit_element.attrib["id"] | |
| elif ref_element.attrib["type"] == "MEDLINE": | |
| reference.medline_id = cit_element.attrib["id"] | |
| elif ref_element.tag == NS + "scope": | |
| scopes.append(ref_element.text) | |
| elif ref_element.tag == NS + "source": | |
| for source_element in ref_element: | |
| if source_element.tag == NS + "tissue": | |
| tissues.append(source_element.text) | |
| if scopes: | |
| scopes_str = "Scope: " + ", ".join(scopes) | |
| else: | |
| scopes_str = "" | |
| if tissues: | |
| tissues_str = "Tissue: " + ", ".join(tissues) | |
| else: | |
| tissues_str = "" | |
| # locations cannot be parsed since they are actually written in | |
| # free text inside scopes so all the references are put in the | |
| # annotation. | |
| reference.location = [] | |
| reference.authors = ", ".join(authors) | |
| if journal_name: | |
| if pub_date and j_volume and j_first and j_last: | |
| reference.journal = REFERENCE_JOURNAL % { | |
| "name": journal_name, | |
| "volume": j_volume, | |
| "first": j_first, | |
| "last": j_last, | |
| "pub_date": pub_date, | |
| } | |
| else: | |
| reference.journal = journal_name | |
| reference.comment = " | ".join( | |
| (pub_type, pub_date, scopes_str, tissues_str) | |
| ) | |
| append_to_annotations("references", reference) | |
| def _parse_position(element, offset=0): | |
| try: | |
| position = int(element.attrib["position"]) + offset | |
| except KeyError: | |
| position = None | |
| status = element.attrib.get("status", "") | |
| if status == "unknown": | |
| assert position is None | |
| return SeqFeature.UnknownPosition() | |
| elif not status: | |
| return SeqFeature.ExactPosition(position) | |
| elif status == "greater than": | |
| return SeqFeature.AfterPosition(position) | |
| elif status == "less than": | |
| return SeqFeature.BeforePosition(position) | |
| elif status == "uncertain": | |
| return SeqFeature.UncertainPosition(position) | |
| else: | |
| raise NotImplementedError(f"Position status {status!r}") | |
| def _parse_feature(element): | |
| feature = SeqFeature.SeqFeature() | |
| for k, v in element.attrib.items(): | |
| feature.qualifiers[k] = v | |
| feature.type = element.attrib.get("type", "") | |
| if "id" in element.attrib: | |
| feature.id = element.attrib["id"] | |
| for feature_element in element: | |
| if feature_element.tag == NS + "location": | |
| position_elements = feature_element.findall(NS + "position") | |
| if position_elements: | |
| element = position_elements[0] | |
| start_position = _parse_position(element, -1) | |
| end_position = _parse_position(element) | |
| else: | |
| element = feature_element.findall(NS + "begin")[0] | |
| start_position = _parse_position(element, -1) | |
| element = feature_element.findall(NS + "end")[0] | |
| end_position = _parse_position(element) | |
| feature.location = SeqFeature.SimpleLocation( | |
| start_position, end_position | |
| ) | |
| else: | |
| try: | |
| feature.qualifiers[ | |
| feature_element.tag.replace(NS, "") | |
| ] = feature_element.text | |
| except Exception: # TODO - Which exceptions? | |
| pass # skip unparsable tag | |
| self.ParsedSeqRecord.features.append(feature) | |
| def _parse_proteinExistence(element): | |
| append_to_annotations("proteinExistence", element.attrib["type"]) | |
| def _parse_evidence(element): | |
| for k, v in element.attrib.items(): | |
| ann_key = k | |
| append_to_annotations(ann_key, v) | |
| def _parse_sequence(element): | |
| for k, v in element.attrib.items(): | |
| if k in ("length", "mass", "version"): | |
| self.ParsedSeqRecord.annotations[f"sequence_{k}"] = int(v) | |
| else: | |
| self.ParsedSeqRecord.annotations[f"sequence_{k}"] = v | |
| self.ParsedSeqRecord.seq = Seq("".join(element.text.split())) | |
| self.ParsedSeqRecord.annotations["molecule_type"] = "protein" | |
| # ============================================# | |
| # Initialize SeqRecord | |
| self.ParsedSeqRecord = SeqRecord("", id="") | |
| # Entry attribs parsing | |
| # Unknown dataset should not happen! | |
| self.dbname = self.entry.attrib.get("dataset", "UnknownDataset") | |
| # add attribs to annotations | |
| for k, v in self.entry.attrib.items(): | |
| if k in ("version"): | |
| # original | |
| # self.ParsedSeqRecord.annotations["entry_%s" % k] = int(v) | |
| # To cope with swissProt plain text parser. this can cause errors | |
| # if the attrib has the same name of an other annotation | |
| self.ParsedSeqRecord.annotations[k] = int(v) | |
| else: | |
| # self.ParsedSeqRecord.annotations["entry_%s" % k] = v | |
| # to cope with swissProt plain text parser: | |
| self.ParsedSeqRecord.annotations[k] = v | |
| # Top-to-bottom entry children parsing | |
| for element in self.entry: | |
| if element.tag == NS + "name": | |
| _parse_name(element) | |
| elif element.tag == NS + "accession": | |
| _parse_accession(element) | |
| elif element.tag == NS + "protein": | |
| _parse_protein(element) | |
| elif element.tag == NS + "gene": | |
| _parse_gene(element) | |
| elif element.tag == NS + "geneLocation": | |
| _parse_geneLocation(element) | |
| elif element.tag == NS + "organism": | |
| _parse_organism(element) | |
| elif element.tag == NS + "organismHost": | |
| _parse_organismHost(element) | |
| elif element.tag == NS + "keyword": | |
| _parse_keyword(element) | |
| elif element.tag == NS + "comment": | |
| _parse_comment(element) | |
| elif element.tag == NS + "dbReference": | |
| _parse_dbReference(element) | |
| elif element.tag == NS + "reference": | |
| _parse_reference(element) | |
| elif element.tag == NS + "feature": | |
| _parse_feature(element) | |
| elif element.tag == NS + "proteinExistence": | |
| _parse_proteinExistence(element) | |
| elif element.tag == NS + "evidence": | |
| _parse_evidence(element) | |
| elif element.tag == NS + "sequence": | |
| _parse_sequence(element) | |
| else: | |
| pass | |
| # remove duplicate dbxrefs | |
| self.ParsedSeqRecord.dbxrefs = sorted(set(self.ParsedSeqRecord.dbxrefs)) | |
| # use first accession as id | |
| if not self.ParsedSeqRecord.id: | |
| self.ParsedSeqRecord.id = self.ParsedSeqRecord.annotations["accessions"][0] | |
| return self.ParsedSeqRecord | |