Spaces:
No application file
No application file
| # Copyright 2006 by Sean Davis, National Cancer Institute, NIH. | |
| # All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Parse Unigene flat file format files such as the Hs.data file. | |
| Here is an overview of the flat file format that this parser deals with: | |
| Line types/qualifiers:: | |
| ID UniGene cluster ID | |
| TITLE Title for the cluster | |
| GENE Gene symbol | |
| CYTOBAND Cytological band | |
| EXPRESS Tissues of origin for ESTs in cluster | |
| RESTR_EXPR Single tissue or development stage contributes | |
| more than half the total EST frequency for this gene. | |
| GNM_TERMINUS genomic confirmation of presence of a 3' terminus; | |
| T if a non-templated polyA tail is found among | |
| a cluster's sequences; else | |
| I if templated As are found in genomic sequence or | |
| S if a canonical polyA signal is found on | |
| the genomic sequence | |
| GENE_ID Entrez gene identifier associated with at least one | |
| sequence in this cluster; | |
| to be used instead of LocusLink. | |
| LOCUSLINK LocusLink identifier associated with at least one | |
| sequence in this cluster; | |
| deprecated in favor of GENE_ID | |
| HOMOL Homology; | |
| CHROMOSOME Chromosome. For plants, CHROMOSOME refers to mapping | |
| on the arabidopsis genome. | |
| STS STS | |
| ACC= GenBank/EMBL/DDBJ accession number of STS | |
| [optional field] | |
| UNISTS= identifier in NCBI's UNISTS database | |
| TXMAP Transcript map interval | |
| MARKER= Marker found on at least one sequence in this | |
| cluster | |
| RHPANEL= Radiation Hybrid panel used to place marker | |
| PROTSIM Protein Similarity data for the sequence with | |
| highest-scoring protein similarity in this cluster | |
| ORG= Organism | |
| PROTGI= Sequence GI of protein | |
| PROTID= Sequence ID of protein | |
| PCT= Percent alignment | |
| ALN= length of aligned region (aa) | |
| SCOUNT Number of sequences in the cluster | |
| SEQUENCE Sequence | |
| ACC= GenBank/EMBL/DDBJ accession number of sequence | |
| NID= Unique nucleotide sequence identifier (gi) | |
| PID= Unique protein sequence identifier (used for | |
| non-ESTs) | |
| CLONE= Clone identifier (used for ESTs only) | |
| END= End (5'/3') of clone insert read (used for | |
| ESTs only) | |
| LID= Library ID; see Hs.lib.info for library name | |
| and tissue | |
| MGC= 5' CDS-completeness indicator; if present, the | |
| clone associated with this sequence is believed | |
| CDS-complete. A value greater than 511 is the gi | |
| of the CDS-complete mRNA matched by the EST, | |
| otherwise the value is an indicator of the | |
| reliability of the test indicating CDS | |
| completeness; higher values indicate more | |
| reliable CDS-completeness predictions. | |
| SEQTYPE= Description of the nucleotide sequence. | |
| Possible values are mRNA, EST and HTC. | |
| TRACE= The Trace ID of the EST sequence, as provided by | |
| NCBI Trace Archive | |
| """ | |
| class SequenceLine: | |
| """Store the information for one SEQUENCE line from a Unigene file. | |
| Initialize with the text part of the SEQUENCE line, or nothing. | |
| Attributes and descriptions (access as LOWER CASE): | |
| - ACC= GenBank/EMBL/DDBJ accession number of sequence | |
| - NID= Unique nucleotide sequence identifier (gi) | |
| - PID= Unique protein sequence identifier (used for non-ESTs) | |
| - CLONE= Clone identifier (used for ESTs only) | |
| - END= End (5'/3') of clone insert read (used for ESTs only) | |
| - LID= Library ID; see Hs.lib.info for library name and tissue | |
| - MGC= 5' CDS-completeness indicator; if present, | |
| the clone associated with this sequence | |
| is believed CDS-complete. A value greater than 511 | |
| is the gi of the CDS-complete mRNA matched by the EST, | |
| otherwise the value is an indicator of the reliability | |
| of the test indicating CDS completeness; | |
| higher values indicate more reliable CDS-completeness | |
| predictions. | |
| - SEQTYPE= Description of the nucleotide sequence. Possible values | |
| are mRNA, EST and HTC. | |
| - TRACE= The Trace ID of the EST sequence, as provided by NCBI | |
| Trace Archive | |
| """ | |
| def __init__(self, text=None): | |
| """Initialize the class.""" | |
| self.acc = "" | |
| self.nid = "" | |
| self.lid = "" | |
| self.pid = "" | |
| self.clone = "" | |
| self.image = "" | |
| self.is_image = False | |
| self.end = "" | |
| self.mgc = "" | |
| self.seqtype = "" | |
| self.trace = "" | |
| if text is not None: | |
| self.text = text | |
| self._init_from_text(text) | |
| def _init_from_text(self, text): | |
| parts = text.split("; ") | |
| for part in parts: | |
| key, val = part.split("=") | |
| if key == "CLONE": | |
| if val[:5] == "IMAGE": | |
| self.is_image = True | |
| self.image = val[6:] | |
| setattr(self, key.lower(), val) | |
| def __repr__(self): | |
| """Return UniGene SequenceLine object as a string.""" | |
| return self.text | |
| class ProtsimLine: | |
| """Store the information for one PROTSIM line from a Unigene file. | |
| Initialize with the text part of the PROTSIM line, or nothing. | |
| Attributes and descriptions (access as LOWER CASE) | |
| ORG= Organism | |
| PROTGI= Sequence GI of protein | |
| PROTID= Sequence ID of protein | |
| PCT= Percent alignment | |
| ALN= length of aligned region (aa) | |
| """ | |
| def __init__(self, text=None): | |
| """Initialize the class.""" | |
| self.org = "" | |
| self.protgi = "" | |
| self.protid = "" | |
| self.pct = "" | |
| self.aln = "" | |
| if text is not None: | |
| self.text = text | |
| self._init_from_text(text) | |
| def _init_from_text(self, text): | |
| parts = text.split("; ") | |
| for part in parts: | |
| key, val = part.split("=") | |
| setattr(self, key.lower(), val) | |
| def __repr__(self): | |
| """Return UniGene ProtsimLine object as a string.""" | |
| return self.text | |
| class STSLine: | |
| """Store the information for one STS line from a Unigene file. | |
| Initialize with the text part of the STS line, or nothing. | |
| Attributes and descriptions (access as LOWER CASE) | |
| ACC= GenBank/EMBL/DDBJ accession number of STS [optional field] | |
| UNISTS= identifier in NCBI's UNISTS database | |
| """ | |
| def __init__(self, text=None): | |
| """Initialize the class.""" | |
| self.acc = "" | |
| self.unists = "" | |
| if text is not None: | |
| self.text = text | |
| self._init_from_text(text) | |
| def _init_from_text(self, text): | |
| parts = text.split(" ") | |
| for part in parts: | |
| key, val = part.split("=") | |
| setattr(self, key.lower(), val) | |
| def __repr__(self): | |
| """Return UniGene STSLine object as a string.""" | |
| return self.text | |
| class Record: | |
| """Store a Unigene record. | |
| Here is what is stored:: | |
| self.ID = '' # ID line | |
| self.species = '' # Hs, Bt, etc. | |
| self.title = '' # TITLE line | |
| self.symbol = '' # GENE line | |
| self.cytoband = '' # CYTOBAND line | |
| self.express = [] # EXPRESS line, parsed on ';' | |
| # Will be an array of strings | |
| self.restr_expr = '' # RESTR_EXPR line | |
| self.gnm_terminus = '' # GNM_TERMINUS line | |
| self.gene_id = '' # GENE_ID line | |
| self.locuslink = '' # LOCUSLINK line | |
| self.homol = '' # HOMOL line | |
| self.chromosome = '' # CHROMOSOME line | |
| self.protsim = [] # PROTSIM entries, array of Protsims | |
| # Type ProtsimLine | |
| self.sequence = [] # SEQUENCE entries, array of Sequence entries | |
| # Type SequenceLine | |
| self.sts = [] # STS entries, array of STS entries | |
| # Type STSLine | |
| self.txmap = [] # TXMAP entries, array of TXMap entries | |
| """ | |
| def __init__(self): | |
| """Initialize the class.""" | |
| self.ID = "" # ID line | |
| self.species = "" # Hs, Bt, etc. | |
| self.title = "" # TITLE line | |
| self.symbol = "" # GENE line | |
| self.cytoband = "" # CYTOBAND line | |
| self.express = [] # EXPRESS line, parsed on ';' | |
| self.restr_expr = "" # RESTR_EXPR line | |
| self.gnm_terminus = "" # GNM_TERMINUS line | |
| self.gene_id = "" # GENE_ID line | |
| self.locuslink = "" # LOCUSLINK line | |
| self.homol = "" # HOMOL line | |
| self.chromosome = "" # CHROMOSOME line | |
| self.protsim = [] # PROTSIM entries, array of Protsims | |
| self.sequence = [] # SEQUENCE entries, array of Sequence entries | |
| self.sts = [] # STS entries, array of STS entries | |
| self.txmap = [] # TXMAP entries, array of TXMap entries | |
| def __repr__(self): | |
| """Represent the UniGene Record object as a string for debugging.""" | |
| return f"<{self.__class__.__name__}> {self.ID} {self.symbol} {self.title}" | |
| def parse(handle): | |
| """Read and load a UniGene records, for files containing multiple records.""" | |
| while True: | |
| record = _read(handle) | |
| if not record: | |
| return | |
| yield record | |
| def read(handle): | |
| """Read and load a UniGene record, one record per file.""" | |
| record = _read(handle) | |
| if not record: | |
| raise ValueError("No SwissProt record found") | |
| # We should have reached the end of the record by now | |
| remainder = handle.read() | |
| if remainder: | |
| raise ValueError("More than one SwissProt record found") | |
| return record | |
| # Everything below is private | |
| def _read(handle): | |
| UG_INDENT = 12 | |
| record = None | |
| for line in handle: | |
| tag, value = line[:UG_INDENT].rstrip(), line[UG_INDENT:].rstrip() | |
| line = line.rstrip() | |
| if tag == "ID": | |
| record = Record() | |
| record.ID = value | |
| record.species = record.ID.split(".")[0] | |
| elif tag == "TITLE": | |
| record.title = value | |
| elif tag == "GENE": | |
| record.symbol = value | |
| elif tag == "GENE_ID": | |
| record.gene_id = value | |
| elif tag == "LOCUSLINK": | |
| record.locuslink = value | |
| elif tag == "HOMOL": | |
| if value == "YES": | |
| record.homol = True | |
| elif value == "NO": | |
| record.homol = True | |
| else: | |
| raise ValueError(f"Cannot parse HOMOL line {line}") | |
| elif tag == "EXPRESS": | |
| record.express = [word.strip() for word in value.split("|")] | |
| elif tag == "RESTR_EXPR": | |
| record.restr_expr = [word.strip() for word in value.split("|")] | |
| elif tag == "CHROMOSOME": | |
| record.chromosome = value | |
| elif tag == "CYTOBAND": | |
| record.cytoband = value | |
| elif tag == "PROTSIM": | |
| protsim = ProtsimLine(value) | |
| record.protsim.append(protsim) | |
| elif tag == "SCOUNT": | |
| scount = int(value) | |
| elif tag == "SEQUENCE": | |
| sequence = SequenceLine(value) | |
| record.sequence.append(sequence) | |
| elif tag == "STS": | |
| sts = STSLine(value) | |
| record.sts.append(sts) | |
| elif tag == "//": | |
| if len(record.sequence) != scount: | |
| raise ValueError( | |
| "The number of sequences specified in the record " | |
| "(%d) does not agree with the number of sequences found (%d)" | |
| % (scount, len(record.sequence)) | |
| ) | |
| return record | |
| else: | |
| raise ValueError(f"Unknown tag {tag}") | |
| if record: | |
| raise ValueError("Unexpected end of stream.") | |