Spaces:
No application file
No application file
| # Copyright 2008 by Bartek Wilczynski. | |
| # Revisions copyright 2019 by Victor Lin. | |
| # Adapted from Bio.MEME.Parser by Jason A. Hackney. All rights reserved. | |
| # This code is part of the Biopython distribution and governed by its | |
| # license. Please see the LICENSE file that should have been included | |
| # as part of this package. | |
| """Module for the support of MEME motif format.""" | |
| import xml.etree.ElementTree as ET | |
| from Bio import Seq | |
| from Bio import motifs | |
| def read(handle): | |
| """Parse the text output of the MEME program into a meme.Record object. | |
| Examples | |
| -------- | |
| >>> from Bio.motifs import meme | |
| >>> with open("motifs/meme.INO_up800.classic.oops.xml") as f: | |
| ... record = meme.read(f) | |
| >>> for motif in record: | |
| ... for instance in motif.instances: | |
| ... print(instance.motif_name, instance.sequence_name, instance.sequence_id, instance.strand, instance.pvalue) | |
| GSKGCATGTGAAA INO1 sequence_5 + 1.21e-08 | |
| GSKGCATGTGAAA FAS1 sequence_2 - 1.87e-08 | |
| GSKGCATGTGAAA ACC1 sequence_4 - 6.62e-08 | |
| GSKGCATGTGAAA CHO2 sequence_1 - 1.05e-07 | |
| GSKGCATGTGAAA CHO1 sequence_0 - 1.69e-07 | |
| GSKGCATGTGAAA FAS2 sequence_3 - 5.62e-07 | |
| GSKGCATGTGAAA OPI3 sequence_6 + 1.08e-06 | |
| TTGACWCYTGCYCWG CHO2 sequence_1 + 7.2e-10 | |
| TTGACWCYTGCYCWG OPI3 sequence_6 - 2.56e-08 | |
| TTGACWCYTGCYCWG ACC1 sequence_4 - 1.59e-07 | |
| TTGACWCYTGCYCWG CHO1 sequence_0 + 2.05e-07 | |
| TTGACWCYTGCYCWG FAS1 sequence_2 + 3.85e-07 | |
| TTGACWCYTGCYCWG FAS2 sequence_3 - 5.11e-07 | |
| TTGACWCYTGCYCWG INO1 sequence_5 + 8.01e-07 | |
| """ | |
| record = Record() | |
| try: | |
| xml_tree = ET.parse(handle) | |
| except ET.ParseError: | |
| raise ValueError( | |
| "Improper MEME XML input file. XML root tag should start with <MEME version= ..." | |
| ) | |
| __read_metadata(record, xml_tree) | |
| __read_alphabet(record, xml_tree) | |
| sequence_id_name_map = __get_sequence_id_name_map(xml_tree) | |
| record.sequences = list(sequence_id_name_map.keys()) | |
| __read_motifs(record, xml_tree, sequence_id_name_map) | |
| return record | |
| class Motif(motifs.Motif): | |
| """A subclass of Motif used in parsing MEME (and MAST) output. | |
| This subclass defines functions and data specific to MEME motifs. | |
| This includes the motif name, the evalue for a motif, and its number | |
| of occurrences. | |
| """ | |
| def __init__(self, alphabet=None, instances=None): | |
| """Initialize the class.""" | |
| motifs.Motif.__init__(self, alphabet, instances) | |
| self.evalue = 0.0 | |
| self.num_occurrences = 0 | |
| self.name = None | |
| self.id = None | |
| self.alt_id = None | |
| class Instance(Seq.Seq): | |
| """A class describing the instances of a MEME motif, and the data thereof.""" | |
| def __init__(self, *args, **kwds): | |
| """Initialize the class.""" | |
| Seq.Seq.__init__(self, *args, **kwds) | |
| self.sequence_name = "" | |
| self.sequence_id = "" | |
| self.start = 0 | |
| self.pvalue = 1.0 | |
| self.strand = 0 | |
| self.length = 0 | |
| self.motif_name = "" | |
| class Record(list): | |
| """A class for holding the results of a MEME run. | |
| A meme.Record is an object that holds the results from running | |
| MEME. It implements no methods of its own. | |
| The meme.Record class inherits from list, so you can access individual | |
| motifs in the record by their index. Alternatively, you can find a motif | |
| by its name: | |
| >>> from Bio import motifs | |
| >>> with open("motifs/meme.INO_up800.classic.oops.xml") as f: | |
| ... record = motifs.parse(f, 'MEME') | |
| >>> motif = record[0] | |
| >>> print(motif.name) | |
| GSKGCATGTGAAA | |
| >>> motif = record['GSKGCATGTGAAA'] | |
| >>> print(motif.name) | |
| GSKGCATGTGAAA | |
| """ | |
| def __init__(self): | |
| """Initialize the class.""" | |
| self.version = "" | |
| self.datafile = "" | |
| self.command = "" | |
| self.alphabet = "" | |
| self.sequences = [] | |
| def __getitem__(self, key): | |
| """Return the motif of index key.""" | |
| if isinstance(key, str): | |
| for motif in self: | |
| if motif.name == key: | |
| return motif | |
| else: | |
| return list.__getitem__(self, key) | |
| # Everything below is private | |
| def __read_metadata(record, xml_tree): | |
| record.version = xml_tree.getroot().get("version") | |
| record.datafile = xml_tree.find("training_set").get("primary_sequences") | |
| record.command = xml_tree.find("model").find("command_line").text | |
| # TODO - background_frequencies, other metadata under model | |
| def __read_alphabet(record, xml_tree): | |
| alphabet_tree = ( | |
| xml_tree.find("training_set").find("letter_frequencies").find("alphabet_array") | |
| ) | |
| for value in alphabet_tree.findall("value"): | |
| record.alphabet += value.get("letter_id") | |
| def __get_sequence_id_name_map(xml_tree): | |
| return { | |
| sequence_tree.get("id"): sequence_tree.get("name") | |
| for sequence_tree in xml_tree.find("training_set").findall("sequence") | |
| } | |
| def __read_motifs(record, xml_tree, sequence_id_name_map): | |
| for motif_tree in xml_tree.find("motifs").findall("motif"): | |
| instances = [] | |
| for site_tree in motif_tree.find("contributing_sites").findall( | |
| "contributing_site" | |
| ): | |
| letters = [ | |
| letter_ref.get("letter_id") | |
| for letter_ref in site_tree.find("site").findall("letter_ref") | |
| ] | |
| sequence = "".join(letters) | |
| instance = Instance(sequence) | |
| instance.motif_name = motif_tree.get("name") | |
| instance.sequence_id = site_tree.get("sequence_id") | |
| instance.sequence_name = sequence_id_name_map[instance.sequence_id] | |
| # TODO - left flank, right flank | |
| instance.start = int(site_tree.get("position")) + 1 | |
| instance.pvalue = float(site_tree.get("pvalue")) | |
| instance.strand = __convert_strand(site_tree.get("strand")) | |
| instance.length = len(sequence) | |
| instances.append(instance) | |
| instances = motifs.Instances(instances, record.alphabet) | |
| motif = Motif(record.alphabet, instances) | |
| motif.id = motif_tree.get("id") | |
| motif.name = motif_tree.get("name") | |
| motif.alt_id = motif_tree.get("alt") | |
| motif.length = int(motif_tree.get("width")) | |
| motif.num_occurrences = int(motif_tree.get("sites")) | |
| motif.evalue = float(motif_tree.get("e_value")) | |
| # TODO - ic, re, llr, pvalue, bayes_threshold, elapsed_time | |
| record.append(motif) | |
| def __convert_strand(strand): | |
| """Convert strand (+/-) from XML if present. | |
| Default: + | |
| """ | |
| if strand == "minus": | |
| return "-" | |
| if strand == "plus" or strand == "none": | |
| return "+" | |
| if __name__ == "__main__": | |
| from Bio._utils import run_doctest | |
| run_doctest() | |