Spaces:
No application file
No application file
| # Copyright 2003-2009 by Bartek Wilczynski. All rights reserved. | |
| # Copyright 2012-2013 by Michiel JL de Hoon. All rights reserved. | |
| # Revisions copyright 2019 by Victor Lin. All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Tools for sequence motif analysis. | |
| Bio.motifs contains the core Motif class containing various I/O methods | |
| as well as methods for motif comparisons and motif searching in sequences. | |
| It also includes functionality for parsing output from the AlignACE, MEME, | |
| and MAST programs, as well as files in the TRANSFAC format. | |
| """ | |
| from urllib.parse import urlencode | |
| from urllib.request import urlopen, Request | |
| def create(instances, alphabet="ACGT"): | |
| """Create a Motif object.""" | |
| instances = Instances(instances, alphabet) | |
| return Motif(instances=instances, alphabet=alphabet) | |
| def parse(handle, fmt, strict=True): | |
| """Parse an output file from a motif finding program. | |
| Currently supported formats (case is ignored): | |
| - AlignAce: AlignAce output file format | |
| - ClusterBuster: Cluster Buster position frequency matrix format | |
| - XMS: XMS matrix format | |
| - MEME: MEME output file motif | |
| - MINIMAL: MINIMAL MEME output file motif | |
| - MAST: MAST output file motif | |
| - TRANSFAC: TRANSFAC database file format | |
| - pfm-four-columns: Generic position-frequency matrix format with four columns. (cisbp, homer, hocomoco, neph, tiffin) | |
| - pfm-four-rows: Generic position-frequency matrix format with four row. (scertf, yetfasco, hdpi, idmmpmm, flyfactor survey) | |
| - pfm: JASPAR-style position-frequency matrix | |
| - jaspar: JASPAR-style multiple PFM format | |
| - sites: JASPAR-style sites file | |
| As files in the pfm and sites formats contain only a single motif, | |
| it is easier to use Bio.motifs.read() instead of Bio.motifs.parse() | |
| for those. | |
| For example: | |
| >>> from Bio import motifs | |
| >>> with open("motifs/alignace.out") as handle: | |
| ... for m in motifs.parse(handle, "AlignAce"): | |
| ... print(m.consensus) | |
| ... | |
| TCTACGATTGAG | |
| CTGCACCTAGCTACGAGTGAG | |
| GTGCCCTAAGCATACTAGGCG | |
| GCCACTAGCAGAGCAGGGGGC | |
| CGACTCAGAGGTT | |
| CCACGCTAAGAGAAGTGCCGGAG | |
| GCACGTCCCTGAGCA | |
| GTCCATCGCAAAGCGTGGGGC | |
| GAGATCAGAGGGCCG | |
| TGGACGCGGGG | |
| GACCAGAGCCTCGCATGGGGG | |
| AGCGCGCGTG | |
| GCCGGTTGCTGTTCATTAGG | |
| ACCGACGGCAGCTAAAAGGG | |
| GACGCCGGGGAT | |
| CGACTCGCGCTTACAAGG | |
| If strict is True (default), the parser will raise a ValueError if the | |
| file contents does not strictly comply with the specified file format. | |
| """ | |
| fmt = fmt.lower() | |
| if fmt == "alignace": | |
| from Bio.motifs import alignace | |
| return alignace.read(handle) | |
| elif fmt == "meme": | |
| from Bio.motifs import meme | |
| return meme.read(handle) | |
| elif fmt == "minimal": | |
| from Bio.motifs import minimal | |
| return minimal.read(handle) | |
| elif fmt == "clusterbuster": | |
| from Bio.motifs import clusterbuster | |
| return clusterbuster.read(handle) | |
| elif fmt in ("pfm-four-columns", "pfm-four-rows"): | |
| from Bio.motifs import pfm | |
| return pfm.read(handle, fmt) | |
| elif fmt == "xms": | |
| from Bio.motifs import xms | |
| return xms.read(handle) | |
| elif fmt == "mast": | |
| from Bio.motifs import mast | |
| return mast.read(handle) | |
| elif fmt == "transfac": | |
| from Bio.motifs import transfac | |
| return transfac.read(handle, strict) | |
| elif fmt in ("pfm", "sites", "jaspar"): | |
| from Bio.motifs import jaspar | |
| return jaspar.read(handle, fmt) | |
| else: | |
| raise ValueError("Unknown format %s" % fmt) | |
| def read(handle, fmt, strict=True): | |
| """Read a motif from a handle using the specified file-format. | |
| This supports the same formats as Bio.motifs.parse(), but | |
| only for files containing exactly one motif. For example, | |
| reading a JASPAR-style pfm file: | |
| >>> from Bio import motifs | |
| >>> with open("motifs/SRF.pfm") as handle: | |
| ... m = motifs.read(handle, "pfm") | |
| >>> m.consensus | |
| Seq('GCCCATATATGG') | |
| Or a single-motif MEME file, | |
| >>> from Bio import motifs | |
| >>> with open("motifs/meme.psp_test.classic.zoops.xml") as handle: | |
| ... m = motifs.read(handle, "meme") | |
| >>> m.consensus | |
| Seq('GCTTATGTAA') | |
| If the handle contains no records, or more than one record, | |
| an exception is raised: | |
| >>> from Bio import motifs | |
| >>> with open("motifs/alignace.out") as handle: | |
| ... motif = motifs.read(handle, "AlignAce") | |
| Traceback (most recent call last): | |
| ... | |
| ValueError: More than one motif found in handle | |
| If however you want the first motif from a file containing | |
| multiple motifs this function would raise an exception (as | |
| shown in the example above). Instead use: | |
| >>> from Bio import motifs | |
| >>> with open("motifs/alignace.out") as handle: | |
| ... record = motifs.parse(handle, "alignace") | |
| >>> motif = record[0] | |
| >>> motif.consensus | |
| Seq('TCTACGATTGAG') | |
| Use the Bio.motifs.parse(handle, fmt) function if you want | |
| to read multiple records from the handle. | |
| If strict is True (default), the parser will raise a ValueError if the | |
| file contents does not strictly comply with the specified file format. | |
| """ | |
| fmt = fmt.lower() | |
| motifs = parse(handle, fmt, strict) | |
| if len(motifs) == 0: | |
| raise ValueError("No motifs found in handle") | |
| if len(motifs) > 1: | |
| raise ValueError("More than one motif found in handle") | |
| motif = motifs[0] | |
| return motif | |
| class Instances(list): | |
| """Class containing a list of sequences that made the motifs.""" | |
| def __init__(self, instances=None, alphabet="ACGT"): | |
| """Initialize the class.""" | |
| from Bio.Seq import Seq, MutableSeq | |
| if isinstance(instances, (Seq, MutableSeq, str)): | |
| raise TypeError( | |
| "instances should be iterator of Seq objects or strings. " | |
| "If a single sequence is given, will treat each character " | |
| "as a separate sequence." | |
| ) | |
| length = None | |
| if instances is not None: | |
| sequences = [] | |
| for instance in instances: | |
| if length is None: | |
| length = len(instance) | |
| elif length != len(instance): | |
| message = ( | |
| "All instances should have the same length (%d found, %d expected)" | |
| % (len(instance), length) | |
| ) | |
| raise ValueError(message) | |
| if not isinstance(instance, Seq): | |
| instance = Seq(str(instance)) | |
| sequences.append(instance) | |
| # no errors were raised; store the instances: | |
| self.extend(sequences) | |
| self.length = length | |
| self.alphabet = alphabet | |
| def __str__(self): | |
| """Return a string containing the sequences of the motif.""" | |
| text = "" | |
| for instance in self: | |
| text += str(instance) + "\n" | |
| return text | |
| def count(self): | |
| """Count nucleotides in a position.""" | |
| counts = {} | |
| for letter in self.alphabet: | |
| counts[letter] = [0] * self.length | |
| for instance in self: | |
| for position, letter in enumerate(instance): | |
| counts[letter][position] += 1 | |
| return counts | |
| def search(self, sequence): | |
| """Find positions of motifs in a given sequence. | |
| This is a generator function, returning found positions of motif | |
| instances in a given sequence. | |
| """ | |
| for pos in range(0, len(sequence) - self.length + 1): | |
| for instance in self: | |
| if instance == sequence[pos : pos + self.length]: | |
| yield (pos, instance) | |
| break # no other instance will fit (we don't want to return multiple hits) | |
| def reverse_complement(self): | |
| """Compute reverse complement of sequences.""" | |
| from Bio.Seq import Seq, MutableSeq | |
| from Bio.SeqRecord import SeqRecord | |
| instances = Instances(alphabet=self.alphabet) | |
| instances.length = self.length | |
| for instance in self: | |
| # TODO: remove inplace=False | |
| if isinstance(instance, (Seq, MutableSeq)): | |
| instance = instance.reverse_complement(inplace=False) | |
| elif isinstance(instance, (str, SeqRecord)): | |
| instance = instance.reverse_complement() | |
| else: | |
| raise RuntimeError("instance has unexpected type %s" % type(instance)) | |
| instances.append(instance) | |
| return instances | |
| class Motif: | |
| """A class representing sequence motifs.""" | |
| def __init__(self, alphabet="ACGT", instances=None, counts=None): | |
| """Initialize the class.""" | |
| from . import matrix | |
| self.name = "" | |
| if counts is not None and instances is not None: | |
| raise Exception( | |
| ValueError, "Specify either instances or counts, don't specify both" | |
| ) | |
| elif counts is not None: | |
| self.instances = None | |
| self.counts = matrix.FrequencyPositionMatrix(alphabet, counts) | |
| self.length = self.counts.length | |
| elif instances is not None: | |
| self.instances = instances | |
| alphabet = self.instances.alphabet | |
| counts = self.instances.count() | |
| self.counts = matrix.FrequencyPositionMatrix(alphabet, counts) | |
| self.length = self.counts.length | |
| else: | |
| self.counts = None | |
| self.instances = None | |
| self.length = None | |
| self.alphabet = alphabet | |
| self.pseudocounts = None | |
| self.background = None | |
| self.mask = None | |
| def __get_mask(self): | |
| return self.__mask | |
| def __set_mask(self, mask): | |
| if self.length is None: | |
| self.__mask = () | |
| elif mask is None: | |
| self.__mask = (1,) * self.length | |
| elif len(mask) != self.length: | |
| raise ValueError( | |
| "The length (%d) of the mask is inconsistent with the length (%d) of the motif", | |
| (len(mask), self.length), | |
| ) | |
| elif isinstance(mask, str): | |
| self.__mask = [] | |
| for char in mask: | |
| if char == "*": | |
| self.__mask.append(1) | |
| elif char == " ": | |
| self.__mask.append(0) | |
| else: | |
| raise ValueError( | |
| "Mask should contain only '*' or ' ' and not a '%s'" % char | |
| ) | |
| self.__mask = tuple(self.__mask) | |
| else: | |
| self.__mask = tuple(int(bool(c)) for c in mask) | |
| mask = property(__get_mask, __set_mask) | |
| del __get_mask | |
| del __set_mask | |
| def __get_pseudocounts(self): | |
| return self._pseudocounts | |
| def __set_pseudocounts(self, value): | |
| self._pseudocounts = {} | |
| if isinstance(value, dict): | |
| self._pseudocounts = {letter: value[letter] for letter in self.alphabet} | |
| else: | |
| if value is None: | |
| value = 0.0 | |
| self._pseudocounts = dict.fromkeys(self.alphabet, value) | |
| pseudocounts = property(__get_pseudocounts, __set_pseudocounts) | |
| del __get_pseudocounts | |
| del __set_pseudocounts | |
| def __get_background(self): | |
| return self._background | |
| def __set_background(self, value): | |
| if isinstance(value, dict): | |
| self._background = {letter: value[letter] for letter in self.alphabet} | |
| elif value is None: | |
| self._background = dict.fromkeys(self.alphabet, 1.0) | |
| else: | |
| if sorted(self.alphabet) != ["A", "C", "G", "T"]: | |
| raise ValueError( | |
| "Setting the background to a single value only works for DNA motifs" | |
| " (in which case the value is interpreted as the GC content)" | |
| ) | |
| self._background["A"] = (1.0 - value) / 2.0 | |
| self._background["C"] = value / 2.0 | |
| self._background["G"] = value / 2.0 | |
| self._background["T"] = (1.0 - value) / 2.0 | |
| total = sum(self._background.values()) | |
| for letter in self.alphabet: | |
| self._background[letter] /= total | |
| background = property(__get_background, __set_background) | |
| del __get_background | |
| del __set_background | |
| def pwm(self): | |
| """Compute position weight matrices.""" | |
| return self.counts.normalize(self._pseudocounts) | |
| def pssm(self): | |
| """Compute position specific scoring matrices.""" | |
| return self.pwm.log_odds(self._background) | |
| def __str__(self, masked=False): | |
| """Return string representation of a motif.""" | |
| text = "" | |
| if self.instances is not None: | |
| text += str(self.instances) | |
| if masked: | |
| for i in range(self.length): | |
| if self.__mask[i]: | |
| text += "*" | |
| else: | |
| text += " " | |
| text += "\n" | |
| return text | |
| def __len__(self): | |
| """Return the length of a motif. | |
| Please use this method (i.e. invoke len(m)) instead of referring to m.length directly. | |
| """ | |
| if self.length is None: | |
| return 0 | |
| else: | |
| return self.length | |
| def reverse_complement(self): | |
| """Return the reverse complement of the motif as a new motif.""" | |
| alphabet = self.alphabet | |
| if self.instances is not None: | |
| instances = self.instances.reverse_complement() | |
| res = Motif(alphabet=alphabet, instances=instances) | |
| else: # has counts | |
| counts = { | |
| "A": self.counts["T"][::-1], | |
| "C": self.counts["G"][::-1], | |
| "G": self.counts["C"][::-1], | |
| "T": self.counts["A"][::-1], | |
| } | |
| res = Motif(alphabet=alphabet, counts=counts) | |
| res.__mask = self.__mask[::-1] | |
| res.background = { | |
| "A": self.background["T"], | |
| "C": self.background["G"], | |
| "G": self.background["C"], | |
| "T": self.background["A"], | |
| } | |
| res.pseudocounts = { | |
| "A": self.pseudocounts["T"], | |
| "C": self.pseudocounts["G"], | |
| "G": self.pseudocounts["C"], | |
| "T": self.pseudocounts["A"], | |
| } | |
| return res | |
| def consensus(self): | |
| """Return the consensus sequence.""" | |
| return self.counts.consensus | |
| def anticonsensus(self): | |
| """Return the least probable pattern to be generated from this motif.""" | |
| return self.counts.anticonsensus | |
| def degenerate_consensus(self): | |
| """Return the degenerate consensus sequence. | |
| Following the rules adapted from | |
| D. R. Cavener: "Comparison of the consensus sequence flanking | |
| translational start sites in Drosophila and vertebrates." | |
| Nucleic Acids Research 15(4): 1353-1361. (1987). | |
| The same rules are used by TRANSFAC. | |
| """ | |
| return self.counts.degenerate_consensus | |
| def weblogo(self, fname, fmt="PNG", version="2.8.2", **kwds): | |
| """Download and save a weblogo using the Berkeley weblogo service. | |
| Requires an internet connection. | |
| The parameters from ``**kwds`` are passed directly to the weblogo server. | |
| Currently, this method uses WebLogo version 3.3. | |
| These are the arguments and their default values passed to | |
| WebLogo 3.3; see their website at http://weblogo.threeplusone.com | |
| for more information:: | |
| 'stack_width' : 'medium', | |
| 'stacks_per_line' : '40', | |
| 'alphabet' : 'alphabet_dna', | |
| 'ignore_lower_case' : True, | |
| 'unit_name' : "bits", | |
| 'first_index' : '1', | |
| 'logo_start' : '1', | |
| 'logo_end': str(self.length), | |
| 'composition' : "comp_auto", | |
| 'percentCG' : '', | |
| 'scale_width' : True, | |
| 'show_errorbars' : True, | |
| 'logo_title' : '', | |
| 'logo_label' : '', | |
| 'show_xaxis': True, | |
| 'xaxis_label': '', | |
| 'show_yaxis': True, | |
| 'yaxis_label': '', | |
| 'yaxis_scale': 'auto', | |
| 'yaxis_tic_interval' : '1.0', | |
| 'show_ends' : True, | |
| 'show_fineprint' : True, | |
| 'color_scheme': 'color_auto', | |
| 'symbols0': '', | |
| 'symbols1': '', | |
| 'symbols2': '', | |
| 'symbols3': '', | |
| 'symbols4': '', | |
| 'color0': '', | |
| 'color1': '', | |
| 'color2': '', | |
| 'color3': '', | |
| 'color4': '', | |
| """ | |
| if set(self.alphabet) == set("ACDEFGHIKLMNPQRSTVWY"): | |
| alpha = "alphabet_protein" | |
| elif set(self.alphabet) == set("ACGU"): | |
| alpha = "alphabet_rna" | |
| elif set(self.alphabet) == set("ACGT"): | |
| alpha = "alphabet_dna" | |
| else: | |
| alpha = "auto" | |
| frequencies = format(self, "transfac") | |
| url = "https://weblogo.threeplusone.com/create.cgi" | |
| values = { | |
| "sequences": frequencies, | |
| "format": fmt.lower(), | |
| "stack_width": "medium", | |
| "stacks_per_line": "40", | |
| "alphabet": alpha, | |
| "ignore_lower_case": True, | |
| "unit_name": "bits", | |
| "first_index": "1", | |
| "logo_start": "1", | |
| "logo_end": str(self.length), | |
| "composition": "comp_auto", | |
| "percentCG": "", | |
| "scale_width": True, | |
| "show_errorbars": True, | |
| "logo_title": "", | |
| "logo_label": "", | |
| "show_xaxis": True, | |
| "xaxis_label": "", | |
| "show_yaxis": True, | |
| "yaxis_label": "", | |
| "yaxis_scale": "auto", | |
| "yaxis_tic_interval": "1.0", | |
| "show_ends": True, | |
| "show_fineprint": True, | |
| "color_scheme": "color_auto", | |
| "symbols0": "", | |
| "symbols1": "", | |
| "symbols2": "", | |
| "symbols3": "", | |
| "symbols4": "", | |
| "color0": "", | |
| "color1": "", | |
| "color2": "", | |
| "color3": "", | |
| "color4": "", | |
| } | |
| values.update({k: "" if v is False else str(v) for k, v in kwds.items()}) | |
| data = urlencode(values).encode("utf-8") | |
| req = Request(url, data) | |
| response = urlopen(req) | |
| with open(fname, "wb") as f: | |
| im = response.read() | |
| f.write(im) | |
| def __format__(self, format_spec): | |
| """Return a string representation of the Motif in the given format. | |
| Currently supported formats: | |
| - clusterbuster: Cluster Buster position frequency matrix format | |
| - pfm : JASPAR single Position Frequency Matrix | |
| - jaspar : JASPAR multiple Position Frequency Matrix | |
| - transfac : TRANSFAC like files | |
| """ | |
| return self.format(format_spec) | |
| def format(self, format_spec): | |
| """Return a string representation of the Motif in the given format. | |
| Currently supported formats: | |
| - clusterbuster: Cluster Buster position frequency matrix format | |
| - pfm : JASPAR single Position Frequency Matrix | |
| - jaspar : JASPAR multiple Position Frequency Matrix | |
| - transfac : TRANSFAC like files | |
| """ | |
| if format_spec in ("pfm", "jaspar"): | |
| from Bio.motifs import jaspar | |
| motifs = [self] | |
| return jaspar.write(motifs, format_spec) | |
| elif format_spec == "transfac": | |
| from Bio.motifs import transfac | |
| motifs = [self] | |
| return transfac.write(motifs) | |
| elif format_spec == "clusterbuster": | |
| from Bio.motifs import clusterbuster | |
| motifs = [self] | |
| return clusterbuster.write(motifs) | |
| else: | |
| raise ValueError("Unknown format type %s" % format_spec) | |
| def write(motifs, fmt): | |
| """Return a string representation of motifs in the given format. | |
| Currently supported formats (case is ignored): | |
| - clusterbuster: Cluster Buster position frequency matrix format | |
| - pfm : JASPAR simple single Position Frequency Matrix | |
| - jaspar : JASPAR multiple PFM format | |
| - transfac : TRANSFAC like files | |
| """ | |
| fmt = fmt.lower() | |
| if fmt in ("pfm", "jaspar"): | |
| from Bio.motifs import jaspar | |
| return jaspar.write(motifs, fmt) | |
| elif fmt == "transfac": | |
| from Bio.motifs import transfac | |
| return transfac.write(motifs) | |
| elif fmt == "clusterbuster": | |
| from Bio.motifs import clusterbuster | |
| return clusterbuster.write(motifs) | |
| else: | |
| raise ValueError("Unknown format type %s" % fmt) | |
| if __name__ == "__main__": | |
| from Bio._utils import run_doctest | |
| run_doctest(verbose=0) | |