Spaces:
No application file
No application file
| # Copyright 2013 by Michiel de Hoon. All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Support for various forms of sequence motif matrices. | |
| Implementation of frequency (count) matrices, position-weight matrices, | |
| and position-specific scoring matrices. | |
| """ | |
| import math | |
| import numbers | |
| try: | |
| import numpy as np | |
| except ImportError: | |
| from Bio import MissingPythonDependencyError | |
| raise MissingPythonDependencyError( | |
| "Install NumPy if you want to use Bio.motifs.matrix." | |
| ) | |
| from Bio.Seq import Seq | |
| from . import _pwm | |
| class GenericPositionMatrix(dict): | |
| """Base class for the support of position matrix operations.""" | |
| def __init__(self, alphabet, values): | |
| """Initialize the class.""" | |
| self.length = None | |
| for letter in alphabet: | |
| if self.length is None: | |
| self.length = len(values[letter]) | |
| elif self.length != len(values[letter]): | |
| raise Exception("data has inconsistent lengths") | |
| self[letter] = list(values[letter]) | |
| self.alphabet = alphabet | |
| def __str__(self): | |
| """Return a string containing nucleotides and counts of the alphabet in the Matrix.""" | |
| words = ["%6d" % i for i in range(self.length)] | |
| line = " " + " ".join(words) | |
| lines = [line] | |
| for letter in self.alphabet: | |
| words = ["%6.2f" % value for value in self[letter]] | |
| line = "%c: " % letter + " ".join(words) | |
| lines.append(line) | |
| text = "\n".join(lines) + "\n" | |
| return text | |
| def __getitem__(self, key): | |
| """Return the position matrix of index key.""" | |
| if isinstance(key, tuple): | |
| if len(key) == 2: | |
| key1, key2 = key | |
| if isinstance(key1, slice): | |
| start1, stop1, stride1 = key1.indices(len(self.alphabet)) | |
| indices1 = range(start1, stop1, stride1) | |
| letters1 = [self.alphabet[i] for i in indices1] | |
| dim1 = 2 | |
| elif isinstance(key1, numbers.Integral): | |
| letter1 = self.alphabet[key1] | |
| dim1 = 1 | |
| elif isinstance(key1, tuple): | |
| letters1 = [self.alphabet[i] for i in key1] | |
| dim1 = 2 | |
| elif isinstance(key1, str): | |
| if len(key1) == 1: | |
| letter1 = key1 | |
| dim1 = 1 | |
| else: | |
| raise KeyError(key1) | |
| else: | |
| raise KeyError("Cannot understand key %s" % key1) | |
| if isinstance(key2, slice): | |
| start2, stop2, stride2 = key2.indices(self.length) | |
| indices2 = range(start2, stop2, stride2) | |
| dim2 = 2 | |
| elif isinstance(key2, numbers.Integral): | |
| index2 = key2 | |
| dim2 = 1 | |
| else: | |
| raise KeyError("Cannot understand key %s" % key2) | |
| if dim1 == 1 and dim2 == 1: | |
| return dict.__getitem__(self, letter1)[index2] | |
| elif dim1 == 1 and dim2 == 2: | |
| values = dict.__getitem__(self, letter1) | |
| return tuple(values[index2] for index2 in indices2) | |
| elif dim1 == 2 and dim2 == 1: | |
| d = {} | |
| for letter1 in letters1: | |
| d[letter1] = dict.__getitem__(self, letter1)[index2] | |
| return d | |
| else: | |
| d = {} | |
| for letter1 in letters1: | |
| values = dict.__getitem__(self, letter1) | |
| d[letter1] = [values[_] for _ in indices2] | |
| if sorted(letters1) == self.alphabet: | |
| return self.__class__(self.alphabet, d) | |
| else: | |
| return d | |
| elif len(key) == 1: | |
| key = key[0] | |
| else: | |
| raise KeyError("keys should be 1- or 2-dimensional") | |
| if isinstance(key, slice): | |
| start, stop, stride = key.indices(len(self.alphabet)) | |
| indices = range(start, stop, stride) | |
| letters = [self.alphabet[i] for i in indices] | |
| dim = 2 | |
| elif isinstance(key, numbers.Integral): | |
| letter = self.alphabet[key] | |
| dim = 1 | |
| elif isinstance(key, tuple): | |
| letters = [self.alphabet[i] for i in key] | |
| dim = 2 | |
| elif isinstance(key, str): | |
| if len(key) == 1: | |
| letter = key | |
| dim = 1 | |
| else: | |
| raise KeyError(key) | |
| else: | |
| raise KeyError("Cannot understand key %s" % key) | |
| if dim == 1: | |
| return dict.__getitem__(self, letter) | |
| elif dim == 2: | |
| d = {} | |
| for letter in letters: | |
| d[letter] = dict.__getitem__(self, letter) | |
| return d | |
| else: | |
| raise RuntimeError("Should not get here") | |
| def consensus(self): | |
| """Return the consensus sequence.""" | |
| sequence = "" | |
| for i in range(self.length): | |
| maximum = -math.inf | |
| for letter in self.alphabet: | |
| count = self[letter][i] | |
| if count > maximum: | |
| maximum = count | |
| sequence_letter = letter | |
| sequence += sequence_letter | |
| return Seq(sequence) | |
| def anticonsensus(self): | |
| """Return the anticonsensus sequence.""" | |
| sequence = "" | |
| for i in range(self.length): | |
| minimum = math.inf | |
| for letter in self.alphabet: | |
| count = self[letter][i] | |
| if count < minimum: | |
| minimum = count | |
| sequence_letter = letter | |
| sequence += sequence_letter | |
| return Seq(sequence) | |
| def degenerate_consensus(self): | |
| """Return the degenerate consensus sequence.""" | |
| # Following the rules adapted from | |
| # D. R. Cavener: "Comparison of the consensus sequence flanking | |
| # translational start sites in Drosophila and vertebrates." | |
| # Nucleic Acids Research 15(4): 1353-1361. (1987). | |
| # The same rules are used by TRANSFAC. | |
| degenerate_nucleotide = { | |
| "A": "A", | |
| "C": "C", | |
| "G": "G", | |
| "T": "T", | |
| "AC": "M", | |
| "AG": "R", | |
| "AT": "W", | |
| "CG": "S", | |
| "CT": "Y", | |
| "GT": "K", | |
| "ACG": "V", | |
| "ACT": "H", | |
| "AGT": "D", | |
| "CGT": "B", | |
| "ACGT": "N", | |
| } | |
| sequence = "" | |
| for i in range(self.length): | |
| def get(nucleotide): | |
| return self[nucleotide][i] # noqa: B023 | |
| nucleotides = sorted(self, key=get, reverse=True) | |
| counts = [self[c][i] for c in nucleotides] | |
| # Follow the Cavener rules: | |
| if counts[0] > sum(counts[1:]) and counts[0] > 2 * counts[1]: | |
| key = nucleotides[0] | |
| elif 4 * sum(counts[:2]) > 3 * sum(counts): | |
| key = "".join(sorted(nucleotides[:2])) | |
| elif counts[3] == 0: | |
| key = "".join(sorted(nucleotides[:3])) | |
| else: | |
| key = "ACGT" | |
| nucleotide = degenerate_nucleotide.get(key, key) | |
| sequence += nucleotide | |
| return Seq(sequence) | |
| def gc_content(self): | |
| """Compute the fraction GC content.""" | |
| alphabet = self.alphabet | |
| gc_total = 0.0 | |
| total = 0.0 | |
| for i in range(self.length): | |
| for letter in alphabet: | |
| if letter in "CG": | |
| gc_total += self[letter][i] | |
| total += self[letter][i] | |
| return gc_total / total | |
| def reverse_complement(self): | |
| """Compute reverse complement.""" | |
| values = {} | |
| if self.alphabet == "ACGU": | |
| values["A"] = self["U"][::-1] | |
| values["U"] = self["A"][::-1] | |
| else: | |
| values["A"] = self["T"][::-1] | |
| values["T"] = self["A"][::-1] | |
| values["G"] = self["C"][::-1] | |
| values["C"] = self["G"][::-1] | |
| alphabet = self.alphabet | |
| return self.__class__(alphabet, values) | |
| class FrequencyPositionMatrix(GenericPositionMatrix): | |
| """Class for the support of frequency calculations on the Position Matrix.""" | |
| def normalize(self, pseudocounts=None): | |
| """Create and return a position-weight matrix by normalizing the counts matrix. | |
| If pseudocounts is None (default), no pseudocounts are added | |
| to the counts. | |
| If pseudocounts is a number, it is added to the counts before | |
| calculating the position-weight matrix. | |
| Alternatively, the pseudocounts can be a dictionary with a key | |
| for each letter in the alphabet associated with the motif. | |
| """ | |
| counts = {} | |
| if pseudocounts is None: | |
| for letter in self.alphabet: | |
| counts[letter] = [0.0] * self.length | |
| elif isinstance(pseudocounts, dict): | |
| for letter in self.alphabet: | |
| counts[letter] = [float(pseudocounts[letter])] * self.length | |
| else: | |
| for letter in self.alphabet: | |
| counts[letter] = [float(pseudocounts)] * self.length | |
| for i in range(self.length): | |
| for letter in self.alphabet: | |
| counts[letter][i] += self[letter][i] | |
| # Actual normalization is done in the PositionWeightMatrix initializer | |
| return PositionWeightMatrix(self.alphabet, counts) | |
| class PositionWeightMatrix(GenericPositionMatrix): | |
| """Class for the support of weight calculations on the Position Matrix.""" | |
| def __init__(self, alphabet, counts): | |
| """Initialize the class.""" | |
| GenericPositionMatrix.__init__(self, alphabet, counts) | |
| for i in range(self.length): | |
| total = sum(self[letter][i] for letter in alphabet) | |
| for letter in alphabet: | |
| self[letter][i] /= total | |
| for letter in alphabet: | |
| self[letter] = tuple(self[letter]) | |
| def log_odds(self, background=None): | |
| """Return the Position-Specific Scoring Matrix. | |
| The Position-Specific Scoring Matrix (PSSM) contains the log-odds | |
| scores computed from the probability matrix and the background | |
| probabilities. If the background is None, a uniform background | |
| distribution is assumed. | |
| """ | |
| values = {} | |
| alphabet = self.alphabet | |
| if background is None: | |
| background = dict.fromkeys(self.alphabet, 1.0) | |
| else: | |
| background = dict(background) | |
| total = sum(background.values()) | |
| for letter in alphabet: | |
| background[letter] /= total | |
| values[letter] = [] | |
| for i in range(self.length): | |
| for letter in alphabet: | |
| b = background[letter] | |
| if b > 0: | |
| p = self[letter][i] | |
| if p > 0: | |
| logodds = math.log(p / b, 2) | |
| else: | |
| logodds = -math.inf | |
| else: | |
| p = self[letter][i] | |
| if p > 0: | |
| logodds = math.inf | |
| else: | |
| logodds = math.nan | |
| values[letter].append(logodds) | |
| pssm = PositionSpecificScoringMatrix(alphabet, values) | |
| return pssm | |
| class PositionSpecificScoringMatrix(GenericPositionMatrix): | |
| """Class for the support of Position Specific Scoring Matrix calculations.""" | |
| def calculate(self, sequence): | |
| """Return the PWM score for a given sequence for all positions. | |
| Notes: | |
| - the sequence can only be a DNA sequence | |
| - the search is performed only on one strand | |
| - if the sequence and the motif have the same length, a single | |
| number is returned | |
| - otherwise, the result is a one-dimensional numpy array | |
| """ | |
| # TODO - Code itself tolerates ambiguous bases (as NaN). | |
| if sorted(self.alphabet) != ["A", "C", "G", "T"]: | |
| raise ValueError( | |
| "PSSM has wrong alphabet: %s - Use only with DNA motifs" % self.alphabet | |
| ) | |
| # NOTE: The C code handles mixed case input as this could be large | |
| # (e.g. contig or chromosome), so requiring it be all upper or lower | |
| # case would impose an overhead to allocate the extra memory. | |
| try: | |
| sequence = bytes(sequence) | |
| except TypeError: # str | |
| try: | |
| sequence = bytes(sequence, "ASCII") | |
| except TypeError: | |
| raise ValueError( | |
| "sequence should be a Seq, MutableSeq, string, or bytes-like object" | |
| ) from None | |
| except UnicodeEncodeError: | |
| raise ValueError( | |
| "sequence should contain ASCII characters only" | |
| ) from None | |
| except Exception: | |
| raise ValueError( | |
| "sequence should be a Seq, MutableSeq, string, or bytes-like object" | |
| ) from None | |
| n = len(sequence) | |
| m = self.length | |
| # Create the numpy arrays here; the C module then does not rely on numpy | |
| # Use a float32 for the scores array to save space | |
| scores = np.empty(n - m + 1, np.float32) | |
| logodds = np.array( | |
| [[self[letter][i] for letter in "ACGT"] for i in range(m)], float | |
| ) | |
| _pwm.calculate(sequence, logodds, scores) | |
| if len(scores) == 1: | |
| return scores[0] | |
| else: | |
| return scores | |
| def search(self, sequence, threshold=0.0, both=True, chunksize=10**6): | |
| """Find hits with PWM score above given threshold. | |
| A generator function, returning found hits in the given sequence | |
| with the pwm score higher than the threshold. | |
| """ | |
| sequence = sequence.upper() | |
| seq_len = len(sequence) | |
| motif_l = self.length | |
| chunk_starts = np.arange(0, seq_len, chunksize) | |
| if both: | |
| rc = self.reverse_complement() | |
| for chunk_start in chunk_starts: | |
| subseq = sequence[chunk_start : chunk_start + chunksize + motif_l - 1] | |
| pos_scores = self.calculate(subseq) | |
| pos_ind = pos_scores >= threshold | |
| pos_positions = np.where(pos_ind)[0] + chunk_start | |
| pos_scores = pos_scores[pos_ind] | |
| if both: | |
| neg_scores = rc.calculate(subseq) | |
| neg_ind = neg_scores >= threshold | |
| neg_positions = np.where(neg_ind)[0] + chunk_start | |
| neg_scores = neg_scores[neg_ind] | |
| else: | |
| neg_positions = np.empty((0), dtype=int) | |
| neg_scores = np.empty((0), dtype=int) | |
| chunk_positions = np.append(pos_positions, neg_positions - seq_len) | |
| chunk_scores = np.append(pos_scores, neg_scores) | |
| order = np.argsort(np.append(pos_positions, neg_positions)) | |
| chunk_positions = chunk_positions[order] | |
| chunk_scores = chunk_scores[order] | |
| yield from zip(chunk_positions, chunk_scores) | |
| def max(self): | |
| """Maximal possible score for this motif. | |
| returns the score computed for the consensus sequence. | |
| """ | |
| score = 0.0 | |
| letters = self.alphabet | |
| for position in range(0, self.length): | |
| score += max(self[letter][position] for letter in letters) | |
| return score | |
| def min(self): | |
| """Minimal possible score for this motif. | |
| returns the score computed for the anticonsensus sequence. | |
| """ | |
| score = 0.0 | |
| letters = self.alphabet | |
| for position in range(0, self.length): | |
| score += min(self[letter][position] for letter in letters) | |
| return score | |
| def gc_content(self): | |
| """Compute the GC-ratio.""" | |
| raise Exception("Cannot compute the %GC composition of a PSSM") | |
| def mean(self, background=None): | |
| """Return expected value of the score of a motif.""" | |
| if background is None: | |
| background = dict.fromkeys(self.alphabet, 1.0) | |
| else: | |
| background = dict(background) | |
| total = sum(background.values()) | |
| for letter in self.alphabet: | |
| background[letter] /= total | |
| sx = 0.0 | |
| for i in range(self.length): | |
| for letter in self.alphabet: | |
| logodds = self[letter, i] | |
| if math.isnan(logodds): | |
| continue | |
| if math.isinf(logodds) and logodds < 0: | |
| continue | |
| b = background[letter] | |
| p = b * math.pow(2, logodds) | |
| sx += p * logodds | |
| return sx | |
| def std(self, background=None): | |
| """Return standard deviation of the score of a motif.""" | |
| if background is None: | |
| background = dict.fromkeys(self.alphabet, 1.0) | |
| else: | |
| background = dict(background) | |
| total = sum(background.values()) | |
| for letter in self.alphabet: | |
| background[letter] /= total | |
| variance = 0.0 | |
| for i in range(self.length): | |
| sx = 0.0 | |
| sxx = 0.0 | |
| for letter in self.alphabet: | |
| logodds = self[letter, i] | |
| if math.isnan(logodds): | |
| continue | |
| if math.isinf(logodds) and logodds < 0: | |
| continue | |
| b = background[letter] | |
| p = b * math.pow(2, logodds) | |
| sx += p * logodds | |
| sxx += p * logodds * logodds | |
| sxx -= sx * sx | |
| variance += sxx | |
| variance = max(variance, 0) # to avoid roundoff problems | |
| return math.sqrt(variance) | |
| def dist_pearson(self, other): | |
| """Return the similarity score based on pearson correlation for the given motif against self. | |
| We use the Pearson's correlation of the respective probabilities. | |
| """ | |
| if self.alphabet != other.alphabet: | |
| raise ValueError("Cannot compare motifs with different alphabets") | |
| max_p = -2 | |
| for offset in range(-self.length + 1, other.length): | |
| if offset < 0: | |
| p = self.dist_pearson_at(other, -offset) | |
| else: # offset>=0 | |
| p = other.dist_pearson_at(self, offset) | |
| if max_p < p: | |
| max_p = p | |
| max_o = -offset | |
| return 1 - max_p, max_o | |
| def dist_pearson_at(self, other, offset): | |
| """Return the similarity score based on pearson correlation at the given offset.""" | |
| letters = self.alphabet | |
| sx = 0.0 # \sum x | |
| sy = 0.0 # \sum y | |
| sxx = 0.0 # \sum x^2 | |
| sxy = 0.0 # \sum x \cdot y | |
| syy = 0.0 # \sum y^2 | |
| norm = max(self.length, offset + other.length) * len(letters) | |
| for pos in range(min(self.length - offset, other.length)): | |
| xi = [self[letter, pos + offset] for letter in letters] | |
| yi = [other[letter, pos] for letter in letters] | |
| sx += sum(xi) | |
| sy += sum(yi) | |
| sxx += sum(x * x for x in xi) | |
| sxy += sum(x * y for x, y in zip(xi, yi)) | |
| syy += sum(y * y for y in yi) | |
| sx /= norm | |
| sy /= norm | |
| sxx /= norm | |
| sxy /= norm | |
| syy /= norm | |
| numerator = sxy - sx * sy | |
| denominator = math.sqrt((sxx - sx * sx) * (syy - sy * sy)) | |
| return numerator / denominator | |
| def distribution(self, background=None, precision=10**3): | |
| """Calculate the distribution of the scores at the given precision.""" | |
| from .thresholds import ScoreDistribution | |
| if background is None: | |
| background = dict.fromkeys(self.alphabet, 1.0) | |
| else: | |
| background = dict(background) | |
| total = sum(background.values()) | |
| for letter in self.alphabet: | |
| background[letter] /= total | |
| return ScoreDistribution(precision=precision, pssm=self, background=background) | |