Spaces:
No application file
No application file
| # Copyright (C) 2013 by Yanbo Ye (yeyanbo289@gmail.com) | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Classes and methods for tree construction.""" | |
| import itertools | |
| import copy | |
| import numbers | |
| from Bio.Phylo import BaseTree | |
| from Bio.Align import Alignment, MultipleSeqAlignment | |
| from Bio.Align import substitution_matrices | |
| # flake8: noqa | |
| class _Matrix: | |
| """Base class for distance matrix or scoring matrix. | |
| Accepts a list of names and a lower triangular matrix.:: | |
| matrix = [[0], | |
| [1, 0], | |
| [2, 3, 0], | |
| [4, 5, 6, 0]] | |
| represents the symmetric matrix of | |
| [0,1,2,4] | |
| [1,0,3,5] | |
| [2,3,0,6] | |
| [4,5,6,0] | |
| :Parameters: | |
| names : list | |
| names of elements, used for indexing | |
| matrix : list | |
| nested list of numerical lists in lower triangular format | |
| Examples | |
| -------- | |
| >>> from Bio.Phylo.TreeConstruction import _Matrix | |
| >>> names = ['Alpha', 'Beta', 'Gamma', 'Delta'] | |
| >>> matrix = [[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]] | |
| >>> m = _Matrix(names, matrix) | |
| >>> m | |
| _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]]) | |
| You can use two indices to get or assign an element in the matrix. | |
| >>> m[1,2] | |
| 3 | |
| >>> m['Beta','Gamma'] | |
| 3 | |
| >>> m['Beta','Gamma'] = 4 | |
| >>> m['Beta','Gamma'] | |
| 4 | |
| Further more, you can use one index to get or assign a list of elements related to that index. | |
| >>> m[0] | |
| [0, 1, 2, 4] | |
| >>> m['Alpha'] | |
| [0, 1, 2, 4] | |
| >>> m['Alpha'] = [0, 7, 8, 9] | |
| >>> m[0] | |
| [0, 7, 8, 9] | |
| >>> m[0,1] | |
| 7 | |
| Also you can delete or insert a column&row of elements by index. | |
| >>> m | |
| _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [7, 0], [8, 4, 0], [9, 5, 6, 0]]) | |
| >>> del m['Alpha'] | |
| >>> m | |
| _Matrix(names=['Beta', 'Gamma', 'Delta'], matrix=[[0], [4, 0], [5, 6, 0]]) | |
| >>> m.insert('Alpha', [0, 7, 8, 9] , 0) | |
| >>> m | |
| _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [7, 0], [8, 4, 0], [9, 5, 6, 0]]) | |
| """ | |
| def __init__(self, names, matrix=None): | |
| """Initialize matrix. | |
| Arguments are a list of names, and optionally a list of lower | |
| triangular matrix data (zero matrix used by default). | |
| """ | |
| # check names | |
| if isinstance(names, list) and all(isinstance(s, str) for s in names): | |
| if len(set(names)) == len(names): | |
| self.names = names | |
| else: | |
| raise ValueError("Duplicate names found") | |
| else: | |
| raise TypeError("'names' should be a list of strings") | |
| # check matrix | |
| if matrix is None: | |
| # create a new one with 0 if matrix is not assigned | |
| matrix = [[0] * i for i in range(1, len(self) + 1)] | |
| self.matrix = matrix | |
| else: | |
| # check if all elements are numbers | |
| if ( | |
| isinstance(matrix, list) | |
| and all(isinstance(row, list) for row in matrix) | |
| and all( | |
| isinstance(item, numbers.Number) for row in matrix for item in row | |
| ) | |
| ): | |
| # check if the same length with names | |
| if len(matrix) == len(names): | |
| # check if is lower triangle format | |
| if [len(row) for row in matrix] == list(range(1, len(self) + 1)): | |
| self.matrix = matrix | |
| else: | |
| raise ValueError("'matrix' should be in lower triangle format") | |
| else: | |
| raise ValueError("'names' and 'matrix' should be the same size") | |
| else: | |
| raise TypeError("'matrix' should be a list of numerical lists") | |
| def __getitem__(self, item): | |
| """Access value(s) by the index(s) or name(s). | |
| For a _Matrix object 'dm':: | |
| dm[i] get a value list from the given 'i' to others; | |
| dm[i, j] get the value between 'i' and 'j'; | |
| dm['name'] map name to index first | |
| dm['name1', 'name2'] map name to index first | |
| """ | |
| # Handle single indexing | |
| if isinstance(item, (int, str)): | |
| index = None | |
| if isinstance(item, int): | |
| index = item | |
| elif isinstance(item, str): | |
| if item in self.names: | |
| index = self.names.index(item) | |
| else: | |
| raise ValueError("Item not found.") | |
| else: | |
| raise TypeError("Invalid index type.") | |
| # check index | |
| if index > len(self) - 1: | |
| raise IndexError("Index out of range.") | |
| return [self.matrix[index][i] for i in range(0, index)] + [ | |
| self.matrix[i][index] for i in range(index, len(self)) | |
| ] | |
| # Handle double indexing | |
| elif len(item) == 2: | |
| row_index = None | |
| col_index = None | |
| if all(isinstance(i, int) for i in item): | |
| row_index, col_index = item | |
| elif all(isinstance(i, str) for i in item): | |
| row_name, col_name = item | |
| if row_name in self.names and col_name in self.names: | |
| row_index = self.names.index(row_name) | |
| col_index = self.names.index(col_name) | |
| else: | |
| raise ValueError("Item not found.") | |
| else: | |
| raise TypeError("Invalid index type.") | |
| # check index | |
| if row_index > len(self) - 1 or col_index > len(self) - 1: | |
| raise IndexError("Index out of range.") | |
| if row_index > col_index: | |
| return self.matrix[row_index][col_index] | |
| else: | |
| return self.matrix[col_index][row_index] | |
| else: | |
| raise TypeError("Invalid index type.") | |
| def __setitem__(self, item, value): | |
| """Set value by the index(s) or name(s). | |
| Similar to __getitem__:: | |
| dm[1] = [1, 0, 3, 4] set values from '1' to others; | |
| dm[i, j] = 2 set the value from 'i' to 'j' | |
| """ | |
| # Handle single indexing | |
| if isinstance(item, (int, str)): | |
| index = None | |
| if isinstance(item, int): | |
| index = item | |
| elif isinstance(item, str): | |
| if item in self.names: | |
| index = self.names.index(item) | |
| else: | |
| raise ValueError("Item not found.") | |
| else: | |
| raise TypeError("Invalid index type.") | |
| # check index | |
| if index > len(self) - 1: | |
| raise IndexError("Index out of range.") | |
| # check and assign value | |
| if isinstance(value, list) and all( | |
| isinstance(n, numbers.Number) for n in value | |
| ): | |
| if len(value) == len(self): | |
| for i in range(0, index): | |
| self.matrix[index][i] = value[i] | |
| for i in range(index, len(self)): | |
| self.matrix[i][index] = value[i] | |
| else: | |
| raise ValueError("Value not the same size.") | |
| else: | |
| raise TypeError("Invalid value type.") | |
| # Handle double indexing | |
| elif len(item) == 2: | |
| row_index = None | |
| col_index = None | |
| if all(isinstance(i, int) for i in item): | |
| row_index, col_index = item | |
| elif all(isinstance(i, str) for i in item): | |
| row_name, col_name = item | |
| if row_name in self.names and col_name in self.names: | |
| row_index = self.names.index(row_name) | |
| col_index = self.names.index(col_name) | |
| else: | |
| raise ValueError("Item not found.") | |
| else: | |
| raise TypeError("Invalid index type.") | |
| # check index | |
| if row_index > len(self) - 1 or col_index > len(self) - 1: | |
| raise IndexError("Index out of range.") | |
| # check and assign value | |
| if isinstance(value, numbers.Number): | |
| if row_index > col_index: | |
| self.matrix[row_index][col_index] = value | |
| else: | |
| self.matrix[col_index][row_index] = value | |
| else: | |
| raise TypeError("Invalid value type.") | |
| else: | |
| raise TypeError("Invalid index type.") | |
| def __delitem__(self, item): | |
| """Delete related distances by the index or name.""" | |
| index = None | |
| if isinstance(item, int): | |
| index = item | |
| elif isinstance(item, str): | |
| index = self.names.index(item) | |
| else: | |
| raise TypeError("Invalid index type.") | |
| # remove distances related to index | |
| for i in range(index + 1, len(self)): | |
| del self.matrix[i][index] | |
| del self.matrix[index] | |
| # remove name | |
| del self.names[index] | |
| def insert(self, name, value, index=None): | |
| """Insert distances given the name and value. | |
| :Parameters: | |
| name : str | |
| name of a row/col to be inserted | |
| value : list | |
| a row/col of values to be inserted | |
| """ | |
| if isinstance(name, str): | |
| # insert at the given index or at the end | |
| if index is None: | |
| index = len(self) | |
| if not isinstance(index, int): | |
| raise TypeError("Invalid index type.") | |
| # insert name | |
| self.names.insert(index, name) | |
| # insert elements of 0, to be assigned | |
| self.matrix.insert(index, [0] * index) | |
| for i in range(index, len(self)): | |
| self.matrix[i].insert(index, 0) | |
| # assign value | |
| self[index] = value | |
| else: | |
| raise TypeError("Invalid name type.") | |
| def __len__(self): | |
| """Matrix length.""" | |
| return len(self.names) | |
| def __repr__(self): | |
| """Return Matrix as a string.""" | |
| return self.__class__.__name__ + "(names=%s, matrix=%s)" % tuple( | |
| map(repr, (self.names, self.matrix)) | |
| ) | |
| def __str__(self): | |
| """Get a lower triangular matrix string.""" | |
| matrix_string = "\n".join( | |
| [ | |
| self.names[i] | |
| + "\t" | |
| + "\t".join([format(n, "f") for n in self.matrix[i]]) | |
| for i in range(0, len(self)) | |
| ] | |
| ) | |
| matrix_string = matrix_string + "\n\t" + "\t".join(self.names) | |
| return matrix_string.expandtabs(tabsize=4) | |
| class DistanceMatrix(_Matrix): | |
| """Distance matrix class that can be used for distance based tree algorithms. | |
| All diagonal elements will be zero no matter what the users provide. | |
| """ | |
| def __init__(self, names, matrix=None): | |
| """Initialize the class.""" | |
| _Matrix.__init__(self, names, matrix) | |
| self._set_zero_diagonal() | |
| def __setitem__(self, item, value): | |
| """Set Matrix's items to values.""" | |
| _Matrix.__setitem__(self, item, value) | |
| self._set_zero_diagonal() | |
| def _set_zero_diagonal(self): | |
| """Set all diagonal elements to zero (PRIVATE).""" | |
| for i in range(0, len(self)): | |
| self.matrix[i][i] = 0 | |
| def format_phylip(self, handle): | |
| """Write data in Phylip format to a given file-like object or handle. | |
| The output stream is the input distance matrix format used with Phylip | |
| programs (e.g. 'neighbor'). See: | |
| http://evolution.genetics.washington.edu/phylip/doc/neighbor.html | |
| :Parameters: | |
| handle : file or file-like object | |
| A writeable text mode file handle or other object supporting | |
| the 'write' method, such as StringIO or sys.stdout. | |
| """ | |
| handle.write(f" {len(self.names)}\n") | |
| # Phylip needs space-separated, vertically aligned columns | |
| name_width = max(12, max(map(len, self.names)) + 1) | |
| value_fmts = ("{" + str(x) + ":.4f}" for x in range(1, len(self.matrix) + 1)) | |
| row_fmt = "{0:" + str(name_width) + "s}" + " ".join(value_fmts) + "\n" | |
| for i, (name, values) in enumerate(zip(self.names, self.matrix)): | |
| # Mirror the matrix values across the diagonal | |
| mirror_values = (self.matrix[j][i] for j in range(i + 1, len(self.matrix))) | |
| fields = itertools.chain([name], values, mirror_values) | |
| handle.write(row_fmt.format(*fields)) | |
| # Shim for compatibility with Biopython<1.70 (#1304) | |
| _DistanceMatrix = DistanceMatrix | |
| class DistanceCalculator: | |
| """Calculates the distance matrix from a DNA or protein sequence alignment. | |
| This class calculates the distance matrix from a multiple sequence alignment | |
| of DNA or protein sequences, and the given name of the substitution model. | |
| Currently only scoring matrices are used. | |
| :Parameters: | |
| model : str | |
| Name of the model matrix to be used to calculate distance. | |
| The attribute ``dna_models`` contains the available model | |
| names for DNA sequences and ``protein_models`` for protein | |
| sequences. | |
| Examples | |
| -------- | |
| Loading a small PHYLIP alignment from which to compute distances:: | |
| >>> from Bio.Phylo.TreeConstruction import DistanceCalculator | |
| >>> from Bio import AlignIO | |
| >>> aln = AlignIO.read(open('TreeConstruction/msa.phy'), 'phylip') | |
| >>> print(aln) # doctest:+NORMALIZE_WHITESPACE | |
| Alignment with 5 rows and 13 columns | |
| AACGTGGCCACAT Alpha | |
| AAGGTCGCCACAC Beta | |
| CAGTTCGCCACAA Gamma | |
| GAGATTTCCGCCT Delta | |
| GAGATCTCCGCCC Epsilon | |
| DNA calculator with 'identity' model:: | |
| >>> calculator = DistanceCalculator('identity') | |
| >>> dm = calculator.get_distance(aln) | |
| >>> print(dm) # doctest:+NORMALIZE_WHITESPACE | |
| Alpha 0.000000 | |
| Beta 0.230769 0.000000 | |
| Gamma 0.384615 0.230769 0.000000 | |
| Delta 0.538462 0.538462 0.538462 0.000000 | |
| Epsilon 0.615385 0.384615 0.461538 0.153846 0.000000 | |
| Alpha Beta Gamma Delta Epsilon | |
| Protein calculator with 'blosum62' model:: | |
| >>> calculator = DistanceCalculator('blosum62') | |
| >>> dm = calculator.get_distance(aln) | |
| >>> print(dm) # doctest:+NORMALIZE_WHITESPACE | |
| Alpha 0.000000 | |
| Beta 0.369048 0.000000 | |
| Gamma 0.493976 0.250000 0.000000 | |
| Delta 0.585366 0.547619 0.566265 0.000000 | |
| Epsilon 0.700000 0.355556 0.488889 0.222222 0.000000 | |
| Alpha Beta Gamma Delta Epsilon | |
| Same calculation, using the new Alignment object:: | |
| >>> from Bio.Phylo.TreeConstruction import DistanceCalculator | |
| >>> from Bio import Align | |
| >>> aln = Align.read('TreeConstruction/msa.phy', 'phylip') | |
| >>> print(aln) # doctest:+NORMALIZE_WHITESPACE | |
| Alpha 0 AACGTGGCCACAT 13 | |
| Beta 0 AAGGTCGCCACAC 13 | |
| Gamma 0 CAGTTCGCCACAA 13 | |
| Delta 0 GAGATTTCCGCCT 13 | |
| Epsilon 0 GAGATCTCCGCCC 13 | |
| <BLANKLINE> | |
| DNA calculator with 'identity' model:: | |
| >>> calculator = DistanceCalculator('identity') | |
| >>> dm = calculator.get_distance(aln) | |
| >>> print(dm) # doctest:+NORMALIZE_WHITESPACE | |
| Alpha 0.000000 | |
| Beta 0.230769 0.000000 | |
| Gamma 0.384615 0.230769 0.000000 | |
| Delta 0.538462 0.538462 0.538462 0.000000 | |
| Epsilon 0.615385 0.384615 0.461538 0.153846 0.000000 | |
| Alpha Beta Gamma Delta Epsilon | |
| Protein calculator with 'blosum62' model:: | |
| >>> calculator = DistanceCalculator('blosum62') | |
| >>> dm = calculator.get_distance(aln) | |
| >>> print(dm) # doctest:+NORMALIZE_WHITESPACE | |
| Alpha 0.000000 | |
| Beta 0.369048 0.000000 | |
| Gamma 0.493976 0.250000 0.000000 | |
| Delta 0.585366 0.547619 0.566265 0.000000 | |
| Epsilon 0.700000 0.355556 0.488889 0.222222 0.000000 | |
| Alpha Beta Gamma Delta Epsilon | |
| """ | |
| protein_alphabet = set("ABCDEFGHIKLMNPQRSTVWXYZ") | |
| dna_models = [] | |
| protein_models = [] | |
| # matrices available | |
| names = substitution_matrices.load() | |
| for name in names: | |
| matrix = substitution_matrices.load(name) | |
| if name == "NUC.4.4": | |
| # BLAST nucleic acid scoring matrix | |
| name = "blastn" | |
| else: | |
| name = name.lower() | |
| if protein_alphabet.issubset(set(matrix.alphabet)): | |
| protein_models.append(name) | |
| else: | |
| dna_models.append(name) | |
| del protein_alphabet | |
| del name | |
| del names | |
| del matrix | |
| models = ["identity"] + dna_models + protein_models | |
| def __init__(self, model="identity", skip_letters=None): | |
| """Initialize with a distance model.""" | |
| # Shim for backward compatibility (#491) | |
| if skip_letters: | |
| self.skip_letters = skip_letters | |
| elif model == "identity": | |
| self.skip_letters = () | |
| else: | |
| self.skip_letters = ("-", "*") | |
| if model == "identity": | |
| self.scoring_matrix = None | |
| elif model in self.models: | |
| if model == "blastn": | |
| name = "NUC.4.4" | |
| else: | |
| name = model.upper() | |
| self.scoring_matrix = substitution_matrices.load(name) | |
| else: | |
| raise ValueError( | |
| "Model not supported. Available models: " + ", ".join(self.models) | |
| ) | |
| def _pairwise(self, seq1, seq2): | |
| """Calculate pairwise distance from two sequences (PRIVATE). | |
| Returns a value between 0 (identical sequences) and 1 (completely | |
| different, or seq1 is an empty string.) | |
| """ | |
| score = 0 | |
| max_score = 0 | |
| if self.scoring_matrix is None: | |
| # Score by character identity, not skipping any special letters | |
| score = sum( | |
| l1 == l2 | |
| for l1, l2 in zip(seq1, seq2) | |
| if l1 not in self.skip_letters and l2 not in self.skip_letters | |
| ) | |
| max_score = len(seq1) | |
| else: | |
| max_score1 = 0 | |
| max_score2 = 0 | |
| for i in range(0, len(seq1)): | |
| l1 = seq1[i] | |
| l2 = seq2[i] | |
| if l1 in self.skip_letters or l2 in self.skip_letters: | |
| continue | |
| try: | |
| max_score1 += self.scoring_matrix[l1, l1] | |
| except IndexError: | |
| raise ValueError( | |
| f"Bad letter '{l1}' in sequence '{seq1.id}' at position '{i}'" | |
| ) from None | |
| try: | |
| max_score2 += self.scoring_matrix[l2, l2] | |
| except IndexError: | |
| raise ValueError( | |
| f"Bad letter '{l2}' in sequence '{seq2.id}' at position '{i}'" | |
| ) from None | |
| score += self.scoring_matrix[l1, l2] | |
| # Take the higher score if the matrix is asymmetrical | |
| max_score = max(max_score1, max_score2) | |
| if max_score == 0: | |
| return 1 # max possible scaled distance | |
| return 1 - (score / max_score) | |
| def get_distance(self, msa): | |
| """Return a DistanceMatrix for an Alignment or MultipleSeqAlignment object. | |
| :Parameters: | |
| msa : Alignment or MultipleSeqAlignment object representing a | |
| DNA or protein multiple sequence alignment. | |
| """ | |
| if isinstance(msa, Alignment): | |
| names = [s.id for s in msa.sequences] | |
| dm = DistanceMatrix(names) | |
| n = len(names) | |
| for i1 in range(n): | |
| for i2 in range(i1): | |
| dm[names[i1], names[i2]] = self._pairwise(msa[i1], msa[i2]) | |
| elif isinstance(msa, MultipleSeqAlignment): | |
| names = [s.id for s in msa] | |
| dm = DistanceMatrix(names) | |
| for seq1, seq2 in itertools.combinations(msa, 2): | |
| dm[seq1.id, seq2.id] = self._pairwise(seq1, seq2) | |
| else: | |
| raise TypeError( | |
| "Must provide an Alignment object or a MultipleSeqAlignment object." | |
| ) | |
| return dm | |
| class TreeConstructor: | |
| """Base class for all tree constructor.""" | |
| def build_tree(self, msa): | |
| """Caller to build the tree from an Alignment or MultipleSeqAlignment object. | |
| This should be implemented in subclass. | |
| """ | |
| raise NotImplementedError("Method not implemented!") | |
| class DistanceTreeConstructor(TreeConstructor): | |
| """Distance based tree constructor. | |
| :Parameters: | |
| method : str | |
| Distance tree construction method, 'nj'(default) or 'upgma'. | |
| distance_calculator : DistanceCalculator | |
| The distance matrix calculator for multiple sequence alignment. | |
| It must be provided if ``build_tree`` will be called. | |
| Examples | |
| -------- | |
| Loading a small PHYLIP alignment from which to compute distances, and then | |
| build a upgma Tree:: | |
| >>> from Bio.Phylo.TreeConstruction import DistanceTreeConstructor | |
| >>> from Bio.Phylo.TreeConstruction import DistanceCalculator | |
| >>> from Bio import AlignIO | |
| >>> aln = AlignIO.read(open('TreeConstruction/msa.phy'), 'phylip') | |
| >>> constructor = DistanceTreeConstructor() | |
| >>> calculator = DistanceCalculator('identity') | |
| >>> dm = calculator.get_distance(aln) | |
| >>> upgmatree = constructor.upgma(dm) | |
| >>> print(upgmatree) | |
| Tree(rooted=True) | |
| Clade(branch_length=0, name='Inner4') | |
| Clade(branch_length=0.18749999999999994, name='Inner1') | |
| Clade(branch_length=0.07692307692307693, name='Epsilon') | |
| Clade(branch_length=0.07692307692307693, name='Delta') | |
| Clade(branch_length=0.11057692307692304, name='Inner3') | |
| Clade(branch_length=0.038461538461538464, name='Inner2') | |
| Clade(branch_length=0.11538461538461536, name='Gamma') | |
| Clade(branch_length=0.11538461538461536, name='Beta') | |
| Clade(branch_length=0.15384615384615383, name='Alpha') | |
| Build a NJ Tree:: | |
| >>> njtree = constructor.nj(dm) | |
| >>> print(njtree) | |
| Tree(rooted=False) | |
| Clade(branch_length=0, name='Inner3') | |
| Clade(branch_length=0.18269230769230765, name='Alpha') | |
| Clade(branch_length=0.04807692307692307, name='Beta') | |
| Clade(branch_length=0.04807692307692307, name='Inner2') | |
| Clade(branch_length=0.27884615384615385, name='Inner1') | |
| Clade(branch_length=0.051282051282051266, name='Epsilon') | |
| Clade(branch_length=0.10256410256410259, name='Delta') | |
| Clade(branch_length=0.14423076923076922, name='Gamma') | |
| Same example, using the new Alignment class:: | |
| >>> from Bio.Phylo.TreeConstruction import DistanceTreeConstructor | |
| >>> from Bio.Phylo.TreeConstruction import DistanceCalculator | |
| >>> from Bio import Align | |
| >>> aln = Align.read(open('TreeConstruction/msa.phy'), 'phylip') | |
| >>> constructor = DistanceTreeConstructor() | |
| >>> calculator = DistanceCalculator('identity') | |
| >>> dm = calculator.get_distance(aln) | |
| >>> upgmatree = constructor.upgma(dm) | |
| >>> print(upgmatree) | |
| Tree(rooted=True) | |
| Clade(branch_length=0, name='Inner4') | |
| Clade(branch_length=0.18749999999999994, name='Inner1') | |
| Clade(branch_length=0.07692307692307693, name='Epsilon') | |
| Clade(branch_length=0.07692307692307693, name='Delta') | |
| Clade(branch_length=0.11057692307692304, name='Inner3') | |
| Clade(branch_length=0.038461538461538464, name='Inner2') | |
| Clade(branch_length=0.11538461538461536, name='Gamma') | |
| Clade(branch_length=0.11538461538461536, name='Beta') | |
| Clade(branch_length=0.15384615384615383, name='Alpha') | |
| Build a NJ Tree:: | |
| >>> njtree = constructor.nj(dm) | |
| >>> print(njtree) | |
| Tree(rooted=False) | |
| Clade(branch_length=0, name='Inner3') | |
| Clade(branch_length=0.18269230769230765, name='Alpha') | |
| Clade(branch_length=0.04807692307692307, name='Beta') | |
| Clade(branch_length=0.04807692307692307, name='Inner2') | |
| Clade(branch_length=0.27884615384615385, name='Inner1') | |
| Clade(branch_length=0.051282051282051266, name='Epsilon') | |
| Clade(branch_length=0.10256410256410259, name='Delta') | |
| Clade(branch_length=0.14423076923076922, name='Gamma') | |
| """ | |
| methods = ["nj", "upgma"] | |
| def __init__(self, distance_calculator=None, method="nj"): | |
| """Initialize the class.""" | |
| if distance_calculator is None or isinstance( | |
| distance_calculator, DistanceCalculator | |
| ): | |
| self.distance_calculator = distance_calculator | |
| else: | |
| raise TypeError("Must provide a DistanceCalculator object.") | |
| if method in self.methods: | |
| self.method = method | |
| else: | |
| raise TypeError( | |
| "Bad method: " | |
| + method | |
| + ". Available methods: " | |
| + ", ".join(self.methods) | |
| ) | |
| def build_tree(self, msa): | |
| """Construct and return a Tree, Neighbor Joining or UPGMA.""" | |
| if self.distance_calculator: | |
| dm = self.distance_calculator.get_distance(msa) | |
| tree = None | |
| if self.method == "upgma": | |
| tree = self.upgma(dm) | |
| else: | |
| tree = self.nj(dm) | |
| return tree | |
| else: | |
| raise TypeError("Must provide a DistanceCalculator object.") | |
| def upgma(self, distance_matrix): | |
| """Construct and return an UPGMA tree. | |
| Constructs and returns an Unweighted Pair Group Method | |
| with Arithmetic mean (UPGMA) tree. | |
| :Parameters: | |
| distance_matrix : DistanceMatrix | |
| The distance matrix for tree construction. | |
| """ | |
| if not isinstance(distance_matrix, DistanceMatrix): | |
| raise TypeError("Must provide a DistanceMatrix object.") | |
| # make a copy of the distance matrix to be used | |
| dm = copy.deepcopy(distance_matrix) | |
| # init terminal clades | |
| clades = [BaseTree.Clade(None, name) for name in dm.names] | |
| # init minimum index | |
| min_i = 0 | |
| min_j = 0 | |
| inner_count = 0 | |
| while len(dm) > 1: | |
| min_dist = dm[1, 0] | |
| # find minimum index | |
| for i in range(1, len(dm)): | |
| for j in range(0, i): | |
| if min_dist >= dm[i, j]: | |
| min_dist = dm[i, j] | |
| min_i = i | |
| min_j = j | |
| # create clade | |
| clade1 = clades[min_i] | |
| clade2 = clades[min_j] | |
| inner_count += 1 | |
| inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) | |
| inner_clade.clades.append(clade1) | |
| inner_clade.clades.append(clade2) | |
| # assign branch length | |
| if clade1.is_terminal(): | |
| clade1.branch_length = min_dist / 2 | |
| else: | |
| clade1.branch_length = min_dist / 2 - self._height_of(clade1) | |
| if clade2.is_terminal(): | |
| clade2.branch_length = min_dist / 2 | |
| else: | |
| clade2.branch_length = min_dist / 2 - self._height_of(clade2) | |
| # update node list | |
| clades[min_j] = inner_clade | |
| del clades[min_i] | |
| # rebuild distance matrix, | |
| # set the distances of new node at the index of min_j | |
| for k in range(0, len(dm)): | |
| if k != min_i and k != min_j: | |
| dm[min_j, k] = (dm[min_i, k] + dm[min_j, k]) / 2 | |
| dm.names[min_j] = "Inner" + str(inner_count) | |
| del dm[min_i] | |
| inner_clade.branch_length = 0 | |
| return BaseTree.Tree(inner_clade) | |
| def nj(self, distance_matrix): | |
| """Construct and return a Neighbor Joining tree. | |
| :Parameters: | |
| distance_matrix : DistanceMatrix | |
| The distance matrix for tree construction. | |
| """ | |
| if not isinstance(distance_matrix, DistanceMatrix): | |
| raise TypeError("Must provide a DistanceMatrix object.") | |
| # make a copy of the distance matrix to be used | |
| dm = copy.deepcopy(distance_matrix) | |
| # init terminal clades | |
| clades = [BaseTree.Clade(None, name) for name in dm.names] | |
| # init node distance | |
| node_dist = [0] * len(dm) | |
| # init minimum index | |
| min_i = 0 | |
| min_j = 0 | |
| inner_count = 0 | |
| # special cases for Minimum Alignment Matrices | |
| if len(dm) == 1: | |
| root = clades[0] | |
| return BaseTree.Tree(root, rooted=False) | |
| elif len(dm) == 2: | |
| # minimum distance will always be [1,0] | |
| min_i = 1 | |
| min_j = 0 | |
| clade1 = clades[min_i] | |
| clade2 = clades[min_j] | |
| clade1.branch_length = dm[min_i, min_j] / 2.0 | |
| clade2.branch_length = dm[min_i, min_j] - clade1.branch_length | |
| inner_clade = BaseTree.Clade(None, "Inner") | |
| inner_clade.clades.append(clade1) | |
| inner_clade.clades.append(clade2) | |
| clades[0] = inner_clade | |
| root = clades[0] | |
| return BaseTree.Tree(root, rooted=False) | |
| while len(dm) > 2: | |
| # calculate nodeDist | |
| for i in range(0, len(dm)): | |
| node_dist[i] = 0 | |
| for j in range(0, len(dm)): | |
| node_dist[i] += dm[i, j] | |
| node_dist[i] = node_dist[i] / (len(dm) - 2) | |
| # find minimum distance pair | |
| min_dist = dm[1, 0] - node_dist[1] - node_dist[0] | |
| min_i = 0 | |
| min_j = 1 | |
| for i in range(1, len(dm)): | |
| for j in range(0, i): | |
| temp = dm[i, j] - node_dist[i] - node_dist[j] | |
| if min_dist > temp: | |
| min_dist = temp | |
| min_i = i | |
| min_j = j | |
| # create clade | |
| clade1 = clades[min_i] | |
| clade2 = clades[min_j] | |
| inner_count += 1 | |
| inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) | |
| inner_clade.clades.append(clade1) | |
| inner_clade.clades.append(clade2) | |
| # assign branch length | |
| clade1.branch_length = ( | |
| dm[min_i, min_j] + node_dist[min_i] - node_dist[min_j] | |
| ) / 2.0 | |
| clade2.branch_length = dm[min_i, min_j] - clade1.branch_length | |
| # update node list | |
| clades[min_j] = inner_clade | |
| del clades[min_i] | |
| # rebuild distance matrix, | |
| # set the distances of new node at the index of min_j | |
| for k in range(0, len(dm)): | |
| if k != min_i and k != min_j: | |
| dm[min_j, k] = ( | |
| dm[min_i, k] + dm[min_j, k] - dm[min_i, min_j] | |
| ) / 2.0 | |
| dm.names[min_j] = "Inner" + str(inner_count) | |
| del dm[min_i] | |
| # set the last clade as one of the child of the inner_clade | |
| root = None | |
| if clades[0] == inner_clade: | |
| clades[0].branch_length = 0 | |
| clades[1].branch_length = dm[1, 0] | |
| clades[0].clades.append(clades[1]) | |
| root = clades[0] | |
| else: | |
| clades[0].branch_length = dm[1, 0] | |
| clades[1].branch_length = 0 | |
| clades[1].clades.append(clades[0]) | |
| root = clades[1] | |
| return BaseTree.Tree(root, rooted=False) | |
| def _height_of(self, clade): | |
| """Calculate clade height -- the longest path to any terminal (PRIVATE).""" | |
| height = 0 | |
| if clade.is_terminal(): | |
| height = clade.branch_length | |
| else: | |
| height = height + max(self._height_of(c) for c in clade.clades) | |
| return height | |
| # #################### Tree Scoring and Searching Classes ##################### | |
| class Scorer: | |
| """Base class for all tree scoring methods.""" | |
| def get_score(self, tree, alignment): | |
| """Caller to get the score of a tree for the given alignment. | |
| This should be implemented in subclass. | |
| """ | |
| raise NotImplementedError("Method not implemented!") | |
| class TreeSearcher: | |
| """Base class for all tree searching methods.""" | |
| def search(self, starting_tree, alignment): | |
| """Caller to search the best tree with a starting tree. | |
| This should be implemented in subclass. | |
| """ | |
| raise NotImplementedError("Method not implemented!") | |
| class NNITreeSearcher(TreeSearcher): | |
| """Tree searching with Nearest Neighbor Interchanges (NNI) algorithm. | |
| :Parameters: | |
| scorer : ParsimonyScorer | |
| parsimony scorer to calculate the parsimony score of | |
| different trees during NNI algorithm. | |
| """ | |
| def __init__(self, scorer): | |
| """Initialize the class.""" | |
| if isinstance(scorer, Scorer): | |
| self.scorer = scorer | |
| else: | |
| raise TypeError("Must provide a Scorer object.") | |
| def search(self, starting_tree, alignment): | |
| """Implement the TreeSearcher.search method. | |
| :Parameters: | |
| starting_tree : Tree | |
| starting tree of NNI method. | |
| alignment : Alignment or MultipleSeqAlignment object | |
| multiple sequence alignment used to calculate parsimony | |
| score of different NNI trees. | |
| """ | |
| return self._nni(starting_tree, alignment) | |
| def _nni(self, starting_tree, alignment): | |
| """Search for the best parsimony tree using the NNI algorithm (PRIVATE).""" | |
| best_tree = starting_tree | |
| while True: | |
| best_score = self.scorer.get_score(best_tree, alignment) | |
| temp = best_score | |
| for t in self._get_neighbors(best_tree): | |
| score = self.scorer.get_score(t, alignment) | |
| if score < best_score: | |
| best_score = score | |
| best_tree = t | |
| # stop if no smaller score exist | |
| if best_score >= temp: | |
| break | |
| return best_tree | |
| def _get_neighbors(self, tree): | |
| """Get all neighbor trees of the given tree (PRIVATE). | |
| Currently only for binary rooted trees. | |
| """ | |
| # make child to parent dict | |
| parents = {} | |
| for clade in tree.find_clades(): | |
| if clade != tree.root: | |
| node_path = tree.get_path(clade) | |
| # cannot get the parent if the parent is root. Bug? | |
| if len(node_path) == 1: | |
| parents[clade] = tree.root | |
| else: | |
| parents[clade] = node_path[-2] | |
| neighbors = [] | |
| root_childs = [] | |
| for clade in tree.get_nonterminals(order="level"): | |
| if clade == tree.root: | |
| left = clade.clades[0] | |
| right = clade.clades[1] | |
| root_childs.append(left) | |
| root_childs.append(right) | |
| if not left.is_terminal() and not right.is_terminal(): | |
| # make changes around the left_left clade | |
| # left_left = left.clades[0] | |
| left_right = left.clades[1] | |
| right_left = right.clades[0] | |
| right_right = right.clades[1] | |
| # neighbor 1 (left_left + right_right) | |
| del left.clades[1] | |
| del right.clades[1] | |
| left.clades.append(right_right) | |
| right.clades.append(left_right) | |
| temp_tree = copy.deepcopy(tree) | |
| neighbors.append(temp_tree) | |
| # neighbor 2 (left_left + right_left) | |
| del left.clades[1] | |
| del right.clades[0] | |
| left.clades.append(right_left) | |
| right.clades.append(right_right) | |
| temp_tree = copy.deepcopy(tree) | |
| neighbors.append(temp_tree) | |
| # change back (left_left + left_right) | |
| del left.clades[1] | |
| del right.clades[0] | |
| left.clades.append(left_right) | |
| right.clades.insert(0, right_left) | |
| elif clade in root_childs: | |
| # skip root child | |
| continue | |
| else: | |
| # method for other clades | |
| # make changes around the parent clade | |
| left = clade.clades[0] | |
| right = clade.clades[1] | |
| parent = parents[clade] | |
| if clade == parent.clades[0]: | |
| sister = parent.clades[1] | |
| # neighbor 1 (parent + right) | |
| del parent.clades[1] | |
| del clade.clades[1] | |
| parent.clades.append(right) | |
| clade.clades.append(sister) | |
| temp_tree = copy.deepcopy(tree) | |
| neighbors.append(temp_tree) | |
| # neighbor 2 (parent + left) | |
| del parent.clades[1] | |
| del clade.clades[0] | |
| parent.clades.append(left) | |
| clade.clades.append(right) | |
| temp_tree = copy.deepcopy(tree) | |
| neighbors.append(temp_tree) | |
| # change back (parent + sister) | |
| del parent.clades[1] | |
| del clade.clades[0] | |
| parent.clades.append(sister) | |
| clade.clades.insert(0, left) | |
| else: | |
| sister = parent.clades[0] | |
| # neighbor 1 (parent + right) | |
| del parent.clades[0] | |
| del clade.clades[1] | |
| parent.clades.insert(0, right) | |
| clade.clades.append(sister) | |
| temp_tree = copy.deepcopy(tree) | |
| neighbors.append(temp_tree) | |
| # neighbor 2 (parent + left) | |
| del parent.clades[0] | |
| del clade.clades[0] | |
| parent.clades.insert(0, left) | |
| clade.clades.append(right) | |
| temp_tree = copy.deepcopy(tree) | |
| neighbors.append(temp_tree) | |
| # change back (parent + sister) | |
| del parent.clades[0] | |
| del clade.clades[0] | |
| parent.clades.insert(0, sister) | |
| clade.clades.insert(0, left) | |
| return neighbors | |
| # ######################## Parsimony Classes ########################## | |
| class ParsimonyScorer(Scorer): | |
| """Parsimony scorer with a scoring matrix. | |
| This is a combination of Fitch algorithm and Sankoff algorithm. | |
| See ParsimonyTreeConstructor for usage. | |
| :Parameters: | |
| matrix : _Matrix | |
| scoring matrix used in parsimony score calculation. | |
| """ | |
| def __init__(self, matrix=None): | |
| """Initialize the class.""" | |
| if not matrix or isinstance(matrix, _Matrix): | |
| self.matrix = matrix | |
| else: | |
| raise TypeError("Must provide a _Matrix object.") | |
| def get_score(self, tree, alignment): | |
| """Calculate parsimony score using the Fitch algorithm. | |
| Calculate and return the parsimony score given a tree and the | |
| MSA using either the Fitch algorithm (without a penalty matrix) | |
| or the Sankoff algorithm (with a matrix). | |
| """ | |
| # make sure the tree is rooted and bifurcating | |
| if not tree.is_bifurcating(): | |
| raise ValueError("The tree provided should be bifurcating.") | |
| if not tree.rooted: | |
| tree.root_at_midpoint() | |
| # sort tree terminals and alignment | |
| terms = tree.get_terminals() | |
| terms.sort(key=lambda term: term.name) | |
| alignment.sort() | |
| if isinstance(alignment, MultipleSeqAlignment): | |
| if not all(t.name == a.id for t, a in zip(terms, alignment)): | |
| raise ValueError( | |
| "Taxon names of the input tree should be the same with the alignment." | |
| ) | |
| else: # Alignment object | |
| if not all(t.name == s.id for t, s in zip(terms, alignment.sequences)): | |
| raise ValueError( | |
| "Taxon names of the input tree should be the same with the alignment." | |
| ) | |
| # term_align = dict(zip(terms, alignment)) | |
| score = 0 | |
| for i in range(len(alignment[0])): | |
| # parsimony score for column_i | |
| score_i = 0 | |
| # get column | |
| column_i = alignment[:, i] | |
| # skip non-informative column | |
| if column_i == len(column_i) * column_i[0]: | |
| continue | |
| # start calculating score_i using the tree and column_i | |
| # Fitch algorithm without the penalty matrix | |
| if not self.matrix: | |
| # init by mapping terminal clades and states in column_i | |
| clade_states = dict(zip(terms, [{c} for c in column_i])) | |
| for clade in tree.get_nonterminals(order="postorder"): | |
| clade_childs = clade.clades | |
| left_state = clade_states[clade_childs[0]] | |
| right_state = clade_states[clade_childs[1]] | |
| state = left_state & right_state | |
| if not state: | |
| state = left_state | right_state | |
| score_i += 1 | |
| clade_states[clade] = state | |
| # Sankoff algorithm with the penalty matrix | |
| else: | |
| inf = float("inf") | |
| # init score arrays for terminal clades | |
| alphabet = self.matrix.names | |
| length = len(alphabet) | |
| clade_scores = {} | |
| for j in range(len(column_i)): | |
| array = [inf] * length | |
| index = alphabet.index(column_i[j]) | |
| array[index] = 0 | |
| clade_scores[terms[j]] = array | |
| # bottom up calculation | |
| for clade in tree.get_nonterminals(order="postorder"): | |
| clade_childs = clade.clades | |
| left_score = clade_scores[clade_childs[0]] | |
| right_score = clade_scores[clade_childs[1]] | |
| array = [] | |
| for m in range(length): | |
| min_l = inf | |
| min_r = inf | |
| for n in range(length): | |
| sl = self.matrix[alphabet[m], alphabet[n]] + left_score[n] | |
| sr = self.matrix[alphabet[m], alphabet[n]] + right_score[n] | |
| if min_l > sl: | |
| min_l = sl | |
| if min_r > sr: | |
| min_r = sr | |
| array.append(min_l + min_r) | |
| clade_scores[clade] = array | |
| # minimum from root score | |
| score_i = min(array) | |
| # TODO: resolve internal states | |
| score += score_i | |
| return score | |
| class ParsimonyTreeConstructor(TreeConstructor): | |
| """Parsimony tree constructor. | |
| :Parameters: | |
| searcher : TreeSearcher | |
| tree searcher to search the best parsimony tree. | |
| starting_tree : Tree | |
| starting tree provided to the searcher. | |
| Examples | |
| -------- | |
| We will load an alignment, and then load various trees which have already been computed from it:: | |
| >>> from Bio import AlignIO, Phylo | |
| >>> aln = AlignIO.read(open('TreeConstruction/msa.phy'), 'phylip') | |
| >>> print(aln) | |
| Alignment with 5 rows and 13 columns | |
| AACGTGGCCACAT Alpha | |
| AAGGTCGCCACAC Beta | |
| CAGTTCGCCACAA Gamma | |
| GAGATTTCCGCCT Delta | |
| GAGATCTCCGCCC Epsilon | |
| Load a starting tree:: | |
| >>> starting_tree = Phylo.read('TreeConstruction/nj.tre', 'newick') | |
| >>> print(starting_tree) | |
| Tree(rooted=False, weight=1.0) | |
| Clade(branch_length=0.0, name='Inner3') | |
| Clade(branch_length=0.01421, name='Inner2') | |
| Clade(branch_length=0.23927, name='Inner1') | |
| Clade(branch_length=0.08531, name='Epsilon') | |
| Clade(branch_length=0.13691, name='Delta') | |
| Clade(branch_length=0.2923, name='Alpha') | |
| Clade(branch_length=0.07477, name='Beta') | |
| Clade(branch_length=0.17523, name='Gamma') | |
| Build the Parsimony tree from the starting tree:: | |
| >>> scorer = Phylo.TreeConstruction.ParsimonyScorer() | |
| >>> searcher = Phylo.TreeConstruction.NNITreeSearcher(scorer) | |
| >>> constructor = Phylo.TreeConstruction.ParsimonyTreeConstructor(searcher, starting_tree) | |
| >>> pars_tree = constructor.build_tree(aln) | |
| >>> print(pars_tree) | |
| Tree(rooted=True, weight=1.0) | |
| Clade(branch_length=0.0) | |
| Clade(branch_length=0.19732999999999998, name='Inner1') | |
| Clade(branch_length=0.13691, name='Delta') | |
| Clade(branch_length=0.08531, name='Epsilon') | |
| Clade(branch_length=0.04194000000000003, name='Inner2') | |
| Clade(branch_length=0.01421, name='Inner3') | |
| Clade(branch_length=0.17523, name='Gamma') | |
| Clade(branch_length=0.07477, name='Beta') | |
| Clade(branch_length=0.2923, name='Alpha') | |
| Same example, using the new Alignment class:: | |
| >>> from Bio import Align, Phylo | |
| >>> alignment = Align.read(open('TreeConstruction/msa.phy'), 'phylip') | |
| >>> print(alignment) | |
| Alpha 0 AACGTGGCCACAT 13 | |
| Beta 0 AAGGTCGCCACAC 13 | |
| Gamma 0 CAGTTCGCCACAA 13 | |
| Delta 0 GAGATTTCCGCCT 13 | |
| Epsilon 0 GAGATCTCCGCCC 13 | |
| <BLANKLINE> | |
| Load a starting tree:: | |
| >>> starting_tree = Phylo.read('TreeConstruction/nj.tre', 'newick') | |
| >>> print(starting_tree) | |
| Tree(rooted=False, weight=1.0) | |
| Clade(branch_length=0.0, name='Inner3') | |
| Clade(branch_length=0.01421, name='Inner2') | |
| Clade(branch_length=0.23927, name='Inner1') | |
| Clade(branch_length=0.08531, name='Epsilon') | |
| Clade(branch_length=0.13691, name='Delta') | |
| Clade(branch_length=0.2923, name='Alpha') | |
| Clade(branch_length=0.07477, name='Beta') | |
| Clade(branch_length=0.17523, name='Gamma') | |
| Build the Parsimony tree from the starting tree:: | |
| >>> scorer = Phylo.TreeConstruction.ParsimonyScorer() | |
| >>> searcher = Phylo.TreeConstruction.NNITreeSearcher(scorer) | |
| >>> constructor = Phylo.TreeConstruction.ParsimonyTreeConstructor(searcher, starting_tree) | |
| >>> pars_tree = constructor.build_tree(alignment) | |
| >>> print(pars_tree) | |
| Tree(rooted=True, weight=1.0) | |
| Clade(branch_length=0.0) | |
| Clade(branch_length=0.19732999999999998, name='Inner1') | |
| Clade(branch_length=0.13691, name='Delta') | |
| Clade(branch_length=0.08531, name='Epsilon') | |
| Clade(branch_length=0.04194000000000003, name='Inner2') | |
| Clade(branch_length=0.01421, name='Inner3') | |
| Clade(branch_length=0.17523, name='Gamma') | |
| Clade(branch_length=0.07477, name='Beta') | |
| Clade(branch_length=0.2923, name='Alpha') | |
| """ | |
| def __init__(self, searcher, starting_tree=None): | |
| """Initialize the class.""" | |
| self.searcher = searcher | |
| self.starting_tree = starting_tree | |
| def build_tree(self, alignment): | |
| """Build the tree. | |
| :Parameters: | |
| alignment : MultipleSeqAlignment | |
| multiple sequence alignment to calculate parsimony tree. | |
| """ | |
| # if starting_tree is none, | |
| # create a upgma tree with 'identity' scoring matrix | |
| if self.starting_tree is None: | |
| dtc = DistanceTreeConstructor(DistanceCalculator("identity"), "upgma") | |
| self.starting_tree = dtc.build_tree(alignment) | |
| return self.searcher.search(self.starting_tree, alignment) | |
| if __name__ == "__main__": | |
| from Bio._utils import run_doctest | |
| run_doctest() | |