Spaces:
No application file
No application file
| # Copyright (C) 2013 by Yanbo Ye (yeyanbo289@gmail.com) | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Classes and methods for finding consensus trees. | |
| This module contains a ``_BitString`` class to assist the consensus tree | |
| searching and some common consensus algorithms such as strict, majority rule and | |
| adam consensus. | |
| """ | |
| import random | |
| import itertools | |
| from ast import literal_eval | |
| from Bio.Phylo import BaseTree | |
| from Bio.Align import MultipleSeqAlignment | |
| class _BitString(str): | |
| """Helper class for binary string data (PRIVATE). | |
| Assistant class of binary string data used for storing and | |
| counting compatible clades in consensus tree searching. It includes | |
| some binary manipulation(&|^~) methods. | |
| _BitString is a sub-class of ``str`` object that only accepts two | |
| characters('0' and '1'), with additional functions for binary-like | |
| manipulation(&|^~). It is used to count and store the clades in | |
| multiple trees in consensus tree searching. During counting, the | |
| clades will be considered the same if their terminals(in terms of | |
| ``name`` attribute) are the same. | |
| For example, let's say two trees are provided as below to search | |
| their strict consensus tree:: | |
| tree1: (((A, B), C),(D, E)) | |
| tree2: ((A, (B, C)),(D, E)) | |
| For both trees, a _BitString object '11111' will represent their | |
| root clade. Each '1' stands for the terminal clade in the list | |
| [A, B, C, D, E](the order might not be the same, it's determined | |
| by the ``get_terminal`` method of the first tree provided). For | |
| the clade ((A, B), C) in tree1 and (A, (B, C)) in tree2, they both | |
| can be represented by '11100'. Similarly, '11000' represents clade | |
| (A, B) in tree1, '01100' represents clade (B, C) in tree2, and '00011' | |
| represents clade (D, E) in both trees. | |
| So, with the ``_count_clades`` function in this module, finally we | |
| can get the clade counts and their _BitString representation as follows | |
| (the root and terminals are omitted):: | |
| clade _BitString count | |
| ABC '11100' 2 | |
| DE '00011' 2 | |
| AB '11000' 1 | |
| BC '01100' 1 | |
| To get the _BitString representation of a clade, we can use the following | |
| code snippet:: | |
| # suppose we are provided with a tree list, the first thing to do is | |
| # to get all the terminal names in the first tree | |
| term_names = [term.name for term in trees[0].get_terminals()] | |
| # for a specific clade in any of the tree, also get its terminal names | |
| clade_term_names = [term.name for term in clade.get_terminals()] | |
| # then create a boolean list | |
| boolvals = [name in clade_term_names for name in term_names] | |
| # create the string version and pass it to _BitString | |
| bitstr = _BitString(''.join(map(str, map(int, boolvals)))) | |
| # or, equivalently: | |
| bitstr = _BitString.from_bool(boolvals) | |
| To convert back:: | |
| # get all the terminal clades of the first tree | |
| terms = [term for term in trees[0].get_terminals()] | |
| # get the index of terminal clades in bitstr | |
| index_list = bitstr.index_one() | |
| # get all terminal clades by index | |
| clade_terms = [terms[i] for i in index_list] | |
| # create a new calde and append all the terminal clades | |
| new_clade = BaseTree.Clade() | |
| new_clade.clades.extend(clade_terms) | |
| Examples | |
| -------- | |
| >>> from Bio.Phylo.Consensus import _BitString | |
| >>> bitstr1 = _BitString('11111') | |
| >>> bitstr2 = _BitString('11100') | |
| >>> bitstr3 = _BitString('01101') | |
| >>> bitstr1 | |
| _BitString('11111') | |
| >>> bitstr2 & bitstr3 | |
| _BitString('01100') | |
| >>> bitstr2 | bitstr3 | |
| _BitString('11101') | |
| >>> bitstr2 ^ bitstr3 | |
| _BitString('10001') | |
| >>> bitstr2.index_one() | |
| [0, 1, 2] | |
| >>> bitstr3.index_one() | |
| [1, 2, 4] | |
| >>> bitstr3.index_zero() | |
| [0, 3] | |
| >>> bitstr1.contains(bitstr2) | |
| True | |
| >>> bitstr2.contains(bitstr3) | |
| False | |
| >>> bitstr2.independent(bitstr3) | |
| False | |
| >>> bitstr1.iscompatible(bitstr2) | |
| True | |
| >>> bitstr2.iscompatible(bitstr3) | |
| False | |
| """ | |
| def __new__(cls, strdata): | |
| """Init from a binary string data.""" | |
| if isinstance(strdata, str) and len(strdata) == strdata.count( | |
| "0" | |
| ) + strdata.count("1"): | |
| return str.__new__(cls, strdata) | |
| else: | |
| raise TypeError( | |
| "The input should be a binary string composed of '0' and '1'" | |
| ) | |
| def __and__(self, other): | |
| selfint = literal_eval("0b" + self) | |
| otherint = literal_eval("0b" + other) | |
| resultint = selfint & otherint | |
| return _BitString(bin(resultint)[2:].zfill(len(self))) | |
| def __or__(self, other): | |
| selfint = literal_eval("0b" + self) | |
| otherint = literal_eval("0b" + other) | |
| resultint = selfint | otherint | |
| return _BitString(bin(resultint)[2:].zfill(len(self))) | |
| def __xor__(self, other): | |
| selfint = literal_eval("0b" + self) | |
| otherint = literal_eval("0b" + other) | |
| resultint = selfint ^ otherint | |
| return _BitString(bin(resultint)[2:].zfill(len(self))) | |
| def __rand__(self, other): | |
| selfint = literal_eval("0b" + self) | |
| otherint = literal_eval("0b" + other) | |
| resultint = otherint & selfint | |
| return _BitString(bin(resultint)[2:].zfill(len(self))) | |
| def __ror__(self, other): | |
| selfint = literal_eval("0b" + self) | |
| otherint = literal_eval("0b" + other) | |
| resultint = otherint | selfint | |
| return _BitString(bin(resultint)[2:].zfill(len(self))) | |
| def __rxor__(self, other): | |
| selfint = literal_eval("0b" + self) | |
| otherint = literal_eval("0b" + other) | |
| resultint = otherint ^ selfint | |
| return _BitString(bin(resultint)[2:].zfill(len(self))) | |
| def __repr__(self): | |
| return "_BitString(" + str.__repr__(self) + ")" | |
| def index_one(self): | |
| """Return a list of positions where the element is '1'.""" | |
| return [i for i, n in enumerate(self) if n == "1"] | |
| def index_zero(self): | |
| """Return a list of positions where the element is '0'.""" | |
| return [i for i, n in enumerate(self) if n == "0"] | |
| def contains(self, other): | |
| """Check if current bitstr1 contains another one bitstr2. | |
| That is to say, the bitstr2.index_one() is a subset of | |
| bitstr1.index_one(). | |
| Examples: | |
| "011011" contains "011000", "011001", "000011" | |
| Be careful, "011011" also contains "000000". Actually, all _BitString | |
| objects contain all-zero _BitString of the same length. | |
| """ | |
| xorbit = self ^ other | |
| return xorbit.count("1") == self.count("1") - other.count("1") | |
| def independent(self, other): | |
| """Check if current bitstr1 is independent of another one bitstr2. | |
| That is to say the bitstr1.index_one() and bitstr2.index_one() have | |
| no intersection. | |
| Be careful, all _BitString objects are independent of all-zero _BitString | |
| of the same length. | |
| """ | |
| xorbit = self ^ other | |
| return xorbit.count("1") == self.count("1") + other.count("1") | |
| def iscompatible(self, other): | |
| """Check if current bitstr1 is compatible with another bitstr2. | |
| Two conditions are considered as compatible: | |
| 1. bitstr1.contain(bitstr2) or vice versa; | |
| 2. bitstr1.independent(bitstr2). | |
| """ | |
| return self.contains(other) or other.contains(self) or self.independent(other) | |
| def from_bool(cls, bools): | |
| return cls("".join(map(str, map(int, bools)))) | |
| def strict_consensus(trees): | |
| """Search strict consensus tree from multiple trees. | |
| :Parameters: | |
| trees : iterable | |
| iterable of trees to produce consensus tree. | |
| """ | |
| trees_iter = iter(trees) | |
| first_tree = next(trees_iter) | |
| terms = first_tree.get_terminals() | |
| bitstr_counts, tree_count = _count_clades(itertools.chain([first_tree], trees_iter)) | |
| # Store bitstrs for strict clades | |
| strict_bitstrs = [ | |
| bitstr for bitstr, t in bitstr_counts.items() if t[0] == tree_count | |
| ] | |
| strict_bitstrs.sort(key=lambda bitstr: bitstr.count("1"), reverse=True) | |
| # Create root | |
| root = BaseTree.Clade() | |
| if strict_bitstrs[0].count("1") == len(terms): | |
| root.clades.extend(terms) | |
| else: | |
| raise ValueError("Taxons in provided trees should be consistent") | |
| # make a bitstr to clades dict and store root clade | |
| bitstr_clades = {strict_bitstrs[0]: root} | |
| # create inner clades | |
| for bitstr in strict_bitstrs[1:]: | |
| clade_terms = [terms[i] for i in bitstr.index_one()] | |
| clade = BaseTree.Clade() | |
| clade.clades.extend(clade_terms) | |
| for bs, c in bitstr_clades.items(): | |
| # check if it should be the parent of current clade | |
| if bs.contains(bitstr): | |
| # remove old bitstring | |
| del bitstr_clades[bs] | |
| # update clade childs | |
| new_childs = [child for child in c.clades if child not in clade_terms] | |
| c.clades = new_childs | |
| # set current clade as child of c | |
| c.clades.append(clade) | |
| # update bitstring | |
| bs = bs ^ bitstr | |
| # update clade | |
| bitstr_clades[bs] = c | |
| break | |
| # put new clade | |
| bitstr_clades[bitstr] = clade | |
| return BaseTree.Tree(root=root) | |
| def majority_consensus(trees, cutoff=0): | |
| """Search majority rule consensus tree from multiple trees. | |
| This is a extend majority rule method, which means the you can set any | |
| cutoff between 0 ~ 1 instead of 0.5. The default value of cutoff is 0 to | |
| create a relaxed binary consensus tree in any condition (as long as one of | |
| the provided trees is a binary tree). The branch length of each consensus | |
| clade in the result consensus tree is the average length of all counts for | |
| that clade. | |
| :Parameters: | |
| trees : iterable | |
| iterable of trees to produce consensus tree. | |
| """ | |
| tree_iter = iter(trees) | |
| first_tree = next(tree_iter) | |
| terms = first_tree.get_terminals() | |
| bitstr_counts, tree_count = _count_clades(itertools.chain([first_tree], tree_iter)) | |
| # Sort bitstrs by descending #occurrences, then #tips, then tip order | |
| bitstrs = sorted( | |
| bitstr_counts.keys(), | |
| key=lambda bitstr: (bitstr_counts[bitstr][0], bitstr.count("1"), str(bitstr)), | |
| reverse=True, | |
| ) | |
| root = BaseTree.Clade() | |
| if bitstrs[0].count("1") == len(terms): | |
| root.clades.extend(terms) | |
| else: | |
| raise ValueError("Taxons in provided trees should be consistent") | |
| # Make a bitstr-to-clades dict and store root clade | |
| bitstr_clades = {bitstrs[0]: root} | |
| # create inner clades | |
| for bitstr in bitstrs[1:]: | |
| # apply majority rule | |
| count_in_trees, branch_length_sum = bitstr_counts[bitstr] | |
| confidence = 100.0 * count_in_trees / tree_count | |
| if confidence < cutoff * 100.0: | |
| break | |
| clade_terms = [terms[i] for i in bitstr.index_one()] | |
| clade = BaseTree.Clade() | |
| clade.clades.extend(clade_terms) | |
| clade.confidence = confidence | |
| clade.branch_length = branch_length_sum / count_in_trees | |
| bsckeys = sorted(bitstr_clades, key=lambda bs: bs.count("1"), reverse=True) | |
| # check if current clade is compatible with previous clades and | |
| # record its possible parent and child clades. | |
| compatible = True | |
| parent_bitstr = None | |
| child_bitstrs = [] # multiple independent childs | |
| for bs in bsckeys: | |
| if not bs.iscompatible(bitstr): | |
| compatible = False | |
| break | |
| # assign the closest ancestor as its parent | |
| # as bsckeys is sorted, it should be the last one | |
| if bs.contains(bitstr): | |
| parent_bitstr = bs | |
| # assign the closest descendant as its child | |
| # the largest and independent clades | |
| if ( | |
| bitstr.contains(bs) | |
| and bs != bitstr | |
| and all(c.independent(bs) for c in child_bitstrs) | |
| ): | |
| child_bitstrs.append(bs) | |
| if not compatible: | |
| continue | |
| if parent_bitstr: | |
| # insert current clade; remove old bitstring | |
| parent_clade = bitstr_clades.pop(parent_bitstr) | |
| # update parent clade childs | |
| parent_clade.clades = [ | |
| c for c in parent_clade.clades if c not in clade_terms | |
| ] | |
| # set current clade as child of parent_clade | |
| parent_clade.clades.append(clade) | |
| # update bitstring | |
| # parent = parent ^ bitstr | |
| # update clade | |
| bitstr_clades[parent_bitstr] = parent_clade | |
| if child_bitstrs: | |
| remove_list = [] | |
| for c in child_bitstrs: | |
| remove_list.extend(c.index_one()) | |
| child_clade = bitstr_clades[c] | |
| parent_clade.clades.remove(child_clade) | |
| clade.clades.append(child_clade) | |
| remove_terms = [terms[i] for i in remove_list] | |
| clade.clades = [c for c in clade.clades if c not in remove_terms] | |
| # put new clade | |
| bitstr_clades[bitstr] = clade | |
| if (len(bitstr_clades) == len(terms) - 1) or ( | |
| len(bitstr_clades) == len(terms) - 2 and len(root.clades) == 3 | |
| ): | |
| break | |
| return BaseTree.Tree(root=root) | |
| def adam_consensus(trees): | |
| """Search Adam Consensus tree from multiple trees. | |
| :Parameters: | |
| trees : list | |
| list of trees to produce consensus tree. | |
| """ | |
| clades = [tree.root for tree in trees] | |
| return BaseTree.Tree(root=_part(clades), rooted=True) | |
| def _part(clades): | |
| """Recursive function for Adam Consensus algorithm (PRIVATE).""" | |
| new_clade = None | |
| terms = clades[0].get_terminals() | |
| term_names = [term.name for term in terms] | |
| if len(terms) == 1 or len(terms) == 2: | |
| new_clade = clades[0] | |
| else: | |
| bitstrs = {_BitString("1" * len(terms))} | |
| for clade in clades: | |
| for child in clade.clades: | |
| bitstr = _clade_to_bitstr(child, term_names) | |
| to_remove = set() | |
| to_add = set() | |
| for bs in bitstrs: | |
| if bs == bitstr: | |
| continue | |
| elif bs.contains(bitstr): | |
| to_add.add(bitstr) | |
| to_add.add(bs ^ bitstr) | |
| to_remove.add(bs) | |
| elif bitstr.contains(bs): | |
| to_add.add(bs ^ bitstr) | |
| elif not bs.independent(bitstr): | |
| to_add.add(bs & bitstr) | |
| to_add.add(bs & bitstr ^ bitstr) | |
| to_add.add(bs & bitstr ^ bs) | |
| to_remove.add(bs) | |
| # bitstrs = bitstrs | to_add | |
| bitstrs ^= to_remove | |
| if to_add: | |
| for ta in sorted(to_add, key=lambda bs: bs.count("1")): | |
| independent = True | |
| for bs in bitstrs: | |
| if not ta.independent(bs): | |
| independent = False | |
| break | |
| if independent: | |
| bitstrs.add(ta) | |
| new_clade = BaseTree.Clade() | |
| for bitstr in sorted(bitstrs): | |
| indices = bitstr.index_one() | |
| if len(indices) == 1: | |
| new_clade.clades.append(terms[indices[0]]) | |
| elif len(indices) == 2: | |
| bifur_clade = BaseTree.Clade() | |
| bifur_clade.clades.append(terms[indices[0]]) | |
| bifur_clade.clades.append(terms[indices[1]]) | |
| new_clade.clades.append(bifur_clade) | |
| elif len(indices) > 2: | |
| part_names = [term_names[i] for i in indices] | |
| next_clades = [] | |
| for clade in clades: | |
| next_clades.append(_sub_clade(clade, part_names)) | |
| # next_clades = [clade.common_ancestor([clade.find_any(name=name) for name in part_names]) for clade in clades] | |
| new_clade.clades.append(_part(next_clades)) | |
| return new_clade | |
| def _sub_clade(clade, term_names): | |
| """Extract a compatible subclade that only contains the given terminal names (PRIVATE).""" | |
| term_clades = [clade.find_any(name) for name in term_names] | |
| sub_clade = clade.common_ancestor(term_clades) | |
| if len(term_names) != sub_clade.count_terminals(): | |
| temp_clade = BaseTree.Clade() | |
| temp_clade.clades.extend(term_clades) | |
| for c in sub_clade.find_clades(terminal=False, order="preorder"): | |
| if c == sub_clade.root: | |
| continue | |
| childs = set(c.find_clades(terminal=True)) & set(term_clades) | |
| if childs: | |
| for tc in temp_clade.find_clades(terminal=False, order="preorder"): | |
| tc_childs = set(tc.clades) | |
| tc_new_clades = tc_childs - childs | |
| if childs.issubset(tc_childs) and tc_new_clades: | |
| tc.clades = list(tc_new_clades) | |
| child_clade = BaseTree.Clade() | |
| child_clade.clades.extend(list(childs)) | |
| tc.clades.append(child_clade) | |
| sub_clade = temp_clade | |
| return sub_clade | |
| def _count_clades(trees): | |
| """Count distinct clades (different sets of terminal names) in the trees (PRIVATE). | |
| Return a tuple first a dict of bitstring (representing clade) and a tuple of its count of | |
| occurrences and sum of branch length for that clade, second the number of trees processed. | |
| :Parameters: | |
| trees : iterable | |
| An iterable that returns the trees to count | |
| """ | |
| bitstrs = {} | |
| tree_count = 0 | |
| for tree in trees: | |
| tree_count += 1 | |
| clade_bitstrs = _tree_to_bitstrs(tree) | |
| for clade in tree.find_clades(terminal=False): | |
| bitstr = clade_bitstrs[clade] | |
| if bitstr in bitstrs: | |
| count, sum_bl = bitstrs[bitstr] | |
| count += 1 | |
| sum_bl += clade.branch_length or 0 | |
| bitstrs[bitstr] = (count, sum_bl) | |
| else: | |
| bitstrs[bitstr] = (1, clade.branch_length or 0) | |
| return bitstrs, tree_count | |
| def get_support(target_tree, trees, len_trees=None): | |
| """Calculate branch support for a target tree given bootstrap replicate trees. | |
| :Parameters: | |
| target_tree : Tree | |
| tree to calculate branch support for. | |
| trees : iterable | |
| iterable of trees used to calculate branch support. | |
| len_trees : int | |
| optional count of replicates in trees. len_trees must be provided | |
| when len(trees) is not a valid operation. | |
| """ | |
| term_names = sorted(term.name for term in target_tree.find_clades(terminal=True)) | |
| bitstrs = {} | |
| size = len_trees | |
| if size is None: | |
| try: | |
| size = len(trees) | |
| except TypeError: | |
| raise TypeError( | |
| "Trees does not support len(trees), " | |
| "you must provide the number of replicates in trees " | |
| "as the optional parameter len_trees." | |
| ) from None | |
| for clade in target_tree.find_clades(terminal=False): | |
| bitstr = _clade_to_bitstr(clade, term_names) | |
| bitstrs[bitstr] = (clade, 0) | |
| for tree in trees: | |
| for clade in tree.find_clades(terminal=False): | |
| bitstr = _clade_to_bitstr(clade, term_names) | |
| if bitstr in bitstrs: | |
| c, t = bitstrs[bitstr] | |
| c.confidence = (t + 1) * 100.0 / size | |
| bitstrs[bitstr] = (c, t + 1) | |
| return target_tree | |
| def bootstrap(msa, times): | |
| """Generate bootstrap replicates from a multiple sequence alignment (OBSOLETE). | |
| :Parameters: | |
| msa : MultipleSeqAlignment | |
| multiple sequence alignment to generate replicates. | |
| times : int | |
| number of bootstrap times. | |
| """ | |
| length = len(msa[0]) | |
| i = 0 | |
| while i < times: | |
| i += 1 | |
| item = None | |
| for j in range(length): | |
| col = random.randint(0, length - 1) | |
| if not item: | |
| item = msa[:, col : col + 1] | |
| else: | |
| item += msa[:, col : col + 1] | |
| yield item | |
| def bootstrap_trees(alignment, times, tree_constructor): | |
| """Generate bootstrap replicate trees from a multiple sequence alignment. | |
| :Parameters: | |
| alignment : Alignment or MultipleSeqAlignment object | |
| multiple sequence alignment to generate replicates. | |
| times : int | |
| number of bootstrap times. | |
| tree_constructor : TreeConstructor | |
| tree constructor to be used to build trees. | |
| """ | |
| if isinstance(alignment, MultipleSeqAlignment): | |
| length = len(alignment[0]) | |
| for i in range(times): | |
| bootstrapped_alignment = None | |
| for j in range(length): | |
| col = random.randint(0, length - 1) | |
| if bootstrapped_alignment is None: | |
| bootstrapped_alignment = alignment[:, col : col + 1] | |
| else: | |
| bootstrapped_alignment += alignment[:, col : col + 1] | |
| tree = tree_constructor.build_tree(alignment) | |
| yield tree | |
| else: | |
| n, m = alignment.shape | |
| for i in range(times): | |
| cols = [random.randint(0, m - 1) for j in range(m)] | |
| tree = tree_constructor.build_tree(alignment[:, cols]) | |
| yield tree | |
| def bootstrap_consensus(alignment, times, tree_constructor, consensus): | |
| """Consensus tree of a series of bootstrap trees for a multiple sequence alignment. | |
| :Parameters: | |
| alignment : Alignment or MultipleSeqAlignment object | |
| Multiple sequence alignment to generate replicates. | |
| times : int | |
| Number of bootstrap times. | |
| tree_constructor : TreeConstructor | |
| Tree constructor to be used to build trees. | |
| consensus : function | |
| Consensus method in this module: ``strict_consensus``, | |
| ``majority_consensus``, ``adam_consensus``. | |
| """ | |
| trees = bootstrap_trees(alignment, times, tree_constructor) | |
| tree = consensus(trees) | |
| return tree | |
| def _clade_to_bitstr(clade, tree_term_names): | |
| """Create a BitString representing a clade, given ordered tree taxon names (PRIVATE).""" | |
| clade_term_names = {term.name for term in clade.find_clades(terminal=True)} | |
| return _BitString.from_bool((name in clade_term_names) for name in tree_term_names) | |
| def _tree_to_bitstrs(tree): | |
| """Create a dict of a tree's clades to corresponding BitStrings (PRIVATE).""" | |
| clades_bitstrs = {} | |
| term_names = [term.name for term in tree.find_clades(terminal=True)] | |
| for clade in tree.find_clades(terminal=False): | |
| bitstr = _clade_to_bitstr(clade, term_names) | |
| clades_bitstrs[clade] = bitstr | |
| return clades_bitstrs | |
| def _bitstring_topology(tree): | |
| """Generate a branch length dict for a tree, keyed by BitStrings (PRIVATE). | |
| Create a dict of all clades' BitStrings to the corresponding branch | |
| lengths (rounded to 5 decimal places). | |
| """ | |
| bitstrs = {} | |
| for clade, bitstr in _tree_to_bitstrs(tree).items(): | |
| bitstrs[bitstr] = round(clade.branch_length or 0.0, 5) | |
| return bitstrs | |
| def _equal_topology(tree1, tree2): | |
| """Are two trees are equal in terms of topology and branch lengths (PRIVATE). | |
| (Branch lengths checked to 5 decimal places.) | |
| """ | |
| term_names1 = {term.name for term in tree1.find_clades(terminal=True)} | |
| term_names2 = {term.name for term in tree2.find_clades(terminal=True)} | |
| return (term_names1 == term_names2) and ( | |
| _bitstring_topology(tree1) == _bitstring_topology(tree2) | |
| ) | |