Spaces:
No application file
No application file
| # Copyright 2012 by Eric Talevich. All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Command-line wrapper for the tree inference program RAxML. | |
| Derived from the help page for RAxML version 7.3 by Alexandros Stamatakis, but | |
| should work for any version 7.X (and probably earlier for most options). | |
| """ | |
| from Bio.Application import _Option, _Switch, AbstractCommandline | |
| class RaxmlCommandline(AbstractCommandline): | |
| """Command-line wrapper for the tree inference program RAxML. | |
| The required parameters are 'sequences' (-s), 'model' (-m) and 'name' (-n). | |
| The parameter 'parsimony_seed' (-p) must also be set for RAxML, but if you | |
| do not specify it, this wrapper will set the seed to 10000 for you. | |
| References | |
| ---------- | |
| Stamatakis A. | |
| RAxML-VI-HPC: Maximum Likelihood-based Phylogenetic Analyses with | |
| Thousands of Taxa and Mixed Models. | |
| Bioinformatics 2006, 22(21):2688-2690. | |
| Homepage: http://sco.h-its.org/exelixis/software.html | |
| Examples | |
| -------- | |
| >>> from Bio.Phylo.Applications import RaxmlCommandline | |
| >>> raxml_cline = RaxmlCommandline(sequences="Tests/Phylip/interlaced2.phy", | |
| ... model="PROTCATWAG", name="interlaced2") | |
| >>> print(raxml_cline) | |
| raxmlHPC -m PROTCATWAG -n interlaced2 -p 10000 -s Tests/Phylip/interlaced2.phy | |
| You would typically run the command line with raxml_cline() or via | |
| the Python subprocess module, as described in the Biopython tutorial. | |
| """ | |
| def __init__(self, cmd="raxmlHPC", **kwargs): | |
| """Initialize the class.""" | |
| self.parameters = [ | |
| _Option( | |
| ["-a", "weight_filename"], | |
| "Name of a column weight file to assign individual weights " | |
| "to each column of the alignment. Those weights must be " | |
| "integers separated by any type and number of whitespaces " | |
| "within a separate file.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-b", "bootstrap_seed"], "Random seed for bootstrapping.", equate=False | |
| ), | |
| _Option( | |
| ["-c", "num_categories"], | |
| "Number of distinct rate categories for RAxML when " | |
| "evolution model is set to GTRCAT or GTRMIX." | |
| "Individual per-site rates are categorized into this " | |
| "many rate categories to accelerate computations. " | |
| "Default: 25.", | |
| equate=False, | |
| ), | |
| _Switch( | |
| ["-d", "random_starting_tree"], | |
| "Start ML optimization from random starting tree.", | |
| ), | |
| _Option( | |
| ["-e", "epsilon"], | |
| "Set model optimization precision in log likelihood units " | |
| "for final optimization of tree topology under MIX/MIXI " | |
| "or GAMMA/GAMMAI." | |
| "Default: 0.1 for models not using proportion of " | |
| "invariant sites estimate; 0.001 for models using " | |
| "proportion of invariant sites estimate.", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-E", "exclude_filename"], | |
| "An exclude file name, containing a specification of " | |
| "alignment positions you wish to exclude. Format is " | |
| "similar to Nexus, the file shall contain entries like " | |
| "'100-200 300-400'; to exclude a single column write, " | |
| "e.g., '100-100'. If you use a mixed model, an " | |
| "appropriately adapted model file will be written.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-f", "algorithm"], | |
| r""" | |
| Select algorithm: | |
| a: Rapid Bootstrap analysis and search for best-scoring ML | |
| tree in one program run. | |
| b: Draw bipartition information on a tree provided with '-t' | |
| based on multiple trees (e.g. form a bootstrap) in a file | |
| specified by '-z'. | |
| c: Check if the alignment can be properly read by RAxML. | |
| d: New rapid hill-climbing (DEFAULT). | |
| e: Optimize model+branch lengths for given input tree under | |
| GAMMA/GAMMAI only. | |
| g: Compute per site log Likelihoods for one ore more trees | |
| passed via '-z' and write them to a file that can be read | |
| by CONSEL. | |
| h: Compute log likelihood test (SH-test) between best tree | |
| passed via '-t' and a bunch of other trees passed via '-z'. | |
| i: Perform a really thorough bootstrap, refinement of final | |
| bootstrap tree under GAMMA and a more exhaustive algorithm. | |
| j: Generate a bunch of bootstrapped alignment files from an | |
| original alignment file. | |
| m: Compare bipartitions between two bunches of trees passed | |
| via '-t' and '-z' respectively. This will return the | |
| Pearson correlation between all bipartitions found in the | |
| two tree files. A file called | |
| RAxML_bipartitionFrequencies.outputFileName will be | |
| printed that contains the pair-wise bipartition | |
| frequencies of the two sets. | |
| n: Compute the log likelihood score of all trees contained | |
| in a tree file provided by '-z' under GAMMA or | |
| GAMMA+P-Invar. | |
| o: Old and slower rapid hill-climbing. | |
| p: Perform pure stepwise MP addition of new sequences to an | |
| incomplete starting tree. | |
| s: Split up a multi-gene partitioned alignment into the | |
| respective subalignments. | |
| t: Do randomized tree searches on one fixed starting tree. | |
| w: Compute ELW test on a bunch of trees passed via '-z'. | |
| x: Compute pair-wise ML distances, ML model parameters will | |
| be estimated on an MP starting tree or a user-defined | |
| tree passed via '-t', only allowed for GAMMA-based models | |
| of rate heterogeneity. | |
| """, | |
| checker_function=(lambda x: isinstance(x, str) and len(x) == 1), | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-g", "grouping_constraint"], | |
| "File name of a multifurcating constraint tree. " | |
| "this tree does not need to be comprehensive, i.e. " | |
| "contain all taxa.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-i", "rearrangements"], | |
| "Initial rearrangement setting for the subsequent " | |
| "application of topological changes phase.", | |
| equate=False, | |
| ), | |
| _Switch( | |
| ["-j", "checkpoints"], | |
| "Write checkpoints (intermediate tree topologies).", | |
| ), | |
| _Switch( | |
| ["-k", "bootstrap_branch_lengths"], | |
| "Print bootstrapped trees with branch lengths. " | |
| "The bootstraps will run a bit longer, because model " | |
| "parameters will be optimized at the end of each run. " | |
| "Use with CATMIX/PROTMIX or GAMMA/GAMMAI.", | |
| ), | |
| _Option( | |
| ["-l", "cluster_threshold"], | |
| "Threshold for sequence similarity clustering. " | |
| "RAxML will then print out an alignment to a file " | |
| "called sequenceFileName.reducedBy.threshold that " | |
| "only contains sequences <= the specified threshold " | |
| "that must be between 0.0 and 1.0. RAxML uses the " | |
| "QT-clustering algorithm to perform this task. " | |
| "In addition, a file called " | |
| "RAxML_reducedList.outputFileName will be written " | |
| "that contains clustering information.", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-L", "cluster_threshold_fast"], | |
| "Same functionality as '-l', but uses a less " | |
| "exhaustive and thus faster clustering algorithm. " | |
| "This is intended for very large datasets with more " | |
| "than 20,000-30,000 sequences.", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-m", "model"], | |
| r"""Model of Nucleotide or Amino Acid Substitution: | |
| NUCLEOTIDES: | |
| GTRCAT : GTR + Optimization of substitution rates + Optimization of site-specific | |
| evolutionary rates which are categorized into numberOfCategories distinct | |
| rate categories for greater computational efficiency | |
| if you do a multiple analysis with '-#' or '-N' but without bootstrapping the program | |
| will use GTRMIX instead | |
| GTRGAMMA : GTR + Optimization of substitution rates + GAMMA model of rate | |
| heterogeneity (alpha parameter will be estimated) | |
| GTRMIX : Inference of the tree under GTRCAT | |
| and thereafter evaluation of the final tree topology under GTRGAMMA | |
| GTRCAT_GAMMA : Inference of the tree with site-specific evolutionary rates. | |
| However, here rates are categorized using the 4 discrete GAMMA rates. | |
| Evaluation of the final tree topology under GTRGAMMA | |
| GTRGAMMAI : Same as GTRGAMMA, but with estimate of proportion of invariable sites | |
| GTRMIXI : Same as GTRMIX, but with estimate of proportion of invariable sites | |
| GTRCAT_GAMMAI : Same as GTRCAT_GAMMA, but with estimate of proportion of invariable sites | |
| AMINO ACIDS: | |
| PROTCATmatrixName[F] : specified AA matrix + Optimization of substitution rates + Optimization of site-specific | |
| evolutionary rates which are categorized into numberOfCategories distinct | |
| rate categories for greater computational efficiency | |
| if you do a multiple analysis with '-#' or '-N' but without bootstrapping the program | |
| will use PROTMIX... instead | |
| PROTGAMMAmatrixName[F] : specified AA matrix + Optimization of substitution rates + GAMMA model of rate | |
| heterogeneity (alpha parameter will be estimated) | |
| PROTMIXmatrixName[F] : Inference of the tree under specified AA matrix + CAT | |
| and thereafter evaluation of the final tree topology under specified AA matrix + GAMMA | |
| PROTCAT_GAMMAmatrixName[F] : Inference of the tree under specified AA matrix and site-specific evolutionary rates. | |
| However, here rates are categorized using the 4 discrete GAMMA rates. | |
| Evaluation of the final tree topology under specified AA matrix + GAMMA | |
| PROTGAMMAImatrixName[F] : Same as PROTGAMMAmatrixName[F], but with estimate of proportion of invariable sites | |
| PROTMIXImatrixName[F] : Same as PROTMIXmatrixName[F], but with estimate of proportion of invariable sites | |
| PROTCAT_GAMMAImatrixName[F] : Same as PROTCAT_GAMMAmatrixName[F], but with estimate of proportion of invariable sites | |
| Available AA substitution models: DAYHOFF, DCMUT, JTT, MTREV, WAG, RTREV, CPREV, VT, BLOSUM62, MTMAM, GTR | |
| With the optional 'F' appendix you can specify if you want to use empirical base frequencies | |
| Please not that for mixed models you can in addition specify the per-gene AA model in | |
| the mixed model file (see manual for details) | |
| """, | |
| equate=False, | |
| ), | |
| _Switch( | |
| ["-M", "partition_branch_lengths"], | |
| "Switch on estimation of individual per-partition " | |
| "branch lengths. Only has effect when used in " | |
| "combination with 'partition_filename' ('-q'). " | |
| "Branch lengths for individual partitions will be " | |
| "printed to separate files. A weighted average of the " | |
| "branch lengths is computed by using the respective " | |
| "partition lengths. ", | |
| ), | |
| _Option( | |
| ["-n", "name"], | |
| "Name used in the output files.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-o", "outgroup"], | |
| "Name of a single outgroup or a comma-separated list " | |
| "of outgroups, eg '-o Rat' or '-o Rat,Mouse'. In case " | |
| "that multiple outgroups are not monophyletic the " | |
| "first name in the list will be selected as outgroup. " | |
| "Don't leave spaces between taxon names!", | |
| checker_function=lambda x: len(x.split()) == 1, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-q", "partition_filename"], | |
| "File name containing the assignment of models to " | |
| "alignment partitions for multiple models of " | |
| "substitution. For the syntax of this file please " | |
| "consult the RAxML manual.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-p", "parsimony_seed"], | |
| "Random number seed for the parsimony inferences. " | |
| "This allows you to reproduce your results and will " | |
| "help developers debug the program. This option HAS " | |
| "NO EFFECT in the parallel MPI version.", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-P", "protein_model"], | |
| "File name of a user-defined AA (Protein) substitution " | |
| "model. This file must contain 420 entries, the first " | |
| "400 being the AA substitution rates (this must be a " | |
| "symmetric matrix) and the last 20 are the empirical " | |
| "base frequencies.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-r", "binary_constraint"], | |
| "File name of a binary constraint tree. " | |
| "This tree does not need to be comprehensive, i.e. " | |
| "contain all taxa.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-s", "sequences"], | |
| "Name of the alignment data file, in PHYLIP format.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-t", "starting_tree"], | |
| "File name of a user starting tree, in Newick format.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-T", "threads"], | |
| "Number of threads to run. " | |
| "PTHREADS VERSION ONLY! " | |
| "Make sure to set this at most the number of CPUs " | |
| "you have on your machine, otherwise, there will be " | |
| "a huge performance decrease!", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-u", "num_bootstrap_searches"], | |
| "Number of multiple bootstrap searches per replicate. " | |
| "Use this to obtain better ML trees for each " | |
| "replicate. Default: 1 ML search per bootstrap " | |
| "replicate.", | |
| equate=False, | |
| ), | |
| _Switch(["-v", "version"], "Display version information."), | |
| _Option( | |
| ["-w", "working_dir"], | |
| "Name of the working directory where RAxML will " | |
| "write its output files. Default: current directory.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-x", "rapid_bootstrap_seed"], | |
| "Random seed for rapid bootstrapping.", | |
| equate=False, | |
| ), | |
| _Switch( | |
| ["-y", "parsimony"], | |
| "Only compute a parsimony starting tree, then exit.", | |
| ), | |
| _Option( | |
| ["-z", "bipartition_filename"], | |
| "Name of a file containing multiple trees, e.g. from " | |
| "a bootstrap run, that shall be used to draw " | |
| "bipartition values onto a tree provided with '-t'. " | |
| "It can also be used to compute per-site log " | |
| "likelihoods in combination with '-f g', and to read " | |
| "a bunch of trees for a couple of other options " | |
| "('-f h', '-f m', '-f n').", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-N", "-#", "num_replicates"], | |
| "Number of alternative runs on distinct starting trees. " | |
| "In combination with the '-b' option, this will invoke a " | |
| "multiple bootstrap analysis. " | |
| "DEFAULT: 1 single analysis." | |
| "Note that '-N' has been added as an alternative since " | |
| "'-#' sometimes caused problems with certain MPI job " | |
| "submission systems, since '-#' is often used to start " | |
| "comments. ", | |
| equate=False, | |
| ), | |
| ] | |
| AbstractCommandline.__init__(self, cmd, **kwargs) | |
| # ENH: enforce -s, -n and -m | |
| if not self.parsimony_seed: | |
| self.parsimony_seed = 10000 | |
| if __name__ == "__main__": | |
| from Bio._utils import run_doctest | |
| run_doctest() | |