Spaces:
No application file
No application file
| # Copyright 2009 by Cymon J. Cox. All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Command line wrapper for the multiple alignment programme MAFFT.""" | |
| from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline | |
| class MafftCommandline(AbstractCommandline): | |
| """Command line wrapper for the multiple alignment program MAFFT. | |
| http://align.bmr.kyushu-u.ac.jp/mafft/software/ | |
| Notes | |
| ----- | |
| Last checked against version: MAFFT v6.717b (2009/12/03) | |
| References | |
| ---------- | |
| Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of | |
| multiple ncRNA alignment by incorporating structural information into | |
| a MAFFT-based framework (describes RNA structural alignment methods) | |
| Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent | |
| developments in the MAFFT multiple sequence alignment program | |
| (outlines version 6) | |
| Katoh, Toh (Bioinformatics 23:372-374, 2007) Errata PartTree: an | |
| algorithm to build an approximate tree from a large number of | |
| unaligned sequences (describes the PartTree algorithm) | |
| Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT | |
| version 5: improvement in accuracy of multiple sequence alignment | |
| (describes [ancestral versions of] the G-INS-i, L-INS-i and E-INS-i | |
| strategies) | |
| Katoh, Misawa, Kuma, Miyata (Nucleic Acids Res. 30:3059-3066, 2002) | |
| Examples | |
| -------- | |
| >>> from Bio.Align.Applications import MafftCommandline | |
| >>> mafft_exe = "/opt/local/mafft" | |
| >>> in_file = "../Doc/examples/opuntia.fasta" | |
| >>> mafft_cline = MafftCommandline(mafft_exe, input=in_file) | |
| >>> print(mafft_cline) | |
| /opt/local/mafft ../Doc/examples/opuntia.fasta | |
| If the mafft binary is on the path (typically the case on a Unix style | |
| operating system) then you don't need to supply the executable location: | |
| >>> from Bio.Align.Applications import MafftCommandline | |
| >>> in_file = "../Doc/examples/opuntia.fasta" | |
| >>> mafft_cline = MafftCommandline(input=in_file) | |
| >>> print(mafft_cline) | |
| mafft ../Doc/examples/opuntia.fasta | |
| You would typically run the command line with mafft_cline() or via | |
| the Python subprocess module, as described in the Biopython tutorial. | |
| Note that MAFFT will write the alignment to stdout, which you may | |
| want to save to a file and then parse, e.g.:: | |
| stdout, stderr = mafft_cline() | |
| with open("aligned.fasta", "w") as handle: | |
| handle.write(stdout) | |
| from Bio import AlignIO | |
| align = AlignIO.read("aligned.fasta", "fasta") | |
| Alternatively, to parse the output with AlignIO directly you can | |
| use StringIO to turn the string into a handle:: | |
| stdout, stderr = mafft_cline() | |
| from io import StringIO | |
| from Bio import AlignIO | |
| align = AlignIO.read(StringIO(stdout), "fasta") | |
| """ | |
| def __init__(self, cmd="mafft", **kwargs): | |
| """Initialize the class.""" | |
| BLOSUM_MATRICES = ["30", "45", "62", "80"] | |
| self.parameters = [ | |
| # **** Algorithm **** | |
| # Automatically selects an appropriate strategy from L-INS-i, FFT-NS- | |
| # i and FFT-NS-2, according to data size. Default: off (always FFT-NS-2) | |
| _Switch(["--auto", "auto"], "Automatically select strategy. Default off."), | |
| # Distance is calculated based on the number of shared 6mers. Default: on | |
| _Switch( | |
| ["--6merpair", "6merpair", "sixmerpair"], | |
| "Distance is calculated based on the number of shared " | |
| "6mers. Default: on", | |
| ), | |
| # All pairwise alignments are computed with the Needleman-Wunsch | |
| # algorithm. More accurate but slower than --6merpair. Suitable for a | |
| # set of globally alignable sequences. Applicable to up to ~200 | |
| # sequences. A combination with --maxiterate 1000 is recommended (G- | |
| # INS-i). Default: off (6mer distance is used) | |
| _Switch( | |
| ["--globalpair", "globalpair"], | |
| "All pairwise alignments are computed with the " | |
| "Needleman-Wunsch algorithm. Default: off", | |
| ), | |
| # All pairwise alignments are computed with the Smith-Waterman | |
| # algorithm. More accurate but slower than --6merpair. Suitable for a | |
| # set of locally alignable sequences. Applicable to up to ~200 | |
| # sequences. A combination with --maxiterate 1000 is recommended (L- | |
| # INS-i). Default: off (6mer distance is used) | |
| _Switch( | |
| ["--localpair", "localpair"], | |
| "All pairwise alignments are computed with the " | |
| "Smith-Waterman algorithm. Default: off", | |
| ), | |
| # All pairwise alignments are computed with a local algorithm with | |
| # the generalized affine gap cost (Altschul 1998). More accurate but | |
| # slower than --6merpair. Suitable when large internal gaps are | |
| # expected. Applicable to up to ~200 sequences. A combination with -- | |
| # maxiterate 1000 is recommended (E-INS-i). Default: off (6mer | |
| # distance is used) | |
| _Switch( | |
| ["--genafpair", "genafpair"], | |
| "All pairwise alignments are computed with a local " | |
| "algorithm with the generalized affine gap cost " | |
| "(Altschul 1998). Default: off", | |
| ), | |
| # All pairwise alignments are computed with FASTA (Pearson and Lipman | |
| # 1988). FASTA is required. Default: off (6mer distance is used) | |
| _Switch( | |
| ["--fastapair", "fastapair"], | |
| "All pairwise alignments are computed with FASTA " | |
| "(Pearson and Lipman 1988). Default: off", | |
| ), | |
| # Weighting factor for the consistency term calculated from pairwise | |
| # alignments. Valid when either of --blobalpair, --localpair, -- | |
| # genafpair, --fastapair or --blastpair is selected. Default: 2.7 | |
| _Option( | |
| ["--weighti", "weighti"], | |
| "Weighting factor for the consistency term calculated " | |
| "from pairwise alignments. Default: 2.7", | |
| checker_function=lambda x: isinstance(x, float), | |
| equate=False, | |
| ), | |
| # Guide tree is built number times in the progressive stage. Valid | |
| # with 6mer distance. Default: 2 | |
| _Option( | |
| ["--retree", "retree"], | |
| "Guide tree is built number times in the progressive " | |
| "stage. Valid with 6mer distance. Default: 2", | |
| checker_function=lambda x: isinstance(x, int), | |
| equate=False, | |
| ), | |
| # Number cycles of iterative refinement are performed. Default: 0 | |
| _Option( | |
| ["--maxiterate", "maxiterate"], | |
| "Number cycles of iterative refinement are performed. Default: 0", | |
| checker_function=lambda x: isinstance(x, int), | |
| equate=False, | |
| ), | |
| # Number of threads to use. Default: 1 | |
| _Option( | |
| ["--thread", "thread"], | |
| "Number of threads to use. Default: 1", | |
| checker_function=lambda x: isinstance(x, int), | |
| equate=False, | |
| ), | |
| # Use FFT approximation in group-to-group alignment. Default: on | |
| _Switch( | |
| ["--fft", "fft"], | |
| "Use FFT approximation in group-to-group alignment. Default: on", | |
| ), | |
| # Do not use FFT approximation in group-to-group alignment. Default: | |
| # off | |
| _Switch( | |
| ["--nofft", "nofft"], | |
| "Do not use FFT approximation in group-to-group " | |
| "alignment. Default: off", | |
| ), | |
| # Alignment score is not checked in the iterative refinement stage. | |
| # Default: off (score is checked) | |
| _Switch( | |
| ["--noscore", "noscore"], | |
| "Alignment score is not checked in the iterative " | |
| "refinement stage. Default: off (score is checked)", | |
| ), | |
| # Use the Myers-Miller (1988) algorithm. Default: automatically | |
| # turned on when the alignment length exceeds 10,000 (aa/nt). | |
| _Switch( | |
| ["--memsave", "memsave"], | |
| "Use the Myers-Miller (1988) algorithm. Default: " | |
| "automatically turned on when the alignment length " | |
| "exceeds 10,000 (aa/nt).", | |
| ), | |
| # Use a fast tree-building method (PartTree, Katoh and Toh 2007) with | |
| # the 6mer distance. Recommended for a large number (> ~10,000) of | |
| # sequences are input. Default: off | |
| _Switch( | |
| ["--parttree", "parttree"], | |
| "Use a fast tree-building method with the 6mer " | |
| "distance. Default: off", | |
| ), | |
| # The PartTree algorithm is used with distances based on DP. Slightly | |
| # more accurate and slower than --parttree. Recommended for a large | |
| # number (> ~10,000) of sequences are input. Default: off | |
| _Switch( | |
| ["--dpparttree", "dpparttree"], | |
| "The PartTree algorithm is used with distances " | |
| "based on DP. Default: off", | |
| ), | |
| # The PartTree algorithm is used with distances based on FASTA. | |
| # Slightly more accurate and slower than --parttree. Recommended for | |
| # a large number (> ~10,000) of sequences are input. FASTA is | |
| # required. Default: off | |
| _Switch( | |
| ["--fastaparttree", "fastaparttree"], | |
| "The PartTree algorithm is used with distances based " | |
| "on FASTA. Default: off", | |
| ), | |
| # The number of partitions in the PartTree algorithm. Default: 50 | |
| _Option( | |
| ["--partsize", "partsize"], | |
| "The number of partitions in the PartTree algorithm. Default: 50", | |
| checker_function=lambda x: isinstance(x, int), | |
| equate=False, | |
| ), | |
| # Do not make alignment larger than number sequences. Valid only with | |
| # the --*parttree options. Default: the number of input sequences | |
| _Switch( | |
| ["--groupsize", "groupsize"], | |
| "Do not make alignment larger than number sequences. " | |
| "Default: the number of input sequences", | |
| ), | |
| # Adjust direction according to the first sequence | |
| # Mafft V6 beta function | |
| _Switch( | |
| ["--adjustdirection", "adjustdirection"], | |
| "Adjust direction according to the first sequence. Default off.", | |
| ), | |
| # Adjust direction according to the first sequence | |
| # for highly diverged data; very slow | |
| # Mafft V6 beta function | |
| _Switch( | |
| ["--adjustdirectionaccurately", "adjustdirectionaccurately"], | |
| "Adjust direction according to the first sequence," | |
| "for highly diverged data; very slow" | |
| "Default off.", | |
| ), | |
| # **** Parameter **** | |
| # Gap opening penalty at group-to-group alignment. Default: 1.53 | |
| _Option( | |
| ["--op", "op"], | |
| "Gap opening penalty at group-to-group alignment. Default: 1.53", | |
| checker_function=lambda x: isinstance(x, float), | |
| equate=False, | |
| ), | |
| # Offset value, which works like gap extension penalty, for group-to- | |
| # group alignment. Default: 0.123 | |
| _Option( | |
| ["--ep", "ep"], | |
| "Offset value, which works like gap extension penalty, " | |
| "for group-to- group alignment. Default: 0.123", | |
| checker_function=lambda x: isinstance(x, float), | |
| equate=False, | |
| ), | |
| # Gap opening penalty at local pairwise alignment. Valid when the -- | |
| # localpair or --genafpair option is selected. Default: -2.00 | |
| _Option( | |
| ["--lop", "lop"], | |
| "Gap opening penalty at local pairwise alignment. Default: 0.123", | |
| checker_function=lambda x: isinstance(x, float), | |
| equate=False, | |
| ), | |
| # Offset value at local pairwise alignment. Valid when the -- | |
| # localpair or --genafpair option is selected. Default: 0.1 | |
| _Option( | |
| ["--lep", "lep"], | |
| "Offset value at local pairwise alignment. Default: 0.1", | |
| checker_function=lambda x: isinstance(x, float), | |
| equate=False, | |
| ), | |
| # Gap extension penalty at local pairwise alignment. Valid when the - | |
| # -localpair or --genafpair option is selected. Default: -0.1 | |
| _Option( | |
| ["--lexp", "lexp"], | |
| "Gap extension penalty at local pairwise alignment. Default: -0.1", | |
| checker_function=lambda x: isinstance(x, float), | |
| equate=False, | |
| ), | |
| # Gap opening penalty to skip the alignment. Valid when the -- | |
| # genafpair option is selected. Default: -6.00 | |
| _Option( | |
| ["--LOP", "LOP"], | |
| "Gap opening penalty to skip the alignment. Default: -6.00", | |
| checker_function=lambda x: isinstance(x, float), | |
| equate=False, | |
| ), | |
| # Gap extension penalty to skip the alignment. Valid when the -- | |
| # genafpair option is selected. Default: 0.00 | |
| _Option( | |
| ["--LEXP", "LEXP"], | |
| "Gap extension penalty to skip the alignment. Default: 0.00", | |
| checker_function=lambda x: isinstance(x, float), | |
| equate=False, | |
| ), | |
| # BLOSUM number matrix (Henikoff and Henikoff 1992) is used. | |
| # number=30, 45, 62 or 80. Default: 62 | |
| _Option( | |
| ["--bl", "bl"], | |
| "BLOSUM number matrix is used. Default: 62", | |
| checker_function=lambda x: x in BLOSUM_MATRICES, | |
| equate=False, | |
| ), | |
| # JTT PAM number (Jones et al. 1992) matrix is used. number>0. | |
| # Default: BLOSUM62 | |
| _Option( | |
| ["--jtt", "jtt"], | |
| "JTT PAM number (Jones et al. 1992) matrix is used. " | |
| "number>0. Default: BLOSUM62", | |
| equate=False, | |
| ), | |
| # Transmembrane PAM number (Jones et al. 1994) matrix is used. | |
| # number>0. Default: BLOSUM62 | |
| _Option( | |
| ["--tm", "tm"], | |
| "Transmembrane PAM number (Jones et al. 1994) " | |
| "matrix is used. number>0. Default: BLOSUM62", | |
| filename=True, # to ensure spaced inputs are quoted | |
| equate=False, | |
| ), | |
| # Use a user-defined AA scoring matrix. The format of matrixfile is | |
| # the same to that of BLAST. Ignored when nucleotide sequences are | |
| # input. Default: BLOSUM62 | |
| _Option( | |
| ["--aamatrix", "aamatrix"], | |
| "Use a user-defined AA scoring matrix. Default: BLOSUM62", | |
| filename=True, # to ensure spaced inputs are quoted | |
| equate=False, | |
| ), | |
| # Incorporate the AA/nuc composition information into the scoring | |
| # matrix. Default: off | |
| _Switch( | |
| ["--fmodel", "fmodel"], | |
| "Incorporate the AA/nuc composition information into " | |
| "the scoring matrix (True) or not (False, default)", | |
| ), | |
| # **** Output **** | |
| # Name length for CLUSTAL and PHYLIP format output | |
| _Option( | |
| ["--namelength", "namelength"], | |
| """Name length in CLUSTAL and PHYLIP output. | |
| MAFFT v6.847 (2011) added --namelength for use with | |
| the --clustalout option for CLUSTAL output. | |
| MAFFT v7.024 (2013) added support for this with the | |
| --phylipout option for PHYLIP output (default 10). | |
| """, | |
| checker_function=lambda x: isinstance(x, int), | |
| equate=False, | |
| ), | |
| # Output format: clustal format. Default: off (fasta format) | |
| _Switch( | |
| ["--clustalout", "clustalout"], | |
| "Output format: clustal (True) or fasta (False, default)", | |
| ), | |
| # Output format: phylip format. | |
| # Added in beta with v6.847, fixed in v6.850 (2011) | |
| _Switch( | |
| ["--phylipout", "phylipout"], | |
| "Output format: phylip (True), or fasta (False, default)", | |
| ), | |
| # Output order: same as input. Default: on | |
| _Switch( | |
| ["--inputorder", "inputorder"], | |
| "Output order: same as input (True, default) or alignment " | |
| "based (False)", | |
| ), | |
| # Output order: aligned. Default: off (inputorder) | |
| _Switch( | |
| ["--reorder", "reorder"], | |
| "Output order: aligned (True) or in input order (False, default)", | |
| ), | |
| # Guide tree is output to the input.tree file. Default: off | |
| _Switch( | |
| ["--treeout", "treeout"], | |
| "Guide tree is output to the input.tree file (True) or " | |
| "not (False, default)", | |
| ), | |
| # Do not report progress. Default: off | |
| _Switch( | |
| ["--quiet", "quiet"], | |
| "Do not report progress (True) or not (False, default).", | |
| ), | |
| # **** Input **** | |
| # Assume the sequences are nucleotide. Default: auto | |
| _Switch( | |
| ["--nuc", "nuc"], | |
| "Assume the sequences are nucleotide (True/False). Default: auto", | |
| ), | |
| # Assume the sequences are amino acid. Default: auto | |
| _Switch( | |
| ["--amino", "amino"], | |
| "Assume the sequences are amino acid (True/False). Default: auto", | |
| ), | |
| # MAFFT has multiple --seed commands where the unaligned input is | |
| # aligned to the seed alignment. There can be multiple seeds in the | |
| # form: "mafft --seed align1 --seed align2 [etc] input" | |
| # Effectively for n number of seed alignments. | |
| # TODO - Can we use class _ArgumentList here? | |
| _Option( | |
| ["--seed", "seed"], | |
| "Seed alignments given in alignment_n (fasta format) " | |
| "are aligned with sequences in input.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| # The input (must be FASTA format) | |
| _Argument(["input"], "Input file name", filename=True, is_required=True), | |
| # mafft-profile takes a second alignment input as an argument: | |
| # mafft-profile align1 align2 | |
| _Argument( | |
| ["input1"], | |
| "Second input file name for the mafft-profile command", | |
| filename=True, | |
| ), | |
| ] | |
| AbstractCommandline.__init__(self, cmd, **kwargs) | |
| if __name__ == "__main__": | |
| from Bio._utils import run_doctest | |
| run_doctest() | |