Spaces:
No application file
No application file
| # Copyright 2001 Brad Chapman. | |
| # Revisions copyright 2009-2010 by Peter Cock. | |
| # Revisions copyright 2010 by Phillip Garland. | |
| # All rights reserved. | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Definitions for interacting with BLAST related applications (OBSOLETE). | |
| Wrappers for the new NCBI BLAST+ tools (written in C++): | |
| - NcbiblastpCommandline - Protein-Protein BLAST | |
| - NcbiblastnCommandline - Nucleotide-Nucleotide BLAST | |
| - NcbiblastxCommandline - Translated Query-Protein Subject BLAST | |
| - NcbitblastnCommandline - Protein Query-Translated Subject BLAST | |
| - NcbitblastxCommandline - Translated Query-Protein Subject BLAST | |
| - NcbipsiblastCommandline - Position-Specific Initiated BLAST | |
| - NcbirpsblastCommandline - Reverse Position Specific BLAST | |
| - NcbirpstblastnCommandline - Translated Reverse Position Specific BLAST | |
| - NcbideltablastCommandline - Protein-Protein domain enhanced lookup time accelerated blast | |
| - NcbiblastformatterCommandline - Convert ASN.1 to other BLAST output formats | |
| - NcbimakeblastdbCommandline - Application to create BLAST databases | |
| For further details, see: | |
| Camacho et al. BLAST+: architecture and applications | |
| BMC Bioinformatics 2009, 10:421 | |
| https://doi.org/10.1186/1471-2105-10-421 | |
| We have decided to remove this module in future, and instead recommend | |
| building your command and invoking it via the subprocess module directly. | |
| """ | |
| from Bio.Application import _Option, AbstractCommandline, _Switch | |
| class _NcbibaseblastCommandline(AbstractCommandline): | |
| """Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE). | |
| This is provided for subclassing, it deals with shared options | |
| common to all the BLAST tools (blastn, rpsblast, rpsblast, etc | |
| AND blast_formatter). | |
| """ | |
| def __init__(self, cmd=None, **kwargs): | |
| assert cmd is not None | |
| extra_parameters = [ | |
| # Core: | |
| _Switch( | |
| ["-h", "h"], "Print USAGE and DESCRIPTION; ignore other arguments." | |
| ), | |
| _Switch( | |
| ["-help", "help"], | |
| "Print USAGE, DESCRIPTION and ARGUMENTS description; " | |
| "ignore other arguments.", | |
| ), | |
| _Switch( | |
| ["-version", "version"], | |
| "Print version number; ignore other arguments.", | |
| ), | |
| # Output configuration options | |
| _Option( | |
| ["-out", "out"], | |
| "Output file for alignment.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| # Formatting options: | |
| _Option( | |
| ["-outfmt", "outfmt"], | |
| "Alignment view. Typically an integer 0-14 but for some " | |
| "formats can be named columns like '6 qseqid sseqid'. " | |
| "Use 5 for XML output (differs from classic BLAST which " | |
| "used 7 for XML).", | |
| filename=True, # to ensure spaced inputs are quoted | |
| equate=False, | |
| ), | |
| # TODO - Document and test the column options | |
| _Switch(["-show_gis", "show_gis"], "Show NCBI GIs in deflines?"), | |
| _Option( | |
| ["-num_descriptions", "num_descriptions"], | |
| "Number of database sequences to show one-line descriptions for.\n\n" | |
| "Integer argument (at least zero). Default is 500. " | |
| "See also num_alignments.", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-num_alignments", "num_alignments"], | |
| "Number of database sequences to show num_alignments for.\n\n" | |
| "Integer argument (at least zero). Default is 200. " | |
| "See also num_alignments.", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-line_length", "line_length"], | |
| "Line length for formatting alignments " | |
| "(integer, at least 1, default 60).\n\n" | |
| "Not applicable for outfmt > 4. Added in BLAST+ 2.2.30.", | |
| equate=False, | |
| ), | |
| _Switch( | |
| ["-html", "html"], "Produce HTML output? See also the outfmt option." | |
| ), | |
| # Miscellaneous options | |
| _Switch( | |
| ["-parse_deflines", "parse_deflines"], | |
| "Should the query and subject defline(s) be parsed?", | |
| ), | |
| ] | |
| try: | |
| # Insert extra parameters - at the start just in case there | |
| # are any arguments which must come last: | |
| self.parameters = extra_parameters + self.parameters | |
| except AttributeError: | |
| # Should we raise an error? The subclass should have set this up! | |
| self.parameters = extra_parameters | |
| AbstractCommandline.__init__(self, cmd, **kwargs) | |
| def _validate_incompatibilities(self, incompatibles): | |
| """Validate parameters for incompatibilities (PRIVATE). | |
| Used by the _validate method. | |
| """ | |
| for a in incompatibles: | |
| if self._get_parameter(a): | |
| for b in incompatibles[a]: | |
| if self._get_parameter(b): | |
| raise ValueError(f"Options {a} and {b} are incompatible.") | |
| class _NcbiblastCommandline(_NcbibaseblastCommandline): | |
| """Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE). | |
| This is provided for subclassing, it deals with shared options | |
| common to all the BLAST tools (blastn, rpsblast, rpsblast, etc). | |
| """ | |
| def __init__(self, cmd=None, **kwargs): | |
| assert cmd is not None | |
| extra_parameters = [ | |
| # Input query options: | |
| _Option( | |
| ["-query", "query"], | |
| "The sequence to search with.", | |
| filename=True, | |
| equate=False, | |
| ), # Should this be required? | |
| _Option( | |
| ["-query_loc", "query_loc"], | |
| "Location on the query sequence (Format: start-stop).", | |
| equate=False, | |
| ), | |
| # General search options: | |
| _Option(["-db", "db"], "The database to BLAST against.", equate=False), | |
| _Option(["-evalue", "evalue"], "Expectation value cutoff.", equate=False), | |
| _Option( | |
| ["-word_size", "word_size"], | |
| "Word size for wordfinder algorithm.\n\nInteger. Minimum 2.", | |
| equate=False, | |
| ), | |
| # BLAST-2-Sequences options: | |
| # - see subclass | |
| # Formatting options: | |
| # - see baseclass | |
| # Query filtering options | |
| _Option( | |
| ["-soft_masking", "soft_masking"], | |
| "Apply filtering locations as soft masks (Boolean, Default = true).", | |
| equate=False, | |
| ), | |
| _Switch( | |
| ["-lcase_masking", "lcase_masking"], | |
| "Use lower case filtering in query and subject sequence(s)?", | |
| ), | |
| # Restrict search or results | |
| _Option( | |
| ["-gilist", "gilist"], | |
| "Restrict search of database to list of GI's.\n\n" | |
| "Incompatible with: negative_gilist, seqidlist, negative_seqidlist, " | |
| "remote, subject, subject_loc", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-negative_gilist", "negative_gilist"], | |
| "Restrict search of database to everything except the listed GIs.\n\n" | |
| "Incompatible with: gilist, seqidlist, remote, subject, subject_loc", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-seqidlist", "seqidlist"], | |
| "Restrict search of database to list of SeqID's.\n\n" | |
| "Incompatible with: gilist, negative_gilist, remote, subject, " | |
| "subject_loc", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-negative_seqidlist", "negative_seqidlist"], | |
| "Restrict search of database to everything except listed SeqID's.\n\n" | |
| "Incompatible with: gilist, seqidlist, remote, subject, subject_loc", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-entrez_query", "entrez_query"], | |
| "Restrict search with the given Entrez query (requires remote).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-qcov_hsp_perc", "qcov_hsp_perc"], | |
| "Percent query coverage per hsp (float, 0 to 100).\n\n" | |
| "Added in BLAST+ 2.2.30.", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-max_target_seqs", "max_target_seqs"], | |
| "Maximum number of aligned sequences to keep (integer, at least one).", | |
| equate=False, | |
| ), | |
| # Statistical options | |
| _Option( | |
| ["-dbsize", "dbsize"], | |
| "Effective length of the database (integer).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-searchsp", "searchsp"], | |
| "Effective length of the search space (integer).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-max_hsps_per_subject", "max_hsps_per_subject"], | |
| "Override max number of HSPs per subject saved for ungapped searches " | |
| "(integer).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-max_hsps", "max_hsps"], | |
| "Set max number of HSPs saved per subject sequence\n\n" | |
| "Ddefault 0 means no limit.", | |
| equate=False, | |
| ), | |
| _Switch(["-sum_statistics", "sum_statistics"], "Use sum statistics."), | |
| # Is -sum_stats a BLAST+ bug, why not use -sum_statistics switch? | |
| _Option( | |
| ["-sum_stats", "sum_stats"], | |
| "Use sum statistics (boolean).\n\nAdded in BLAST+ 2.2.30.", | |
| equate=False, | |
| ), | |
| # Extension options | |
| _Option( | |
| ["-xdrop_ungap", "xdrop_ungap"], | |
| "X-dropoff value (in bits) for ungapped extensions (float).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-xdrop_gap", "xdrop_gap"], | |
| "X-dropoff value (in bits) for preliminary gapped extensions (float).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-xdrop_gap_final", "xdrop_gap_final"], | |
| "X-dropoff value (in bits) for final gapped alignment (float).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-window_size", "window_size"], | |
| "Multiple hits window size, use 0 to specify 1-hit algorithm " | |
| "(integer).", | |
| equate=False, | |
| ), | |
| # Search strategy options | |
| _Option( | |
| ["-import_search_strategy", "import_search_strategy"], | |
| "Search strategy to use.\n\n" | |
| "Incompatible with: export_search_strategy", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-export_search_strategy", "export_search_strategy"], | |
| "File name to record the search strategy used.\n\n" | |
| "Incompatible with: import_search_strategy", | |
| filename=True, | |
| equate=False, | |
| ), | |
| # Miscellaneous options | |
| _Option( | |
| ["-num_threads", "num_threads"], | |
| "Number of threads to use in the BLAST search.\n\n" | |
| "Integer, at least one. Default is one. Incompatible with: remote", | |
| equate=False, | |
| ), | |
| _Switch( | |
| ["-remote", "remote"], | |
| "Execute search remotely?\n\n" | |
| "Incompatible with: gilist, negative_gilist, subject_loc, " | |
| "num_threads, ...", | |
| ), | |
| ] | |
| try: | |
| # Insert extra parameters - at the start just in case there | |
| # are any arguments which must come last: | |
| self.parameters = extra_parameters + self.parameters | |
| except AttributeError: | |
| # Should we raise an error? The subclass should have set this up! | |
| self.parameters = extra_parameters | |
| _NcbibaseblastCommandline.__init__(self, cmd, **kwargs) | |
| def _validate(self): | |
| incompatibles = { | |
| "remote": ["gilist", "negative_gilist", "num_threads"], | |
| "import_search_strategy": ["export_search_strategy"], | |
| "gilist": ["negative_gilist"], | |
| "seqidlist": ["gilist", "negative_gilist", "remote"], | |
| } | |
| self._validate_incompatibilities(incompatibles) | |
| if self.entrez_query and not self.remote: | |
| raise ValueError("Option entrez_query requires remote option.") | |
| AbstractCommandline._validate(self) | |
| class _Ncbiblast2SeqCommandline(_NcbiblastCommandline): | |
| """Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE). | |
| This is provided for subclassing, it deals with shared options | |
| common to all the BLAST tools supporting two-sequence BLAST | |
| (blastn, psiblast, etc) but not rpsblast or rpstblastn. | |
| """ | |
| def __init__(self, cmd=None, **kwargs): | |
| assert cmd is not None | |
| extra_parameters = [ | |
| # General search options: | |
| _Option( | |
| ["-gapopen", "gapopen"], "Cost to open a gap (integer).", equate=False | |
| ), | |
| _Option( | |
| ["-gapextend", "gapextend"], | |
| "Cost to extend a gap (integer).", | |
| equate=False, | |
| ), | |
| # BLAST-2-Sequences options: | |
| _Option( | |
| ["-subject", "subject"], | |
| "Subject sequence(s) to search.\n\n" | |
| "Incompatible with: db, gilist, seqidlist, negative_gilist, " | |
| "negative_seqidlist, db_soft_mask, db_hard_mask\n\n" | |
| "See also subject_loc.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-subject_loc", "subject_loc"], | |
| "Location on the subject sequence (Format: start-stop).\n\n" | |
| "Incompatible with: db, gilist, seqidlist, negative_gilist, " | |
| "negative_seqidlist, db_soft_mask, db_hard_mask, remote.\n\n" | |
| "See also subject.", | |
| equate=False, | |
| ), | |
| # Restrict search or results: | |
| _Option( | |
| ["-culling_limit", "culling_limit"], | |
| "Hit culling limit (integer).\n\n" | |
| "If the query range of a hit is enveloped by that of at " | |
| "least this many higher-scoring hits, delete the hit.\n\n" | |
| "Incompatible with: best_hit_overhang, best_hit_score_edge.", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-best_hit_overhang", "best_hit_overhang"], | |
| "Best Hit algorithm overhang value (float, recommended value: 0.1)\n\n" | |
| "Float between 0.0 and 0.5 inclusive. " | |
| "Incompatible with: culling_limit.", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-best_hit_score_edge", "best_hit_score_edge"], | |
| "Best Hit algorithm score edge value (float).\n\n" | |
| "Float between 0.0 and 0.5 inclusive. Recommended value: 0.1\n\n" | |
| "Incompatible with: culling_limit.", | |
| equate=False, | |
| ), | |
| ] | |
| try: | |
| # Insert extra parameters - at the start just in case there | |
| # are any arguments which must come last: | |
| self.parameters = extra_parameters + self.parameters | |
| except AttributeError: | |
| # Should we raise an error? The subclass should have set this up! | |
| self.parameters = extra_parameters | |
| _NcbiblastCommandline.__init__(self, cmd, **kwargs) | |
| def _validate(self): | |
| incompatibles = { | |
| "subject_loc": ["db", "gilist", "negative_gilist", "seqidlist", "remote"], | |
| "culling_limit": ["best_hit_overhang", "best_hit_score_edge"], | |
| "subject": ["db", "gilist", "negative_gilist", "seqidlist"], | |
| } | |
| self._validate_incompatibilities(incompatibles) | |
| _NcbiblastCommandline._validate(self) | |
| class _NcbiblastMain2SeqCommandline(_Ncbiblast2SeqCommandline): | |
| """Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE). | |
| This is provided for subclassing, it deals with shared options | |
| common to the main BLAST tools blastp, blastn, blastx, tblastx, tblastn | |
| but not psiblast, rpsblast or rpstblastn. | |
| """ | |
| def __init__(self, cmd=None, **kwargs): | |
| assert cmd is not None | |
| extra_parameters = [ | |
| # Restrict search or results: | |
| _Option( | |
| ["-db_soft_mask", "db_soft_mask"], | |
| "Filtering algorithm for soft masking (integer).\n\n" | |
| "Filtering algorithm ID to apply to BLAST database as soft masking. " | |
| "Incompatible with: db_hard_mask, subject, subject_loc", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-db_hard_mask", "db_hard_mask"], | |
| "Filtering algorithm for hard masking (integer).\n\n" | |
| "Filtering algorithm ID to apply to BLAST database as hard masking. " | |
| "Incompatible with: db_soft_mask, subject, subject_loc", | |
| equate=False, | |
| ), | |
| ] | |
| try: | |
| # Insert extra parameters - at the start just in case there | |
| # are any arguments which must come last: | |
| self.parameters = extra_parameters + self.parameters | |
| except AttributeError: | |
| # Should we raise an error? The subclass should have set this up! | |
| self.parameters = extra_parameters | |
| _Ncbiblast2SeqCommandline.__init__(self, cmd, **kwargs) | |
| def _validate(self): | |
| incompatibles = { | |
| "db_soft_mask": ["db_hard_mask", "subject", "subject_loc"], | |
| "db_hard_mask": ["db_soft_mask", "subject", "subject_loc"], | |
| } | |
| self._validate_incompatibilities(incompatibles) | |
| _Ncbiblast2SeqCommandline._validate(self) | |
| class NcbiblastpCommandline(_NcbiblastMain2SeqCommandline): | |
| """Create a commandline for the NCBI BLAST+ program blastp (for proteins). | |
| With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI | |
| replaced the old blastall tool with separate tools for each of the searches. | |
| This wrapper therefore replaces BlastallCommandline with option -p blastp. | |
| >>> from Bio.Blast.Applications import NcbiblastpCommandline | |
| >>> cline = NcbiblastpCommandline(query="rosemary.pro", db="nr", | |
| ... evalue=0.001, remote=True, ungapped=True) | |
| >>> cline | |
| NcbiblastpCommandline(cmd='blastp', query='rosemary.pro', db='nr', evalue=0.001, remote=True, ungapped=True) | |
| >>> print(cline) | |
| blastp -query rosemary.pro -db nr -evalue 0.001 -remote -ungapped | |
| You would typically run the command line with cline() or via the Python | |
| subprocess module, as described in the Biopython tutorial. | |
| """ | |
| def __init__(self, cmd="blastp", **kwargs): | |
| """Initialize the class.""" | |
| self.parameters = [ | |
| # General search options: | |
| _Option( | |
| ["-task", "task"], | |
| "Task to execute (string, blastp (default), blastp-fast or blastp-short).", | |
| checker_function=lambda value: value | |
| in ["blastp", "blastp-fast", "blastp-short"], | |
| equate=False, | |
| ), | |
| _Option(["-matrix", "matrix"], "Scoring matrix name (default BLOSUM62)."), | |
| _Option( | |
| ["-threshold", "threshold"], | |
| "Minimum score for words to be added to the BLAST lookup table (float).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-comp_based_stats", "comp_based_stats"], | |
| "Use composition-based statistics (string, default 2, i.e. True).\n\n" | |
| "0, F or f: no composition-based statistics\n\n" | |
| "2, T or t, D or d : Composition-based score adjustment as in " | |
| "Bioinformatics 21:902-911, 2005, conditioned on sequence " | |
| "properties\n\n" | |
| "Note that tblastn also supports values of 1 and 3.", | |
| checker_function=lambda value: value in "0Ft2TtDd", | |
| equate=False, | |
| ), | |
| # Query filtering options: | |
| _Option( | |
| ["-seg", "seg"], | |
| "Filter query sequence with SEG (string).\n\n" | |
| 'Format: "yes", "window locut hicut", or "no" to disable\n' | |
| 'Default is "12 2.2 2.5"', | |
| equate=False, | |
| ), | |
| # Extension options: | |
| _Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"), | |
| # Miscellaneous options: | |
| _Switch( | |
| ["-use_sw_tback", "use_sw_tback"], | |
| "Compute locally optimal Smith-Waterman alignments?", | |
| ), | |
| ] | |
| _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs) | |
| class NcbiblastnCommandline(_NcbiblastMain2SeqCommandline): | |
| """Wrapper for the NCBI BLAST+ program blastn (for nucleotides). | |
| With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI | |
| replaced the old blastall tool with separate tools for each of the searches. | |
| This wrapper therefore replaces BlastallCommandline with option -p blastn. | |
| For example, to run a search against the "nt" nucleotide database using the | |
| FASTA nucleotide file "m_code.fasta" as the query, with an expectation value | |
| cut off of 0.001, saving the output to a file in XML format: | |
| >>> from Bio.Blast.Applications import NcbiblastnCommandline | |
| >>> cline = NcbiblastnCommandline(query="m_cold.fasta", db="nt", strand="plus", | |
| ... evalue=0.001, out="m_cold.xml", outfmt=5) | |
| >>> cline | |
| NcbiblastnCommandline(cmd='blastn', out='m_cold.xml', outfmt=5, query='m_cold.fasta', db='nt', evalue=0.001, strand='plus') | |
| >>> print(cline) | |
| blastn -out m_cold.xml -outfmt 5 -query m_cold.fasta -db nt -evalue 0.001 -strand plus | |
| You would typically run the command line with cline() or via the Python | |
| subprocess module, as described in the Biopython tutorial. | |
| """ | |
| def __init__(self, cmd="blastn", **kwargs): | |
| """Initialize the class.""" | |
| self.parameters = [ | |
| # Input query options: | |
| _Option( | |
| ["-strand", "strand"], | |
| "Query strand(s) to search against database/subject.\n\n" | |
| 'Values allowed are "both" (default), "minus", "plus".', | |
| checker_function=lambda value: value in ["both", "minus", "plus"], | |
| equate=False, | |
| ), | |
| # General search options: | |
| _Option( | |
| ["-task", "task"], | |
| "Task to execute (string, default 'megablast')\n\n" | |
| "Allowed values 'blastn', 'blastn-short', 'dc-megablast', 'megablast' " | |
| "(the default), or 'vecscreen'.", | |
| checker_function=lambda value: value | |
| in ["blastn", "blastn-short", "dc-megablast", "megablast", "vecscreen"], | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-penalty", "penalty"], | |
| "Penalty for a nucleotide mismatch (integer, at most zero).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-reward", "reward"], | |
| "Reward for a nucleotide match (integer, at least zero).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-use_index", "use_index"], | |
| "Use MegaBLAST database index (Boolean, Default = False)", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-index_name", "index_name"], | |
| "MegaBLAST database index name.", | |
| equate=False, | |
| ), | |
| # Query filtering options: | |
| _Option( | |
| ["-dust", "dust"], | |
| "Filter query sequence with DUST (string).\n\n" | |
| "Format: 'yes', 'level window linker', or 'no' to disable.\n\n" | |
| "Default = '20 64 1'.", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-filtering_db", "filtering_db"], | |
| "BLAST database containing filtering elements (i.e. repeats).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-window_masker_taxid", "window_masker_taxid"], | |
| "Enable WindowMasker filtering using a Taxonomic ID (integer).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-window_masker_db", "window_masker_db"], | |
| "Enable WindowMasker filtering using this repeats database (string).", | |
| equate=False, | |
| ), | |
| # Restrict search or results: | |
| _Option( | |
| ["-perc_identity", "perc_identity"], | |
| "Percent identity (real, 0 to 100 inclusive).", | |
| equate=False, | |
| ), | |
| # Discontiguous MegaBLAST options | |
| _Option( | |
| ["-template_type", "template_type"], | |
| "Discontiguous MegaBLAST template type (string).\n\n" | |
| "Allowed values: 'coding', 'coding_and_optimal' or 'optimal'.\n" | |
| "Requires: template_length.", | |
| checker_function=lambda value: value | |
| in ["coding", "coding_and_optimal", "optimal"], | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-template_length", "template_length"], | |
| "Discontiguous MegaBLAST template length (integer).\n\n" | |
| "Allowed values: 16, 18, 21.\n\n" | |
| "Requires: template_type.", | |
| checker_function=lambda value: value in [16, 18, 21, "16", "18", "21"], | |
| equate=False, | |
| ), | |
| # Extension options: | |
| _Switch( | |
| ["-no_greedy", "no_greedy"], | |
| "Use non-greedy dynamic programming extension", | |
| ), | |
| _Option( | |
| ["-min_raw_gapped_score", "min_raw_gapped_score"], | |
| "Minimum raw gapped score to keep an alignment in the " | |
| "preliminary gapped and traceback stages (integer).", | |
| equate=False, | |
| ), | |
| _Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"), | |
| _Option( | |
| ["-off_diagonal_range", "off_diagonal_range"], | |
| "Number of off-diagonals to search for the 2nd hit (integer).\n\n" | |
| "Expects a positive integer, or 0 (default) to turn off." | |
| "Added in BLAST 2.2.23+", | |
| equate=False, | |
| ), | |
| ] | |
| _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs) | |
| def _validate(self): | |
| if (self.template_type and not self.template_length) or ( | |
| self.template_length and not self.template_type | |
| ): | |
| raise ValueError( | |
| "Options template_type and template_type require each other." | |
| ) | |
| _NcbiblastMain2SeqCommandline._validate(self) | |
| class NcbiblastxCommandline(_NcbiblastMain2SeqCommandline): | |
| """Wrapper for the NCBI BLAST+ program blastx (nucleotide query, protein database). | |
| With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI | |
| replaced the old blastall tool with separate tools for each of the searches. | |
| This wrapper therefore replaces BlastallCommandline with option -p blastx. | |
| >>> from Bio.Blast.Applications import NcbiblastxCommandline | |
| >>> cline = NcbiblastxCommandline(query="m_cold.fasta", db="nr", evalue=0.001) | |
| >>> cline | |
| NcbiblastxCommandline(cmd='blastx', query='m_cold.fasta', db='nr', evalue=0.001) | |
| >>> print(cline) | |
| blastx -query m_cold.fasta -db nr -evalue 0.001 | |
| You would typically run the command line with cline() or via the Python | |
| subprocess module, as described in the Biopython tutorial. | |
| """ | |
| def __init__(self, cmd="blastx", **kwargs): | |
| """Initialize the class.""" | |
| self.parameters = [ | |
| # Input query options: | |
| _Option( | |
| ["-task", "task"], | |
| "Task to execute (string, blastx (default) or blastx-fast).", | |
| checker_function=lambda value: value in ["blastx", "blastx-fast"], | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-strand", "strand"], | |
| "Query strand(s) to search against database/subject.\n\n" | |
| 'Values allowed are "both" (default), "minus", "plus".', | |
| checker_function=lambda value: value in ["both", "minus", "plus"], | |
| equate=False, | |
| ), | |
| # Input query options: | |
| _Option( | |
| ["-query_gencode", "query_gencode"], | |
| "Genetic code to use to translate query (integer, default 1).", | |
| equate=False, | |
| ), | |
| # General search options: | |
| _Option( | |
| ["-frame_shift_penalty", "frame_shift_penalty"], | |
| "Frame shift penalty (integer, at least 1, default ignored) (OBSOLETE).\n\n" | |
| "This was removed in BLAST 2.2.27+", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-max_intron_length", "max_intron_length"], | |
| "Maximum intron length (integer).\n\n" | |
| "Length of the largest intron allowed in a translated nucleotide " | |
| "sequence when linking multiple distinct alignments (a negative " | |
| "value disables linking). Default zero.", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-matrix", "matrix"], | |
| "Scoring matrix name (default BLOSUM62).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-threshold", "threshold"], | |
| "Minimum score for words to be added to the BLAST lookup table (float).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-comp_based_stats", "comp_based_stats"], | |
| "Use composition-based statistics for blastp, blastx, or tblastn.\n\n" | |
| "D or d: default (equivalent to 2 )\n\n" | |
| "0 or F or f: no composition-based statistics\n\n" | |
| "1: Composition-based statistics as in NAR 29:2994-3005, 2001\n\n" | |
| "2 or T or t : Composition-based score adjustment as in " | |
| "Bioinformatics 21:902-911, 2005, conditioned on sequence " | |
| "properties\n\n" | |
| "3: Composition-based score adjustment as in Bioinformatics " | |
| "21:902-911, 2005, unconditionally.\n\n" | |
| "For programs other than tblastn, must either be absent or be " | |
| "D, F or 0\n\n" | |
| "Default = 2.", | |
| equate=False, | |
| ), | |
| # Query filtering options: | |
| _Option( | |
| ["-seg", "seg"], | |
| "Filter query sequence with SEG (string).\n\n" | |
| 'Format: "yes", "window locut hicut", or "no" to disable.' | |
| 'Default is "12 2.2 2.5"', | |
| equate=False, | |
| ), | |
| # Extension options: | |
| _Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"), | |
| _Switch( | |
| ["-use_sw_tback", "use_sw_tback"], | |
| "Compute locally optimal Smith-Waterman alignments?", | |
| ), | |
| ] | |
| _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs) | |
| class NcbitblastnCommandline(_NcbiblastMain2SeqCommandline): | |
| """Wrapper for the NCBI BLAST+ program tblastn. | |
| With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI | |
| replaced the old blastall tool with separate tools for each of the searches. | |
| This wrapper therefore replaces BlastallCommandline with option -p tblastn. | |
| >>> from Bio.Blast.Applications import NcbitblastnCommandline | |
| >>> cline = NcbitblastnCommandline(help=True) | |
| >>> cline | |
| NcbitblastnCommandline(cmd='tblastn', help=True) | |
| >>> print(cline) | |
| tblastn -help | |
| You would typically run the command line with cline() or via the Python | |
| subprocess module, as described in the Biopython tutorial. | |
| """ | |
| def __init__(self, cmd="tblastn", **kwargs): | |
| """Initialize the class.""" | |
| self.parameters = [ | |
| # General search options: | |
| _Option( | |
| ["-task", "task"], | |
| "Task to execute (string, tblastn (default) or tblastn-fast).", | |
| checker_function=lambda value: value in ["tblastn", "tblastn-fast"], | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-db_gencode", "db_gencode"], | |
| "Genetic code to use to translate query (integer, default 1).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-frame_shift_penalty", "frame_shift_penalty"], | |
| "Frame shift penalty (integer, at least 1, default ignored) (OBSOLETE).\n\n" | |
| "This was removed in BLAST 2.2.27+", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-max_intron_length", "max_intron_length"], | |
| "Maximum intron length (integer).\n\n" | |
| "Length of the largest intron allowed in a translated nucleotide " | |
| "sequence when linking multiple distinct alignments (a negative " | |
| "value disables linking). Default zero.", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-matrix", "matrix"], | |
| "Scoring matrix name (default BLOSUM62).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-threshold", "threshold"], | |
| "Minimum score for words to be added to the BLAST lookup table (float).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-comp_based_stats", "comp_based_stats"], | |
| "Use composition-based statistics (string, default 2, i.e. True).\n\n" | |
| "0, F or f: no composition-based statistics\n\n" | |
| "1: Composition-based statistics as in NAR 29:2994-3005, 2001\n\n" | |
| "2, T or t, D or d : Composition-based score adjustment as in " | |
| "Bioinformatics 21:902-911, 2005, conditioned on sequence properties\n\n" | |
| "3: Composition-based score adjustment as in Bioinformatics 21:902-911, " | |
| "2005, unconditionally\n\n" | |
| "Note that only tblastn supports values of 1 and 3.", | |
| checker_function=lambda value: value in "0Ft12TtDd3", | |
| equate=False, | |
| ), | |
| # Query filtering options: | |
| _Option( | |
| ["-seg", "seg"], | |
| "Filter query sequence with SEG (string).\n\n" | |
| 'Format: "yes", "window locut hicut", or "no" to disable.\n\n' | |
| 'Default is "12 2.2 2.5"', | |
| equate=False, | |
| ), | |
| # Extension options: | |
| _Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"), | |
| # Miscellaneous options: | |
| _Switch( | |
| ["-use_sw_tback", "use_sw_tback"], | |
| "Compute locally optimal Smith-Waterman alignments?", | |
| ), | |
| # PSI-TBLASTN options: | |
| _Option( | |
| ["-in_pssm", "in_pssm"], | |
| "PSI-BLAST checkpoint file.\n\nIncompatible with: remote, query", | |
| filename=True, | |
| equate=False, | |
| ), | |
| ] | |
| _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs) | |
| class NcbitblastxCommandline(_NcbiblastMain2SeqCommandline): | |
| """Wrapper for the NCBI BLAST+ program tblastx. | |
| With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI | |
| replaced the old blastall tool with separate tools for each of the searches. | |
| This wrapper therefore replaces BlastallCommandline with option -p tblastx. | |
| >>> from Bio.Blast.Applications import NcbitblastxCommandline | |
| >>> cline = NcbitblastxCommandline(help=True) | |
| >>> cline | |
| NcbitblastxCommandline(cmd='tblastx', help=True) | |
| >>> print(cline) | |
| tblastx -help | |
| You would typically run the command line with cline() or via the Python | |
| subprocess module, as described in the Biopython tutorial. | |
| """ | |
| def __init__(self, cmd="tblastx", **kwargs): | |
| """Initialize the class.""" | |
| self.parameters = [ | |
| # Input query options: | |
| _Option( | |
| ["-strand", "strand"], | |
| "Query strand(s) to search against database/subject.\n\n" | |
| 'Values allowed are "both" (default), "minus", "plus".', | |
| checker_function=lambda value: value in ["both", "minus", "plus"], | |
| equate=False, | |
| ), | |
| # Input query options: | |
| _Option( | |
| ["-query_gencode", "query_gencode"], | |
| "Genetic code to use to translate query (integer, default 1).", | |
| equate=False, | |
| ), | |
| # General search options: | |
| _Option( | |
| ["-db_gencode", "db_gencode"], | |
| "Genetic code to use to translate query (integer, default 1).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-max_intron_length", "max_intron_length"], | |
| "Maximum intron length (integer).\n\n" | |
| "Length of the largest intron allowed in a translated nucleotide " | |
| "sequence when linking multiple distinct alignments (a negative " | |
| "value disables linking). Default zero.", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-matrix", "matrix"], | |
| "Scoring matrix name (default BLOSUM62).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-threshold", "threshold"], | |
| "Minimum score for words to be added to the BLAST lookup table (float).", | |
| equate=False, | |
| ), | |
| # Query filtering options: | |
| _Option( | |
| ["-seg", "seg"], | |
| "Filter query sequence with SEG (string).\n\n" | |
| 'Format: "yes", "window locut hicut", or "no" to disable.\n\n' | |
| 'Default is "12 2.2 2.5"', | |
| equate=False, | |
| ), | |
| ] | |
| _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs) | |
| class NcbipsiblastCommandline(_Ncbiblast2SeqCommandline): | |
| """Wrapper for the NCBI BLAST+ program psiblast. | |
| With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI | |
| replaced the old blastpgp tool with a similar tool psiblast. This wrapper | |
| therefore replaces BlastpgpCommandline, the wrapper for blastpgp. | |
| >>> from Bio.Blast.Applications import NcbipsiblastCommandline | |
| >>> cline = NcbipsiblastCommandline(help=True) | |
| >>> cline | |
| NcbipsiblastCommandline(cmd='psiblast', help=True) | |
| >>> print(cline) | |
| psiblast -help | |
| You would typically run the command line with cline() or via the Python | |
| subprocess module, as described in the Biopython tutorial. | |
| """ | |
| def __init__(self, cmd="psiblast", **kwargs): | |
| """Initialize the class.""" | |
| self.parameters = [ | |
| # General search options: | |
| _Option( | |
| ["-matrix", "matrix"], | |
| "Scoring matrix name (default BLOSUM62).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-threshold", "threshold"], | |
| "Minimum score for words to be added to the BLAST lookup table (float).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-comp_based_stats", "comp_based_stats"], | |
| "Use composition-based statistics (string, default 2, i.e. True).\n\n" | |
| "0, F or f: no composition-based statistics\n\n" | |
| "2, T or t, D or d : Composition-based score adjustment as in " | |
| "Bioinformatics 21:902-911, 2005, conditioned on sequence properties\n\n" | |
| "Note that tblastn also supports values of 1 and 3.", | |
| checker_function=lambda value: value in "0Ft2TtDd", | |
| equate=False, | |
| ), | |
| # Query filtering options: | |
| _Option( | |
| ["-seg", "seg"], | |
| "Filter query sequence with SEG (string).\n\n" | |
| 'Format: "yes", "window locut hicut", or "no" to disable. ' | |
| 'Default is "12 2.2 2.5"', | |
| equate=False, | |
| ), | |
| # Extension options: | |
| _Option( | |
| ["-gap_trigger", "gap_trigger"], | |
| "Number of bits to trigger gapping (float, default 22).", | |
| equate=False, | |
| ), | |
| # Miscellaneous options: | |
| _Switch( | |
| ["-use_sw_tback", "use_sw_tback"], | |
| "Compute locally optimal Smith-Waterman alignments?", | |
| ), | |
| # PSI-BLAST options: | |
| _Option( | |
| ["-num_iterations", "num_iterations"], | |
| "Number of iterations to perform (integer, at least one).\n\n" | |
| "Default is one. Incompatible with: remote", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-out_pssm", "out_pssm"], | |
| "File name to store checkpoint file.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-out_ascii_pssm", "out_ascii_pssm"], | |
| "File name to store ASCII version of PSSM.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Switch( | |
| ["-save_pssm_after_last_round", "save_pssm_after_last_round"], | |
| "Save PSSM after the last database search.", | |
| ), | |
| _Switch( | |
| ["-save_each_pssm", "save_each_pssm"], | |
| "Save PSSM after each iteration\n\n" | |
| "File name is given in -save_pssm or -save_ascii_pssm options.", | |
| ), | |
| _Option( | |
| ["-in_msa", "in_msa"], | |
| "File name of multiple sequence alignment to restart PSI-BLAST.\n\n" | |
| "Incompatible with: in_pssm, query", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-msa_master_idx", "msa_master_idx"], | |
| "Index of sequence to use as master in MSA.\n\n" | |
| "Index (1-based) of sequence to use as the master in the multiple " | |
| "sequence alignment. If not specified, the first sequence is used.", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-in_pssm", "in_pssm"], | |
| "PSI-BLAST checkpoint file.\n\n" | |
| "Incompatible with: in_msa, query, phi_pattern", | |
| filename=True, | |
| equate=False, | |
| ), | |
| # PSSM engine options: | |
| _Option( | |
| ["-pseudocount", "pseudocount"], | |
| "Pseudo-count value used when constructing PSSM.\n\n" | |
| "Integer. Default is zero.", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-inclusion_ethresh", "inclusion_ethresh"], | |
| "E-value inclusion threshold for pairwise alignments (float, default 0.002).", | |
| equate=False, | |
| ), | |
| _Switch( | |
| ["-ignore_msa_master", "ignore_msa_master"], | |
| "Ignore the master sequence when creating PSSM.\n\n" | |
| "Requires: in_msa\n" | |
| "Incompatible with: msa_master_idx, in_pssm, query, query_loc, " | |
| "phi_pattern", | |
| ), | |
| # PHI-BLAST options: | |
| _Option( | |
| ["-phi_pattern", "phi_pattern"], | |
| "File name containing pattern to search.\n\n" | |
| "Incompatible with: in_pssm", | |
| filename=True, | |
| equate=False, | |
| ), | |
| ] | |
| _Ncbiblast2SeqCommandline.__init__(self, cmd, **kwargs) | |
| def _validate(self): | |
| incompatibles = { | |
| "num_iterations": ["remote"], | |
| "in_msa": ["in_pssm", "query"], | |
| "in_pssm": ["in_msa", "query", "phi_pattern"], | |
| "ignore_msa_master": [ | |
| "msa_master_idx", | |
| "in_pssm", | |
| "query", | |
| "query_loc", | |
| "phi_pattern", | |
| ], | |
| } | |
| self._validate_incompatibilities(incompatibles) | |
| _Ncbiblast2SeqCommandline._validate(self) | |
| class NcbirpsblastCommandline(_NcbiblastCommandline): | |
| """Wrapper for the NCBI BLAST+ program rpsblast. | |
| With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI | |
| replaced the old rpsblast tool with a similar tool of the same name. This | |
| wrapper replaces RpsBlastCommandline, the wrapper for the old rpsblast. | |
| >>> from Bio.Blast.Applications import NcbirpsblastCommandline | |
| >>> cline = NcbirpsblastCommandline(help=True) | |
| >>> cline | |
| NcbirpsblastCommandline(cmd='rpsblast', help=True) | |
| >>> print(cline) | |
| rpsblast -help | |
| You would typically run the command line with cline() or via the Python | |
| subprocess module, as described in the Biopython tutorial. | |
| """ | |
| def __init__(self, cmd="rpsblast", **kwargs): | |
| """Initialize the class.""" | |
| # TODO - remove the -word_size argument as per BLAST+ 2.2.30 | |
| # (BLAST team say it should never have been included, since | |
| # the word size is set when building the domain database.) | |
| # This likely means reviewing the class hierarchy again. | |
| self.parameters = [ | |
| # Query filtering options: | |
| _Option( | |
| ["-seg", "seg"], | |
| "Filter query sequence with SEG (string).\n\n" | |
| 'Format: "yes", "window locut hicut", or "no" to disable.' | |
| 'Default is "12 2.2 2.5"', | |
| equate=False, | |
| ), | |
| # Restrict search or results: | |
| _Option( | |
| ["-culling_limit", "culling_limit"], | |
| "Hit culling limit (integer).\n\n" | |
| "If the query range of a hit is enveloped by that of at " | |
| "least this many higher-scoring hits, delete the hit. " | |
| "Incompatible with: best_hit_overhang, best_hit_score_edge.", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-best_hit_overhang", "best_hit_overhang"], | |
| "Best Hit algorithm overhang value (recommended value: 0.1).\n\n" | |
| "Float between 0.0 and 0.5 inclusive. " | |
| "Incompatible with: culling_limit.", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-best_hit_score_edge", "best_hit_score_edge"], | |
| "Best Hit algorithm score edge value (recommended value: 0.1).\n\n" | |
| "Float between 0.0 and 0.5 inclusive. " | |
| "Incompatible with: culling_limit.", | |
| equate=False, | |
| ), | |
| # General search options: | |
| _Option( | |
| ["-comp_based_stats", "comp_based_stats"], | |
| "Use composition-based statistics.\n\n" | |
| "D or d: default (equivalent to 0)\n\n" | |
| "0 or F or f: Simplified Composition-based statistics as in " | |
| "Bioinformatics 15:1000-1011, 1999\n\n" | |
| "1 or T or t: Composition-based statistics as in NAR 29:2994-3005, " | |
| "2001\n\n" | |
| "Default = 0.", | |
| checker_function=lambda value: value in "Dd0Ff1Tt", | |
| equate=False, | |
| ), | |
| # Misc options: | |
| _Switch( | |
| ["-use_sw_tback", "use_sw_tback"], | |
| "Compute locally optimal Smith-Waterman alignments?", | |
| ), | |
| ] | |
| _NcbiblastCommandline.__init__(self, cmd, **kwargs) | |
| def _validate(self): | |
| incompatibles = {"culling_limit": ["best_hit_overhang", "best_hit_score_edge"]} | |
| self._validate_incompatibilities(incompatibles) | |
| _NcbiblastCommandline._validate(self) | |
| class NcbirpstblastnCommandline(_NcbiblastCommandline): | |
| """Wrapper for the NCBI BLAST+ program rpstblastn. | |
| With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI | |
| replaced the old rpsblast tool with a similar tool of the same name, and a | |
| separate tool rpstblastn for Translated Reverse Position Specific BLAST. | |
| >>> from Bio.Blast.Applications import NcbirpstblastnCommandline | |
| >>> cline = NcbirpstblastnCommandline(help=True) | |
| >>> cline | |
| NcbirpstblastnCommandline(cmd='rpstblastn', help=True) | |
| >>> print(cline) | |
| rpstblastn -help | |
| You would typically run the command line with cline() or via the Python | |
| subprocess module, as described in the Biopython tutorial. | |
| """ | |
| def __init__(self, cmd="rpstblastn", **kwargs): | |
| """Initialize the class.""" | |
| # TODO - remove the -word_size argument as per BLAST+ 2.2.30 | |
| # (BLAST team say it should never have been included, since | |
| # the word size is set when building the domain database.) | |
| # This likely means reviewing the class hierarchy again. | |
| self.parameters = [ | |
| # Input query options: | |
| _Option( | |
| ["-strand", "strand"], | |
| "Query strand(s) to search against database/subject.\n\n" | |
| 'Values allowed are "both" (default), "minus", "plus".', | |
| checker_function=lambda value: value in ["both", "minus", "plus"], | |
| equate=False, | |
| ), | |
| # Input query options: | |
| _Option( | |
| ["-query_gencode", "query_gencode"], | |
| "Genetic code to use to translate query (integer, default 1).", | |
| equate=False, | |
| ), | |
| # Query filtering options: | |
| _Option( | |
| ["-seg", "seg"], | |
| "Filter query sequence with SEG (string).\n\n" | |
| 'Format: "yes", "window locut hicut", or "no" to disable. ' | |
| 'Default is "12 2.2 2.5"', | |
| equate=False, | |
| ), | |
| # General search options: | |
| _Option( | |
| ["-comp_based_stats", "comp_based_stats"], | |
| "Use composition-based statistics.\n\n" | |
| "D or d: default (equivalent to 0)\n\n" | |
| "0 or F or f: Simplified Composition-based statistics as in " | |
| "Bioinformatics 15:1000-1011, 1999\n\n" | |
| "1 or T or t: Composition-based statistics as in NAR 29:2994-3005, " | |
| "2001\n\n" | |
| "Default = 0.", | |
| checker_function=lambda value: value in "Dd0Ff1Tt", | |
| equate=False, | |
| ), | |
| # Extension options: | |
| _Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"), | |
| # Miscellaneous options: | |
| _Switch( | |
| ["-use_sw_tback", "use_sw_tback"], | |
| "Compute locally optimal Smith-Waterman alignments?", | |
| ), | |
| ] | |
| _NcbiblastCommandline.__init__(self, cmd, **kwargs) | |
| class NcbiblastformatterCommandline(_NcbibaseblastCommandline): | |
| """Wrapper for the NCBI BLAST+ program blast_formatter. | |
| With the release of BLAST 2.2.24+ (i.e. the BLAST suite rewritten in C++ | |
| instead of C), the NCBI added the ASN.1 output format option to all the | |
| search tools, and extended the blast_formatter to support this as input. | |
| The blast_formatter command allows you to convert the ASN.1 output into | |
| the other output formats (XML, tabular, plain text, HTML). | |
| >>> from Bio.Blast.Applications import NcbiblastformatterCommandline | |
| >>> cline = NcbiblastformatterCommandline(archive="example.asn", outfmt=5, out="example.xml") | |
| >>> cline | |
| NcbiblastformatterCommandline(cmd='blast_formatter', out='example.xml', outfmt=5, archive='example.asn') | |
| >>> print(cline) | |
| blast_formatter -out example.xml -outfmt 5 -archive example.asn | |
| You would typically run the command line with cline() or via the Python | |
| subprocess module, as described in the Biopython tutorial. | |
| Note that this wrapper is for the version of blast_formatter from BLAST | |
| 2.2.24+ (or later) which is when the NCBI first announced the inclusion | |
| this tool. There was actually an early version in BLAST 2.2.23+ (and | |
| possibly in older releases) but this did not have the -archive option | |
| (instead -rid is a mandatory argument), and is not supported by this | |
| wrapper. | |
| """ | |
| def __init__(self, cmd="blast_formatter", **kwargs): | |
| """Initialize the class.""" | |
| self.parameters = [ | |
| # Input options | |
| _Option( | |
| ["-rid", "rid"], | |
| "BLAST Request ID (RID), not compatible with archive arg.", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-archive", "archive"], | |
| "Archive file of results, not compatible with rid arg.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| # Restrict search or results | |
| _Option( | |
| ["-max_target_seqs", "max_target_seqs"], | |
| "Maximum number of aligned sequences to keep.", | |
| checker_function=lambda value: value >= 1, | |
| equate=False, | |
| ), | |
| ] | |
| _NcbibaseblastCommandline.__init__(self, cmd, **kwargs) | |
| def _validate(self): | |
| incompatibles = {"rid": ["archive"]} | |
| self._validate_incompatibilities(incompatibles) | |
| _NcbibaseblastCommandline._validate(self) | |
| class NcbideltablastCommandline(_Ncbiblast2SeqCommandline): | |
| """Create a commandline for the NCBI BLAST+ program deltablast (for proteins). | |
| This is a wrapper for the deltablast command line command included in | |
| the NCBI BLAST+ software (not present in the original BLAST). | |
| >>> from Bio.Blast.Applications import NcbideltablastCommandline | |
| >>> cline = NcbideltablastCommandline(query="rosemary.pro", db="nr", | |
| ... evalue=0.001, remote=True) | |
| >>> cline | |
| NcbideltablastCommandline(cmd='deltablast', query='rosemary.pro', db='nr', evalue=0.001, remote=True) | |
| >>> print(cline) | |
| deltablast -query rosemary.pro -db nr -evalue 0.001 -remote | |
| You would typically run the command line with cline() or via the Python | |
| subprocess module, as described in the Biopython tutorial. | |
| """ | |
| def __init__(self, cmd="deltablast", **kwargs): | |
| """Initialize the class.""" | |
| self.parameters = [ | |
| # General search options: | |
| _Option(["-matrix", "matrix"], "Scoring matrix name (default BLOSUM62)."), | |
| _Option( | |
| ["-threshold", "threshold"], | |
| "Minimum score for words to be added to the BLAST lookup table (float).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-comp_based_stats", "comp_based_stats"], | |
| "Use composition-based statistics (string, default 2, i.e. True).\n\n" | |
| "0, F or f: no composition-based statistics.\n\n" | |
| "2, T or t, D or d : Composition-based score adjustment as in " | |
| "Bioinformatics 21:902-911, 2005, conditioned on sequence properties\n\n" | |
| "Note that tblastn also supports values of 1 and 3.", | |
| checker_function=lambda value: value in "0Ft2TtDd", | |
| equate=False, | |
| ), | |
| # Query filtering options: | |
| _Option( | |
| ["-seg", "seg"], | |
| "Filter query sequence with SEG (string).\n\n" | |
| 'Format: "yes", "window locut hicut", or "no" to disable. ' | |
| 'Default is "12 2.2 2.5"', | |
| equate=False, | |
| ), | |
| # Extension options: | |
| _Option( | |
| ["-gap_trigger", "gap_trigger"], | |
| "Number of bits to trigger gapping. Default = 22.", | |
| equate=False, | |
| ), | |
| # Miscellaneous options: | |
| _Switch( | |
| ["-use_sw_tback", "use_sw_tback"], | |
| "Compute locally optimal Smith-Waterman alignments?", | |
| ), | |
| # PSI-BLAST options | |
| _Option( | |
| ["-num_iterations", "num_iterations"], | |
| "Number of iterations to perform. (integer >=1, Default is 1).\n\n" | |
| "Incompatible with: remote", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-out_pssm", "out_pssm"], | |
| "File name to store checkpoint file.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-out_ascii_pssm", "out_ascii_pssm"], | |
| "File name to store ASCII version of PSSM.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Switch( | |
| ["-save_pssm_after_last_round", "save_pssm_after_last_round"], | |
| "Save PSSM after the last database search.", | |
| ), | |
| _Switch( | |
| ["-save_each_pssm", "save_each_pssm"], | |
| "Save PSSM after each iteration.\n\n" | |
| "File name is given in -save_pssm or -save_ascii_pssm options.", | |
| ), | |
| # PSSM engine options | |
| _Option( | |
| ["-pseudocount", "pseudocount"], | |
| "Pseudo-count value used when constructing PSSM (integer, default 0).", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-domain_inclusion_ethresh", "domain_inclusion_ethresh"], | |
| "E-value inclusion threshold for alignments with conserved domains.\n\n" | |
| "(float, Default is 0.05)", | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-inclusion_ethresh", "inclusion_ethresh"], | |
| "Pairwise alignment e-value inclusion threshold (float, default 0.002).", | |
| equate=False, | |
| ), | |
| # DELTA-BLAST options | |
| _Option( | |
| ["-rpsdb", "rpsdb"], | |
| "BLAST domain database name (dtring, Default = 'cdd_delta').", | |
| equate=False, | |
| ), | |
| _Switch( | |
| ["-show_domain_hits", "show_domain_hits"], | |
| "Show domain hits?\n\nIncompatible with: remote, subject", | |
| ), | |
| ] | |
| _Ncbiblast2SeqCommandline.__init__(self, cmd, **kwargs) | |
| class NcbimakeblastdbCommandline(AbstractCommandline): | |
| """Wrapper for the NCBI BLAST+ program makeblastdb. | |
| This is a wrapper for the NCBI BLAST+ makeblastdb application | |
| to create BLAST databases. By default, this creates a blast database | |
| with the same name as the input file. The default output location | |
| is the same directory as the input. | |
| >>> from Bio.Blast.Applications import NcbimakeblastdbCommandline | |
| >>> cline = NcbimakeblastdbCommandline(dbtype="prot", | |
| ... input_file="NC_005816.faa") | |
| >>> cline | |
| NcbimakeblastdbCommandline(cmd='makeblastdb', dbtype='prot', input_file='NC_005816.faa') | |
| >>> print(cline) | |
| makeblastdb -dbtype prot -in NC_005816.faa | |
| You would typically run the command line with cline() or via the Python | |
| subprocess module, as described in the Biopython tutorial. | |
| """ | |
| def __init__(self, cmd="makeblastdb", **kwargs): | |
| """Initialize the class.""" | |
| self.parameters = [ | |
| # Basic input options | |
| _Switch( | |
| ["-h", "h"], "Print USAGE and DESCRIPTION; ignore other arguments." | |
| ), | |
| _Switch( | |
| ["-help", "help"], | |
| "Print USAGE, DESCRIPTION and ARGUMENTS description; " | |
| "ignore other arguments.", | |
| ), | |
| _Switch( | |
| ["-version", "version"], | |
| "Print version number; ignore other arguments.", | |
| ), | |
| # Output configuration options | |
| _Option( | |
| ["-out", "out"], | |
| "Output file for alignment.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| # makeblastdb specific options | |
| _Option( | |
| ["-blastdb_version", "blastdb_version"], | |
| "Version of BLAST database to be created. " | |
| "Tip: use BLAST database version 4 on 32 bit CPU. " | |
| "Default = 5", | |
| equate=False, | |
| checker_function=lambda x: x == 4 or x == 5, | |
| ), | |
| _Option( | |
| ["-dbtype", "dbtype"], | |
| "Molecule type of target db ('nucl' or 'prot').", | |
| equate=False, | |
| is_required=True, | |
| checker_function=lambda x: x == "nucl" or x == "prot", | |
| ), | |
| _Option( | |
| ["-in", "input_file"], | |
| "Input file/database name.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-input_type", "input_type"], | |
| "Type of the data specified in input_file.\n\n" | |
| "Default = 'fasta'. Added in BLAST 2.2.26.", | |
| filename=False, | |
| equate=False, | |
| checker_function=self._input_type_checker, | |
| ), | |
| _Option( | |
| ["-title", "title"], | |
| "Title for BLAST database.", | |
| filename=False, | |
| equate=False, | |
| ), | |
| _Switch( | |
| ["-parse_seqids", "parse_seqids"], | |
| "Option to parse seqid for FASTA input if set.\n\n" | |
| "For all other input types, seqids are parsed automatically", | |
| ), | |
| _Switch( | |
| ["-hash_index", "hash_index"], "Create index of sequence hash values." | |
| ), | |
| _Option( | |
| ["-mask_data", "mask_data"], | |
| "Comma-separated list of input files containing masking " | |
| "data as produced by NCBI masking applications " | |
| "(e.g. dustmasker, segmasker, windowmasker).", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-mask_id", "mask_id"], | |
| "Comma-separated list of strings to uniquely identify the " | |
| "masking algorithm.", | |
| filename=False, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-mask_desc", "mask_desc"], | |
| "Comma-separated list of free form strings to describe " | |
| "the masking algorithm details.", | |
| filename=False, | |
| equate=False, | |
| ), | |
| _Switch(["-gi_mask", "gi_mask"], "Create GI indexed masking data."), | |
| _Option( | |
| ["-gi_mask_name", "gi_mask_name"], | |
| "Comma-separated list of masking data output files.", | |
| filename=False, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-max_file_sz", "max_file_sz"], | |
| "Maximum file size for BLAST database files. Default = '1GB'.", | |
| filename=False, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-logfile", "logfile"], | |
| "File to which the program log should be redirected.", | |
| filename=True, | |
| equate=False, | |
| ), | |
| _Option( | |
| ["-taxid", "taxid"], | |
| "Taxonomy ID to assign to all sequences.", | |
| filename=False, | |
| equate=False, | |
| checker_function=lambda x: type(x)(int(x)) == x, | |
| ), | |
| _Option( | |
| ["-taxid_map", "taxid_map"], | |
| "Text file mapping sequence IDs to taxonomy IDs.\n\n" | |
| "Format:<SequenceId> <TaxonomyId><newline>", | |
| filename=True, | |
| equate=False, | |
| ), | |
| ] | |
| AbstractCommandline.__init__(self, cmd, **kwargs) | |
| def _input_type_checker(self, command): | |
| return command in ("asn1_bin", "asn1_txt", "blastdb", "fasta") | |
| def _validate(self): | |
| incompatibles = { | |
| "mask_id": ["gi_mask"], | |
| "gi_mask": ["mask_id"], | |
| "taxid": ["taxid_map"], | |
| } | |
| # Copied from _NcbibaseblastCommandline class above. | |
| # Code repeated here for python2 and 3 compatibility, | |
| # because this is not a _NcbibaseblastCommandline subclass. | |
| for a in incompatibles: | |
| if self._get_parameter(a): | |
| for b in incompatibles[a]: | |
| if self._get_parameter(b): | |
| raise ValueError(f"Options {a} and {b} are incompatible.") | |
| if self.mask_id and not self.mask_data: | |
| raise ValueError("Option mask_id requires mask_data to be set.") | |
| if self.mask_desc and not self.mask_id: | |
| raise ValueError("Option mask_desc requires mask_id to be set.") | |
| if self.gi_mask and not self.parse_seqids: | |
| raise ValueError("Option gi_mask requires parse_seqids to be set.") | |
| if self.gi_mask_name and not (self.mask_data and self.gi_mask): | |
| raise ValueError( | |
| "Option gi_mask_name requires mask_data and gi_mask to be set." | |
| ) | |
| if self.taxid_map and not self.parse_seqids: | |
| raise ValueError("Option taxid_map requires parse_seqids to be set.") | |
| AbstractCommandline._validate(self) | |
| def _test(): | |
| """Run the Bio.Blast.Applications module's doctests (PRIVATE).""" | |
| import doctest | |
| doctest.testmod(verbose=1) | |
| if __name__ == "__main__": | |
| # Run the doctests | |
| _test() | |