Spaces:
No application file
No application file
| # Copyright 1999 by Jeffrey Chang. All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| # | |
| # Patched by Brad Chapman. | |
| # Chris Wroe added modifications for work in myGrid | |
| """Code to invoke the NCBI BLAST server over the internet. | |
| This module provides code to work with the WWW version of BLAST | |
| provided by the NCBI. https://blast.ncbi.nlm.nih.gov/ | |
| Variables: | |
| - email Set the Blast email parameter (default is None). | |
| - tool Set the Blast tool parameter (default is ``biopython``). | |
| """ | |
| import warnings | |
| from io import StringIO | |
| import time | |
| from urllib.parse import urlencode | |
| from urllib.request import build_opener, install_opener | |
| from urllib.request import urlopen, urlretrieve, urlparse, urlcleanup | |
| from urllib.request import HTTPPasswordMgrWithDefaultRealm, HTTPBasicAuthHandler | |
| from urllib.request import Request | |
| from Bio import BiopythonWarning | |
| email = None | |
| tool = "biopython" | |
| NCBI_BLAST_URL = "https://blast.ncbi.nlm.nih.gov/Blast.cgi" | |
| def qblast( | |
| program, | |
| database, | |
| sequence, | |
| url_base=NCBI_BLAST_URL, | |
| auto_format=None, | |
| composition_based_statistics=None, | |
| db_genetic_code=None, | |
| endpoints=None, | |
| entrez_query="(none)", | |
| expect=10.0, | |
| filter=None, | |
| gapcosts=None, | |
| genetic_code=None, | |
| hitlist_size=50, | |
| i_thresh=None, | |
| layout=None, | |
| lcase_mask=None, | |
| matrix_name=None, | |
| nucl_penalty=None, | |
| nucl_reward=None, | |
| other_advanced=None, | |
| perc_ident=None, | |
| phi_pattern=None, | |
| query_file=None, | |
| query_believe_defline=None, | |
| query_from=None, | |
| query_to=None, | |
| searchsp_eff=None, | |
| service=None, | |
| threshold=None, | |
| ungapped_alignment=None, | |
| word_size=None, | |
| short_query=None, | |
| alignments=500, | |
| alignment_view=None, | |
| descriptions=500, | |
| entrez_links_new_window=None, | |
| expect_low=None, | |
| expect_high=None, | |
| format_entrez_query=None, | |
| format_object=None, | |
| format_type="XML", | |
| ncbi_gi=None, | |
| results_file=None, | |
| show_overview=None, | |
| megablast=None, | |
| template_type=None, | |
| template_length=None, | |
| username="blast", | |
| password=None, | |
| ): | |
| """BLAST search using NCBI's QBLAST server or a cloud service provider. | |
| Supports all parameters of the old qblast API for Put and Get. | |
| Please note that NCBI uses the new Common URL API for BLAST searches | |
| on the internet (http://ncbi.github.io/blast-cloud/dev/api.html). Thus, | |
| some of the parameters used by this function are not (or are no longer) | |
| officially supported by NCBI. Although they are still functioning, this | |
| may change in the future. | |
| The Common URL API (http://ncbi.github.io/blast-cloud/dev/api.html) allows | |
| doing BLAST searches on cloud servers. To use this feature, please set | |
| ``url_base='http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi'`` | |
| and ``format_object='Alignment'``. For more details, please see | |
| https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast | |
| Some useful parameters: | |
| - program blastn, blastp, blastx, tblastn, or tblastx (lower case) | |
| - database Which database to search against (e.g. "nr"). | |
| - sequence The sequence to search. | |
| - ncbi_gi TRUE/FALSE whether to give 'gi' identifier. | |
| - descriptions Number of descriptions to show. Def 500. | |
| - alignments Number of alignments to show. Def 500. | |
| - expect An expect value cutoff. Def 10.0. | |
| - matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). | |
| - filter "none" turns off filtering. Default no filtering | |
| - format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". | |
| - entrez_query Entrez query to limit Blast search | |
| - hitlist_size Number of hits to return. Default 50 | |
| - megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) | |
| - short_query TRUE/FALSE whether to adjust the search parameters for a | |
| short query sequence. Note that this will override | |
| manually set parameters like word size and e value. Turns | |
| off when sequence length is > 30 residues. Default: None. | |
| - service plain, psi, phi, rpsblast, megablast (lower case) | |
| This function does no checking of the validity of the parameters | |
| and passes the values to the server as is. More help is available at: | |
| https://ncbi.github.io/blast-cloud/dev/api.html | |
| """ | |
| programs = ["blastn", "blastp", "blastx", "tblastn", "tblastx"] | |
| if program not in programs: | |
| raise ValueError( | |
| f"Program specified is {program}. Expected one of {', '.join(programs)}" | |
| ) | |
| # SHORT_QUERY_ADJUST throws an error when using blastn (wrong parameter | |
| # assignment from NCBIs side). | |
| # Thus we set the (known) parameters directly: | |
| if short_query and program == "blastn": | |
| short_query = None | |
| # We only use the 'short-query' parameters for short sequences: | |
| if len(sequence) < 31: | |
| expect = 1000 | |
| word_size = 7 | |
| nucl_reward = 1 | |
| filter = None | |
| lcase_mask = None | |
| warnings.warn( | |
| '"SHORT_QUERY_ADJUST" is incorrectly implemented (by NCBI) for blastn.' | |
| " We bypass the problem by manually adjusting the search parameters." | |
| " Thus, results may slightly differ from web page searches.", | |
| BiopythonWarning, | |
| ) | |
| # Format the "Put" command, which sends search requests to qblast. | |
| # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 | |
| # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 | |
| # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified | |
| # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) | |
| parameters = { | |
| "AUTO_FORMAT": auto_format, | |
| "COMPOSITION_BASED_STATISTICS": composition_based_statistics, | |
| "DATABASE": database, | |
| "DB_GENETIC_CODE": db_genetic_code, | |
| "ENDPOINTS": endpoints, | |
| "ENTREZ_QUERY": entrez_query, | |
| "EXPECT": expect, | |
| "FILTER": filter, | |
| "GAPCOSTS": gapcosts, | |
| "GENETIC_CODE": genetic_code, | |
| "HITLIST_SIZE": hitlist_size, | |
| "I_THRESH": i_thresh, | |
| "LAYOUT": layout, | |
| "LCASE_MASK": lcase_mask, | |
| "MEGABLAST": megablast, | |
| "MATRIX_NAME": matrix_name, | |
| "NUCL_PENALTY": nucl_penalty, | |
| "NUCL_REWARD": nucl_reward, | |
| "OTHER_ADVANCED": other_advanced, | |
| "PERC_IDENT": perc_ident, | |
| "PHI_PATTERN": phi_pattern, | |
| "PROGRAM": program, | |
| # ('PSSM': pssm: - It is possible to use PSI-BLAST via this API? | |
| "QUERY": sequence, | |
| "QUERY_FILE": query_file, | |
| "QUERY_BELIEVE_DEFLINE": query_believe_defline, | |
| "QUERY_FROM": query_from, | |
| "QUERY_TO": query_to, | |
| # 'RESULTS_FILE': ...: - Can we use this parameter? | |
| "SEARCHSP_EFF": searchsp_eff, | |
| "SERVICE": service, | |
| "SHORT_QUERY_ADJUST": short_query, | |
| "TEMPLATE_TYPE": template_type, | |
| "TEMPLATE_LENGTH": template_length, | |
| "THRESHOLD": threshold, | |
| "UNGAPPED_ALIGNMENT": ungapped_alignment, | |
| "WORD_SIZE": word_size, | |
| "CMD": "Put", | |
| } | |
| if password is not None: | |
| # handle authentication for BLAST cloud | |
| password_mgr = HTTPPasswordMgrWithDefaultRealm() | |
| password_mgr.add_password(None, url_base, username, password) | |
| handler = HTTPBasicAuthHandler(password_mgr) | |
| opener = build_opener(handler) | |
| install_opener(opener) | |
| if url_base == NCBI_BLAST_URL: | |
| parameters.update({"email": email, "tool": tool}) | |
| parameters = {key: value for key, value in parameters.items() if value is not None} | |
| message = urlencode(parameters).encode() | |
| request = Request(url_base, message, {"User-Agent": "BiopythonClient"}) | |
| # Send off the initial query to qblast. | |
| # Note the NCBI do not currently impose a rate limit here, other | |
| # than the request not to make say 50 queries at once using multiple | |
| # threads. | |
| handle = urlopen(request) | |
| # Format the "Get" command, which gets the formatted results from qblast | |
| # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 | |
| rid, rtoe = _parse_qblast_ref_page(handle) | |
| parameters = { | |
| "ALIGNMENTS": alignments, | |
| "ALIGNMENT_VIEW": alignment_view, | |
| "DESCRIPTIONS": descriptions, | |
| "ENTREZ_LINKS_NEW_WINDOW": entrez_links_new_window, | |
| "EXPECT_LOW": expect_low, | |
| "EXPECT_HIGH": expect_high, | |
| "FORMAT_ENTREZ_QUERY": format_entrez_query, | |
| "FORMAT_OBJECT": format_object, | |
| "FORMAT_TYPE": format_type, | |
| "NCBI_GI": ncbi_gi, | |
| "RID": rid, | |
| "RESULTS_FILE": results_file, | |
| "SERVICE": service, | |
| "SHOW_OVERVIEW": show_overview, | |
| "CMD": "Get", | |
| } | |
| parameters = {key: value for key, value in parameters.items() if value is not None} | |
| message = urlencode(parameters).encode() | |
| # Poll NCBI until the results are ready. | |
| # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo | |
| # 1. Do not contact the server more often than once every 10 seconds. | |
| # 2. Do not poll for any single RID more often than once a minute. | |
| # 3. Use the URL parameter email and tool, so that the NCBI | |
| # can contact you if there is a problem. | |
| # 4. Run scripts weekends or between 9 pm and 5 am Eastern time | |
| # on weekdays if more than 50 searches will be submitted. | |
| # -- | |
| # Could start with a 10s delay, but expect most short queries | |
| # will take longer thus at least 70s with delay. Therefore, | |
| # start with 20s delay, thereafter once a minute. | |
| delay = 20 # seconds | |
| while True: | |
| current = time.time() | |
| wait = qblast._previous + delay - current | |
| if wait > 0: | |
| time.sleep(wait) | |
| qblast._previous = current + wait | |
| else: | |
| qblast._previous = current | |
| # delay by at least 60 seconds only if running the request against the public NCBI API | |
| if delay < 60 and url_base == NCBI_BLAST_URL: | |
| # Wasn't a quick return, must wait at least a minute | |
| delay = 60 | |
| request = Request(url_base, message, {"User-Agent": "BiopythonClient"}) | |
| handle = urlopen(request) | |
| results = handle.read().decode() | |
| # Can see an "\n\n" page while results are in progress, | |
| # if so just wait a bit longer... | |
| if results == "\n\n": | |
| continue | |
| # XML results don't have the Status tag when finished | |
| if "Status=" not in results: | |
| break | |
| i = results.index("Status=") | |
| j = results.index("\n", i) | |
| status = results[i + len("Status=") : j].strip() | |
| if status.upper() == "READY": | |
| break | |
| return StringIO(results) | |
| qblast._previous = 0 | |
| def _parse_qblast_ref_page(handle): | |
| """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE). | |
| The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is probably | |
| 'Request Time of Execution' and RID would be 'Request Identifier'. | |
| """ | |
| s = handle.read().decode() | |
| i = s.find("RID =") | |
| if i == -1: | |
| rid = None | |
| else: | |
| j = s.find("\n", i) | |
| rid = s[i + len("RID =") : j].strip() | |
| i = s.find("RTOE =") | |
| if i == -1: | |
| rtoe = None | |
| else: | |
| j = s.find("\n", i) | |
| rtoe = s[i + len("RTOE =") : j].strip() | |
| if not rid and not rtoe: | |
| # Can we reliably extract the error message from the HTML page? | |
| # e.g. "Message ID#24 Error: Failed to read the Blast query: | |
| # Nucleotide FASTA provided for protein sequence" | |
| # or "Message ID#32 Error: Query contains no data: Query | |
| # contains no sequence data" | |
| # | |
| # This used to occur inside a <div class="error msInf"> entry: | |
| i = s.find('<div class="error msInf">') | |
| if i != -1: | |
| msg = s[i + len('<div class="error msInf">') :].strip() | |
| msg = msg.split("</div>", 1)[0].split("\n", 1)[0].strip() | |
| if msg: | |
| raise ValueError(f"Error message from NCBI: {msg}") | |
| # In spring 2010 the markup was like this: | |
| i = s.find('<p class="error">') | |
| if i != -1: | |
| msg = s[i + len('<p class="error">') :].strip() | |
| msg = msg.split("</p>", 1)[0].split("\n", 1)[0].strip() | |
| if msg: | |
| raise ValueError(f"Error message from NCBI: {msg}") | |
| # Generic search based on the way the error messages start: | |
| i = s.find("Message ID#") | |
| if i != -1: | |
| # Break the message at the first HTML tag | |
| msg = s[i:].split("<", 1)[0].split("\n", 1)[0].strip() | |
| raise ValueError(f"Error message from NCBI: {msg}") | |
| # We didn't recognise the error layout :( | |
| # print(s) | |
| raise ValueError( | |
| "No RID and no RTOE found in the 'please wait' page, " | |
| "there was probably an error in your request but we " | |
| "could not extract a helpful error message." | |
| ) | |
| elif not rid: | |
| # Can this happen? | |
| raise ValueError( | |
| f"No RID found in the 'please wait' page. (although RTOE = {rtoe!r})" | |
| ) | |
| elif not rtoe: | |
| # Can this happen? | |
| raise ValueError( | |
| f"No RTOE found in the 'please wait' page. (although RID = {rid!r})" | |
| ) | |
| try: | |
| return rid, int(rtoe) | |
| except ValueError: | |
| raise ValueError( | |
| f"A non-integer RTOE found in the 'please wait' page, {rtoe!r}" | |
| ) from None | |