Spaces:

chenzihong
/

GraphGen

Build error

App Files Files Community

github-actions[bot] commited on Dec 25, 2025

Commit

31df32c

1 Parent(s): ac15317

Auto-sync from demo at Thu Dec 25 14:16:44 UTC 2025

Browse files

Files changed (12) hide show

graphgen/bases/base_operator.py +6 -2
graphgen/bases/base_reader.py +2 -0
graphgen/bases/base_searcher.py +3 -3
graphgen/models/searcher/db/ncbi_searcher.py +260 -84
graphgen/models/searcher/db/rnacentral_searcher.py +174 -80
graphgen/models/searcher/db/uniprot_searcher.py +105 -79
graphgen/models/searcher/web/bing_search.py +6 -0
graphgen/models/searcher/web/google_search.py +6 -0
graphgen/operators/__init__.py +2 -2
graphgen/operators/search/__init__.py +1 -1
graphgen/operators/search/search_all.py +0 -83
graphgen/operators/search/search_service.py +163 -0

graphgen/bases/base_operator.py CHANGED Viewed

@@ -6,11 +6,12 @@ from typing import Iterable, Union
 import pandas as pd
 import ray
-from graphgen.utils import CURRENT_LOGGER_VAR, set_logger
 class BaseOperator(ABC):
     def __init__(self, working_dir: str = "cache", op_name: str = None):
         log_dir = os.path.join(working_dir, "logs")
         self.op_name = op_name or self.__class__.__name__
@@ -39,6 +40,9 @@ class BaseOperator(ABC):
     def __call__(
         self, batch: pd.DataFrame
     ) -> Union[pd.DataFrame, Iterable[pd.DataFrame]]:
         logger_token = CURRENT_LOGGER_VAR.set(self.logger)
         try:
             result = self.process(batch)

 import pandas as pd
 import ray
 class BaseOperator(ABC):
     def __init__(self, working_dir: str = "cache", op_name: str = None):
+        # lazy import to avoid circular import
+        from graphgen.utils import set_logger
         log_dir = os.path.join(working_dir, "logs")
         self.op_name = op_name or self.__class__.__name__
     def __call__(
         self, batch: pd.DataFrame
     ) -> Union[pd.DataFrame, Iterable[pd.DataFrame]]:
+        # lazy import to avoid circular import
+        from graphgen.utils import CURRENT_LOGGER_VAR
         logger_token = CURRENT_LOGGER_VAR.set(self.logger)
         try:
             result = self.process(batch)

graphgen/bases/base_reader.py CHANGED Viewed

@@ -39,6 +39,8 @@ class BaseReader(ABC):
             "table",
             "equation",
             "protein",
         ], f"Unsupported item type: {item_type}"
         if item_type == "text":
             content = item.get(self.text_column, "").strip()

             "table",
             "equation",
             "protein",
+            "dna",
+            "rna",
         ], f"Unsupported item type: {item_type}"
         if item_type == "text":
             content = item.get(self.text_column, "").strip()

graphgen/bases/base_searcher.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List
 class BaseSearcher(ABC):
@@ -8,11 +8,11 @@ class BaseSearcher(ABC):
     """
     @abstractmethod
-    async def search(self, query: str, **kwargs) -> List[Dict[str, Any]]:
         """
         Search for data based on the given query.
         :param query: The searcher query.
         :param kwargs: Additional keyword arguments for the searcher.
-        :return: List of dictionaries containing the searcher results.
         """

 from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional
 class BaseSearcher(ABC):
     """
     @abstractmethod
+    def search(self, query: str, **kwargs) -> Optional[Dict[str, Any]]:
         """
         Search for data based on the given query.
         :param query: The searcher query.
         :param kwargs: Additional keyword arguments for the searcher.
+        :return: Dictionary containing the searcher result, or None if not found.
         """

graphgen/models/searcher/db/ncbi_searcher.py CHANGED Viewed

@@ -1,10 +1,7 @@
-import asyncio
 import os
 import re
 import subprocess
 import tempfile
-from concurrent.futures import ThreadPoolExecutor
-from functools import lru_cache
 from http.client import IncompleteRead
 from typing import Dict, Optional
@@ -22,15 +19,6 @@ from graphgen.bases import BaseSearcher
 from graphgen.utils import logger
-@lru_cache(maxsize=None)
-def _get_pool():
-    return ThreadPoolExecutor(max_workers=10)
-# ensure only one NCBI request at a time
-_ncbi_lock = asyncio.Lock()
 class NCBISearch(BaseSearcher):
     """
     NCBI Search client to search DNA/GenBank/Entrez databases.
@@ -49,6 +37,8 @@ class NCBISearch(BaseSearcher):
         email: str = "email@example.com",
         api_key: str = "",
         tool: str = "GraphGen",
     ):
         """
         Initialize the NCBI Search client.
@@ -59,8 +49,8 @@ class NCBISearch(BaseSearcher):
             email (str): Email address for NCBI API requests.
             api_key (str): API key for NCBI API requests, see https://account.ncbi.nlm.nih.gov/settings/.
             tool (str): Tool name for NCBI API requests.
         """
-        super().__init__()
         Entrez.timeout = 60  # 60 seconds timeout
         Entrez.email = email
         Entrez.tool = tool
@@ -70,9 +60,23 @@ class NCBISearch(BaseSearcher):
         Entrez.sleep_between_tries = 5
         self.use_local_blast = use_local_blast
         self.local_blast_db = local_blast_db
-        if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"):
-            logger.error("Local BLAST database files not found. Please check the path.")
-            self.use_local_blast = False
     @staticmethod
     def _nested_get(data: dict, *keys, default=None):
@@ -84,17 +88,21 @@ class NCBISearch(BaseSearcher):
         return data
     @staticmethod
-    def _infer_molecule_type_detail(accession: Optional[str], gene_type: Optional[int] = None) -> Optional[str]:
         """Infer molecule_type_detail from accession prefix or gene type."""
         if accession:
-            if accession.startswith(("NM_", "XM_")):
-                return "mRNA"
-            if accession.startswith(("NC_", "NT_")):
-                return "genomic DNA"
-            if accession.startswith(("NR_", "XR_")):
-                return "RNA"
-            if accession.startswith("NG_"):
-                return "genomic region"
         # Fallback: infer from gene type if available
         if gene_type is not None:
             gene_type_map = {
@@ -126,20 +134,25 @@ class NCBISearch(BaseSearcher):
         gene_synonyms = []
         if isinstance(synonyms_raw, list):
             for syn in synonyms_raw:
-                gene_synonyms.append(syn.get("Gene-ref_syn_E") if isinstance(syn, dict) else str(syn))
         elif synonyms_raw:
             gene_synonyms.append(str(synonyms_raw))
         # Extract location info
         label = locus.get("Gene-commentary_label", "")
-        chromosome_match = re.search(r"Chromosome\s+(\S+)", str(label)) if label else None
         seq_interval = self._nested_get(
             locus, "Gene-commentary_seqs", 0, "Seq-loc_int", "Seq-interval", default={}
         )
         genomic_location = (
             f"{seq_interval.get('Seq-interval_from')}-{seq_interval.get('Seq-interval_to')}"
-            if seq_interval.get('Seq-interval_from') and seq_interval.get('Seq-interval_to')
             else None
         )
@@ -153,7 +166,6 @@ class NCBISearch(BaseSearcher):
             None,
         )
         # Fallback: if no type 3 accession, try any available accession
-        # This is needed for genes that don't have mRNA transcripts but have other sequence records
         if not representative_accession:
             representative_accession = next(
                 (
@@ -170,7 +182,8 @@ class NCBISearch(BaseSearcher):
                 comment.get("Gene-commentary_comment")
                 for comment in data.get("Entrezgene_comments", [])
                 if isinstance(comment, dict)
-                and "function" in str(comment.get("Gene-commentary_heading", "")).lower()
             ),
             None,
         )
@@ -194,7 +207,9 @@ class NCBISearch(BaseSearcher):
                 "5": "snRNA",
                 "6": "ncRNA",
                 "7": "other",
-            }.get(str(data.get("Entrezgene_type")), f"type_{data.get('Entrezgene_type')}"),
             "chromosome": chromosome_match.group(1) if chromosome_match else None,
             "genomic_location": genomic_location,
             "function": function,
@@ -209,25 +224,33 @@ class NCBISearch(BaseSearcher):
             "_representative_accession": representative_accession,
         }
-    def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]:
         """Get gene information by Gene ID."""
         def _extract_metadata_from_genbank(result: dict, accession: str):
             """Extract metadata from GenBank format (title, features, organism, etc.)."""
-            with Entrez.efetch(db="nuccore", id=accession, rettype="gb", retmode="text") as handle:
                 record = SeqIO.read(handle, "genbank")
                 result["title"] = record.description
                 result["molecule_type_detail"] = (
-                    "mRNA" if accession.startswith(("NM_", "XM_")) else
-                    "genomic DNA" if accession.startswith(("NC_", "NT_")) else
-                    "RNA" if accession.startswith(("NR_", "XR_")) else
-                    "genomic region" if accession.startswith("NG_") else "N/A"
                 )
                 for feature in record.features:
                     if feature.type == "source":
-                        if 'chromosome' in feature.qualifiers:
-                            result["chromosome"] = feature.qualifiers['chromosome'][0]
                         if feature.location:
                             start = int(feature.location.start) + 1
@@ -236,48 +259,91 @@ class NCBISearch(BaseSearcher):
                         break
-                if not result.get("organism") and 'organism' in record.annotations:
-                    result["organism"] = record.annotations['organism']
             return result
         def _extract_sequence_from_fasta(result: dict, accession: str):
             """Extract sequence from FASTA format (more reliable than GenBank for CON-type records)."""
             try:
-                with Entrez.efetch(db="nuccore", id=accession, rettype="fasta", retmode="text") as fasta_handle:
                     fasta_record = SeqIO.read(fasta_handle, "fasta")
                     result["sequence"] = str(fasta_record.seq)
                     result["sequence_length"] = len(fasta_record.seq)
             except Exception as fasta_exc:
                 logger.warning(
                     "Failed to extract sequence from accession %s using FASTA format: %s",
-                    accession, fasta_exc
                 )
                 result["sequence"] = None
                 result["sequence_length"] = None
             return result
         try:
             with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle:
                 gene_record = Entrez.read(handle)
-                if not gene_record:
-                    return None
-                result = self._gene_record_to_dict(gene_record, gene_id)
-                if accession := (preferred_accession or result.get("_representative_accession")):
-                    result = _extract_metadata_from_genbank(result, accession)
-                    result = _extract_sequence_from_fasta(result, accession)
-                result.pop("_representative_accession", None)
-                return result
         except (RequestException, IncompleteRead):
             raise
         except Exception as exc:
             logger.error("Gene ID %s not found: %s", gene_id, exc)
             return None
     def get_by_accession(self, accession: str) -> Optional[dict]:
         """Get sequence information by accession number."""
         def _extract_gene_id(link_handle):
             """Extract GeneID from elink results."""
             links = Entrez.read(link_handle)
@@ -301,9 +367,11 @@ class NCBISearch(BaseSearcher):
                 return None
             result = self.get_by_gene_id(gene_id, preferred_accession=accession)
             if result:
                 result["id"] = accession
                 result["url"] = f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}"
             return result
         except (RequestException, IncompleteRead):
             raise
@@ -311,6 +379,12 @@ class NCBISearch(BaseSearcher):
             logger.error("Accession %s not found: %s", accession, exc)
             return None
     def get_best_hit(self, keyword: str) -> Optional[dict]:
         """Search NCBI Gene database with a keyword and return the best hit."""
         if not keyword.strip():
@@ -318,33 +392,113 @@ class NCBISearch(BaseSearcher):
         try:
             for search_term in [f"{keyword}[Gene] OR {keyword}[All Fields]", keyword]:
-                with Entrez.esearch(db="gene", term=search_term, retmax=1, sort="relevance") as search_handle:
                     search_results = Entrez.read(search_handle)
-                    if len(gene_id := search_results.get("IdList", [])) > 0:
-                        return self.get_by_gene_id(gene_id)
         except (RequestException, IncompleteRead):
             raise
         except Exception as e:
             logger.error("Keyword %s not found: %s", keyword, e)
         return None
     def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
-        """Perform local BLAST search using local BLAST database."""
         try:
-            with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp:
                 tmp.write(f">query\n{seq}\n")
                 tmp_name = tmp.name
             cmd = [
-                "blastn", "-db", self.local_blast_db, "-query", tmp_name,
-                "-evalue", str(threshold), "-max_target_seqs", "1", "-outfmt", "6 sacc"
             ]
-            logger.debug("Running local blastn: %s", " ".join(cmd))
-            out = subprocess.check_output(cmd, text=True).strip()
             os.remove(tmp_name)
             return out.split("\n", maxsplit=1)[0] if out else None
         except Exception as exc:
             logger.error("Local blastn failed: %s", exc)
             return None
     def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
@@ -358,8 +512,9 @@ class NCBISearch(BaseSearcher):
                 seq = sequence.strip().replace(" ", "").replace("\n", "")
             return seq if re.fullmatch(r"[ATCGN]+", seq, re.I) else None
-        def _process_network_blast_result(blast_record, seq: str, threshold: float) -> Optional[dict]:
             """Process network BLAST result and return dictionary or None."""
             if not blast_record.alignments:
                 logger.info("No BLAST hits found for the given sequence.")
@@ -383,7 +538,9 @@ class NCBISearch(BaseSearcher):
                 "title": best_alignment.title,
                 "sequence_length": len(seq),
                 "e_value": best_hsp.expect,
-                "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0,
                 "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}",
             }
@@ -393,15 +550,31 @@ class NCBISearch(BaseSearcher):
                 return None
             # Try local BLAST first if enabled
-            if self.use_local_blast and (accession := self._local_blast(seq, threshold)):
-                logger.debug("Local BLAST found accession: %s", accession)
-                return self.get_by_accession(accession)
-            # Fall back to network BLAST
             logger.debug("Falling back to NCBIWWW.qblast")
-            with NCBIWWW.qblast("blastn", "nr", seq, hitlist_size=1, expect=threshold) as result_handle:
-                return _process_network_blast_result(NCBIXML.read(result_handle), seq, threshold)
         except (RequestException, IncompleteRead):
             raise
         except Exception as e:
@@ -414,8 +587,9 @@ class NCBISearch(BaseSearcher):
         retry=retry_if_exception_type((RequestException, IncompleteRead)),
         reraise=True,
     )
-    async def search(self, query: str, threshold: float = 0.01, **kwargs) -> Optional[Dict]:
         """Search NCBI with either a gene ID, accession number, keyword, or DNA sequence."""
         if not query or not isinstance(query, str):
             logger.error("Empty or non-string input.")
             return None
@@ -423,19 +597,21 @@ class NCBISearch(BaseSearcher):
         query = query.strip()
         logger.debug("NCBI search query: %s", query)
-        loop = asyncio.get_running_loop()
-        # limit concurrent requests (NCBI rate limit: max 3 requests per second)
-        async with _ncbi_lock:
-            # Auto-detect query type and execute in thread pool
-            if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I):
-                result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold)
-            elif re.fullmatch(r"^\d+$", query):
-                result = await loop.run_in_executor(_get_pool(), self.get_by_gene_id, query)
-            elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I):
-                result = await loop.run_in_executor(_get_pool(), self.get_by_accession, query)
-            else:
-                result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query)
         if result:
             result["_search_query"] = query

 import os
 import re
 import subprocess
 import tempfile
 from http.client import IncompleteRead
 from typing import Dict, Optional
 from graphgen.utils import logger
 class NCBISearch(BaseSearcher):
     """
     NCBI Search client to search DNA/GenBank/Entrez databases.
         email: str = "email@example.com",
         api_key: str = "",
         tool: str = "GraphGen",
+        blast_num_threads: int = 4,
+        threshold: float = 0.01,
     ):
         """
         Initialize the NCBI Search client.
             email (str): Email address for NCBI API requests.
             api_key (str): API key for NCBI API requests, see https://account.ncbi.nlm.nih.gov/settings/.
             tool (str): Tool name for NCBI API requests.
+            blast_num_threads (int): Number of threads for BLAST search.
         """
         Entrez.timeout = 60  # 60 seconds timeout
         Entrez.email = email
         Entrez.tool = tool
         Entrez.sleep_between_tries = 5
         self.use_local_blast = use_local_blast
         self.local_blast_db = local_blast_db
+        self.blast_num_threads = blast_num_threads
+        self.threshold = threshold
+        if self.use_local_blast:
+            # Check for single-file database (.nhr) or multi-file database (.00.nhr)
+            db_exists = os.path.isfile(f"{self.local_blast_db}.nhr") or os.path.isfile(
+                f"{self.local_blast_db}.00.nhr"
+            )
+            if not db_exists:
+                logger.error(
+                    "Local BLAST database files not found. Please check the path."
+                )
+                logger.error(
+                    "Expected: %s.nhr or %s.00.nhr",
+                    self.local_blast_db,
+                    self.local_blast_db,
+                )
+                self.use_local_blast = False
     @staticmethod
     def _nested_get(data: dict, *keys, default=None):
         return data
     @staticmethod
+    def _infer_molecule_type_detail(
+        accession: Optional[str], gene_type: Optional[int] = None
+    ) -> Optional[str]:
         """Infer molecule_type_detail from accession prefix or gene type."""
         if accession:
+            # Map accession prefixes to molecule types
+            prefix_map = {
+                ("NM_", "XM_"): "mRNA",
+                ("NC_", "NT_"): "genomic DNA",
+                ("NR_", "XR_"): "RNA",
+                ("NG_",): "genomic region",
+            }
+            for prefixes, mol_type in prefix_map.items():
+                if accession.startswith(prefixes):
+                    return mol_type
         # Fallback: infer from gene type if available
         if gene_type is not None:
             gene_type_map = {
         gene_synonyms = []
         if isinstance(synonyms_raw, list):
             for syn in synonyms_raw:
+                gene_synonyms.append(
+                    syn.get("Gene-ref_syn_E") if isinstance(syn, dict) else str(syn)
+                )
         elif synonyms_raw:
             gene_synonyms.append(str(synonyms_raw))
         # Extract location info
         label = locus.get("Gene-commentary_label", "")
+        chromosome_match = (
+            re.search(r"Chromosome\s+(\S+)", str(label)) if label else None
+        )
         seq_interval = self._nested_get(
             locus, "Gene-commentary_seqs", 0, "Seq-loc_int", "Seq-interval", default={}
         )
         genomic_location = (
             f"{seq_interval.get('Seq-interval_from')}-{seq_interval.get('Seq-interval_to')}"
+            if seq_interval.get("Seq-interval_from")
+            and seq_interval.get("Seq-interval_to")
             else None
         )
             None,
         )
         # Fallback: if no type 3 accession, try any available accession
         if not representative_accession:
             representative_accession = next(
                 (
                 comment.get("Gene-commentary_comment")
                 for comment in data.get("Entrezgene_comments", [])
                 if isinstance(comment, dict)
+                and "function"
+                in str(comment.get("Gene-commentary_heading", "")).lower()
             ),
             None,
         )
                 "5": "snRNA",
                 "6": "ncRNA",
                 "7": "other",
+            }.get(
+                str(data.get("Entrezgene_type")), f"type_{data.get('Entrezgene_type')}"
+            ),
             "chromosome": chromosome_match.group(1) if chromosome_match else None,
             "genomic_location": genomic_location,
             "function": function,
             "_representative_accession": representative_accession,
         }
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type((RequestException, IncompleteRead)),
+        reraise=True,
+    )
+    def get_by_gene_id(
+        self, gene_id: str, preferred_accession: Optional[str] = None
+    ) -> Optional[dict]:
         """Get gene information by Gene ID."""
         def _extract_metadata_from_genbank(result: dict, accession: str):
             """Extract metadata from GenBank format (title, features, organism, etc.)."""
+            with Entrez.efetch(
+                db="nuccore", id=accession, rettype="gb", retmode="text"
+            ) as handle:
                 record = SeqIO.read(handle, "genbank")
                 result["title"] = record.description
                 result["molecule_type_detail"] = (
+                    self._infer_molecule_type_detail(accession) or "N/A"
                 )
                 for feature in record.features:
                     if feature.type == "source":
+                        if "chromosome" in feature.qualifiers:
+                            result["chromosome"] = feature.qualifiers["chromosome"][0]
                         if feature.location:
                             start = int(feature.location.start) + 1
                         break
+                if not result.get("organism") and "organism" in record.annotations:
+                    result["organism"] = record.annotations["organism"]
             return result
         def _extract_sequence_from_fasta(result: dict, accession: str):
             """Extract sequence from FASTA format (more reliable than GenBank for CON-type records)."""
             try:
+                with Entrez.efetch(
+                    db="nuccore", id=accession, rettype="fasta", retmode="text"
+                ) as fasta_handle:
                     fasta_record = SeqIO.read(fasta_handle, "fasta")
                     result["sequence"] = str(fasta_record.seq)
                     result["sequence_length"] = len(fasta_record.seq)
             except Exception as fasta_exc:
                 logger.warning(
                     "Failed to extract sequence from accession %s using FASTA format: %s",
+                    accession,
+                    fasta_exc,
                 )
                 result["sequence"] = None
                 result["sequence_length"] = None
             return result
+        def _extract_sequence(result: dict, accession: str):
+            """
+            Extract sequence using the appropriate method based on configuration.
+            If use_local_blast=True, use local database. Otherwise, use NCBI API.
+            Always fetches sequence (no option to skip).
+            """
+            # If using local BLAST, use local database
+            if self.use_local_blast:
+                sequence = self._extract_sequence_from_local_db(accession)
+                if sequence:
+                    result["sequence"] = sequence
+                    result["sequence_length"] = len(sequence)
+                else:
+                    # Failed to extract from local DB, set to None (no fallback to API)
+                    result["sequence"] = None
+                    result["sequence_length"] = None
+                    logger.warning(
+                        "Failed to extract sequence from local DB for accession %s. "
+                        "Not falling back to NCBI API as use_local_blast=True.",
+                        accession,
+                    )
+            else:
+                # Use NCBI API to fetch sequence
+                result = _extract_sequence_from_fasta(result, accession)
+            return result
         try:
             with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle:
                 gene_record = Entrez.read(handle)
+            if not gene_record:
+                return None
+            result = self._gene_record_to_dict(gene_record, gene_id)
+            if accession := (
+                preferred_accession or result.get("_representative_accession")
+            ):
+                result = _extract_metadata_from_genbank(result, accession)
+                # Extract sequence using appropriate method
+                result = _extract_sequence(result, accession)
+            result.pop("_representative_accession", None)
+            return result
         except (RequestException, IncompleteRead):
             raise
         except Exception as exc:
             logger.error("Gene ID %s not found: %s", gene_id, exc)
             return None
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type((RequestException, IncompleteRead)),
+        reraise=True,
+    )
     def get_by_accession(self, accession: str) -> Optional[dict]:
         """Get sequence information by accession number."""
         def _extract_gene_id(link_handle):
             """Extract GeneID from elink results."""
             links = Entrez.read(link_handle)
                 return None
             result = self.get_by_gene_id(gene_id, preferred_accession=accession)
             if result:
                 result["id"] = accession
                 result["url"] = f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}"
             return result
         except (RequestException, IncompleteRead):
             raise
             logger.error("Accession %s not found: %s", accession, exc)
             return None
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type((RequestException, IncompleteRead)),
+        reraise=True,
+    )
     def get_best_hit(self, keyword: str) -> Optional[dict]:
         """Search NCBI Gene database with a keyword and return the best hit."""
         if not keyword.strip():
         try:
             for search_term in [f"{keyword}[Gene] OR {keyword}[All Fields]", keyword]:
+                with Entrez.esearch(
+                    db="gene", term=search_term, retmax=1, sort="relevance"
+                ) as search_handle:
                     search_results = Entrez.read(search_handle)
+                if len(gene_id := search_results.get("IdList", [])) > 0:
+                    result = self.get_by_gene_id(gene_id[0])
+                    return result
         except (RequestException, IncompleteRead):
             raise
         except Exception as e:
             logger.error("Keyword %s not found: %s", keyword, e)
         return None
+    def _extract_sequence_from_local_db(self, accession: str) -> Optional[str]:
+        """Extract sequence from local BLAST database using blastdbcmd."""
+        try:
+            cmd = [
+                "blastdbcmd",
+                "-db",
+                self.local_blast_db,
+                "-entry",
+                accession,
+                "-outfmt",
+                "%s",  # Only sequence, no header
+            ]
+            sequence = subprocess.check_output(
+                cmd,
+                text=True,
+                timeout=10,  # 10 second timeout for local extraction
+                stderr=subprocess.DEVNULL,
+            ).strip()
+            return sequence if sequence else None
+        except subprocess.TimeoutExpired:
+            logger.warning(
+                "Timeout extracting sequence from local DB for accession %s", accession
+            )
+            return None
+        except Exception as exc:
+            logger.warning(
+                "Failed to extract sequence from local DB for accession %s: %s",
+                accession,
+                exc,
+            )
+            return None
     def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
+        """
+        Perform local BLAST search using local BLAST database.
+        Optimized with multi-threading and faster output format.
+        """
         try:
+            with tempfile.NamedTemporaryFile(
+                mode="w+", suffix=".fa", delete=False
+            ) as tmp:
                 tmp.write(f">query\n{seq}\n")
                 tmp_name = tmp.name
+            # Optimized BLAST command with:
+            # - num_threads: Use multiple threads for faster search
+            # - outfmt 6 sacc: Only return accession (minimal output)
+            # - max_target_seqs 1: Only need the best hit
+            # - evalue: Threshold for significance
             cmd = [
+                "blastn",
+                "-db",
+                self.local_blast_db,
+                "-query",
+                tmp_name,
+                "-evalue",
+                str(threshold),
+                "-max_target_seqs",
+                "1",
+                "-num_threads",
+                str(self.blast_num_threads),
+                "-outfmt",
+                "6 sacc",  # Only accession, tab-separated
             ]
+            logger.debug(
+                "Running local blastn (threads=%d): %s",
+                self.blast_num_threads,
+                " ".join(cmd),
+            )
+            # Run BLAST with timeout to avoid hanging
+            try:
+                out = subprocess.check_output(
+                    cmd,
+                    text=True,
+                    timeout=300,  # 5 minute timeout for BLAST search
+                    stderr=subprocess.DEVNULL,  # Suppress BLAST warnings to reduce I/O
+                ).strip()
+            except subprocess.TimeoutExpired:
+                logger.warning("BLAST search timed out after 5 minutes for sequence")
+                os.remove(tmp_name)
+                return None
             os.remove(tmp_name)
             return out.split("\n", maxsplit=1)[0] if out else None
         except Exception as exc:
             logger.error("Local blastn failed: %s", exc)
+            # Clean up temp file if it still exists
+            try:
+                if "tmp_name" in locals():
+                    os.remove(tmp_name)
+            except Exception:
+                pass
             return None
     def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
                 seq = sequence.strip().replace(" ", "").replace("\n", "")
             return seq if re.fullmatch(r"[ATCGN]+", seq, re.I) else None
+        def _process_network_blast_result(
+            blast_record, seq: str, threshold: float
+        ) -> Optional[dict]:
             """Process network BLAST result and return dictionary or None."""
             if not blast_record.alignments:
                 logger.info("No BLAST hits found for the given sequence.")
                 "title": best_alignment.title,
                 "sequence_length": len(seq),
                 "e_value": best_hsp.expect,
+                "identity": best_hsp.identities / best_hsp.align_length
+                if best_hsp.align_length > 0
+                else 0,
                 "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}",
             }
                 return None
             # Try local BLAST first if enabled
+            if self.use_local_blast:
+                accession = self._local_blast(seq, threshold)
+                if accession:
+                    logger.debug("Local BLAST found accession: %s", accession)
+                    # When using local BLAST, skip sequence fetching by default (faster, fewer API calls)
+                    # Sequence is already known from the query, so we only need metadata
+                    result = self.get_by_accession(accession)
+                    return result
+                logger.info(
+                    "Local BLAST found no match for sequence. "
+                    "API fallback disabled when using local database."
+                )
+                return None
+            # Fall back to network BLAST only if local BLAST is not enabled
             logger.debug("Falling back to NCBIWWW.qblast")
+            with NCBIWWW.qblast(
+                "blastn", "nr", seq, hitlist_size=1, expect=threshold
+            ) as result_handle:
+                result = _process_network_blast_result(
+                    NCBIXML.read(result_handle), seq, threshold
+                )
+            return result
         except (RequestException, IncompleteRead):
             raise
         except Exception as e:
         retry=retry_if_exception_type((RequestException, IncompleteRead)),
         reraise=True,
     )
+    def search(self, query: str, threshold: float = None, **kwargs) -> Optional[Dict]:
         """Search NCBI with either a gene ID, accession number, keyword, or DNA sequence."""
+        threshold = threshold or self.threshold
         if not query or not isinstance(query, str):
             logger.error("Empty or non-string input.")
             return None
         query = query.strip()
         logger.debug("NCBI search query: %s", query)
+        # Auto-detect query type and execute
+        # All methods call NCBI API (rate limit: max 3 requests per second)
+        # Even if get_by_fasta uses local BLAST, it still calls get_by_accession which needs API
+        if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I):
+            # FASTA sequence
+            result = self.get_by_fasta(query, threshold)
+        elif re.fullmatch(r"^\d+$", query):
+            # Gene ID
+            result = self.get_by_gene_id(query)
+        elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I):
+            # Accession
+            result = self.get_by_accession(query)
+        else:
+            # Keyword
+            result = self.get_best_hit(query)
         if result:
             result["_search_query"] = query

graphgen/models/searcher/db/rnacentral_searcher.py CHANGED Viewed

@@ -1,15 +1,11 @@
-import asyncio
 import os
 import re
 import subprocess
-from concurrent.futures import ThreadPoolExecutor
-from functools import lru_cache
 import tempfile
-from typing import Dict, Optional, List, Any, Set
-import hashlib
 import requests
-import aiohttp
 from tenacity import (
     retry,
     retry_if_exception_type,
@@ -21,10 +17,6 @@ from graphgen.bases import BaseSearcher
 from graphgen.utils import logger
-@lru_cache(maxsize=None)
-def _get_pool():
-    return ThreadPoolExecutor(max_workers=10)
 class RNACentralSearch(BaseSearcher):
     """
     RNAcentral Search client to search RNA databases.
@@ -35,12 +27,22 @@ class RNACentralSearch(BaseSearcher):
     API Documentation: https://rnacentral.org/api/v1
     """
-    def __init__(self, use_local_blast: bool = False, local_blast_db: str = "rna_db"):
-        super().__init__()
         self.base_url = "https://rnacentral.org/api/v1"
         self.headers = {"Accept": "application/json"}
         self.use_local_blast = use_local_blast
         self.local_blast_db = local_blast_db
         if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"):
             logger.error("Local BLAST database files not found. Please check the path.")
             self.use_local_blast = False
@@ -49,7 +51,7 @@ class RNACentralSearch(BaseSearcher):
     def _rna_data_to_dict(
         rna_id: str,
         rna_data: Dict[str, Any],
-        xrefs_data: Optional[List[Dict[str, Any]]] = None
     ) -> Dict[str, Any]:
         organisms, gene_names, so_terms = set(), set(), set()
         modifications: List[Any] = []
@@ -58,7 +60,8 @@ class RNACentralSearch(BaseSearcher):
             acc = xref.get("accession", {})
             if s := acc.get("species"):
                 organisms.add(s)
-            if g := acc.get("gene", "").strip():
                 gene_names.add(g)
             if m := xref.get("modifications"):
                 modifications.extend(m)
@@ -137,7 +140,9 @@ class RNACentralSearch(BaseSearcher):
         # Normalize sequence
         normalized_seq = sequence.replace("U", "T").replace("u", "t").upper()
         if not re.fullmatch(r"[ATCGN]+", normalized_seq):
-            raise ValueError(f"Invalid sequence characters after normalization: {normalized_seq[:50]}...")
         return hashlib.md5(normalized_seq.encode("ascii")).hexdigest()
@@ -151,12 +156,21 @@ class RNACentralSearch(BaseSearcher):
             url = f"{self.base_url}/rna/{rna_id}"
             url += "?flat=true"
-            resp = requests.get(url, headers=self.headers, timeout=30)
             resp.raise_for_status()
             rna_data = resp.json()
             xrefs_data = rna_data.get("xrefs", [])
-            return self._rna_data_to_dict(rna_id, rna_data, xrefs_data)
         except requests.RequestException as e:
             logger.error("Network error getting RNA ID %s: %s", rna_id, e)
             return None
@@ -164,6 +178,12 @@ class RNACentralSearch(BaseSearcher):
             logger.error("Unexpected error getting RNA ID %s: %s", rna_id, e)
             return None
     def get_best_hit(self, keyword: str) -> Optional[dict]:
         """
         Search RNAcentral with a keyword and return the best hit.
@@ -178,7 +198,9 @@ class RNACentralSearch(BaseSearcher):
         try:
             url = f"{self.base_url}/rna"
             params = {"search": keyword, "format": "json"}
-            resp = requests.get(url, params=params, headers=self.headers, timeout=30)
             resp.raise_for_status()
             data = resp.json()
@@ -206,76 +228,146 @@ class RNACentralSearch(BaseSearcher):
             return None
     def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
-        """Perform local BLAST search using local BLAST database."""
         try:
-            with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp:
                 tmp.write(f">query\n{seq}\n")
                 tmp_name = tmp.name
             cmd = [
-                "blastn", "-db", self.local_blast_db, "-query", tmp_name,
-                "-evalue", str(threshold), "-max_target_seqs", "1", "-outfmt", "6 sacc"
             ]
-            logger.debug("Running local blastn for RNA: %s", " ".join(cmd))
-            out = subprocess.check_output(cmd, text=True).strip()
             os.remove(tmp_name)
             return out.split("\n", maxsplit=1)[0] if out else None
         except Exception as exc:
             logger.error("Local blastn failed: %s", exc)
             return None
-    def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
-        """
-        Search RNAcentral with an RNA sequence.
-        Tries local BLAST first if enabled, falls back to RNAcentral API.
-        Unified approach: Find RNA ID from sequence search, then call get_by_rna_id() for complete information.
-        :param sequence: RNA sequence (FASTA format or raw sequence).
-        :param threshold: E-value threshold for BLAST search.
-        :return: A dictionary containing complete RNA information or None if not found.
-        """
-        def _extract_sequence(sequence: str) -> Optional[str]:
-            """Extract and normalize RNA sequence from input."""
-            if sequence.startswith(">"):
-                seq_lines = sequence.strip().split("\n")
-                seq = "".join(seq_lines[1:])
-            else:
-                seq = sequence.strip().replace(" ", "").replace("\n", "")
-            return seq if seq and re.fullmatch(r"[AUCGN\s]+", seq, re.I) else None
         try:
-            seq = _extract_sequence(sequence)
             if not seq:
                 logger.error("Empty or invalid RNA sequence provided.")
                 return None
-            # Try local BLAST first if enabled
             if self.use_local_blast:
-                accession = self._local_blast(seq, threshold)
-                if accession:
-                    logger.debug("Local BLAST found accession: %s", accession)
-                    return self.get_by_rna_id(accession)
-            # Fall back to RNAcentral API if local BLAST didn't find result
-            logger.debug("Falling back to RNAcentral API.")
-            md5_hash = self._calculate_md5(seq)
-            search_url = f"{self.base_url}/rna"
-            params = {"md5": md5_hash, "format": "json"}
-            resp = requests.get(search_url, params=params, headers=self.headers, timeout=60)
-            resp.raise_for_status()
-            search_results = resp.json()
-            results = search_results.get("results", [])
-            if not results:
-                logger.info("No exact match found in RNAcentral for sequence")
-                return None
-            rna_id = results[0].get("rnacentral_id")
-            if not rna_id:
-                logger.error("No RNAcentral ID found in search results.")
-                return None
-            return self.get_by_rna_id(rna_id)
         except Exception as e:
             logger.error("Sequence search failed: %s", e)
             return None
@@ -283,11 +375,12 @@ class RNACentralSearch(BaseSearcher):
     @retry(
         stop=stop_after_attempt(3),
         wait=wait_exponential(multiplier=1, min=2, max=10),
-        retry=retry_if_exception_type((aiohttp.ClientError, asyncio.TimeoutError)),
         reraise=True,
     )
-    async def search(self, query: str, threshold: float = 0.1, **kwargs) -> Optional[Dict]:
         """Search RNAcentral with either an RNAcentral ID, keyword, or RNA sequence."""
         if not query or not isinstance(query, str):
             logger.error("Empty or non-string input.")
             return None
@@ -295,19 +388,20 @@ class RNACentralSearch(BaseSearcher):
         query = query.strip()
         logger.debug("RNAcentral search query: %s", query)
-        loop = asyncio.get_running_loop()
-        # check if RNA sequence (AUCG characters, contains U)
-        if query.startswith(">") or (
-            re.fullmatch(r"[AUCGN\s]+", query, re.I) and "U" in query.upper()
-        ):
-            result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold)
         # check if RNAcentral ID (typically starts with URS)
         elif re.fullmatch(r"URS\d+", query, re.I):
-            result = await loop.run_in_executor(_get_pool(), self.get_by_rna_id, query)
         else:
             # otherwise treat as keyword
-            result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query)
         if result:
             result["_search_query"] = query

+import hashlib
 import os
 import re
 import subprocess
 import tempfile
+from typing import Any, Dict, List, Optional, Set
 import requests
 from tenacity import (
     retry,
     retry_if_exception_type,
 from graphgen.utils import logger
 class RNACentralSearch(BaseSearcher):
     """
     RNAcentral Search client to search RNA databases.
     API Documentation: https://rnacentral.org/api/v1
     """
+    def __init__(
+        self,
+        use_local_blast: bool = False,
+        local_blast_db: str = "rna_db",
+        api_timeout: int = 30,
+        blast_num_threads: int = 4,
+        threshold: float = 0.01,
+    ):
         self.base_url = "https://rnacentral.org/api/v1"
         self.headers = {"Accept": "application/json"}
         self.use_local_blast = use_local_blast
         self.local_blast_db = local_blast_db
+        self.api_timeout = api_timeout
+        self.blast_num_threads = blast_num_threads  # Number of threads for BLAST search
+        self.threshold = threshold  # E-value threshold for BLAST search
         if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"):
             logger.error("Local BLAST database files not found. Please check the path.")
             self.use_local_blast = False
     def _rna_data_to_dict(
         rna_id: str,
         rna_data: Dict[str, Any],
+        xrefs_data: Optional[List[Dict[str, Any]]] = None,
     ) -> Dict[str, Any]:
         organisms, gene_names, so_terms = set(), set(), set()
         modifications: List[Any] = []
             acc = xref.get("accession", {})
             if s := acc.get("species"):
                 organisms.add(s)
+            gene_value = acc.get("gene")
+            if isinstance(gene_value, str) and (g := gene_value.strip()):
                 gene_names.add(g)
             if m := xref.get("modifications"):
                 modifications.extend(m)
         # Normalize sequence
         normalized_seq = sequence.replace("U", "T").replace("u", "t").upper()
         if not re.fullmatch(r"[ATCGN]+", normalized_seq):
+            raise ValueError(
+                f"Invalid sequence characters after normalization: {normalized_seq[:50]}..."
+            )
         return hashlib.md5(normalized_seq.encode("ascii")).hexdigest()
             url = f"{self.base_url}/rna/{rna_id}"
             url += "?flat=true"
+            resp = requests.get(url, headers=self.headers, timeout=self.api_timeout)
             resp.raise_for_status()
             rna_data = resp.json()
             xrefs_data = rna_data.get("xrefs", [])
+            result = self._rna_data_to_dict(rna_id, rna_data, xrefs_data)
+            return result
+        except requests.Timeout as e:
+            logger.warning(
+                "Timeout getting RNA ID %s (timeout=%ds): %s",
+                rna_id,
+                self.api_timeout,
+                e,
+            )
+            return None
         except requests.RequestException as e:
             logger.error("Network error getting RNA ID %s: %s", rna_id, e)
             return None
             logger.error("Unexpected error getting RNA ID %s: %s", rna_id, e)
             return None
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=2, max=10),
+        retry=retry_if_exception_type((requests.Timeout, requests.RequestException)),
+        reraise=False,
+    )
     def get_best_hit(self, keyword: str) -> Optional[dict]:
         """
         Search RNAcentral with a keyword and return the best hit.
         try:
             url = f"{self.base_url}/rna"
             params = {"search": keyword, "format": "json"}
+            resp = requests.get(
+                url, params=params, headers=self.headers, timeout=self.api_timeout
+            )
             resp.raise_for_status()
             data = resp.json()
             return None
     def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
+        """
+        Perform local BLAST search using local BLAST database.
+        Optimized with multi-threading and faster output format.
+        """
         try:
+            # Use temporary file for query sequence
+            with tempfile.NamedTemporaryFile(
+                mode="w+", suffix=".fa", delete=False
+            ) as tmp:
                 tmp.write(f">query\n{seq}\n")
                 tmp_name = tmp.name
+            # Optimized BLAST command with:
+            # - num_threads: Use multiple threads for faster search
+            # - outfmt 6 sacc: Only return accession (minimal output)
+            # - max_target_seqs 1: Only need the best hit
+            # - evalue: Threshold for significance
             cmd = [
+                "blastn",
+                "-db",
+                self.local_blast_db,
+                "-query",
+                tmp_name,
+                "-evalue",
+                str(threshold),
+                "-max_target_seqs",
+                "1",
+                "-num_threads",
+                str(self.blast_num_threads),
+                "-outfmt",
+                "6 sacc",  # Only accession, tab-separated
             ]
+            logger.debug(
+                "Running local blastn for RNA (threads=%d): %s",
+                self.blast_num_threads,
+                " ".join(cmd),
+            )
+            # Run BLAST with timeout to avoid hanging
+            try:
+                out = subprocess.check_output(
+                    cmd,
+                    text=True,
+                    timeout=300,  # 5 minute timeout for BLAST search
+                    stderr=subprocess.DEVNULL,  # Suppress BLAST warnings to reduce I/O
+                ).strip()
+            except subprocess.TimeoutExpired:
+                logger.warning("BLAST search timed out after 5 minutes for sequence")
+                os.remove(tmp_name)
+                return None
             os.remove(tmp_name)
             return out.split("\n", maxsplit=1)[0] if out else None
         except Exception as exc:
             logger.error("Local blastn failed: %s", exc)
+            # Clean up temp file if it still exists
+            try:
+                if "tmp_name" in locals():
+                    os.remove(tmp_name)
+            except Exception:
+                pass
             return None
+    @staticmethod
+    def _extract_rna_sequence(sequence: str) -> Optional[str]:
+        """Extract and normalize RNA sequence from input."""
+        if sequence.startswith(">"):
+            seq_lines = sequence.strip().split("\n")
+            seq = "".join(seq_lines[1:])
+        else:
+            seq = sequence.strip().replace(" ", "").replace("\n", "")
+        # Accept both U (original RNA) and T
+        return seq if seq and re.fullmatch(r"[AUCGTN\s]+", seq, re.I) else None
+    def _search_with_local_blast(self, seq: str, threshold: float) -> Optional[dict]:
+        """Search using local BLAST database."""
+        accession = self._local_blast(seq, threshold)
+        if not accession:
+            logger.info(
+                "Local BLAST found no match for sequence. "
+                "API fallback disabled when using local database."
+            )
+            return None
+        logger.debug("Local BLAST found accession: %s", accession)
+        detailed = self.get_by_rna_id(accession)
+        if detailed:
+            return detailed
+        logger.info(
+            "Local BLAST found accession %s but could not retrieve metadata from API.",
+            accession,
+        )
+        return None
+    def _search_with_api(self, seq: str) -> Optional[dict]:
+        """Search using RNAcentral API with MD5 hash."""
+        logger.debug("Falling back to RNAcentral API.")
+        md5_hash = self._calculate_md5(seq)
+        search_url = f"{self.base_url}/rna"
+        params = {"md5": md5_hash, "format": "json"}
+        resp = requests.get(
+            search_url, params=params, headers=self.headers, timeout=60
+        )
+        resp.raise_for_status()
+        search_results = resp.json()
+        results = search_results.get("results", [])
+        if not results:
+            logger.info("No exact match found in RNAcentral for sequence")
+            return None
+        rna_id = results[0].get("rnacentral_id")
+        if not rna_id:
+            logger.error("No RNAcentral ID found in search results.")
+            return None
+        detailed = self.get_by_rna_id(rna_id)
+        if detailed:
+            return detailed
+        # Fallback: use search result data if get_by_rna_id returns None
+        logger.debug(
+            "Using search result data for %s (get_by_rna_id returned None)", rna_id
+        )
+        return self._rna_data_to_dict(rna_id, results[0])
+    def get_by_fasta(
+        self, sequence: str, threshold: float = 0.01
+    ) -> Optional[dict]:
+        """Search RNAcentral with an RNA sequence."""
         try:
+            seq = self._extract_rna_sequence(sequence)
             if not seq:
                 logger.error("Empty or invalid RNA sequence provided.")
                 return None
             if self.use_local_blast:
+                return self._search_with_local_blast(seq, threshold)
+            return self._search_with_api(seq)
         except Exception as e:
             logger.error("Sequence search failed: %s", e)
             return None
     @retry(
         stop=stop_after_attempt(3),
         wait=wait_exponential(multiplier=1, min=2, max=10),
+        retry=retry_if_exception_type((requests.Timeout, requests.RequestException)),
         reraise=True,
     )
+    def search(self, query: str, threshold: float = None, **kwargs) -> Optional[Dict]:
         """Search RNAcentral with either an RNAcentral ID, keyword, or RNA sequence."""
+        threshold = threshold or self.threshold
         if not query or not isinstance(query, str):
             logger.error("Empty or non-string input.")
             return None
         query = query.strip()
         logger.debug("RNAcentral search query: %s", query)
+        # check if RNA sequence (AUCG or ATCG characters, contains U or T)
+        # Note: Sequences with T are also RNA sequences
+        is_rna_sequence = query.startswith(">") or (
+            re.fullmatch(r"[AUCGTN\s]+", query, re.I)
+            and ("U" in query.upper() or "T" in query.upper())
+        )
+        if is_rna_sequence:
+            result = self.get_by_fasta(query, threshold)
         # check if RNAcentral ID (typically starts with URS)
         elif re.fullmatch(r"URS\d+", query, re.I):
+            result = self.get_by_rna_id(query)
         else:
             # otherwise treat as keyword
+            result = self.get_best_hit(query)
         if result:
             result["_search_query"] = query

graphgen/models/searcher/db/uniprot_searcher.py CHANGED Viewed

@@ -1,10 +1,7 @@
-import asyncio
 import os
 import re
 import subprocess
 import tempfile
-from concurrent.futures import ThreadPoolExecutor
-from functools import lru_cache
 from io import StringIO
 from typing import Dict, Optional
@@ -22,15 +19,6 @@ from graphgen.bases import BaseSearcher
 from graphgen.utils import logger
-@lru_cache(maxsize=None)
-def _get_pool():
-    return ThreadPoolExecutor(max_workers=10)
-# ensure only one BLAST searcher at a time
-_blast_lock = asyncio.Lock()
 class UniProtSearch(BaseSearcher):
     """
     UniProt Search client to searcher with UniProt.
@@ -39,10 +27,18 @@ class UniProtSearch(BaseSearcher):
     3) Search with FASTA sequence (BLAST searcher). Note that NCBIWWW does not support async.
     """
-    def __init__(self, use_local_blast: bool = False, local_blast_db: str = "sp_db"):
-        super().__init__()
         self.use_local_blast = use_local_blast
         self.local_blast_db = local_blast_db
         if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.phr"):
             logger.error("Local BLAST database files not found. Please check the path.")
             self.use_local_blast = False
@@ -61,7 +57,7 @@ class UniProtSearch(BaseSearcher):
     @staticmethod
     def _swissprot_to_dict(record: SwissProt.Record) -> dict:
-        """error
         Convert a SwissProt.Record to a dictionary.
         """
         functions = []
@@ -104,75 +100,88 @@ class UniProtSearch(BaseSearcher):
             logger.error("Keyword %s not found: %s", keyword, e)
         return None
-    def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]:
         """
-        Search UniProt with a FASTA sequence and return the best hit.
         :param fasta_sequence: The FASTA sequence.
-        :param threshold: E-value threshold for BLAST searcher.
-        :return: A dictionary containing the best hit information or None if not found.
         """
         try:
             if fasta_sequence.startswith(">"):
                 seq = str(list(SeqIO.parse(StringIO(fasta_sequence), "fasta"))[0].seq)
             else:
                 seq = fasta_sequence.strip()
         except Exception as e:  # pylint: disable=broad-except
             logger.error("Invalid FASTA sequence: %s", e)
             return None
-        if not seq:
-            logger.error("Empty FASTA sequence provided.")
             return None
-        accession = None
-        if self.use_local_blast:
-            accession = self._local_blast(seq, threshold)
-            if accession:
-                logger.debug("Local BLAST found accession: %s", accession)
-        if not accession:
-            logger.debug("Falling back to NCBIWWW.qblast.")
-            # UniProtKB/Swiss-Prot BLAST API
-            try:
-                logger.debug(
-                    "Performing BLAST searcher for the given sequence: %s", seq
-                )
-                result_handle = NCBIWWW.qblast(
-                    program="blastp",
-                    database="swissprot",
-                    sequence=seq,
-                    hitlist_size=1,
-                    expect=threshold,
-                )
-                blast_record = NCBIXML.read(result_handle)
-            except RequestException:
-                raise
-            except Exception as e:  # pylint: disable=broad-except
-                logger.error("BLAST searcher failed: %s", e)
-                return None
-            if not blast_record.alignments:
-                logger.info("No BLAST hits found for the given sequence.")
-                return None
-            best_alignment = blast_record.alignments[0]
-            best_hsp = best_alignment.hsps[0]
-            if best_hsp.expect > threshold:
-                logger.info("No BLAST hits below the threshold E-value.")
-                return None
-            hit_id = best_alignment.hit_id
-            # like sp|P01308.1|INS_HUMAN
-            accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id
-        return self.get_by_accession(accession)
     def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
         """
         Perform local BLAST search using local BLAST database.
-        :param seq: The protein sequence.
-        :param threshold: E-value threshold for BLAST searcher.
-        :return: The accession number of the best hit or None if not found.
         """
         try:
             with tempfile.NamedTemporaryFile(
@@ -181,6 +190,11 @@ class UniProtSearch(BaseSearcher):
                 tmp.write(f">query\n{seq}\n")
                 tmp_name = tmp.name
             cmd = [
                 "blastp",
                 "-db",
@@ -191,11 +205,30 @@ class UniProtSearch(BaseSearcher):
                 str(threshold),
                 "-max_target_seqs",
                 "1",
                 "-outfmt",
-                "6 sacc",  # only return accession
             ]
-            logger.debug("Running local blastp: %s", " ".join(cmd))
-            out = subprocess.check_output(cmd, text=True).strip()
             os.remove(tmp_name)
             if out:
                 return out.split("\n", maxsplit=1)[0]
@@ -210,16 +243,14 @@ class UniProtSearch(BaseSearcher):
         retry=retry_if_exception_type(RequestException),
         reraise=True,
     )
-    async def search(
-        self, query: str, threshold: float = 0.7, **kwargs
-    ) -> Optional[Dict]:
         """
         Search UniProt with either an accession number, keyword, or FASTA sequence.
         :param query: The searcher query (accession number, keyword, or FASTA sequence).
         :param threshold: E-value threshold for BLAST searcher.
         :return: A dictionary containing the best hit information or None if not found.
         """
         # auto detect query type
         if not query or not isinstance(query, str):
             logger.error("Empty or non-string input.")
@@ -228,26 +259,21 @@ class UniProtSearch(BaseSearcher):
         logger.debug("UniProt searcher query: %s", query)
-        loop = asyncio.get_running_loop()
         # check if fasta sequence
         if query.startswith(">") or re.fullmatch(
             r"[ACDEFGHIKLMNPQRSTVWY\s]+", query, re.I
         ):
-            async with _blast_lock:
-                result = await loop.run_in_executor(
-                    _get_pool(), self.get_by_fasta, query, threshold
-                )
         # check if accession number
-        elif re.fullmatch(r"[A-NR-Z0-9]{6,10}", query, re.I):
-            result = await loop.run_in_executor(
-                _get_pool(), self.get_by_accession, query
-            )
         else:
             # otherwise treat as keyword
-            result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query)
         if result:
             result["_search_query"] = query

 import os
 import re
 import subprocess
 import tempfile
 from io import StringIO
 from typing import Dict, Optional
 from graphgen.utils import logger
 class UniProtSearch(BaseSearcher):
     """
     UniProt Search client to searcher with UniProt.
     3) Search with FASTA sequence (BLAST searcher). Note that NCBIWWW does not support async.
     """
+    def __init__(
+        self,
+        use_local_blast: bool = False,
+        local_blast_db: str = "sp_db",
+        blast_num_threads: int = 4,
+        threshold: float = 0.01,
+    ):
         self.use_local_blast = use_local_blast
         self.local_blast_db = local_blast_db
+        self.blast_num_threads = blast_num_threads  # Number of threads for BLAST search
+        self.threshold = threshold
         if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.phr"):
             logger.error("Local BLAST database files not found. Please check the path.")
             self.use_local_blast = False
     @staticmethod
     def _swissprot_to_dict(record: SwissProt.Record) -> dict:
+        """
         Convert a SwissProt.Record to a dictionary.
         """
         functions = []
             logger.error("Keyword %s not found: %s", keyword, e)
         return None
+    def _parse_fasta_sequence(self, fasta_sequence: str) -> Optional[str]:
         """
+        Parse and extract sequence from FASTA format.
         :param fasta_sequence: The FASTA sequence.
+        :return: Extracted sequence string or None if invalid.
         """
         try:
             if fasta_sequence.startswith(">"):
                 seq = str(list(SeqIO.parse(StringIO(fasta_sequence), "fasta"))[0].seq)
             else:
                 seq = fasta_sequence.strip()
+            return seq if seq else None
         except Exception as e:  # pylint: disable=broad-except
             logger.error("Invalid FASTA sequence: %s", e)
             return None
+    def _search_with_local_blast(self, seq: str, threshold: float) -> Optional[Dict]:
+        """Search using local BLAST database."""
+        accession = self._local_blast(seq, threshold)
+        if not accession:
+            logger.info(
+                "Local BLAST found no match for sequence. "
+                "API fallback disabled when using local database."
+            )
             return None
+        logger.debug("Local BLAST found accession: %s", accession)
+        return self.get_by_accession(accession)
+    def _search_with_network_blast(self, seq: str, threshold: float) -> Optional[Dict]:
+        """Search using network BLAST (NCBIWWW)."""
+        logger.debug("Falling back to NCBIWWW.qblast.")
+        try:
+            logger.debug("Performing BLAST searcher for the given sequence: %s", seq)
+            result_handle = NCBIWWW.qblast(
+                program="blastp",
+                database="swissprot",
+                sequence=seq,
+                hitlist_size=1,
+                expect=threshold,
+            )
+            blast_record = NCBIXML.read(result_handle)
+        except RequestException:
+            raise
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("BLAST searcher failed: %s", e)
+            return None
+        if not blast_record.alignments:
+            logger.info("No BLAST hits found for the given sequence.")
+            return None
+        best_alignment = blast_record.alignments[0]
+        best_hsp = best_alignment.hsps[0]
+        if best_hsp.expect > threshold:
+            logger.info("No BLAST hits below the threshold E-value.")
+            return None
+        # like sp|P01308.1|INS_HUMAN
+        hit_id = best_alignment.hit_id
+        accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id
+        return self.get_by_accession(accession)
+    def get_by_fasta(
+        self, fasta_sequence: str, threshold: float
+    ) -> Optional[Dict]:
+        """Search UniProt with a FASTA sequence and return the best hit."""
+        seq = self._parse_fasta_sequence(fasta_sequence)
+        if not seq:
+            logger.error("Empty FASTA sequence provided.")
+            return None
+        search_method = (
+            self._search_with_local_blast if self.use_local_blast
+            else self._search_with_network_blast
+        )
+        return search_method(seq, threshold)
     def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
         """
         Perform local BLAST search using local BLAST database.
+        Optimized with multi-threading and faster output format.
         """
         try:
             with tempfile.NamedTemporaryFile(
                 tmp.write(f">query\n{seq}\n")
                 tmp_name = tmp.name
+            # Optimized BLAST command with:
+            # - num_threads: Use multiple threads for faster search
+            # - outfmt 6 sacc: Only return accession (minimal output)
+            # - max_target_seqs 1: Only need the best hit
+            # - evalue: Threshold for significance
             cmd = [
                 "blastp",
                 "-db",
                 str(threshold),
                 "-max_target_seqs",
                 "1",
+                "-num_threads",
+                str(self.blast_num_threads),
                 "-outfmt",
+                "6 sacc",  # Only accession, tab-separated
             ]
+            logger.debug(
+                "Running local blastp (threads=%d): %s",
+                self.blast_num_threads,
+                " ".join(cmd),
+            )
+            # Run BLAST with timeout to avoid hanging
+            try:
+                out = subprocess.check_output(
+                    cmd,
+                    text=True,
+                    timeout=300,  # 5 minute timeout for BLAST search
+                    stderr=subprocess.DEVNULL,  # Suppress BLAST warnings to reduce I/O
+                ).strip()
+            except subprocess.TimeoutExpired:
+                logger.warning("BLAST search timed out after 5 minutes for sequence")
+                os.remove(tmp_name)
+                return None
             os.remove(tmp_name)
             if out:
                 return out.split("\n", maxsplit=1)[0]
         retry=retry_if_exception_type(RequestException),
         reraise=True,
     )
+    def search(self, query: str, threshold: float = None, **kwargs) -> Optional[Dict]:
         """
         Search UniProt with either an accession number, keyword, or FASTA sequence.
         :param query: The searcher query (accession number, keyword, or FASTA sequence).
         :param threshold: E-value threshold for BLAST searcher.
         :return: A dictionary containing the best hit information or None if not found.
         """
+        threshold = threshold or self.threshold
         # auto detect query type
         if not query or not isinstance(query, str):
             logger.error("Empty or non-string input.")
         logger.debug("UniProt searcher query: %s", query)
         # check if fasta sequence
         if query.startswith(">") or re.fullmatch(
             r"[ACDEFGHIKLMNPQRSTVWY\s]+", query, re.I
         ):
+            result = self.get_by_fasta(query, threshold)
         # check if accession number
+        # UniProt accession IDs: 6-10 characters, must start with a letter
+        # Format: [A-Z][A-Z0-9]{5,9} (6-10 chars total: 1 letter + 5-9 alphanumeric)
+        elif re.fullmatch(r"[A-Z][A-Z0-9]{5,9}", query, re.I):
+            result = self.get_by_accession(query)
         else:
             # otherwise treat as keyword
+            result = self.get_best_hit(query)
         if result:
             result["_search_query"] = query

graphgen/models/searcher/web/bing_search.py CHANGED Viewed

@@ -1,3 +1,9 @@
 import requests
 from fastapi import HTTPException

+"""
+To use Bing Web Search API,
+follow the instructions [here](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api)
+and obtain your Bing subscription key.
+"""
 import requests
 from fastapi import HTTPException

graphgen/models/searcher/web/google_search.py CHANGED Viewed

@@ -1,3 +1,9 @@
 import requests
 from fastapi import HTTPException

+"""
+To use Google Web Search API,
+follow the instructions [here](https://developers.google.com/custom-search/v1/overview)
+to get your Google searcher api key.
+"""
 import requests
 from fastapi import HTTPException

graphgen/operators/__init__.py CHANGED Viewed

@@ -6,7 +6,7 @@ from .judge import JudgeService
 from .partition import PartitionService
 from .quiz import QuizService
 from .read import read
-from .search import search_all
 operators = {
     "read": read,
@@ -15,7 +15,7 @@ operators = {
     "quiz": QuizService,
     "judge": JudgeService,
     "extract": ExtractService,
-    "search": search_all,
     "partition": PartitionService,
     "generate": GenerateService,
 }

 from .partition import PartitionService
 from .quiz import QuizService
 from .read import read
+from .search import SearchService
 operators = {
     "read": read,
     "quiz": QuizService,
     "judge": JudgeService,
     "extract": ExtractService,
+    "search": SearchService,
     "partition": PartitionService,
     "generate": GenerateService,
 }

graphgen/operators/search/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- from .~~search_all~~ import ~~search_all~~


1	+ from .search_service import SearchService

graphgen/operators/search/search_all.py DELETED Viewed

@@ -1,83 +0,0 @@
-"""
-To use Google Web Search API,
-follow the instructions [here](https://developers.google.com/custom-search/v1/overview)
-to get your Google searcher api key.
-To use Bing Web Search API,
-follow the instructions [here](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api)
-and obtain your Bing subscription key.
-"""
-from graphgen.utils import logger, run_concurrent
-async def search_all(
-    seed_data: dict,
-    search_config: dict,
-) -> dict:
-    """
-    Perform searches across multiple search types and aggregate the results.
-    :param seed_data: A dictionary containing seed data with entity names.
-    :param search_config: A dictionary specifying which data sources to use for searching.
-    :return: A dictionary with
-    """
-    results = {}
-    data_sources = search_config.get("data_sources", [])
-    for data_source in data_sources:
-        data = list(seed_data.values())
-        data = [d["content"] for d in data if "content" in d]
-        data = list(set(data))  # Remove duplicates
-        if data_source == "uniprot":
-            from graphgen.models import UniProtSearch
-            uniprot_search_client = UniProtSearch(
-                **search_config.get("uniprot_params", {})
-            )
-            uniprot_results = await run_concurrent(
-                uniprot_search_client.search,
-                data,
-                desc="Searching UniProt database",
-                unit="keyword",
-            )
-            results[data_source] = uniprot_results
-        elif data_source == "ncbi":
-            from graphgen.models import NCBISearch
-            ncbi_search_client = NCBISearch(
-                **search_config.get("ncbi_params", {})
-            )
-            ncbi_results = await run_concurrent(
-                ncbi_search_client.search,
-                data,
-                desc="Searching NCBI database",
-                unit="keyword",
-            )
-            results[data_source] = ncbi_results
-        elif data_source == "rnacentral":
-            from graphgen.models import RNACentralSearch
-            rnacentral_search_client = RNACentralSearch(
-                **search_config.get("rnacentral_params", {})
-            )
-            rnacentral_results = await run_concurrent(
-                rnacentral_search_client.search,
-                data,
-                desc="Searching RNAcentral database",
-                unit="keyword",
-            )
-            results[data_source] = rnacentral_results
-        else:
-            logger.error("Data source %s not supported.", data_source)
-            continue
-    return results

graphgen/operators/search/search_service.py ADDED Viewed

	@@ -0,0 +1,163 @@

+from functools import partial
+from typing import Optional
+import pandas as pd
+from graphgen.bases import BaseOperator
+from graphgen.common import init_storage
+from graphgen.utils import compute_content_hash, logger, run_concurrent
+class SearchService(BaseOperator):
+    """
+    Service class for performing searches across multiple data sources.
+    Provides search functionality for UniProt, NCBI, and RNAcentral databases.
+    """
+    def __init__(
+        self,
+        working_dir: str = "cache",
+        kv_backend: str = "rocksdb",
+        data_sources: list = None,
+        **kwargs,
+    ):
+        super().__init__(working_dir=working_dir, op_name="search_service")
+        self.working_dir = working_dir
+        self.data_sources = data_sources or []
+        self.kwargs = kwargs
+        self.search_storage = init_storage(
+            backend=kv_backend, working_dir=working_dir, namespace="search"
+        )
+        self.searchers = {}
+    def _init_searchers(self):
+        """
+        Initialize all searchers (deferred import to avoid circular imports).
+        """
+        for datasource in self.data_sources:
+            if datasource in self.searchers:
+                continue
+            if datasource == "uniprot":
+                from graphgen.models import UniProtSearch
+                params = self.kwargs.get("uniprot_params", {})
+                self.searchers[datasource] = UniProtSearch(**params)
+            elif datasource == "ncbi":
+                from graphgen.models import NCBISearch
+                params = self.kwargs.get("ncbi_params", {})
+                self.searchers[datasource] = NCBISearch(**params)
+            elif datasource == "rnacentral":
+                from graphgen.models import RNACentralSearch
+                params = self.kwargs.get("rnacentral_params", {})
+                self.searchers[datasource] = RNACentralSearch(**params)
+            else:
+                logger.error(f"Unknown data source: {datasource}, skipping")
+    @staticmethod
+    async def _perform_search(
+        seed: dict, searcher_obj, data_source: str
+    ) -> Optional[dict]:
+        """
+        Perform search for a single seed using the specified searcher.
+        :param seed: The seed document with 'content' field
+        :param searcher_obj: The searcher instance
+        :param data_source: The data source name
+        :return: Search result with metadata
+        """
+        query = seed.get("content", "")
+        if not query:
+            logger.warning("Empty query for seed: %s", seed)
+            return None
+        result = searcher_obj.search(query)
+        if result:
+            result["_doc_id"] = compute_content_hash(str(data_source) + query, "doc-")
+            result["data_source"] = data_source
+            result["type"] = seed.get("type", "text")
+        return result
+    def _process_single_source(
+        self, data_source: str, seed_data: list[dict]
+    ) -> list[dict]:
+        """
+        process a single data source: check cache, search missing, update cache.
+        """
+        searcher = self.searchers[data_source]
+        seeds_with_ids = []
+        for seed in seed_data:
+            query = seed.get("content", "")
+            if not query:
+                continue
+            doc_id = compute_content_hash(str(data_source) + query, "doc-")
+            seeds_with_ids.append((doc_id, seed))
+        if not seeds_with_ids:
+            return []
+        doc_ids = [doc_id for doc_id, _ in seeds_with_ids]
+        cached_results = self.search_storage.get_by_ids(doc_ids)
+        to_search_seeds = []
+        final_results = []
+        for (doc_id, seed), cached in zip(seeds_with_ids, cached_results):
+            if cached is not None:
+                if "_doc_id" not in cached:
+                    cached["_doc_id"] = doc_id
+                final_results.append(cached)
+            else:
+                to_search_seeds.append(seed)
+        if to_search_seeds:
+            new_results = run_concurrent(
+                partial(
+                    self._perform_search, searcher_obj=searcher, data_source=data_source
+                ),
+                to_search_seeds,
+                desc=f"Searching {data_source} database",
+                unit="keyword",
+            )
+            new_results = [res for res in new_results if res is not None]
+            if new_results:
+                upsert_data = {res["_doc_id"]: res for res in new_results}
+                self.search_storage.upsert(upsert_data)
+                logger.info(
+                    f"Saved {len(upsert_data)} new results to {data_source} cache"
+                )
+            final_results.extend(new_results)
+        return final_results
+    def process(self, batch: pd.DataFrame) -> pd.DataFrame:
+        docs = batch.to_dict(orient="records")
+        self._init_searchers()
+        seed_data = [doc for doc in docs if doc and "content" in doc]
+        if not seed_data:
+            logger.warning("No valid seeds in batch")
+            return pd.DataFrame([])
+        all_results = []
+        for data_source in self.data_sources:
+            if data_source not in self.searchers:
+                logger.error(f"Data source {data_source} not initialized, skipping")
+                continue
+            source_results = self._process_single_source(data_source, seed_data)
+            all_results.extend(source_results)
+        if not all_results:
+            logger.warning("No search results generated for this batch")
+        return pd.DataFrame(all_results)