Spaces:
Running
Running
github-actions[bot]
commited on
Commit
·
c91a105
1
Parent(s):
cebee97
Auto-sync from demo at Tue Feb 10 05:21:56 UTC 2026
Browse files
graphgen/models/__init__.py
CHANGED
|
@@ -42,6 +42,7 @@ if TYPE_CHECKING:
|
|
| 42 |
TXTReader,
|
| 43 |
)
|
| 44 |
from .rephraser import StyleControlledRephraser
|
|
|
|
| 45 |
from .searcher.db.ncbi_searcher import NCBISearch
|
| 46 |
from .searcher.db.rnacentral_searcher import RNACentralSearch
|
| 47 |
from .searcher.db.uniprot_searcher import UniProtSearch
|
|
@@ -95,6 +96,7 @@ _import_map = {
|
|
| 95 |
"TXTReader": ".reader",
|
| 96 |
"HuggingFaceReader": ".reader",
|
| 97 |
# Searcher
|
|
|
|
| 98 |
"NCBISearch": ".searcher.db.ncbi_searcher",
|
| 99 |
"RNACentralSearch": ".searcher.db.rnacentral_searcher",
|
| 100 |
"UniProtSearch": ".searcher.db.uniprot_searcher",
|
|
|
|
| 42 |
TXTReader,
|
| 43 |
)
|
| 44 |
from .rephraser import StyleControlledRephraser
|
| 45 |
+
from .searcher.db.interpro_searcher import InterProSearch
|
| 46 |
from .searcher.db.ncbi_searcher import NCBISearch
|
| 47 |
from .searcher.db.rnacentral_searcher import RNACentralSearch
|
| 48 |
from .searcher.db.uniprot_searcher import UniProtSearch
|
|
|
|
| 96 |
"TXTReader": ".reader",
|
| 97 |
"HuggingFaceReader": ".reader",
|
| 98 |
# Searcher
|
| 99 |
+
"InterProSearch": ".searcher.db.interpro_searcher",
|
| 100 |
"NCBISearch": ".searcher.db.ncbi_searcher",
|
| 101 |
"RNACentralSearch": ".searcher.db.rnacentral_searcher",
|
| 102 |
"UniProtSearch": ".searcher.db.uniprot_searcher",
|
graphgen/models/searcher/db/interpro_searcher.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import Dict, Optional
|
| 3 |
+
|
| 4 |
+
import requests
|
| 5 |
+
from requests.exceptions import RequestException
|
| 6 |
+
from tenacity import (
|
| 7 |
+
retry,
|
| 8 |
+
retry_if_exception_type,
|
| 9 |
+
stop_after_attempt,
|
| 10 |
+
wait_exponential,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
from graphgen.bases import BaseSearcher
|
| 14 |
+
from graphgen.utils import logger
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class InterProSearch(BaseSearcher):
|
| 18 |
+
"""
|
| 19 |
+
InterPro Search client to search protein domains and functional annotations.
|
| 20 |
+
Supports:
|
| 21 |
+
1) Get protein domain information by UniProt accession number.
|
| 22 |
+
|
| 23 |
+
API Documentation: https://www.ebi.ac.uk/interpro/api/
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
def __init__(
|
| 27 |
+
self,
|
| 28 |
+
api_timeout: int = 30,
|
| 29 |
+
):
|
| 30 |
+
"""
|
| 31 |
+
Initialize the InterPro Search client.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
api_timeout (int): Request timeout in seconds.
|
| 35 |
+
"""
|
| 36 |
+
self.api_timeout = api_timeout
|
| 37 |
+
self.BASE_URL = "https://www.ebi.ac.uk/interpro/api"
|
| 38 |
+
|
| 39 |
+
@staticmethod
|
| 40 |
+
def _is_uniprot_accession(text: str) -> bool:
|
| 41 |
+
"""Check if text looks like a UniProt accession number."""
|
| 42 |
+
# UniProt: 6-10 chars starting with letter, e.g., P01308, Q96KN2
|
| 43 |
+
return bool(re.fullmatch(r"[A-Z][A-Z0-9]{5,9}", text.strip(), re.I))
|
| 44 |
+
|
| 45 |
+
def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
|
| 46 |
+
"""
|
| 47 |
+
Search InterPro database by UniProt accession number.
|
| 48 |
+
|
| 49 |
+
This method queries the EBI API to get pre-computed domain information
|
| 50 |
+
for a known UniProt entry.
|
| 51 |
+
|
| 52 |
+
Args:
|
| 53 |
+
accession (str): UniProt accession number.
|
| 54 |
+
|
| 55 |
+
Returns:
|
| 56 |
+
Dictionary with domain information or None if not found.
|
| 57 |
+
"""
|
| 58 |
+
if not accession or not isinstance(accession, str) or not self._is_uniprot_accession(accession):
|
| 59 |
+
logger.error("Invalid accession provided")
|
| 60 |
+
return None
|
| 61 |
+
|
| 62 |
+
accession = accession.strip().upper()
|
| 63 |
+
|
| 64 |
+
# Query InterPro REST API for UniProt entry
|
| 65 |
+
url = f"{self.BASE_URL}/entry/interpro/protein/uniprot/{accession}/"
|
| 66 |
+
|
| 67 |
+
response = requests.get(url, timeout=self.api_timeout)
|
| 68 |
+
|
| 69 |
+
if response.status_code != 200:
|
| 70 |
+
logger.warning(
|
| 71 |
+
"Failed to search InterPro for accession %s: %d",
|
| 72 |
+
accession,
|
| 73 |
+
response.status_code,
|
| 74 |
+
)
|
| 75 |
+
return None
|
| 76 |
+
|
| 77 |
+
data = response.json()
|
| 78 |
+
|
| 79 |
+
# Get entry details for each InterPro entry found
|
| 80 |
+
for result in data.get("results", []):
|
| 81 |
+
interpro_acc = result.get("metadata", {}).get("accession")
|
| 82 |
+
if interpro_acc:
|
| 83 |
+
entry_details = self.get_entry_details(interpro_acc)
|
| 84 |
+
if entry_details:
|
| 85 |
+
result["entry_details"] = entry_details
|
| 86 |
+
|
| 87 |
+
result = {
|
| 88 |
+
"molecule_type": "protein",
|
| 89 |
+
"database": "InterPro",
|
| 90 |
+
"id": accession,
|
| 91 |
+
"content": data.get("results", []),
|
| 92 |
+
"url": f"https://www.ebi.ac.uk/interpro/protein/uniprot/{accession}/",
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
return result
|
| 96 |
+
|
| 97 |
+
def get_entry_details(self, interpro_accession: str) -> Optional[Dict]:
|
| 98 |
+
"""
|
| 99 |
+
Get detailed information for a specific InterPro entry.
|
| 100 |
+
|
| 101 |
+
Args:
|
| 102 |
+
interpro_accession (str): InterPro accession number (e.g., IPR000001).
|
| 103 |
+
Returns:
|
| 104 |
+
Dictionary with entry details or None if not found.
|
| 105 |
+
"""
|
| 106 |
+
if not interpro_accession or not isinstance(interpro_accession, str):
|
| 107 |
+
return None
|
| 108 |
+
|
| 109 |
+
url = f"{self.BASE_URL}/entry/interpro/{interpro_accession}/"
|
| 110 |
+
|
| 111 |
+
response = requests.get(url, timeout=self.api_timeout)
|
| 112 |
+
if response.status_code != 200:
|
| 113 |
+
logger.warning(
|
| 114 |
+
"Failed to get InterPro entry %s: %d",
|
| 115 |
+
interpro_accession,
|
| 116 |
+
response.status_code,
|
| 117 |
+
)
|
| 118 |
+
return None
|
| 119 |
+
|
| 120 |
+
return response.json()
|
| 121 |
+
|
| 122 |
+
@retry(
|
| 123 |
+
stop=stop_after_attempt(3),
|
| 124 |
+
wait=wait_exponential(multiplier=1, min=2, max=5),
|
| 125 |
+
retry=retry_if_exception_type(RequestException),
|
| 126 |
+
reraise=True,
|
| 127 |
+
)
|
| 128 |
+
def search(self, query: str, **kwargs) -> Optional[Dict]:
|
| 129 |
+
"""
|
| 130 |
+
Search InterPro for protein domain information by UniProt accession.
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
query (str): UniProt accession number (e.g., P01308, Q96KN2).
|
| 134 |
+
**kwargs: Additional arguments (unused).
|
| 135 |
+
|
| 136 |
+
Returns:
|
| 137 |
+
Dictionary with domain information or None if not found.
|
| 138 |
+
"""
|
| 139 |
+
if not query or not isinstance(query, str):
|
| 140 |
+
logger.error("Empty or non-string input")
|
| 141 |
+
return None
|
| 142 |
+
|
| 143 |
+
query = query.strip()
|
| 144 |
+
logger.debug("InterPro search query: %s", query[:100])
|
| 145 |
+
|
| 146 |
+
# Search by UniProt ID
|
| 147 |
+
logger.debug("Searching for UniProt accession: %s", query)
|
| 148 |
+
result = self.search_by_uniprot_id(query)
|
| 149 |
+
|
| 150 |
+
if result:
|
| 151 |
+
result["_search_query"] = query
|
| 152 |
+
|
| 153 |
+
return result
|
graphgen/operators/search/search_service.py
CHANGED
|
@@ -58,6 +58,11 @@ class SearchService(BaseOperator):
|
|
| 58 |
|
| 59 |
params = self.kwargs.get("rnacentral_params", {})
|
| 60 |
self.searcher = RNACentralSearch(**params)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
else:
|
| 62 |
logger.error(f"Unknown data source: {self.data_source}")
|
| 63 |
|
|
|
|
| 58 |
|
| 59 |
params = self.kwargs.get("rnacentral_params", {})
|
| 60 |
self.searcher = RNACentralSearch(**params)
|
| 61 |
+
elif self.data_source == "interpro":
|
| 62 |
+
from graphgen.models import InterProSearch
|
| 63 |
+
|
| 64 |
+
params = self.kwargs.get("interpro_params", {})
|
| 65 |
+
self.searcher = InterProSearch(**params)
|
| 66 |
else:
|
| 67 |
logger.error(f"Unknown data source: {self.data_source}")
|
| 68 |
|