github-actions[bot] commited on
Commit
c91a105
·
1 Parent(s): cebee97

Auto-sync from demo at Tue Feb 10 05:21:56 UTC 2026

Browse files
graphgen/models/__init__.py CHANGED
@@ -42,6 +42,7 @@ if TYPE_CHECKING:
42
  TXTReader,
43
  )
44
  from .rephraser import StyleControlledRephraser
 
45
  from .searcher.db.ncbi_searcher import NCBISearch
46
  from .searcher.db.rnacentral_searcher import RNACentralSearch
47
  from .searcher.db.uniprot_searcher import UniProtSearch
@@ -95,6 +96,7 @@ _import_map = {
95
  "TXTReader": ".reader",
96
  "HuggingFaceReader": ".reader",
97
  # Searcher
 
98
  "NCBISearch": ".searcher.db.ncbi_searcher",
99
  "RNACentralSearch": ".searcher.db.rnacentral_searcher",
100
  "UniProtSearch": ".searcher.db.uniprot_searcher",
 
42
  TXTReader,
43
  )
44
  from .rephraser import StyleControlledRephraser
45
+ from .searcher.db.interpro_searcher import InterProSearch
46
  from .searcher.db.ncbi_searcher import NCBISearch
47
  from .searcher.db.rnacentral_searcher import RNACentralSearch
48
  from .searcher.db.uniprot_searcher import UniProtSearch
 
96
  "TXTReader": ".reader",
97
  "HuggingFaceReader": ".reader",
98
  # Searcher
99
+ "InterProSearch": ".searcher.db.interpro_searcher",
100
  "NCBISearch": ".searcher.db.ncbi_searcher",
101
  "RNACentralSearch": ".searcher.db.rnacentral_searcher",
102
  "UniProtSearch": ".searcher.db.uniprot_searcher",
graphgen/models/searcher/db/interpro_searcher.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Dict, Optional
3
+
4
+ import requests
5
+ from requests.exceptions import RequestException
6
+ from tenacity import (
7
+ retry,
8
+ retry_if_exception_type,
9
+ stop_after_attempt,
10
+ wait_exponential,
11
+ )
12
+
13
+ from graphgen.bases import BaseSearcher
14
+ from graphgen.utils import logger
15
+
16
+
17
+ class InterProSearch(BaseSearcher):
18
+ """
19
+ InterPro Search client to search protein domains and functional annotations.
20
+ Supports:
21
+ 1) Get protein domain information by UniProt accession number.
22
+
23
+ API Documentation: https://www.ebi.ac.uk/interpro/api/
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ api_timeout: int = 30,
29
+ ):
30
+ """
31
+ Initialize the InterPro Search client.
32
+
33
+ Args:
34
+ api_timeout (int): Request timeout in seconds.
35
+ """
36
+ self.api_timeout = api_timeout
37
+ self.BASE_URL = "https://www.ebi.ac.uk/interpro/api"
38
+
39
+ @staticmethod
40
+ def _is_uniprot_accession(text: str) -> bool:
41
+ """Check if text looks like a UniProt accession number."""
42
+ # UniProt: 6-10 chars starting with letter, e.g., P01308, Q96KN2
43
+ return bool(re.fullmatch(r"[A-Z][A-Z0-9]{5,9}", text.strip(), re.I))
44
+
45
+ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
46
+ """
47
+ Search InterPro database by UniProt accession number.
48
+
49
+ This method queries the EBI API to get pre-computed domain information
50
+ for a known UniProt entry.
51
+
52
+ Args:
53
+ accession (str): UniProt accession number.
54
+
55
+ Returns:
56
+ Dictionary with domain information or None if not found.
57
+ """
58
+ if not accession or not isinstance(accession, str) or not self._is_uniprot_accession(accession):
59
+ logger.error("Invalid accession provided")
60
+ return None
61
+
62
+ accession = accession.strip().upper()
63
+
64
+ # Query InterPro REST API for UniProt entry
65
+ url = f"{self.BASE_URL}/entry/interpro/protein/uniprot/{accession}/"
66
+
67
+ response = requests.get(url, timeout=self.api_timeout)
68
+
69
+ if response.status_code != 200:
70
+ logger.warning(
71
+ "Failed to search InterPro for accession %s: %d",
72
+ accession,
73
+ response.status_code,
74
+ )
75
+ return None
76
+
77
+ data = response.json()
78
+
79
+ # Get entry details for each InterPro entry found
80
+ for result in data.get("results", []):
81
+ interpro_acc = result.get("metadata", {}).get("accession")
82
+ if interpro_acc:
83
+ entry_details = self.get_entry_details(interpro_acc)
84
+ if entry_details:
85
+ result["entry_details"] = entry_details
86
+
87
+ result = {
88
+ "molecule_type": "protein",
89
+ "database": "InterPro",
90
+ "id": accession,
91
+ "content": data.get("results", []),
92
+ "url": f"https://www.ebi.ac.uk/interpro/protein/uniprot/{accession}/",
93
+ }
94
+
95
+ return result
96
+
97
+ def get_entry_details(self, interpro_accession: str) -> Optional[Dict]:
98
+ """
99
+ Get detailed information for a specific InterPro entry.
100
+
101
+ Args:
102
+ interpro_accession (str): InterPro accession number (e.g., IPR000001).
103
+ Returns:
104
+ Dictionary with entry details or None if not found.
105
+ """
106
+ if not interpro_accession or not isinstance(interpro_accession, str):
107
+ return None
108
+
109
+ url = f"{self.BASE_URL}/entry/interpro/{interpro_accession}/"
110
+
111
+ response = requests.get(url, timeout=self.api_timeout)
112
+ if response.status_code != 200:
113
+ logger.warning(
114
+ "Failed to get InterPro entry %s: %d",
115
+ interpro_accession,
116
+ response.status_code,
117
+ )
118
+ return None
119
+
120
+ return response.json()
121
+
122
+ @retry(
123
+ stop=stop_after_attempt(3),
124
+ wait=wait_exponential(multiplier=1, min=2, max=5),
125
+ retry=retry_if_exception_type(RequestException),
126
+ reraise=True,
127
+ )
128
+ def search(self, query: str, **kwargs) -> Optional[Dict]:
129
+ """
130
+ Search InterPro for protein domain information by UniProt accession.
131
+
132
+ Args:
133
+ query (str): UniProt accession number (e.g., P01308, Q96KN2).
134
+ **kwargs: Additional arguments (unused).
135
+
136
+ Returns:
137
+ Dictionary with domain information or None if not found.
138
+ """
139
+ if not query or not isinstance(query, str):
140
+ logger.error("Empty or non-string input")
141
+ return None
142
+
143
+ query = query.strip()
144
+ logger.debug("InterPro search query: %s", query[:100])
145
+
146
+ # Search by UniProt ID
147
+ logger.debug("Searching for UniProt accession: %s", query)
148
+ result = self.search_by_uniprot_id(query)
149
+
150
+ if result:
151
+ result["_search_query"] = query
152
+
153
+ return result
graphgen/operators/search/search_service.py CHANGED
@@ -58,6 +58,11 @@ class SearchService(BaseOperator):
58
 
59
  params = self.kwargs.get("rnacentral_params", {})
60
  self.searcher = RNACentralSearch(**params)
 
 
 
 
 
61
  else:
62
  logger.error(f"Unknown data source: {self.data_source}")
63
 
 
58
 
59
  params = self.kwargs.get("rnacentral_params", {})
60
  self.searcher = RNACentralSearch(**params)
61
+ elif self.data_source == "interpro":
62
+ from graphgen.models import InterProSearch
63
+
64
+ params = self.kwargs.get("interpro_params", {})
65
+ self.searcher = InterProSearch(**params)
66
  else:
67
  logger.error(f"Unknown data source: {self.data_source}")
68