darkfrostx commited on
Commit
a2b5b6d
·
verified ·
1 Parent(s): 3ed83bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +368 -126
app.py CHANGED
@@ -1,14 +1,24 @@
1
- from fastapi import FastAPI, Query
2
  from fastapi.middleware.cors import CORSMiddleware
3
- from fastapi.responses import RedirectResponse, JSONResponse
4
- import httpx, asyncio, time, hashlib, json, os, math
5
- from typing import Dict, Any, Tuple, Optional, List
 
 
6
 
7
  APP_NAME = "neuro-mechanism-backend"
8
- CALLER_ID = "neuro-mech-backend-demo"
 
 
9
 
10
  app = FastAPI(title=APP_NAME)
11
 
 
 
 
 
 
 
12
  @app.get("/", include_in_schema=False)
13
  def root():
14
  return RedirectResponse(url="/docs")
@@ -21,8 +31,17 @@ def health():
21
  def endpoints():
22
  return JSONResponse({
23
  "GET": [
24
- "/mechanism_graph?receptor=HTR2A&symptom=apathy",
25
- "/heuristics/regions_from_string?receptor=HTR2A&limit=40",
 
 
 
 
 
 
 
 
 
26
  "/lit/eupmc?query=HTR2A%20AND%20apathy&pageSize=5",
27
  "/string/network?identifiers=HTR2A&species=9606",
28
  "/gpcrdb/protein?entry=htr2a_human",
@@ -34,12 +53,6 @@ def endpoints():
34
  ]
35
  })
36
 
37
- app.add_middleware(
38
- CORSMiddleware,
39
- allow_origins=["*"], allow_credentials=True,
40
- allow_methods=["*"], allow_headers=["*"]
41
- )
42
-
43
  UA = {"User-Agent": f"{APP_NAME}/1.2 (HF Space)"}
44
 
45
  # ----------------- tiny in-memory TTL cache -----------------
@@ -62,7 +75,12 @@ class TTLCache:
62
  async with httpx.AsyncClient(headers=UA, timeout=30) as client:
63
  r = await client.get(url, params=params)
64
  r.raise_for_status()
65
- data = r.json()
 
 
 
 
 
66
  async with self._lock:
67
  if len(self.store) > self.max_items:
68
  self.store.pop(next(iter(self.store)))
@@ -71,11 +89,10 @@ class TTLCache:
71
 
72
  CACHE = TTLCache()
73
 
74
- # ----------------- polite throttling for STRING -----------------
75
  _last_string_call = 0.0
76
  async def throttle_string():
77
- """Courtesy throttle ~1 call/sec for STRING API."""
78
- # See STRING API etiquette.
79
  global _last_string_call
80
  now = time.time()
81
  wait = 1.05 - (now - _last_string_call)
@@ -83,14 +100,26 @@ async def throttle_string():
83
  await asyncio.sleep(wait)
84
  _last_string_call = time.time()
85
 
 
86
  async def get_json_cached(url: str, params: Optional[dict], ttl: int):
87
  return await CACHE.get(url, params, ttl)
88
 
89
- # ----------------- basic pass-throughs -----------------
 
 
 
 
 
 
 
 
 
90
  @app.get("/lit/eupmc")
91
- async def europe_pmc_search(query: str, pageSize: int = 5):
 
 
92
  url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
93
- params = {"query": query, "format": "json", "pageSize": pageSize}
94
  return await get_json_cached(url, params, ttl=600)
95
 
96
  @app.get("/lit/pubmed_esearch")
@@ -131,157 +160,370 @@ async def uniprot_search(query: str, size: int = 5):
131
  @app.get("/gpcrdb/protein")
132
  async def gpcrdb_protein(entry: str):
133
  url = f"https://gpcrdb.org/services/protein/{entry}"
134
- try:
135
- return await get_json_cached(url, None, ttl=86400)
136
- except Exception:
137
- # never blow up the aggregator
138
- return {}
139
 
140
  @app.get("/string/network")
141
  async def string_network(identifiers: str, species: int = 9606, limit: int = 50):
 
142
  await throttle_string()
143
  url = "https://string-db.org/api/json/network"
144
  params = {"identifiers": identifiers, "species": species, "caller_identity": CALLER_ID, "limit": limit}
145
- try:
146
- return await get_json_cached(url, params, ttl=3600)
147
- except Exception:
148
- return []
149
-
150
- # ----------------- REGION heuristic (improved) -----------------
151
 
152
- # synonyms to widen recall; add more as needed
153
- REGION_SYNONYMS = {
154
- "prefrontal cortex": ["PFC", "vmPFC", "dlPFC", "ventromedial prefrontal cortex", "dorsolateral prefrontal cortex"],
155
- "anterior cingulate cortex": ["ACC", "dACC", "rACC"],
156
- "nucleus accumbens": ["NAc", "accumbens", "ventral striatum"],
 
 
157
  "ventral tegmental area": ["VTA"],
158
- "substantia nigra": ["SN", "SNc"],
159
- "hippocampus": ["HC"],
160
- "amygdala": [],
161
- "insula": ["insular cortex"],
162
- "thalamus": [],
163
- "hypothalamus": [],
164
- "dorsal striatum": ["caudate", "putamen"],
165
- "cerebellum": []
166
  }
167
 
168
- REGION_TERMS_DEFAULT = list(REGION_SYNONYMS.keys())
169
-
170
- def _quote_if_phrase(s: str) -> str:
171
- s = s.strip()
172
- # phrase? keep quotes; single token? no quotes to broaden match
173
- return f'"{s}"' if (" " in s and not s.startswith('"')) else s
174
-
175
- async def eupmc_hitcount(q: str) -> int:
176
- url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
177
- params = {"query": q, "format": "json", "pageSize": 0}
178
- data = await get_json_cached(url, params, ttl=1800)
179
- return int(data.get("hitCount", 0))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
  def collect_gene_symbols_from_string(edges: List[dict], focus: str) -> List[str]:
182
  genes = set()
183
  f = focus.upper()
184
- for e in edges:
185
- for k in ("preferredName_A", "preferredName_B"):
186
  g = e.get(k)
187
- if g and g.upper() != f:
188
  genes.add(g)
189
  return list(genes)
190
 
 
 
 
 
 
 
 
 
 
 
191
  @app.get("/heuristics/regions_from_string")
192
  async def regions_from_string(
193
  receptor: str = Query(..., description="e.g., HTR2A"),
194
  species: int = 9606,
195
  limit: int = 40,
196
- regions: Optional[str] = Query(None, description="comma-separated regions; default common set")
 
197
  ):
198
- # 1) pull neighbors
 
 
 
 
 
 
 
 
 
199
  edges = await string_network(receptor, species=species, limit=limit)
200
  neighbors = collect_gene_symbols_from_string(edges, receptor)
201
 
202
- # STRING confidence map
203
  conf: Dict[str, float] = {}
204
- for e in edges:
205
- a, b = e.get("preferredName_A"), e.get("preferredName_B")
206
- score = float(e.get("score", 0) or 0)
207
  if a and a.upper() != receptor.upper():
208
  conf[a] = max(conf.get(a, 0.0), score)
209
  if b and b.upper() != receptor.upper():
210
  conf[b] = max(conf.get(b, 0.0), score)
 
211
 
212
- # region list + synonyms
213
- base_regions = [r.strip() for r in (regions.split(",") if regions else REGION_TERMS_DEFAULT) if r.strip()]
214
- expanded_regions: List[Tuple[str, str]] = []
215
- for base in base_regions:
216
- expanded_regions.append((base, base))
217
- for syn in REGION_SYNONYMS.get(base, []):
218
- expanded_regions.append((base, syn)) # (canonical, synonym)
219
 
220
- # 2) Europe PMC hitCount per (canonical, candidate term)
221
- gene_clause = " OR ".join([receptor] + neighbors[:25]) if neighbors else receptor
222
- tasks = []
223
- queries = []
224
- for canon, term in expanded_regions:
225
- q1 = f'({_quote_if_phrase(term)}) AND ({gene_clause})'
226
- queries.append((canon, term, q1))
227
- tasks.append(eupmc_hitcount(q1))
228
- counts = await asyncio.gather(*tasks)
229
-
230
- # fallback pass for zeros: (region) AND (receptor) only
231
- fallback_tasks = []
232
- fallback_idx = []
233
- for i, ((canon, term, q1), hc) in enumerate(zip(queries, counts)):
234
- if hc == 0:
235
- q2 = f'({_quote_if_phrase(term)}) AND ({receptor})'
236
- fallback_idx.append(i)
237
- fallback_tasks.append(eupmc_hitcount(q2))
238
- if fallback_tasks:
239
- fallback_counts = await asyncio.gather(*fallback_tasks)
240
- for j, idx in enumerate(fallback_idx):
241
- if fallback_counts[j] > 0:
242
- counts[idx] = fallback_counts[j]
243
-
244
- # 3) aggregate by canonical region; weight by mean STRING conf
245
- mean_conf = sum(conf.values()) / max(len(conf), 1) if conf else 0.2
246
- agg: Dict[str, Dict[str, float]] = {}
247
- for (canon, _term, _q), hc in zip(queries, counts):
248
- d = agg.setdefault(canon, {"hits": 0})
249
- d["hits"] += int(hc)
250
 
251
  results = []
252
- for region, d in agg.items():
253
- score = (math.log10(d["hits"] + 1.0)) * mean_conf
254
- results.append({"region": region, "hits": d["hits"], "weighted_score": round(score, 4)})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
  results.sort(key=lambda x: x["weighted_score"], reverse=True)
257
  return {
258
  "focus": receptor,
259
  "neighbors_considered": neighbors[:25],
260
  "regions_ranked": results,
261
- "notes": "STRING neighbors + EuropePMC co-mentions; synonyms + fallback enabled."
262
  }
263
 
264
- # ----------------- aggregator -----------------
265
- @app.get("/mechanism_graph")
266
- async def mechanism_graph(
267
- receptor: str = Query(..., description="e.g., HTR2A"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  species: int = 9606,
269
- symptom: str = "apathy"
 
 
 
270
  ):
271
- gpcr_entry = f"{receptor.lower()}_human" if not receptor.lower().endswith("_human") else receptor.lower()
272
-
273
- gpcr_task = gpcrdb_protein(entry=gpcr_entry) # safe wrapper above
274
- string_task = string_network(identifiers=receptor, species=species, limit=50)
275
- lit_task = europe_pmc_search(query=f"{receptor} AND {symptom}", pageSize=10)
276
- regions_task = regions_from_string(receptor=receptor, species=species, limit=40, regions=None)
277
-
278
- gpcr_r, string_r, lit_r, regions_r = await asyncio.gather(gpcr_task, string_task, lit_task, regions_task)
279
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  return {
281
- "receptor": receptor,
282
- "gpcrdb": gpcr_r if isinstance(gpcr_r, dict) else {},
283
- "string": string_r if isinstance(string_r, list) else [],
284
- "literature": lit_r if isinstance(lit_r, dict) else {},
285
- "region_scores": regions_r if isinstance(regions_r, dict) else {},
286
- "notes": "Mechanism aggregator with cache + robust region heuristic"
287
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Query, Path, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
+ from fastapi.responses import RedirectResponse, JSONResponse, FileResponse, StreamingResponse
4
+ from typing import Dict, Any, Tuple, Optional, List, Literal
5
+ import httpx, asyncio, time, os, hashlib, json, gzip, math
6
+ from pathlib import Path as _Path
7
+ from datetime import datetime
8
 
9
  APP_NAME = "neuro-mechanism-backend"
10
+ CALLER_ID = "neuro-mech-backend-demo" # appears in STRING logs
11
+ DATA_DIR = _Path("/tmp/neuro_mech_jobs")
12
+ DATA_DIR.mkdir(parents=True, exist_ok=True)
13
 
14
  app = FastAPI(title=APP_NAME)
15
 
16
+ app.add_middleware(
17
+ CORSMiddleware,
18
+ allow_origins=["*"], allow_credentials=True,
19
+ allow_methods=["*"], allow_headers=["*"],
20
+ )
21
+
22
  @app.get("/", include_in_schema=False)
23
  def root():
24
  return RedirectResponse(url="/docs")
 
31
  def endpoints():
32
  return JSONResponse({
33
  "GET": [
34
+ "/mechanism_graph_manifest?receptor=HTR2A&symptom=apathy&species=9606",
35
+ "/mechanism_graph/nodes?job_id=<id>&page=1&page_size=200",
36
+ "/mechanism_graph/edges?job_id=<id>&page=1&page_size=200",
37
+ "/mechanism_graph/literature?job_id=<id>&page=1&page_size=50",
38
+ "/mechanism_graph/regions?job_id=<id>&page=1&page_size=50",
39
+ "/download/<job_id>/nodes (gz)",
40
+ "/download/<job_id>/edges (gz)",
41
+ "/download/<job_id>/literature (gz)",
42
+ "/download/<job_id>/regions (gz)",
43
+ "/util/synonyms?term=apathy&kind=phenotype",
44
+ "/heuristics/regions_from_string?receptor=HTR2A&symptom=apathy&limit=40",
45
  "/lit/eupmc?query=HTR2A%20AND%20apathy&pageSize=5",
46
  "/string/network?identifiers=HTR2A&species=9606",
47
  "/gpcrdb/protein?entry=htr2a_human",
 
53
  ]
54
  })
55
 
 
 
 
 
 
 
56
  UA = {"User-Agent": f"{APP_NAME}/1.2 (HF Space)"}
57
 
58
  # ----------------- tiny in-memory TTL cache -----------------
 
75
  async with httpx.AsyncClient(headers=UA, timeout=30) as client:
76
  r = await client.get(url, params=params)
77
  r.raise_for_status()
78
+ # Some third-party APIs return plain text/HTML on error;
79
+ # Fast path: try JSON, else wrap as text.
80
+ try:
81
+ data = r.json()
82
+ except Exception:
83
+ data = {"text": r.text, "status_code": r.status_code}
84
  async with self._lock:
85
  if len(self.store) > self.max_items:
86
  self.store.pop(next(iter(self.store)))
 
89
 
90
  CACHE = TTLCache()
91
 
92
+ # ----------------- polite throttling for STRING ------------------
93
  _last_string_call = 0.0
94
  async def throttle_string():
95
+ """Be nice to STRING; ~1 req/sec as a courtesy."""
 
96
  global _last_string_call
97
  now = time.time()
98
  wait = 1.05 - (now - _last_string_call)
 
100
  await asyncio.sleep(wait)
101
  _last_string_call = time.time()
102
 
103
+ # ----------------- helpers -----------------
104
  async def get_json_cached(url: str, params: Optional[dict], ttl: int):
105
  return await CACHE.get(url, params, ttl)
106
 
107
+ def _safe_float(x, default=0.0):
108
+ try:
109
+ return float(x)
110
+ except Exception:
111
+ return default
112
+
113
+ def _hash_params(d: dict) -> str:
114
+ return hashlib.sha1(json.dumps(d, sort_keys=True).encode()).hexdigest()
115
+
116
+ # ----------------- base connectors -----------------
117
  @app.get("/lit/eupmc")
118
+ async def europe_pmc_search(query: str, pageSize: int = 5, page: int = 1):
119
+ # Europe PMC REST search (JSON)
120
+ # docs: https://europepmc.org/RestfulWebService ; client vignette: europepmc R pkg
121
  url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
122
+ params = {"query": query, "format": "json", "pageSize": pageSize, "page": page}
123
  return await get_json_cached(url, params, ttl=600)
124
 
125
  @app.get("/lit/pubmed_esearch")
 
160
  @app.get("/gpcrdb/protein")
161
  async def gpcrdb_protein(entry: str):
162
  url = f"https://gpcrdb.org/services/protein/{entry}"
163
+ return await get_json_cached(url, None, ttl=86400)
 
 
 
 
164
 
165
  @app.get("/string/network")
166
  async def string_network(identifiers: str, species: int = 9606, limit: int = 50):
167
+ # STRING JSON network endpoint
168
  await throttle_string()
169
  url = "https://string-db.org/api/json/network"
170
  params = {"identifiers": identifiers, "species": species, "caller_identity": CALLER_ID, "limit": limit}
171
+ return await get_json_cached(url, params, ttl=3600)
 
 
 
 
 
172
 
173
+ # ----------------- synonym utilities -----------------
174
+ # curated region slang/aliases (additive to OLS)
175
+ CURATED_REGION_SYNONYMS = {
176
+ "prefrontal cortex": ["PFC", "frontal cortex", "dorsolateral prefrontal cortex", "dlPFC",
177
+ "ventromedial prefrontal cortex", "vmPFC", "orbitofrontal cortex", "OFC"],
178
+ "anterior cingulate cortex": ["ACC", "dorsal ACC", "dACC", "rostral ACC", "rACC"],
179
+ "nucleus accumbens": ["NAc", "ventral striatum"],
180
  "ventral tegmental area": ["VTA"],
181
+ "substantia nigra": ["SN", "pars compacta", "SNc"],
182
+ "hippocampus": ["hippocampal formation", "CA1", "CA3", "dentate gyrus"],
183
+ "amygdala": ["basolateral amygdala", "BLA", "central amygdala"]
 
 
 
 
 
184
  }
185
 
186
+ async def _ols_synonyms(term: str, ontologies: Optional[List[str]] = None) -> List[str]:
187
+ # OLS4 search; aggregate synonyms for top hits containing the term
188
+ url = "https://www.ebi.ac.uk/ols4/api/search"
189
+ params = {"q": term}
190
+ if ontologies:
191
+ # OLS4 supports multiple ontology filters as repeated params
192
+ # We'll just join as comma-separated for brevity (works for OLS4)
193
+ params["ontology"] = ",".join(ontologies)
194
+ data = await get_json_cached(url, params, ttl=86400)
195
+ syns = set()
196
+ try:
197
+ docs = data.get("response", {}).get("docs", [])
198
+ for d in docs[:5]:
199
+ for s in d.get("synonyms", []) or []:
200
+ if isinstance(s, str):
201
+ syns.add(s)
202
+ except Exception:
203
+ pass
204
+ return list(syns)
205
+
206
+ async def _mygene_aliases(symbol: str) -> List[str]:
207
+ # MyGene.info v3; pull aliases/other names for the main focus gene
208
+ url = "https://mygene.info/v3/query"
209
+ params = {"q": f"symbol:{symbol}", "fields": "symbol,name,alias,alias_symbol,other_names", "size": 1, "species": "human"}
210
+ data = await get_json_cached(url, params, ttl=86400)
211
+ syns = set()
212
+ try:
213
+ hits = data.get("hits", [])
214
+ if hits:
215
+ h = hits[0]
216
+ for fld in ("symbol","name"):
217
+ v = h.get(fld)
218
+ if isinstance(v, str):
219
+ syns.add(v)
220
+ for fld in ("alias","alias_symbol","other_names"):
221
+ v = h.get(fld)
222
+ if isinstance(v, list):
223
+ for x in v:
224
+ if isinstance(x, str):
225
+ syns.add(x)
226
+ except Exception:
227
+ pass
228
+ return list(syns)
229
+
230
+ @app.get("/util/synonyms")
231
+ async def util_synonyms(term: str, kind: Literal["region","gene","phenotype","auto"]="auto"):
232
+ """
233
+ Fetch synonyms for a term.
234
+ region: OLS4 (UBERON,HBP/HPO where applicable) + curated slang
235
+ gene: MyGene.info aliases
236
+ phenotype: OLS4(HPO)
237
+ auto: choose gene if ALLCAPS letters+digits, else phenotype->region fallback.
238
+ """
239
+ k = kind
240
+ if k == "auto":
241
+ k = "gene" if term.isupper() else "phenotype"
242
+ syns = set([term])
243
+
244
+ if k == "region":
245
+ syns.update(CURATED_REGION_SYNONYMS.get(term.lower(), []))
246
+ syns.update(await _ols_synonyms(term, ontologies=["uberon","hbp","hpo","ncit"]))
247
+ elif k == "gene":
248
+ syns.update(await _mygene_aliases(term))
249
+ elif k == "phenotype":
250
+ syns.update(await _ols_synonyms(term, ontologies=["hpo","efo","mondo"]))
251
+
252
+ return {"term": term, "kind": k, "synonyms": sorted({s for s in syns if isinstance(s, str) and len(s) <= 60})}
253
+
254
+ # ----------------- region heuristic (upgraded) -----------------
255
+ REGION_TERMS_DEFAULT = [
256
+ "prefrontal cortex","anterior cingulate cortex","mPFC","ACC","nucleus accumbens","ventral striatum",
257
+ "dorsal striatum","caudate","putamen","amygdala","hippocampus","thalamus","hypothalamus",
258
+ "insula","ventral tegmental area","VTA","substantia nigra","cerebellum"
259
+ ]
260
 
261
  def collect_gene_symbols_from_string(edges: List[dict], focus: str) -> List[str]:
262
  genes = set()
263
  f = focus.upper()
264
+ for e in edges or []:
265
+ for k in ("preferredName_A","preferredName_B"):
266
  g = e.get(k)
267
+ if g and isinstance(g,str) and g.upper() != f:
268
  genes.add(g)
269
  return list(genes)
270
 
271
+ async def _eupmc_hitcount(q: str) -> int:
272
+ # Europe PMC search hitCount (pageSize=0)
273
+ url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
274
+ params = {"query": q, "format": "json", "pageSize": 0}
275
+ data = await get_json_cached(url, params, ttl=3600)
276
+ try:
277
+ return int(data.get("hitCount", 0))
278
+ except Exception:
279
+ return 0
280
+
281
  @app.get("/heuristics/regions_from_string")
282
  async def regions_from_string(
283
  receptor: str = Query(..., description="e.g., HTR2A"),
284
  species: int = 9606,
285
  limit: int = 40,
286
+ regions: Optional[str] = Query(None, description="comma-separated region terms (optional)"),
287
+ symptom: Optional[str] = Query(None, description="optional phenotype/symptom to weight co-mentions (e.g., apathy)")
288
  ):
289
+ """
290
+ Heuristic: rank brain regions by STRING neighbors + Europe PMC co-mentions, with synonyms & tiered fallbacks.
291
+ Tiers (all unquoted for flexible match):
292
+ T1: (region_syns) AND ((receptor_syns) OR neighbors) AND (symptom_syns?) weight 1.0
293
+ T2: (region_syns) AND (receptor_syns OR neighbors) weight 0.6
294
+ T3: (region_syns) AND (receptor_syns) weight 0.5
295
+ T4: (region_syns) AND (symptom_syns) weight 0.3
296
+ Final score = log10(weighted_hits+1) * mean_top_STRING_conf
297
+ """
298
+ # 1) STRING neighbors
299
  edges = await string_network(receptor, species=species, limit=limit)
300
  neighbors = collect_gene_symbols_from_string(edges, receptor)
301
 
302
+ # STRING confidences
303
  conf: Dict[str, float] = {}
304
+ for e in edges or []:
305
+ a, b, score = e.get("preferredName_A"), e.get("preferredName_B"), _safe_float(e.get("score", 0))
 
306
  if a and a.upper() != receptor.upper():
307
  conf[a] = max(conf.get(a, 0.0), score)
308
  if b and b.upper() != receptor.upper():
309
  conf[b] = max(conf.get(b, 0.0), score)
310
+ mean_conf = sum(conf.values())/max(len(conf),1) if conf else 0.2
311
 
312
+ # 2) synonyms
313
+ receptor_syns = await _mygene_aliases(receptor)
314
+ symptom_syns = []
315
+ if symptom:
316
+ s = await util_synonyms(symptom, kind="phenotype")
317
+ symptom_syns = s["synonyms"]
 
318
 
319
+ region_list = [r.strip() for r in (regions.split(",") if regions else REGION_TERMS_DEFAULT) if r.strip()]
320
+ # Build clauses (unquoted OR lists)
321
+ gene_clause = " OR ".join(sorted({receptor} | set(receptor_syns) | set(neighbors[:25])))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
 
323
  results = []
324
+ tasks = []
325
+ tier_defs = []
326
+ for region in region_list:
327
+ # region synonyms
328
+ rs = await util_synonyms(region, kind="region")
329
+ region_syns = rs["synonyms"]
330
+ region_clause = " OR ".join(region_syns)
331
+
332
+ # tiers
333
+ # T1
334
+ if symptom and symptom_syns:
335
+ t1 = f"({region_clause}) AND (({gene_clause})) AND ({' OR '.join(symptom_syns)})"
336
+ else:
337
+ t1 = f"({region_clause}) AND (({gene_clause}))"
338
+ t2 = f"({region_clause}) AND (({gene_clause}))"
339
+ t3 = f"({region_clause}) AND ({' OR '.join(sorted(set([receptor] + receptor_syns)))})"
340
+ t4 = f"({region_clause}) AND ({' OR '.join(symptom_syns)})" if symptom_syns else None
341
+
342
+ tiers = [("t1",1.0,t1), ("t2",0.6,t2), ("t3",0.5,t3)]
343
+ if t4: tiers.append(("t4",0.3,t4))
344
+
345
+ # schedule hitCount calls
346
+ tier_defs.append((region, tiers))
347
+ for _,_,q in tiers:
348
+ tasks.append(_eupmc_hitcount(q))
349
+
350
+ # gather all counts in-order
351
+ counts_all = await asyncio.gather(*tasks)
352
+ # fold back into regions
353
+ idx = 0
354
+ for region, tiers in tier_defs:
355
+ weighted = 0.0
356
+ tier_counts = {}
357
+ for name, weight, _q in tiers:
358
+ hc = counts_all[idx]; idx += 1
359
+ tier_counts[name] = hc
360
+ weighted += weight * hc
361
+ score = math.log10(weighted + 1.0) * mean_conf
362
+ results.append({"region": region, "tiers": tier_counts, "weighted_hits": int(round(weighted)),
363
+ "weighted_score": round(score, 4)})
364
 
365
  results.sort(key=lambda x: x["weighted_score"], reverse=True)
366
  return {
367
  "focus": receptor,
368
  "neighbors_considered": neighbors[:25],
369
  "regions_ranked": results,
370
+ "notes": "STRING + Europe PMC with synonyms and tiered fallbacks (unquoted)."
371
  }
372
 
373
+ # ----------------- MANIFEST + PAGED SECTIONS + DOWNLOAD -----------------
374
+ def _job_dir(job_id: str) -> _Path:
375
+ d = DATA_DIR / job_id
376
+ d.mkdir(parents=True, exist_ok=True)
377
+ return d
378
+
379
+ def _write_gz_jsonl(path: _Path, items: List[dict]):
380
+ with gzip.open(path, "wt", encoding="utf-8") as gz:
381
+ for it in items:
382
+ gz.write(json.dumps(it, ensure_ascii=False) + "\n")
383
+
384
+ def _read_gz_page(path: _Path, page: int, page_size: int) -> Tuple[int, List[dict]]:
385
+ total = 0
386
+ start = (page - 1) * page_size
387
+ end = start + page_size
388
+ out = []
389
+ with gzip.open(path, "rt", encoding="utf-8") as gz:
390
+ for i, line in enumerate(gz):
391
+ if not line.strip():
392
+ continue
393
+ if i >= start and i < end:
394
+ out.append(json.loads(line))
395
+ total += 1
396
+ return total, out
397
+
398
+ async def _build_mech_job(params: dict) -> dict:
399
+ """
400
+ Build nodes/edges/literature/regions; write gz NDJSON + meta.
401
+ """
402
+ receptor = params["receptor"]
403
+ species = int(params.get("species", 9606))
404
+ symptom = params.get("symptom")
405
+ string_limit = int(params.get("string_limit", 200))
406
+ eupmc_page_size = int(params.get("eupmc_page_size", 100))
407
+ eupmc_max_pages = int(params.get("eupmc_max_pages", 3))
408
+
409
+ job_id = _hash_params(params)
410
+ d = _job_dir(job_id)
411
+ meta_path = d / "meta.json"
412
+ if meta_path.exists():
413
+ return json.loads(meta_path.read_text("utf-8"))
414
+
415
+ # 1) STRING edges + nodes
416
+ edges = await string_network(receptor, species=species, limit=string_limit)
417
+ edge_items = []
418
+ nodes = set([receptor])
419
+ for e in edges or []:
420
+ a = e.get("preferredName_A"); b = e.get("preferredName_B")
421
+ score = _safe_float(e.get("score", 0))
422
+ if a and b:
423
+ edge_items.append({"a": a, "b": b, "score": score})
424
+ nodes.add(a); nodes.add(b)
425
+ node_items = [{"symbol": n, "seed": (n.upper()==receptor.upper())} for n in sorted(nodes)]
426
+
427
+ _write_gz_jsonl(d / "edges.jsonl.gz", edge_items)
428
+ _write_gz_jsonl(d / "nodes.jsonl.gz", node_items)
429
+
430
+ # 2) Europe PMC literature for (receptor AND symptom?) else receptor
431
+ lit_items = []
432
+ base_q = f"{receptor} AND {symptom}" if symptom else receptor
433
+ for page in range(1, eupmc_max_pages+1):
434
+ res = await europe_pmc_search(base_q, pageSize=eupmc_page_size, page=page)
435
+ hits = res.get("resultList", {}).get("result", []) or []
436
+ for h in hits:
437
+ lit_items.append({
438
+ "id": h.get("id"),
439
+ "source": h.get("source"), "title": h.get("title"),
440
+ "pubYear": h.get("pubYear"), "authorString": h.get("authorString"),
441
+ "journalTitle": h.get("journalTitle"), "doi": h.get("doi")
442
+ })
443
+ # stop early if last page
444
+ if len(hits) < eupmc_page_size:
445
+ break
446
+ _write_gz_jsonl(d / "literature.jsonl.gz", lit_items)
447
+
448
+ # 3) Regions heuristic (with symptom)
449
+ reg = await regions_from_string(receptor=receptor, species=species, limit=min(100, string_limit), regions=None, symptom=symptom)
450
+ reg_items = []
451
+ for r in reg.get("regions_ranked", []):
452
+ reg_items.append(r)
453
+ _write_gz_jsonl(d / "regions.jsonl.gz", reg_items)
454
+
455
+ meta = {
456
+ "job_id": job_id,
457
+ "created": datetime.utcnow().isoformat() + "Z",
458
+ "params": params,
459
+ "counts": {
460
+ "nodes": len(node_items),
461
+ "edges": len(edge_items),
462
+ "literature": len(lit_items),
463
+ "regions": len(reg_items)
464
+ },
465
+ "sections": ["nodes","edges","literature","regions"]
466
+ }
467
+ meta_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
468
+ return meta
469
+
470
+ @app.get("/mechanism_graph_manifest")
471
+ async def mechanism_graph_manifest(
472
+ receptor: str = Query(...),
473
  species: int = 9606,
474
+ symptom: Optional[str] = None,
475
+ string_limit: int = 200,
476
+ eupmc_page_size: int = 100,
477
+ eupmc_max_pages: int = 3
478
  ):
479
+ """
480
+ Build the full mechanism dataset server-side and return a manifest with job_id + counts.
481
+ The actual data is stored as gzipped NDJSON and can be:
482
+ - paged via /mechanism_graph/{section}?job_id=...&page=1&page_size=...
483
+ - or downloaded as a single gz file via /download/{job_id}/{section}
484
+ """
485
+ params = {
486
+ "receptor": receptor, "species": species, "symptom": symptom,
487
+ "string_limit": string_limit, "eupmc_page_size": eupmc_page_size, "eupmc_max_pages": eupmc_max_pages
488
+ }
489
+ meta = await _build_mech_job(params)
490
+ return meta
491
+
492
+ @app.get("/mechanism_graph/{section}")
493
+ async def mechanism_graph_section(
494
+ section: Literal["nodes","edges","literature","regions"] = Path(...),
495
+ job_id: str = Query(...),
496
+ page: int = 1,
497
+ page_size: int = 100
498
+ ):
499
+ """
500
+ Return a single page from a section (nodes|edges|literature|regions).
501
+ """
502
+ d = _job_dir(job_id)
503
+ p = d / f"{section}.jsonl.gz"
504
+ if not p.exists():
505
+ raise HTTPException(status_code=404, detail=f"section {section} not found for job {job_id}")
506
+
507
+ total, items = _read_gz_page(p, page=page, page_size=page_size)
508
  return {
509
+ "job_id": job_id,
510
+ "section": section,
511
+ "page": page, "page_size": page_size,
512
+ "total": total,
513
+ "items": items
 
514
  }
515
+
516
+ @app.get("/download/{job_id}/{section}")
517
+ async def download_section(job_id: str, section: Literal["nodes","edges","literature","regions"]):
518
+ """
519
+ Download the full gzipped NDJSON for a section.
520
+ """
521
+ d = _job_dir(job_id)
522
+ p = d / f"{section}.jsonl.gz"
523
+ if not p.exists():
524
+ raise HTTPException(status_code=404, detail=f"section {section} not found for job {job_id}")
525
+ return FileResponse(
526
+ path=str(p),
527
+ filename=f"{APP_NAME}-{job_id}-{section}.jsonl.gz",
528
+ media_type="application/gzip"
529
+ )