Update app.py
Browse files
app.py
CHANGED
|
@@ -1,14 +1,24 @@
|
|
| 1 |
-
from fastapi import FastAPI, Query
|
| 2 |
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
-
from fastapi.responses import RedirectResponse, JSONResponse
|
| 4 |
-
import
|
| 5 |
-
|
|
|
|
|
|
|
| 6 |
|
| 7 |
APP_NAME = "neuro-mechanism-backend"
|
| 8 |
-
CALLER_ID = "neuro-mech-backend-demo"
|
|
|
|
|
|
|
| 9 |
|
| 10 |
app = FastAPI(title=APP_NAME)
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
@app.get("/", include_in_schema=False)
|
| 13 |
def root():
|
| 14 |
return RedirectResponse(url="/docs")
|
|
@@ -21,8 +31,17 @@ def health():
|
|
| 21 |
def endpoints():
|
| 22 |
return JSONResponse({
|
| 23 |
"GET": [
|
| 24 |
-
"/
|
| 25 |
-
"/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
"/lit/eupmc?query=HTR2A%20AND%20apathy&pageSize=5",
|
| 27 |
"/string/network?identifiers=HTR2A&species=9606",
|
| 28 |
"/gpcrdb/protein?entry=htr2a_human",
|
|
@@ -34,12 +53,6 @@ def endpoints():
|
|
| 34 |
]
|
| 35 |
})
|
| 36 |
|
| 37 |
-
app.add_middleware(
|
| 38 |
-
CORSMiddleware,
|
| 39 |
-
allow_origins=["*"], allow_credentials=True,
|
| 40 |
-
allow_methods=["*"], allow_headers=["*"]
|
| 41 |
-
)
|
| 42 |
-
|
| 43 |
UA = {"User-Agent": f"{APP_NAME}/1.2 (HF Space)"}
|
| 44 |
|
| 45 |
# ----------------- tiny in-memory TTL cache -----------------
|
|
@@ -62,7 +75,12 @@ class TTLCache:
|
|
| 62 |
async with httpx.AsyncClient(headers=UA, timeout=30) as client:
|
| 63 |
r = await client.get(url, params=params)
|
| 64 |
r.raise_for_status()
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
async with self._lock:
|
| 67 |
if len(self.store) > self.max_items:
|
| 68 |
self.store.pop(next(iter(self.store)))
|
|
@@ -71,11 +89,10 @@ class TTLCache:
|
|
| 71 |
|
| 72 |
CACHE = TTLCache()
|
| 73 |
|
| 74 |
-
# ----------------- polite throttling for STRING
|
| 75 |
_last_string_call = 0.0
|
| 76 |
async def throttle_string():
|
| 77 |
-
"""
|
| 78 |
-
# See STRING API etiquette.
|
| 79 |
global _last_string_call
|
| 80 |
now = time.time()
|
| 81 |
wait = 1.05 - (now - _last_string_call)
|
|
@@ -83,14 +100,26 @@ async def throttle_string():
|
|
| 83 |
await asyncio.sleep(wait)
|
| 84 |
_last_string_call = time.time()
|
| 85 |
|
|
|
|
| 86 |
async def get_json_cached(url: str, params: Optional[dict], ttl: int):
|
| 87 |
return await CACHE.get(url, params, ttl)
|
| 88 |
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
@app.get("/lit/eupmc")
|
| 91 |
-
async def europe_pmc_search(query: str, pageSize: int = 5):
|
|
|
|
|
|
|
| 92 |
url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
|
| 93 |
-
params = {"query": query, "format": "json", "pageSize": pageSize}
|
| 94 |
return await get_json_cached(url, params, ttl=600)
|
| 95 |
|
| 96 |
@app.get("/lit/pubmed_esearch")
|
|
@@ -131,157 +160,370 @@ async def uniprot_search(query: str, size: int = 5):
|
|
| 131 |
@app.get("/gpcrdb/protein")
|
| 132 |
async def gpcrdb_protein(entry: str):
|
| 133 |
url = f"https://gpcrdb.org/services/protein/{entry}"
|
| 134 |
-
|
| 135 |
-
return await get_json_cached(url, None, ttl=86400)
|
| 136 |
-
except Exception:
|
| 137 |
-
# never blow up the aggregator
|
| 138 |
-
return {}
|
| 139 |
|
| 140 |
@app.get("/string/network")
|
| 141 |
async def string_network(identifiers: str, species: int = 9606, limit: int = 50):
|
|
|
|
| 142 |
await throttle_string()
|
| 143 |
url = "https://string-db.org/api/json/network"
|
| 144 |
params = {"identifiers": identifiers, "species": species, "caller_identity": CALLER_ID, "limit": limit}
|
| 145 |
-
|
| 146 |
-
return await get_json_cached(url, params, ttl=3600)
|
| 147 |
-
except Exception:
|
| 148 |
-
return []
|
| 149 |
-
|
| 150 |
-
# ----------------- REGION heuristic (improved) -----------------
|
| 151 |
|
| 152 |
-
#
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
"
|
| 156 |
-
|
|
|
|
|
|
|
| 157 |
"ventral tegmental area": ["VTA"],
|
| 158 |
-
"substantia nigra": ["SN", "SNc"],
|
| 159 |
-
"hippocampus": ["
|
| 160 |
-
"amygdala": []
|
| 161 |
-
"insula": ["insular cortex"],
|
| 162 |
-
"thalamus": [],
|
| 163 |
-
"hypothalamus": [],
|
| 164 |
-
"dorsal striatum": ["caudate", "putamen"],
|
| 165 |
-
"cerebellum": []
|
| 166 |
}
|
| 167 |
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
url =
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
def collect_gene_symbols_from_string(edges: List[dict], focus: str) -> List[str]:
|
| 182 |
genes = set()
|
| 183 |
f = focus.upper()
|
| 184 |
-
for e in edges:
|
| 185 |
-
for k in ("preferredName_A",
|
| 186 |
g = e.get(k)
|
| 187 |
-
if g and g.upper() != f:
|
| 188 |
genes.add(g)
|
| 189 |
return list(genes)
|
| 190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
@app.get("/heuristics/regions_from_string")
|
| 192 |
async def regions_from_string(
|
| 193 |
receptor: str = Query(..., description="e.g., HTR2A"),
|
| 194 |
species: int = 9606,
|
| 195 |
limit: int = 40,
|
| 196 |
-
regions: Optional[str] = Query(None, description="comma-separated
|
|
|
|
| 197 |
):
|
| 198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
edges = await string_network(receptor, species=species, limit=limit)
|
| 200 |
neighbors = collect_gene_symbols_from_string(edges, receptor)
|
| 201 |
|
| 202 |
-
# STRING
|
| 203 |
conf: Dict[str, float] = {}
|
| 204 |
-
for e in edges:
|
| 205 |
-
a, b = e.get("preferredName_A"), e.get("preferredName_B")
|
| 206 |
-
score = float(e.get("score", 0) or 0)
|
| 207 |
if a and a.upper() != receptor.upper():
|
| 208 |
conf[a] = max(conf.get(a, 0.0), score)
|
| 209 |
if b and b.upper() != receptor.upper():
|
| 210 |
conf[b] = max(conf.get(b, 0.0), score)
|
|
|
|
| 211 |
|
| 212 |
-
#
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
expanded_regions.append((base, syn)) # (canonical, synonym)
|
| 219 |
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
queries = []
|
| 224 |
-
for canon, term in expanded_regions:
|
| 225 |
-
q1 = f'({_quote_if_phrase(term)}) AND ({gene_clause})'
|
| 226 |
-
queries.append((canon, term, q1))
|
| 227 |
-
tasks.append(eupmc_hitcount(q1))
|
| 228 |
-
counts = await asyncio.gather(*tasks)
|
| 229 |
-
|
| 230 |
-
# fallback pass for zeros: (region) AND (receptor) only
|
| 231 |
-
fallback_tasks = []
|
| 232 |
-
fallback_idx = []
|
| 233 |
-
for i, ((canon, term, q1), hc) in enumerate(zip(queries, counts)):
|
| 234 |
-
if hc == 0:
|
| 235 |
-
q2 = f'({_quote_if_phrase(term)}) AND ({receptor})'
|
| 236 |
-
fallback_idx.append(i)
|
| 237 |
-
fallback_tasks.append(eupmc_hitcount(q2))
|
| 238 |
-
if fallback_tasks:
|
| 239 |
-
fallback_counts = await asyncio.gather(*fallback_tasks)
|
| 240 |
-
for j, idx in enumerate(fallback_idx):
|
| 241 |
-
if fallback_counts[j] > 0:
|
| 242 |
-
counts[idx] = fallback_counts[j]
|
| 243 |
-
|
| 244 |
-
# 3) aggregate by canonical region; weight by mean STRING conf
|
| 245 |
-
mean_conf = sum(conf.values()) / max(len(conf), 1) if conf else 0.2
|
| 246 |
-
agg: Dict[str, Dict[str, float]] = {}
|
| 247 |
-
for (canon, _term, _q), hc in zip(queries, counts):
|
| 248 |
-
d = agg.setdefault(canon, {"hits": 0})
|
| 249 |
-
d["hits"] += int(hc)
|
| 250 |
|
| 251 |
results = []
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
|
| 256 |
results.sort(key=lambda x: x["weighted_score"], reverse=True)
|
| 257 |
return {
|
| 258 |
"focus": receptor,
|
| 259 |
"neighbors_considered": neighbors[:25],
|
| 260 |
"regions_ranked": results,
|
| 261 |
-
"notes": "STRING
|
| 262 |
}
|
| 263 |
|
| 264 |
-
# -----------------
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
species: int = 9606,
|
| 269 |
-
symptom: str =
|
|
|
|
|
|
|
|
|
|
| 270 |
):
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
return {
|
| 281 |
-
"
|
| 282 |
-
"
|
| 283 |
-
"
|
| 284 |
-
"
|
| 285 |
-
"
|
| 286 |
-
"notes": "Mechanism aggregator with cache + robust region heuristic"
|
| 287 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, Query, Path, HTTPException
|
| 2 |
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
+
from fastapi.responses import RedirectResponse, JSONResponse, FileResponse, StreamingResponse
|
| 4 |
+
from typing import Dict, Any, Tuple, Optional, List, Literal
|
| 5 |
+
import httpx, asyncio, time, os, hashlib, json, gzip, math
|
| 6 |
+
from pathlib import Path as _Path
|
| 7 |
+
from datetime import datetime
|
| 8 |
|
| 9 |
APP_NAME = "neuro-mechanism-backend"
|
| 10 |
+
CALLER_ID = "neuro-mech-backend-demo" # appears in STRING logs
|
| 11 |
+
DATA_DIR = _Path("/tmp/neuro_mech_jobs")
|
| 12 |
+
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
| 13 |
|
| 14 |
app = FastAPI(title=APP_NAME)
|
| 15 |
|
| 16 |
+
app.add_middleware(
|
| 17 |
+
CORSMiddleware,
|
| 18 |
+
allow_origins=["*"], allow_credentials=True,
|
| 19 |
+
allow_methods=["*"], allow_headers=["*"],
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
@app.get("/", include_in_schema=False)
|
| 23 |
def root():
|
| 24 |
return RedirectResponse(url="/docs")
|
|
|
|
| 31 |
def endpoints():
|
| 32 |
return JSONResponse({
|
| 33 |
"GET": [
|
| 34 |
+
"/mechanism_graph_manifest?receptor=HTR2A&symptom=apathy&species=9606",
|
| 35 |
+
"/mechanism_graph/nodes?job_id=<id>&page=1&page_size=200",
|
| 36 |
+
"/mechanism_graph/edges?job_id=<id>&page=1&page_size=200",
|
| 37 |
+
"/mechanism_graph/literature?job_id=<id>&page=1&page_size=50",
|
| 38 |
+
"/mechanism_graph/regions?job_id=<id>&page=1&page_size=50",
|
| 39 |
+
"/download/<job_id>/nodes (gz)",
|
| 40 |
+
"/download/<job_id>/edges (gz)",
|
| 41 |
+
"/download/<job_id>/literature (gz)",
|
| 42 |
+
"/download/<job_id>/regions (gz)",
|
| 43 |
+
"/util/synonyms?term=apathy&kind=phenotype",
|
| 44 |
+
"/heuristics/regions_from_string?receptor=HTR2A&symptom=apathy&limit=40",
|
| 45 |
"/lit/eupmc?query=HTR2A%20AND%20apathy&pageSize=5",
|
| 46 |
"/string/network?identifiers=HTR2A&species=9606",
|
| 47 |
"/gpcrdb/protein?entry=htr2a_human",
|
|
|
|
| 53 |
]
|
| 54 |
})
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
UA = {"User-Agent": f"{APP_NAME}/1.2 (HF Space)"}
|
| 57 |
|
| 58 |
# ----------------- tiny in-memory TTL cache -----------------
|
|
|
|
| 75 |
async with httpx.AsyncClient(headers=UA, timeout=30) as client:
|
| 76 |
r = await client.get(url, params=params)
|
| 77 |
r.raise_for_status()
|
| 78 |
+
# Some third-party APIs return plain text/HTML on error;
|
| 79 |
+
# Fast path: try JSON, else wrap as text.
|
| 80 |
+
try:
|
| 81 |
+
data = r.json()
|
| 82 |
+
except Exception:
|
| 83 |
+
data = {"text": r.text, "status_code": r.status_code}
|
| 84 |
async with self._lock:
|
| 85 |
if len(self.store) > self.max_items:
|
| 86 |
self.store.pop(next(iter(self.store)))
|
|
|
|
| 89 |
|
| 90 |
CACHE = TTLCache()
|
| 91 |
|
| 92 |
+
# ----------------- polite throttling for STRING ------------------
|
| 93 |
_last_string_call = 0.0
|
| 94 |
async def throttle_string():
|
| 95 |
+
"""Be nice to STRING; ~1 req/sec as a courtesy."""
|
|
|
|
| 96 |
global _last_string_call
|
| 97 |
now = time.time()
|
| 98 |
wait = 1.05 - (now - _last_string_call)
|
|
|
|
| 100 |
await asyncio.sleep(wait)
|
| 101 |
_last_string_call = time.time()
|
| 102 |
|
| 103 |
+
# ----------------- helpers -----------------
|
| 104 |
async def get_json_cached(url: str, params: Optional[dict], ttl: int):
|
| 105 |
return await CACHE.get(url, params, ttl)
|
| 106 |
|
| 107 |
+
def _safe_float(x, default=0.0):
|
| 108 |
+
try:
|
| 109 |
+
return float(x)
|
| 110 |
+
except Exception:
|
| 111 |
+
return default
|
| 112 |
+
|
| 113 |
+
def _hash_params(d: dict) -> str:
|
| 114 |
+
return hashlib.sha1(json.dumps(d, sort_keys=True).encode()).hexdigest()
|
| 115 |
+
|
| 116 |
+
# ----------------- base connectors -----------------
|
| 117 |
@app.get("/lit/eupmc")
|
| 118 |
+
async def europe_pmc_search(query: str, pageSize: int = 5, page: int = 1):
|
| 119 |
+
# Europe PMC REST search (JSON)
|
| 120 |
+
# docs: https://europepmc.org/RestfulWebService ; client vignette: europepmc R pkg
|
| 121 |
url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
|
| 122 |
+
params = {"query": query, "format": "json", "pageSize": pageSize, "page": page}
|
| 123 |
return await get_json_cached(url, params, ttl=600)
|
| 124 |
|
| 125 |
@app.get("/lit/pubmed_esearch")
|
|
|
|
| 160 |
@app.get("/gpcrdb/protein")
|
| 161 |
async def gpcrdb_protein(entry: str):
|
| 162 |
url = f"https://gpcrdb.org/services/protein/{entry}"
|
| 163 |
+
return await get_json_cached(url, None, ttl=86400)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
|
| 165 |
@app.get("/string/network")
|
| 166 |
async def string_network(identifiers: str, species: int = 9606, limit: int = 50):
|
| 167 |
+
# STRING JSON network endpoint
|
| 168 |
await throttle_string()
|
| 169 |
url = "https://string-db.org/api/json/network"
|
| 170 |
params = {"identifiers": identifiers, "species": species, "caller_identity": CALLER_ID, "limit": limit}
|
| 171 |
+
return await get_json_cached(url, params, ttl=3600)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
+
# ----------------- synonym utilities -----------------
|
| 174 |
+
# curated region slang/aliases (additive to OLS)
|
| 175 |
+
CURATED_REGION_SYNONYMS = {
|
| 176 |
+
"prefrontal cortex": ["PFC", "frontal cortex", "dorsolateral prefrontal cortex", "dlPFC",
|
| 177 |
+
"ventromedial prefrontal cortex", "vmPFC", "orbitofrontal cortex", "OFC"],
|
| 178 |
+
"anterior cingulate cortex": ["ACC", "dorsal ACC", "dACC", "rostral ACC", "rACC"],
|
| 179 |
+
"nucleus accumbens": ["NAc", "ventral striatum"],
|
| 180 |
"ventral tegmental area": ["VTA"],
|
| 181 |
+
"substantia nigra": ["SN", "pars compacta", "SNc"],
|
| 182 |
+
"hippocampus": ["hippocampal formation", "CA1", "CA3", "dentate gyrus"],
|
| 183 |
+
"amygdala": ["basolateral amygdala", "BLA", "central amygdala"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
}
|
| 185 |
|
| 186 |
+
async def _ols_synonyms(term: str, ontologies: Optional[List[str]] = None) -> List[str]:
|
| 187 |
+
# OLS4 search; aggregate synonyms for top hits containing the term
|
| 188 |
+
url = "https://www.ebi.ac.uk/ols4/api/search"
|
| 189 |
+
params = {"q": term}
|
| 190 |
+
if ontologies:
|
| 191 |
+
# OLS4 supports multiple ontology filters as repeated params
|
| 192 |
+
# We'll just join as comma-separated for brevity (works for OLS4)
|
| 193 |
+
params["ontology"] = ",".join(ontologies)
|
| 194 |
+
data = await get_json_cached(url, params, ttl=86400)
|
| 195 |
+
syns = set()
|
| 196 |
+
try:
|
| 197 |
+
docs = data.get("response", {}).get("docs", [])
|
| 198 |
+
for d in docs[:5]:
|
| 199 |
+
for s in d.get("synonyms", []) or []:
|
| 200 |
+
if isinstance(s, str):
|
| 201 |
+
syns.add(s)
|
| 202 |
+
except Exception:
|
| 203 |
+
pass
|
| 204 |
+
return list(syns)
|
| 205 |
+
|
| 206 |
+
async def _mygene_aliases(symbol: str) -> List[str]:
|
| 207 |
+
# MyGene.info v3; pull aliases/other names for the main focus gene
|
| 208 |
+
url = "https://mygene.info/v3/query"
|
| 209 |
+
params = {"q": f"symbol:{symbol}", "fields": "symbol,name,alias,alias_symbol,other_names", "size": 1, "species": "human"}
|
| 210 |
+
data = await get_json_cached(url, params, ttl=86400)
|
| 211 |
+
syns = set()
|
| 212 |
+
try:
|
| 213 |
+
hits = data.get("hits", [])
|
| 214 |
+
if hits:
|
| 215 |
+
h = hits[0]
|
| 216 |
+
for fld in ("symbol","name"):
|
| 217 |
+
v = h.get(fld)
|
| 218 |
+
if isinstance(v, str):
|
| 219 |
+
syns.add(v)
|
| 220 |
+
for fld in ("alias","alias_symbol","other_names"):
|
| 221 |
+
v = h.get(fld)
|
| 222 |
+
if isinstance(v, list):
|
| 223 |
+
for x in v:
|
| 224 |
+
if isinstance(x, str):
|
| 225 |
+
syns.add(x)
|
| 226 |
+
except Exception:
|
| 227 |
+
pass
|
| 228 |
+
return list(syns)
|
| 229 |
+
|
| 230 |
+
@app.get("/util/synonyms")
|
| 231 |
+
async def util_synonyms(term: str, kind: Literal["region","gene","phenotype","auto"]="auto"):
|
| 232 |
+
"""
|
| 233 |
+
Fetch synonyms for a term.
|
| 234 |
+
region: OLS4 (UBERON,HBP/HPO where applicable) + curated slang
|
| 235 |
+
gene: MyGene.info aliases
|
| 236 |
+
phenotype: OLS4(HPO)
|
| 237 |
+
auto: choose gene if ALLCAPS letters+digits, else phenotype->region fallback.
|
| 238 |
+
"""
|
| 239 |
+
k = kind
|
| 240 |
+
if k == "auto":
|
| 241 |
+
k = "gene" if term.isupper() else "phenotype"
|
| 242 |
+
syns = set([term])
|
| 243 |
+
|
| 244 |
+
if k == "region":
|
| 245 |
+
syns.update(CURATED_REGION_SYNONYMS.get(term.lower(), []))
|
| 246 |
+
syns.update(await _ols_synonyms(term, ontologies=["uberon","hbp","hpo","ncit"]))
|
| 247 |
+
elif k == "gene":
|
| 248 |
+
syns.update(await _mygene_aliases(term))
|
| 249 |
+
elif k == "phenotype":
|
| 250 |
+
syns.update(await _ols_synonyms(term, ontologies=["hpo","efo","mondo"]))
|
| 251 |
+
|
| 252 |
+
return {"term": term, "kind": k, "synonyms": sorted({s for s in syns if isinstance(s, str) and len(s) <= 60})}
|
| 253 |
+
|
| 254 |
+
# ----------------- region heuristic (upgraded) -----------------
|
| 255 |
+
REGION_TERMS_DEFAULT = [
|
| 256 |
+
"prefrontal cortex","anterior cingulate cortex","mPFC","ACC","nucleus accumbens","ventral striatum",
|
| 257 |
+
"dorsal striatum","caudate","putamen","amygdala","hippocampus","thalamus","hypothalamus",
|
| 258 |
+
"insula","ventral tegmental area","VTA","substantia nigra","cerebellum"
|
| 259 |
+
]
|
| 260 |
|
| 261 |
def collect_gene_symbols_from_string(edges: List[dict], focus: str) -> List[str]:
|
| 262 |
genes = set()
|
| 263 |
f = focus.upper()
|
| 264 |
+
for e in edges or []:
|
| 265 |
+
for k in ("preferredName_A","preferredName_B"):
|
| 266 |
g = e.get(k)
|
| 267 |
+
if g and isinstance(g,str) and g.upper() != f:
|
| 268 |
genes.add(g)
|
| 269 |
return list(genes)
|
| 270 |
|
| 271 |
+
async def _eupmc_hitcount(q: str) -> int:
|
| 272 |
+
# Europe PMC search hitCount (pageSize=0)
|
| 273 |
+
url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
|
| 274 |
+
params = {"query": q, "format": "json", "pageSize": 0}
|
| 275 |
+
data = await get_json_cached(url, params, ttl=3600)
|
| 276 |
+
try:
|
| 277 |
+
return int(data.get("hitCount", 0))
|
| 278 |
+
except Exception:
|
| 279 |
+
return 0
|
| 280 |
+
|
| 281 |
@app.get("/heuristics/regions_from_string")
|
| 282 |
async def regions_from_string(
|
| 283 |
receptor: str = Query(..., description="e.g., HTR2A"),
|
| 284 |
species: int = 9606,
|
| 285 |
limit: int = 40,
|
| 286 |
+
regions: Optional[str] = Query(None, description="comma-separated region terms (optional)"),
|
| 287 |
+
symptom: Optional[str] = Query(None, description="optional phenotype/symptom to weight co-mentions (e.g., apathy)")
|
| 288 |
):
|
| 289 |
+
"""
|
| 290 |
+
Heuristic: rank brain regions by STRING neighbors + Europe PMC co-mentions, with synonyms & tiered fallbacks.
|
| 291 |
+
Tiers (all unquoted for flexible match):
|
| 292 |
+
T1: (region_syns) AND ((receptor_syns) OR neighbors) AND (symptom_syns?) weight 1.0
|
| 293 |
+
T2: (region_syns) AND (receptor_syns OR neighbors) weight 0.6
|
| 294 |
+
T3: (region_syns) AND (receptor_syns) weight 0.5
|
| 295 |
+
T4: (region_syns) AND (symptom_syns) weight 0.3
|
| 296 |
+
Final score = log10(weighted_hits+1) * mean_top_STRING_conf
|
| 297 |
+
"""
|
| 298 |
+
# 1) STRING neighbors
|
| 299 |
edges = await string_network(receptor, species=species, limit=limit)
|
| 300 |
neighbors = collect_gene_symbols_from_string(edges, receptor)
|
| 301 |
|
| 302 |
+
# STRING confidences
|
| 303 |
conf: Dict[str, float] = {}
|
| 304 |
+
for e in edges or []:
|
| 305 |
+
a, b, score = e.get("preferredName_A"), e.get("preferredName_B"), _safe_float(e.get("score", 0))
|
|
|
|
| 306 |
if a and a.upper() != receptor.upper():
|
| 307 |
conf[a] = max(conf.get(a, 0.0), score)
|
| 308 |
if b and b.upper() != receptor.upper():
|
| 309 |
conf[b] = max(conf.get(b, 0.0), score)
|
| 310 |
+
mean_conf = sum(conf.values())/max(len(conf),1) if conf else 0.2
|
| 311 |
|
| 312 |
+
# 2) synonyms
|
| 313 |
+
receptor_syns = await _mygene_aliases(receptor)
|
| 314 |
+
symptom_syns = []
|
| 315 |
+
if symptom:
|
| 316 |
+
s = await util_synonyms(symptom, kind="phenotype")
|
| 317 |
+
symptom_syns = s["synonyms"]
|
|
|
|
| 318 |
|
| 319 |
+
region_list = [r.strip() for r in (regions.split(",") if regions else REGION_TERMS_DEFAULT) if r.strip()]
|
| 320 |
+
# Build clauses (unquoted OR lists)
|
| 321 |
+
gene_clause = " OR ".join(sorted({receptor} | set(receptor_syns) | set(neighbors[:25])))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
|
| 323 |
results = []
|
| 324 |
+
tasks = []
|
| 325 |
+
tier_defs = []
|
| 326 |
+
for region in region_list:
|
| 327 |
+
# region synonyms
|
| 328 |
+
rs = await util_synonyms(region, kind="region")
|
| 329 |
+
region_syns = rs["synonyms"]
|
| 330 |
+
region_clause = " OR ".join(region_syns)
|
| 331 |
+
|
| 332 |
+
# tiers
|
| 333 |
+
# T1
|
| 334 |
+
if symptom and symptom_syns:
|
| 335 |
+
t1 = f"({region_clause}) AND (({gene_clause})) AND ({' OR '.join(symptom_syns)})"
|
| 336 |
+
else:
|
| 337 |
+
t1 = f"({region_clause}) AND (({gene_clause}))"
|
| 338 |
+
t2 = f"({region_clause}) AND (({gene_clause}))"
|
| 339 |
+
t3 = f"({region_clause}) AND ({' OR '.join(sorted(set([receptor] + receptor_syns)))})"
|
| 340 |
+
t4 = f"({region_clause}) AND ({' OR '.join(symptom_syns)})" if symptom_syns else None
|
| 341 |
+
|
| 342 |
+
tiers = [("t1",1.0,t1), ("t2",0.6,t2), ("t3",0.5,t3)]
|
| 343 |
+
if t4: tiers.append(("t4",0.3,t4))
|
| 344 |
+
|
| 345 |
+
# schedule hitCount calls
|
| 346 |
+
tier_defs.append((region, tiers))
|
| 347 |
+
for _,_,q in tiers:
|
| 348 |
+
tasks.append(_eupmc_hitcount(q))
|
| 349 |
+
|
| 350 |
+
# gather all counts in-order
|
| 351 |
+
counts_all = await asyncio.gather(*tasks)
|
| 352 |
+
# fold back into regions
|
| 353 |
+
idx = 0
|
| 354 |
+
for region, tiers in tier_defs:
|
| 355 |
+
weighted = 0.0
|
| 356 |
+
tier_counts = {}
|
| 357 |
+
for name, weight, _q in tiers:
|
| 358 |
+
hc = counts_all[idx]; idx += 1
|
| 359 |
+
tier_counts[name] = hc
|
| 360 |
+
weighted += weight * hc
|
| 361 |
+
score = math.log10(weighted + 1.0) * mean_conf
|
| 362 |
+
results.append({"region": region, "tiers": tier_counts, "weighted_hits": int(round(weighted)),
|
| 363 |
+
"weighted_score": round(score, 4)})
|
| 364 |
|
| 365 |
results.sort(key=lambda x: x["weighted_score"], reverse=True)
|
| 366 |
return {
|
| 367 |
"focus": receptor,
|
| 368 |
"neighbors_considered": neighbors[:25],
|
| 369 |
"regions_ranked": results,
|
| 370 |
+
"notes": "STRING + Europe PMC with synonyms and tiered fallbacks (unquoted)."
|
| 371 |
}
|
| 372 |
|
| 373 |
+
# ----------------- MANIFEST + PAGED SECTIONS + DOWNLOAD -----------------
|
| 374 |
+
def _job_dir(job_id: str) -> _Path:
|
| 375 |
+
d = DATA_DIR / job_id
|
| 376 |
+
d.mkdir(parents=True, exist_ok=True)
|
| 377 |
+
return d
|
| 378 |
+
|
| 379 |
+
def _write_gz_jsonl(path: _Path, items: List[dict]):
|
| 380 |
+
with gzip.open(path, "wt", encoding="utf-8") as gz:
|
| 381 |
+
for it in items:
|
| 382 |
+
gz.write(json.dumps(it, ensure_ascii=False) + "\n")
|
| 383 |
+
|
| 384 |
+
def _read_gz_page(path: _Path, page: int, page_size: int) -> Tuple[int, List[dict]]:
|
| 385 |
+
total = 0
|
| 386 |
+
start = (page - 1) * page_size
|
| 387 |
+
end = start + page_size
|
| 388 |
+
out = []
|
| 389 |
+
with gzip.open(path, "rt", encoding="utf-8") as gz:
|
| 390 |
+
for i, line in enumerate(gz):
|
| 391 |
+
if not line.strip():
|
| 392 |
+
continue
|
| 393 |
+
if i >= start and i < end:
|
| 394 |
+
out.append(json.loads(line))
|
| 395 |
+
total += 1
|
| 396 |
+
return total, out
|
| 397 |
+
|
| 398 |
+
async def _build_mech_job(params: dict) -> dict:
|
| 399 |
+
"""
|
| 400 |
+
Build nodes/edges/literature/regions; write gz NDJSON + meta.
|
| 401 |
+
"""
|
| 402 |
+
receptor = params["receptor"]
|
| 403 |
+
species = int(params.get("species", 9606))
|
| 404 |
+
symptom = params.get("symptom")
|
| 405 |
+
string_limit = int(params.get("string_limit", 200))
|
| 406 |
+
eupmc_page_size = int(params.get("eupmc_page_size", 100))
|
| 407 |
+
eupmc_max_pages = int(params.get("eupmc_max_pages", 3))
|
| 408 |
+
|
| 409 |
+
job_id = _hash_params(params)
|
| 410 |
+
d = _job_dir(job_id)
|
| 411 |
+
meta_path = d / "meta.json"
|
| 412 |
+
if meta_path.exists():
|
| 413 |
+
return json.loads(meta_path.read_text("utf-8"))
|
| 414 |
+
|
| 415 |
+
# 1) STRING edges + nodes
|
| 416 |
+
edges = await string_network(receptor, species=species, limit=string_limit)
|
| 417 |
+
edge_items = []
|
| 418 |
+
nodes = set([receptor])
|
| 419 |
+
for e in edges or []:
|
| 420 |
+
a = e.get("preferredName_A"); b = e.get("preferredName_B")
|
| 421 |
+
score = _safe_float(e.get("score", 0))
|
| 422 |
+
if a and b:
|
| 423 |
+
edge_items.append({"a": a, "b": b, "score": score})
|
| 424 |
+
nodes.add(a); nodes.add(b)
|
| 425 |
+
node_items = [{"symbol": n, "seed": (n.upper()==receptor.upper())} for n in sorted(nodes)]
|
| 426 |
+
|
| 427 |
+
_write_gz_jsonl(d / "edges.jsonl.gz", edge_items)
|
| 428 |
+
_write_gz_jsonl(d / "nodes.jsonl.gz", node_items)
|
| 429 |
+
|
| 430 |
+
# 2) Europe PMC literature for (receptor AND symptom?) else receptor
|
| 431 |
+
lit_items = []
|
| 432 |
+
base_q = f"{receptor} AND {symptom}" if symptom else receptor
|
| 433 |
+
for page in range(1, eupmc_max_pages+1):
|
| 434 |
+
res = await europe_pmc_search(base_q, pageSize=eupmc_page_size, page=page)
|
| 435 |
+
hits = res.get("resultList", {}).get("result", []) or []
|
| 436 |
+
for h in hits:
|
| 437 |
+
lit_items.append({
|
| 438 |
+
"id": h.get("id"),
|
| 439 |
+
"source": h.get("source"), "title": h.get("title"),
|
| 440 |
+
"pubYear": h.get("pubYear"), "authorString": h.get("authorString"),
|
| 441 |
+
"journalTitle": h.get("journalTitle"), "doi": h.get("doi")
|
| 442 |
+
})
|
| 443 |
+
# stop early if last page
|
| 444 |
+
if len(hits) < eupmc_page_size:
|
| 445 |
+
break
|
| 446 |
+
_write_gz_jsonl(d / "literature.jsonl.gz", lit_items)
|
| 447 |
+
|
| 448 |
+
# 3) Regions heuristic (with symptom)
|
| 449 |
+
reg = await regions_from_string(receptor=receptor, species=species, limit=min(100, string_limit), regions=None, symptom=symptom)
|
| 450 |
+
reg_items = []
|
| 451 |
+
for r in reg.get("regions_ranked", []):
|
| 452 |
+
reg_items.append(r)
|
| 453 |
+
_write_gz_jsonl(d / "regions.jsonl.gz", reg_items)
|
| 454 |
+
|
| 455 |
+
meta = {
|
| 456 |
+
"job_id": job_id,
|
| 457 |
+
"created": datetime.utcnow().isoformat() + "Z",
|
| 458 |
+
"params": params,
|
| 459 |
+
"counts": {
|
| 460 |
+
"nodes": len(node_items),
|
| 461 |
+
"edges": len(edge_items),
|
| 462 |
+
"literature": len(lit_items),
|
| 463 |
+
"regions": len(reg_items)
|
| 464 |
+
},
|
| 465 |
+
"sections": ["nodes","edges","literature","regions"]
|
| 466 |
+
}
|
| 467 |
+
meta_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
|
| 468 |
+
return meta
|
| 469 |
+
|
| 470 |
+
@app.get("/mechanism_graph_manifest")
|
| 471 |
+
async def mechanism_graph_manifest(
|
| 472 |
+
receptor: str = Query(...),
|
| 473 |
species: int = 9606,
|
| 474 |
+
symptom: Optional[str] = None,
|
| 475 |
+
string_limit: int = 200,
|
| 476 |
+
eupmc_page_size: int = 100,
|
| 477 |
+
eupmc_max_pages: int = 3
|
| 478 |
):
|
| 479 |
+
"""
|
| 480 |
+
Build the full mechanism dataset server-side and return a manifest with job_id + counts.
|
| 481 |
+
The actual data is stored as gzipped NDJSON and can be:
|
| 482 |
+
- paged via /mechanism_graph/{section}?job_id=...&page=1&page_size=...
|
| 483 |
+
- or downloaded as a single gz file via /download/{job_id}/{section}
|
| 484 |
+
"""
|
| 485 |
+
params = {
|
| 486 |
+
"receptor": receptor, "species": species, "symptom": symptom,
|
| 487 |
+
"string_limit": string_limit, "eupmc_page_size": eupmc_page_size, "eupmc_max_pages": eupmc_max_pages
|
| 488 |
+
}
|
| 489 |
+
meta = await _build_mech_job(params)
|
| 490 |
+
return meta
|
| 491 |
+
|
| 492 |
+
@app.get("/mechanism_graph/{section}")
|
| 493 |
+
async def mechanism_graph_section(
|
| 494 |
+
section: Literal["nodes","edges","literature","regions"] = Path(...),
|
| 495 |
+
job_id: str = Query(...),
|
| 496 |
+
page: int = 1,
|
| 497 |
+
page_size: int = 100
|
| 498 |
+
):
|
| 499 |
+
"""
|
| 500 |
+
Return a single page from a section (nodes|edges|literature|regions).
|
| 501 |
+
"""
|
| 502 |
+
d = _job_dir(job_id)
|
| 503 |
+
p = d / f"{section}.jsonl.gz"
|
| 504 |
+
if not p.exists():
|
| 505 |
+
raise HTTPException(status_code=404, detail=f"section {section} not found for job {job_id}")
|
| 506 |
+
|
| 507 |
+
total, items = _read_gz_page(p, page=page, page_size=page_size)
|
| 508 |
return {
|
| 509 |
+
"job_id": job_id,
|
| 510 |
+
"section": section,
|
| 511 |
+
"page": page, "page_size": page_size,
|
| 512 |
+
"total": total,
|
| 513 |
+
"items": items
|
|
|
|
| 514 |
}
|
| 515 |
+
|
| 516 |
+
@app.get("/download/{job_id}/{section}")
|
| 517 |
+
async def download_section(job_id: str, section: Literal["nodes","edges","literature","regions"]):
|
| 518 |
+
"""
|
| 519 |
+
Download the full gzipped NDJSON for a section.
|
| 520 |
+
"""
|
| 521 |
+
d = _job_dir(job_id)
|
| 522 |
+
p = d / f"{section}.jsonl.gz"
|
| 523 |
+
if not p.exists():
|
| 524 |
+
raise HTTPException(status_code=404, detail=f"section {section} not found for job {job_id}")
|
| 525 |
+
return FileResponse(
|
| 526 |
+
path=str(p),
|
| 527 |
+
filename=f"{APP_NAME}-{job_id}-{section}.jsonl.gz",
|
| 528 |
+
media_type="application/gzip"
|
| 529 |
+
)
|