hchevva commited on
Commit
05faca4
·
verified ·
1 Parent(s): e73b687

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +295 -199
app.py CHANGED
@@ -1,228 +1,254 @@
1
- import os
2
  import json
 
3
  import time
4
- import asyncio
5
- from dataclasses import dataclass
6
- from typing import Any, Dict, Tuple
7
 
8
  import gradio as gr
 
9
 
10
- from core.http import client
11
  from core.rate_limit import check_and_increment_global_ai_cap
12
- from core.validate import is_cas
13
  from core.pdf_report import build_pdf
 
14
 
15
- from core.sources import pubchem, ntp, ctx as ctx_src, iarc, scholar, fema, cdc
16
- from core.sources.ai_summary import generate_ai_summary
17
-
 
 
18
 
19
  # -----------------------------
20
- # Settings
21
- # -----------------------------
22
- @dataclass
23
- class Settings:
24
- openai_model: str = os.getenv("OPENAI_MODEL", "gpt-4o")
25
- max_ai_summaries_per_day: int = int(os.getenv("MAX_AI_SUMMARIES_PER_DAY", "100"))
26
- cache_ttl_seconds: int = int(os.getenv("CACHE_TTL_SECONDS", "86400"))
27
-
28
-
29
- settings = Settings()
30
-
31
- # -----------------------------
32
- # Simple in-memory caches
33
  # -----------------------------
34
  SEARCH_CACHE: Dict[str, Dict[str, Any]] = {}
35
  AI_CACHE: Dict[str, str] = {}
36
 
37
- # -----------------------------
38
- # Utilities
39
- # -----------------------------
40
- def _pretty(obj: Any) -> str:
41
  try:
42
- return json.dumps(obj, indent=2, ensure_ascii=False)
43
  except Exception:
44
  return str(obj)
45
 
46
 
47
- def _truncate_text(s: str, max_chars: int) -> str:
48
- if not s:
49
- return ""
50
- if len(s) <= max_chars:
51
- return s
52
- return s[:max_chars] + "\n\n[TRUNCATED]\n"
53
 
54
 
55
  # -----------------------------
56
- # Renderers (Markdown blocks)
57
  # -----------------------------
58
- def render_overview(data: dict) -> str:
59
- q = data.get("query", "")
60
- cas = data.get("cas_used", "")
61
- return f"**Query:** `{q}`\n\n**CAS used:** `{cas}`"
62
-
63
-
64
- def render_pubchem_summary(pub: dict) -> str:
65
- if not pub or not pub.get("ok"):
66
- return f"PubChem unavailable: {pub.get('error') if isinstance(pub, dict) else 'unknown'}"
67
-
68
- cid = pub.get("cid", "")
69
- resolved_cas = pub.get("resolved_cas", "")
70
- iupac = pub.get("iupac_name") or pub.get("title") or "-"
71
- mf = pub.get("molecular_formula") or "-"
72
- mw = pub.get("molecular_weight") or "-"
73
- smiles = pub.get("canonical_smiles") or "-"
74
- struct_url = pub.get("structure_png") or ""
75
 
 
 
 
76
  lines = [
77
- f"**CID:** `{cid}`",
78
- f"**Resolved CAS (from synonyms):** `{resolved_cas}`" if resolved_cas else "",
79
- f"**IUPAC/Title:** {iupac}",
80
- "",
81
- f"**Molecular Formula:** `{mf}`",
82
- f"**Molecular Weight:** `{mw}`",
83
- f"**Canonical SMILES:** `{smiles}`",
84
- "",
85
  ]
86
 
87
- if struct_url:
88
- lines += [
89
- "**Structure**",
90
- f"![structure]({struct_url})",
91
- "",
92
- ]
 
 
 
93
 
94
- pc_url = pub.get("url") or ""
95
- if pc_url:
96
- lines.append(f"[Open PubChem]({pc_url})")
97
 
98
- return "\n".join([x for x in lines if x != ""])
 
 
 
99
 
 
 
 
100
 
101
- def render_ctx_summary(ctx: dict) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  if not ctx or not ctx.get("ok"):
103
- return ctx.get("error") or "CTX unavailable."
104
- # Full fields already in ctx dict; render a compact header + note
105
- dtxsid = ctx.get("dtxsid") or ""
 
 
 
 
 
 
 
106
  lines = []
107
  if dtxsid:
108
  lines.append(f"**DTXSID:** `{dtxsid}`")
109
- hazard = ctx.get("ghs_hazard_statements") or ""
110
- echa = ctx.get("echa_cl_summary") or ""
111
- if hazard:
112
- lines.append(f"\n**GHS Hazard Statements:**\n\n{hazard}")
113
- if echa:
114
- lines.append(f"\n**ECHA C&L Summary:**\n\n{echa}")
115
-
116
- dash = ctx.get("dashboard_search_url") or ""
117
  if dash:
118
- lines.append(f"\n[Open CompTox Dashboard search]({dash})")
119
- return "\n".join(lines) if lines else "No DTXSID found for this query."
120
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
- def render_ntp_summary(ntp_obj: dict) -> str:
123
- if not ntp_obj or not ntp_obj.get("ok"):
124
- return ntp_obj.get("error") or "NTP unavailable."
125
- hits = ntp_obj.get("hits") or []
126
- if not hits:
127
- return "No NTP Technical Reports found for this CAS."
128
- lines = []
129
- for h in hits:
130
- tr = h.get("tr") or h.get("title") or "NTP Technical Report"
131
- url = h.get("url") or ""
132
- pdf = h.get("pdf") or ""
133
- # Always show PDF link when available
134
- if pdf:
135
- lines.append(f"- **{tr}** — [Report page]({url}) • [PDF]({pdf})" if url else f"- **{tr}** — [PDF]({pdf})")
136
- else:
137
- lines.append(f"- **{tr}** — [Report page]({url})" if url else f"- **{tr}**")
138
  return "\n".join(lines)
139
 
140
 
141
- def render_cdc_summary(cdc_obj: dict) -> str:
142
- if not cdc_obj or not cdc_obj.get("ok"):
143
- return cdc_obj.get("error") or "CDC toxprofiles unavailable."
 
144
 
145
- matches = cdc_obj.get("matches") or []
146
- if not matches:
147
- return "No toxprofile is available for the chemical."
148
 
149
  lines = []
150
- for m in matches:
151
- name = m.get("name") or "ToxProfile"
152
- cas = m.get("cas") or ""
153
- url = m.get("url") or ""
154
  if url:
155
- lines.append(f"- **{name}** (CAS: {cas}) — [CDC ToxProfile]({url})")
156
  else:
157
- lines.append(f"- **{name}** (CAS: {cas})")
158
  return "\n".join(lines)
159
 
160
 
161
- def render_iarc_block(obj: dict) -> str:
162
- if not obj or not obj.get("ok"):
163
  return "IARC link unavailable."
164
- url = obj.get("url") or ""
165
- if not url:
166
- return "IARC link unavailable."
167
- return f"[Open IARC Monographs search]({url})"
168
-
169
-
170
- def render_scholar_block(obj: dict) -> str:
171
- if not obj or not obj.get("ok"):
172
- return "Scholar link unavailable."
173
- url = obj.get("url") or ""
174
- return f"[Open Google Scholar search]({url})" if url else "Scholar link unavailable."
175
-
176
-
177
- def render_fema_block(obj: dict) -> str:
178
- if not obj or not obj.get("ok"):
179
- return "FEMA link unavailable."
180
- url = obj.get("url") or ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  return f"[Open FEMA / Fragrance Materials Safety Resource search]({url})" if url else "FEMA link unavailable."
182
 
183
 
184
- # -----------------------------
185
- # Prompt builder (keep small)
186
- # -----------------------------
187
- def build_prompt(data: dict) -> str:
188
- cas = data.get("cas_used") or data.get("query") or "unknown"
189
-
190
- pub = data.get("pubchem", {})
191
- ctx = data.get("ctx_genetox", {})
192
- ntp_obj = data.get("ntp_technical_reports", {})
193
- cdc_obj = data.get("cdc_toxprofiles", {})
194
-
195
- prompt = f"""You are a toxicology assistant. Summarize weight-of-evidence for mutagenicity/genotoxicity.
196
-
197
- Chemical CAS: {cas}
198
-
199
- PUBCHEM (selected fields):
200
- {_pretty({k: pub.get(k) for k in ['cid','resolved_cas','iupac_name','title','molecular_formula','molecular_weight','canonical_smiles','url']})}
201
-
202
- CTX (selected blocks only):
203
- {_pretty({k: ctx.get(k) for k in ['dtxsid','ghs_hazard_statements','echa_cl_summary','genetox_records']})}
204
-
205
- NTP Technical Reports (hits):
206
- {_pretty(ntp_obj.get('hits') if isinstance(ntp_obj, dict) else ntp_obj)}
207
-
208
- CDC ToxProfiles (matches):
209
- {_pretty(cdc_obj.get('matches') if isinstance(cdc_obj, dict) else cdc_obj)}
210
-
211
- Write a concise, structured summary:
212
- - Identity & key links
213
- - Genetox signals (Ames, micronucleus, chromosomal aberrations, etc.)
214
- - Any conflicts/inconsistencies
215
- - Overall conclusion (low/medium/high concern)
216
- - What data is missing
217
- """
218
-
219
- return _truncate_text(prompt, max_chars=16000)
220
 
221
 
222
  # -----------------------------
223
- # Search pipeline
224
  # -----------------------------
225
- async def run_search(query: str) -> dict:
 
226
  q = (query or "").strip()
227
  if not q:
228
  raise gr.Error("Enter a CAS number (preferred) or chemical name.")
@@ -232,65 +258,130 @@ async def run_search(query: str) -> dict:
232
  return SEARCH_CACHE[cache_key]
233
 
234
  async with client() as http:
 
235
  pub = await pubchem.pubchem_by_query(q, http)
236
 
237
  cas = q
238
- if not is_cas(cas):
239
  cas = pub.get("resolved_cas") or q
240
 
241
- ctx_task = (
242
- ctx_src.fetch_ctx_genetox(cas, http)
243
- if is_cas(cas)
244
- else asyncio.sleep(0, result={"ok": False, "error": "CTX requires CAS (CAS-first)."})
245
- )
246
  ntp_task = ntp.search_technical_reports(cas, http, limit=8)
247
 
248
  ctx_res, ntp_res = await asyncio.gather(ctx_task, ntp_task)
249
 
250
- # CDC (offline/local index): try resolved CAS first, then name fallback inside cdc.search()
251
- cdc_res = cdc.search(q, cas=cas if is_cas(cas) else None, limit=8)
252
-
253
- out = {
254
  "query": q,
255
  "cas_used": cas,
256
  "pubchem": pub,
257
  "ctx_genetox": ctx_res,
258
  "ntp_technical_reports": ntp_res,
259
- "cdc_toxprofiles": cdc_res,
260
  "iarc_monographs": iarc.bookshelf_link(cas),
261
  "google_scholar": {"ok": True, "url": scholar.scholar_link(cas)},
262
- "fema": fema.fema_link(cas if is_cas(cas) else q),
263
  }
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  SEARCH_CACHE[cache_key] = out
266
  return out
267
 
268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  def do_search(query: str):
270
  data = asyncio.run(run_search(query))
 
271
  overview_md_text = render_overview(data)
272
  pubchem_md_text = render_pubchem_summary(data.get("pubchem", {}))
273
  ctx_md_text = render_ctx_summary(data.get("ctx_genetox", {}))
274
  ntp_md_text = render_ntp_summary(data.get("ntp_technical_reports", {}))
275
- cdc_md_text = render_cdc_summary(data.get("cdc_toxprofiles", {}))
276
  iarc_md_text = render_iarc_block(data.get("iarc_monographs", {}))
277
  scholar_md_text = render_scholar_block(data.get("google_scholar", {}))
278
  fema_md_text = render_fema_block(data.get("fema", {}))
279
 
280
- raw_pubchem_json = _pretty(data.get("pubchem", {}))
281
- raw_ctx_json = _pretty(data.get("ctx_genetox", {}))
282
- raw_ntp_json = _pretty(data.get("ntp_technical_reports", {}))
283
- raw_iarc_json = _pretty(data.get("iarc_monographs", {}))
284
- raw_scholar_json = _pretty(data.get("google_scholar", {}))
285
- raw_fema_json = _pretty(data.get("fema", {}))
 
 
 
 
286
 
 
 
287
  return (
288
- data,
289
  overview_md_text,
290
  pubchem_md_text,
 
291
  ctx_md_text,
292
  ntp_md_text,
293
- cdc_md_text,
294
  iarc_md_text,
295
  scholar_md_text,
296
  fema_md_text,
@@ -300,13 +391,14 @@ def do_search(query: str):
300
  raw_iarc_json,
301
  raw_scholar_json,
302
  raw_fema_json,
303
- "", # ai_out blank after search
304
  )
305
 
306
 
307
  def generate_ai(data: dict):
308
  if not data:
309
  raise gr.Error("Run a search first.")
 
310
  cas = data.get("cas_used") or data.get("query") or ""
311
  cache_key = f"ai::{cas}"
312
  if cache_key in AI_CACHE:
@@ -316,6 +408,8 @@ def generate_ai(data: dict):
316
  if not allowed:
317
  return f"AI Summary capacity reached for today (limit {info.get('limit')}). Please try again tomorrow."
318
 
 
 
319
  resp = generate_ai_summary(build_prompt(data))
320
  if not resp.get("ok"):
321
  return f"**AI summary unavailable:** {resp.get('error')}"
@@ -328,19 +422,17 @@ def generate_ai(data: dict):
328
  def download_report(data: dict, ai_text: str):
329
  if not data:
330
  raise gr.Error("Run a search first.")
 
331
  cas = data.get("cas_used") or data.get("query") or "unknown"
332
  pdf_path, json_path = build_pdf(cas, evidence=data, ai_summary=ai_text if ai_text else None)
333
  return pdf_path, json_path
334
 
335
 
336
  # -----------------------------
337
- # UI (light, production-like)
338
  # -----------------------------
339
- LIGHT_CSS = """
340
- .gradio-container { background: white !important; }
341
- """
342
 
343
- with gr.Blocks(title="ToxRAI (HF Demo)", css=LIGHT_CSS) as demo:
344
  gr.Markdown("# 🧪 ToxRAI — Demo (CAS-first)")
345
  gr.Markdown(
346
  f"Public demo • AI summaries/day global cap: **{settings.max_ai_summaries_per_day}** • Cache TTL: **{settings.cache_ttl_seconds}s**"
@@ -363,6 +455,7 @@ with gr.Blocks(title="ToxRAI (HF Demo)", css=LIGHT_CSS) as demo:
363
  with gr.Accordion("PubChem (summary)", open=False):
364
  pubchem_md = gr.Markdown()
365
 
 
366
  with gr.Accordion("CDC ToxProfiles", open=False):
367
  cdc_md = gr.Markdown()
368
 
@@ -406,9 +499,9 @@ with gr.Blocks(title="ToxRAI (HF Demo)", css=LIGHT_CSS) as demo:
406
  state,
407
  overview_md,
408
  pubchem_md,
 
409
  ctx_md,
410
  ntp_md,
411
- cdc_md,
412
  iarc_md,
413
  scholar_md,
414
  fema_md,
@@ -429,9 +522,9 @@ with gr.Blocks(title="ToxRAI (HF Demo)", css=LIGHT_CSS) as demo:
429
  state,
430
  overview_md,
431
  pubchem_md,
 
432
  ctx_md,
433
  ntp_md,
434
- cdc_md,
435
  iarc_md,
436
  scholar_md,
437
  fema_md,
@@ -449,5 +542,8 @@ with gr.Blocks(title="ToxRAI (HF Demo)", css=LIGHT_CSS) as demo:
449
  pdf_btn.click(fn=download_report, inputs=[state, ai_out], outputs=[pdf_file, json_file])
450
 
451
 
 
 
 
452
  if __name__ == "__main__":
453
- demo.queue().launch()
 
1
+ import asyncio
2
  import json
3
+ import os
4
  import time
5
+ from typing import Any, Dict, Optional
 
 
6
 
7
  import gradio as gr
8
+ import httpx
9
 
10
+ from core.config import settings
11
  from core.rate_limit import check_and_increment_global_ai_cap
 
12
  from core.pdf_report import build_pdf
13
+ from core.sources import pubchem, ntp, ctx as ctx_src, iarc, scholar, fema
14
 
15
+ # Optional: CDC module may exist in your repo (user added).
16
+ try:
17
+ from core.sources import cdc
18
+ except Exception:
19
+ cdc = None # type: ignore
20
 
21
  # -----------------------------
22
+ # Caches (simple in-memory)
 
 
 
 
 
 
 
 
 
 
 
 
23
  # -----------------------------
24
  SEARCH_CACHE: Dict[str, Dict[str, Any]] = {}
25
  AI_CACHE: Dict[str, str] = {}
26
 
27
+
28
+ def json_pretty(obj: Any) -> str:
 
 
29
  try:
30
+ return json.dumps(obj, indent=2, ensure_ascii=False, default=str)
31
  except Exception:
32
  return str(obj)
33
 
34
 
35
+ def client() -> httpx.AsyncClient:
36
+ return httpx.AsyncClient(headers={"user-agent": "toxrai-hf-demo"})
 
 
 
 
37
 
38
 
39
  # -----------------------------
40
+ # Rendering helpers (Markdown)
41
  # -----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ def render_overview(data: Dict[str, Any]) -> str:
44
+ q = data.get("query") or ""
45
+ cas = data.get("cas_used") or ""
46
  lines = [
47
+ f"**Query:** `{q}`",
48
+ f"**CAS used:** `{cas}`",
 
 
 
 
 
 
49
  ]
50
 
51
+ # Add quick IDs when available
52
+ pub = data.get("pubchem") or {}
53
+ if pub.get("ok") and pub.get("cid"):
54
+ lines.append(f"**PubChem CID:** `{pub.get('cid')}`")
55
+ ctx = data.get("ctx_genetox") or {}
56
+ if ctx.get("ok") and ctx.get("dtxsid"):
57
+ lines.append(f"**EPA CompTox DTXSID:** `{ctx.get('dtxsid')}`")
58
+
59
+ return "\n\n".join(lines)
60
 
 
 
 
61
 
62
+ def render_pubchem_summary(pub: Dict[str, Any]) -> str:
63
+ if not pub or not pub.get("ok"):
64
+ err = pub.get("error") if isinstance(pub, dict) else "Unknown PubChem error"
65
+ return f"PubChem unavailable: {err}"
66
 
67
+ cid = pub.get("cid")
68
+ resolved_cas = pub.get("resolved_cas") or "-"
69
+ props = pub.get("props") or {}
70
 
71
+ iupac_name = props.get("IUPACName") or props.get("iupac_name") or "-"
72
+ formula = props.get("MolecularFormula") or "-"
73
+ mw = props.get("MolecularWeight")
74
+ mw_str = f"{mw}" if mw not in (None, "") else "-"
75
+ smiles = props.get("CanonicalSMILES") or "-"
76
+
77
+ lines = []
78
+ lines.append(f"**CID:** `{cid}`")
79
+ lines.append(f"**Resolved CAS (from synonyms):** `{resolved_cas}`")
80
+ lines.append(f"**IUPAC/Title:** {iupac_name}")
81
+ lines.append("")
82
+ lines.append(f"**Molecular Formula:** `{formula}`")
83
+ lines.append(f"**Molecular Weight:** `{mw_str}`")
84
+ lines.append(f"**Canonical SMILES:** `{smiles}`")
85
+
86
+ structure_png = pub.get("structure_png")
87
+ if structure_png:
88
+ lines.append("")
89
+ lines.append("**Structure**")
90
+ lines.append(f"![]({structure_png})")
91
+
92
+ url = pub.get("url")
93
+ if url:
94
+ lines.append("")
95
+ lines.append(f"[Open PubChem]({url})")
96
+
97
+ hazards = pub.get("hazards") or []
98
+ if hazards:
99
+ lines.append("")
100
+ lines.append("### Safety / Hazard Information")
101
+ # Render as paragraphs (avoids weird wrapping from bullet nesting)
102
+ for h in hazards:
103
+ name = (h or {}).get("name") or "Hazard"
104
+ text = (h or {}).get("text") or ""
105
+ if not text:
106
+ continue
107
+ lines.append(f"**{name}:** {text}")
108
+ lines.append("")
109
+
110
+ return "\n".join(lines).rstrip() + "\n"
111
+
112
+
113
+ def render_ctx_summary(ctx: Dict[str, Any]) -> str:
114
  if not ctx or not ctx.get("ok"):
115
+ search_url = ctx.get("dashboard_search") if isinstance(ctx, dict) else None
116
+ err = ctx.get("error") if isinstance(ctx, dict) else "Unknown CTX error"
117
+ if search_url:
118
+ return f"{err}\n\n[Open CompTox Dashboard search]({search_url})"
119
+ return str(err)
120
+
121
+ dtxsid = ctx.get("dtxsid")
122
+ dash = ctx.get("dashboard_url")
123
+ summary = ctx.get("summary")
124
+
125
  lines = []
126
  if dtxsid:
127
  lines.append(f"**DTXSID:** `{dtxsid}`")
 
 
 
 
 
 
 
 
128
  if dash:
129
+ lines.append(f"[Open CompTox Dashboard]({dash})")
130
+
131
+ # Try to surface key fields (if present) without dumping huge JSON
132
+ if isinstance(summary, dict):
133
+ interesting_keys = [
134
+ "geneTox",
135
+ "genetox",
136
+ "overall",
137
+ "summary",
138
+ "conclusion",
139
+ "call",
140
+ "result",
141
+ "assessment",
142
+ ]
143
+ picked = {}
144
+ for k in summary.keys():
145
+ lk = k.lower()
146
+ if any(tok in lk for tok in interesting_keys):
147
+ picked[k] = summary[k]
148
+ if not picked:
149
+ # fallback: first few keys
150
+ for k in list(summary.keys())[:8]:
151
+ picked[k] = summary[k]
152
+
153
+ lines.append("")
154
+ lines.append("```json")
155
+ txt = json_pretty(picked)
156
+ # Keep it readable in UI
157
+ if len(txt) > 6000:
158
+ txt = txt[:6000] + "\n... (truncated)"
159
+ lines.append(txt)
160
+ lines.append("```")
161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  return "\n".join(lines)
163
 
164
 
165
+ def render_ntp_summary(ntp_res: Dict[str, Any]) -> str:
166
+ if not ntp_res or not ntp_res.get("ok"):
167
+ err = ntp_res.get("error") if isinstance(ntp_res, dict) else "Unknown NTP error"
168
+ return f"NTP Technical Reports unavailable: {err}"
169
 
170
+ items = ntp_res.get("items") or []
171
+ if not items:
172
+ return "No NTP Technical Reports found for this CAS." # CAS-filtered
173
 
174
  lines = []
175
+ for it in items:
176
+ num = it.get("tr") or it.get("num") or ""
177
+ title = it.get("title") or "Report"
178
+ url = it.get("report_page") or it.get("url") or ""
179
  if url:
180
+ lines.append(f"- **TR-{num}** [{title}]({url})")
181
  else:
182
+ lines.append(f"- **TR-{num}** {title}")
183
  return "\n".join(lines)
184
 
185
 
186
+ def render_iarc_block(iarc_res: Dict[str, Any]) -> str:
187
+ if not iarc_res or not iarc_res.get("ok"):
188
  return "IARC link unavailable."
189
+ url = iarc_res.get("url")
190
+ if url:
191
+ return f"[Search IARC Monographs (NCBI Bookshelf)]({url})"
192
+
193
+ results = iarc_res.get("results") if isinstance(iarc_res, dict) else None
194
+ if isinstance(results, list) and results:
195
+ lines = []
196
+ for it in results:
197
+ if not isinstance(it, dict):
198
+ continue
199
+ title = it.get("title") or "IARC Monographs"
200
+ link = it.get("url")
201
+ year = it.get("year")
202
+ suffix = f" ({year})" if year else ""
203
+ if link:
204
+ lines.append(f"- [{title}]({link}){suffix}")
205
+ else:
206
+ lines.append(f"- {title}{suffix}")
207
+ return "\n".join(lines) if lines else "IARC link unavailable."
208
+
209
+ return "IARC link unavailable."
210
+
211
+
212
+ def render_scholar_block(sch_res: Dict[str, Any]) -> str:
213
+ if not sch_res or not sch_res.get("ok"):
214
+ return "Google Scholar link unavailable."
215
+ url = sch_res.get("url")
216
+ return f"[Open Google Scholar search]({url})" if url else "Google Scholar link unavailable."
217
+
218
+
219
+ def render_fema_block(fema_res: Dict[str, Any]) -> str:
220
+ if not fema_res or not fema_res.get("ok"):
221
+ err = fema_res.get("error") if isinstance(fema_res, dict) else "FEMA link unavailable."
222
+ return str(err)
223
+ url = fema_res.get("url")
224
  return f"[Open FEMA / Fragrance Materials Safety Resource search]({url})" if url else "FEMA link unavailable."
225
 
226
 
227
+ def render_cdc_block(cdc_res: Any) -> str:
228
+ if not cdc_res:
229
+ return "No CDC ToxProfiles match."
230
+ # Accept either dict or list
231
+ if isinstance(cdc_res, dict):
232
+ url = cdc_res.get("url")
233
+ name = cdc_res.get("name") or "CDC ToxProfile"
234
+ return f"[{name}]({url})" if url else name
235
+ if isinstance(cdc_res, list):
236
+ lines = []
237
+ for it in cdc_res:
238
+ if not isinstance(it, dict):
239
+ continue
240
+ name = it.get("name") or "CDC ToxProfile"
241
+ url = it.get("url")
242
+ lines.append(f"- [{name}]({url})" if url else f"- {name}")
243
+ return "\n".join(lines) if lines else "No CDC ToxProfiles match."
244
+ return str(cdc_res)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
 
247
  # -----------------------------
248
+ # Search + AI
249
  # -----------------------------
250
+
251
+ async def run_search(query: str) -> Dict[str, Any]:
252
  q = (query or "").strip()
253
  if not q:
254
  raise gr.Error("Enter a CAS number (preferred) or chemical name.")
 
258
  return SEARCH_CACHE[cache_key]
259
 
260
  async with client() as http:
261
+ # PubChem accepts names and CAS. We also use it to resolve CAS via synonyms.
262
  pub = await pubchem.pubchem_by_query(q, http)
263
 
264
  cas = q
265
+ if not pubchem.is_cas(cas):
266
  cas = pub.get("resolved_cas") or q
267
 
268
+ # CTX is CAS-first (but we allow name too; resolver will try both)
269
+ ctx_task = ctx_src.fetch_ctx_genetox(cas, http) if cas else asyncio.sleep(0, result={"ok": False})
 
 
 
270
  ntp_task = ntp.search_technical_reports(cas, http, limit=8)
271
 
272
  ctx_res, ntp_res = await asyncio.gather(ctx_task, ntp_task)
273
 
274
+ out: Dict[str, Any] = {
 
 
 
275
  "query": q,
276
  "cas_used": cas,
277
  "pubchem": pub,
278
  "ctx_genetox": ctx_res,
279
  "ntp_technical_reports": ntp_res,
 
280
  "iarc_monographs": iarc.bookshelf_link(cas),
281
  "google_scholar": {"ok": True, "url": scholar.scholar_link(cas)},
282
+ "fema": fema.fema_link(cas if pubchem.is_cas(cas) else q),
283
  }
284
 
285
+ # CDC toxprofiles (if module exists)
286
+ if cdc is not None:
287
+ try:
288
+ # Try a few common function names (depending on how you implemented cdc.py)
289
+ if hasattr(cdc, "lookup"):
290
+ out["cdc_toxprofiles"] = cdc.lookup(cas)
291
+ elif hasattr(cdc, "search"):
292
+ out["cdc_toxprofiles"] = cdc.search(cas)
293
+ elif hasattr(cdc, "toxprofile_for"):
294
+ out["cdc_toxprofiles"] = cdc.toxprofile_for(cas)
295
+ else:
296
+ out["cdc_toxprofiles"] = None
297
+ except Exception:
298
+ out["cdc_toxprofiles"] = None
299
+
300
  SEARCH_CACHE[cache_key] = out
301
  return out
302
 
303
 
304
+ def _prune_for_prompt(obj: Any, max_chars: int) -> str:
305
+ txt = json_pretty(obj)
306
+ if len(txt) <= max_chars:
307
+ return txt
308
+ return txt[:max_chars] + "\n... (truncated)"
309
+
310
+
311
+ def build_prompt(data: Dict[str, Any]) -> str:
312
+ """Build a prompt that will not exceed model context.
313
+
314
+ Key change vs earlier version: DO NOT dump full raw JSON from all sources.
315
+ """
316
+
317
+ pub = data.get("pubchem") or {}
318
+ props = (pub.get("props") or {}) if isinstance(pub, dict) else {}
319
+ hazards = (pub.get("hazards") or []) if isinstance(pub, dict) else []
320
+
321
+ prompt_obj = {
322
+ "query": data.get("query"),
323
+ "cas_used": data.get("cas_used"),
324
+ "pubchem": {
325
+ "cid": pub.get("cid"),
326
+ "resolved_cas": pub.get("resolved_cas"),
327
+ "iupac": props.get("IUPACName") or props.get("iupac_name"),
328
+ "formula": props.get("MolecularFormula"),
329
+ "molecular_weight": props.get("MolecularWeight"),
330
+ "canonical_smiles": props.get("CanonicalSMILES"),
331
+ "hazards": hazards[:10],
332
+ },
333
+ "ctx_genetox": {
334
+ "ok": (data.get("ctx_genetox") or {}).get("ok"),
335
+ "dtxsid": (data.get("ctx_genetox") or {}).get("dtxsid"),
336
+ "summary": (data.get("ctx_genetox") or {}).get("summary"),
337
+ },
338
+ "ntp_technical_reports": (data.get("ntp_technical_reports") or {}).get("items", []),
339
+ "cdc_toxprofiles": data.get("cdc_toxprofiles"),
340
+ }
341
+
342
+ body = _prune_for_prompt(prompt_obj, max_chars=12000)
343
+
344
+ return (
345
+ "You are a toxicology regulatory assistant. "
346
+ "Using ONLY the evidence JSON below, write a concise weight-of-evidence summary focused on mutagenicity/genotoxicity. "
347
+ "If evidence is conflicting or absent, say so explicitly. "
348
+ "Cite which source each statement comes from (PubChem hazards, CTX genetox summary, NTP TR titles, CDC ToxProfiles).\n\n"
349
+ "EVIDENCE_JSON:\n"
350
+ + body
351
+ )
352
+
353
+
354
  def do_search(query: str):
355
  data = asyncio.run(run_search(query))
356
+
357
  overview_md_text = render_overview(data)
358
  pubchem_md_text = render_pubchem_summary(data.get("pubchem", {}))
359
  ctx_md_text = render_ctx_summary(data.get("ctx_genetox", {}))
360
  ntp_md_text = render_ntp_summary(data.get("ntp_technical_reports", {}))
 
361
  iarc_md_text = render_iarc_block(data.get("iarc_monographs", {}))
362
  scholar_md_text = render_scholar_block(data.get("google_scholar", {}))
363
  fema_md_text = render_fema_block(data.get("fema", {}))
364
 
365
+ cdc_md_text = ""
366
+ if "cdc_toxprofiles" in data:
367
+ cdc_md_text = render_cdc_block(data.get("cdc_toxprofiles"))
368
+
369
+ raw_pubchem_json = json_pretty(data.get("pubchem", {}))
370
+ raw_ctx_json = json_pretty(data.get("ctx_genetox", {}))
371
+ raw_ntp_json = json_pretty(data.get("ntp_technical_reports", {}))
372
+ raw_iarc_json = json_pretty(data.get("iarc_monographs", {}))
373
+ raw_scholar_json = json_pretty(data.get("google_scholar", {}))
374
+ raw_fema_json = json_pretty(data.get("fema", {}))
375
 
376
+ # IMPORTANT: return order must match `outputs=[...]`
377
+ # If CDC accordion exists, include it right after PubChem.
378
  return (
379
+ data, # state
380
  overview_md_text,
381
  pubchem_md_text,
382
+ cdc_md_text,
383
  ctx_md_text,
384
  ntp_md_text,
 
385
  iarc_md_text,
386
  scholar_md_text,
387
  fema_md_text,
 
391
  raw_iarc_json,
392
  raw_scholar_json,
393
  raw_fema_json,
394
+ "", # ai_out (blank after search)
395
  )
396
 
397
 
398
  def generate_ai(data: dict):
399
  if not data:
400
  raise gr.Error("Run a search first.")
401
+
402
  cas = data.get("cas_used") or data.get("query") or ""
403
  cache_key = f"ai::{cas}"
404
  if cache_key in AI_CACHE:
 
408
  if not allowed:
409
  return f"AI Summary capacity reached for today (limit {info.get('limit')}). Please try again tomorrow."
410
 
411
+ from core.sources.ai_summary import generate_ai_summary # local import avoids cold-start issues
412
+
413
  resp = generate_ai_summary(build_prompt(data))
414
  if not resp.get("ok"):
415
  return f"**AI summary unavailable:** {resp.get('error')}"
 
422
  def download_report(data: dict, ai_text: str):
423
  if not data:
424
  raise gr.Error("Run a search first.")
425
+
426
  cas = data.get("cas_used") or data.get("query") or "unknown"
427
  pdf_path, json_path = build_pdf(cas, evidence=data, ai_summary=ai_text if ai_text else None)
428
  return pdf_path, json_path
429
 
430
 
431
  # -----------------------------
432
+ # UI
433
  # -----------------------------
 
 
 
434
 
435
+ with gr.Blocks(title="ToxRAI (HF Demo)") as demo:
436
  gr.Markdown("# 🧪 ToxRAI — Demo (CAS-first)")
437
  gr.Markdown(
438
  f"Public demo • AI summaries/day global cap: **{settings.max_ai_summaries_per_day}** • Cache TTL: **{settings.cache_ttl_seconds}s**"
 
455
  with gr.Accordion("PubChem (summary)", open=False):
456
  pubchem_md = gr.Markdown()
457
 
458
+ # CDC accordion (optional)
459
  with gr.Accordion("CDC ToxProfiles", open=False):
460
  cdc_md = gr.Markdown()
461
 
 
499
  state,
500
  overview_md,
501
  pubchem_md,
502
+ cdc_md,
503
  ctx_md,
504
  ntp_md,
 
505
  iarc_md,
506
  scholar_md,
507
  fema_md,
 
522
  state,
523
  overview_md,
524
  pubchem_md,
525
+ cdc_md,
526
  ctx_md,
527
  ntp_md,
 
528
  iarc_md,
529
  scholar_md,
530
  fema_md,
 
542
  pdf_btn.click(fn=download_report, inputs=[state, ai_out], outputs=[pdf_file, json_file])
543
 
544
 
545
+ demo.queue(default_concurrency_limit=6)
546
+ app = demo
547
+
548
  if __name__ == "__main__":
549
+ demo.launch()