hchevva commited on
Commit
3d936e9
·
verified ·
1 Parent(s): c3e44ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +196 -269
app.py CHANGED
@@ -1,235 +1,228 @@
1
- import asyncio
2
- import json
3
  import os
 
4
  import time
5
- from typing import Any, Dict, Optional
 
 
6
 
7
  import gradio as gr
8
- import httpx
9
 
10
- from core.config import settings
11
  from core.rate_limit import check_and_increment_global_ai_cap
 
12
  from core.pdf_report import build_pdf
13
- from core.sources import pubchem, ntp, ctx as ctx_src, iarc, scholar, fema
14
 
15
- # Optional: CDC module may exist in your repo (user added).
16
- try:
17
- from core.sources import cdc
18
- except Exception:
19
- cdc = None # type: ignore
 
 
 
 
 
 
 
 
 
 
20
 
21
  # -----------------------------
22
- # Caches (simple in-memory)
23
  # -----------------------------
24
  SEARCH_CACHE: Dict[str, Dict[str, Any]] = {}
25
  AI_CACHE: Dict[str, str] = {}
26
 
27
-
28
- def json_pretty(obj: Any) -> str:
 
 
29
  try:
30
- return json.dumps(obj, indent=2, ensure_ascii=False, default=str)
31
  except Exception:
32
  return str(obj)
33
 
34
 
35
- def client() -> httpx.AsyncClient:
36
- return httpx.AsyncClient(headers={"user-agent": "toxrai-hf-demo"})
 
 
 
 
37
 
38
 
39
  # -----------------------------
40
- # Rendering helpers (Markdown)
41
  # -----------------------------
 
 
 
 
42
 
43
- def render_overview(data: Dict[str, Any]) -> str:
44
- q = data.get("query") or ""
45
- cas = data.get("cas_used") or ""
46
- lines = [
47
- f"**Query:** `{q}`",
48
- f"**CAS used:** `{cas}`",
49
- ]
50
-
51
- # Add quick IDs when available
52
- pub = data.get("pubchem") or {}
53
- if pub.get("ok") and pub.get("cid"):
54
- lines.append(f"**PubChem CID:** `{pub.get('cid')}`")
55
- ctx = data.get("ctx_genetox") or {}
56
- if ctx.get("ok") and ctx.get("dtxsid"):
57
- lines.append(f"**EPA CompTox DTXSID:** `{ctx.get('dtxsid')}`")
58
 
59
- return "\n\n".join(lines)
 
 
60
 
 
 
 
 
 
 
 
61
 
62
- def render_pubchem_summary(pub: Dict[str, Any]) -> str:
63
- if not pub or not pub.get("ok"):
64
- err = pub.get("error") if isinstance(pub, dict) else "Unknown PubChem error"
65
- return f"PubChem unavailable: {err}"
 
 
 
 
 
 
66
 
67
- cid = pub.get("cid")
68
- resolved_cas = pub.get("resolved_cas") or "-"
69
- props = pub.get("props") or {}
 
 
 
70
 
71
- iupac_name = props.get("IUPACName") or props.get("iupac_name") or "-"
72
- formula = props.get("MolecularFormula") or "-"
73
- mw = props.get("MolecularWeight")
74
- mw_str = f"{mw}" if mw not in (None, "") else "-"
75
- smiles = props.get("CanonicalSMILES") or "-"
76
 
77
- lines = []
78
- lines.append(f"**CID:** `{cid}`")
79
- lines.append(f"**Resolved CAS (from synonyms):** `{resolved_cas}`")
80
- lines.append(f"**IUPAC/Title:** {iupac_name}")
81
- lines.append("")
82
- lines.append(f"**Molecular Formula:** `{formula}`")
83
- lines.append(f"**Molecular Weight:** `{mw_str}`")
84
- lines.append(f"**Canonical SMILES:** `{smiles}`")
85
-
86
- structure_png = pub.get("structure_png")
87
- if structure_png:
88
- lines.append("")
89
- lines.append("**Structure**")
90
- lines.append(f"![]({structure_png})")
91
-
92
- url = pub.get("url")
93
- if url:
94
- lines.append("")
95
- lines.append(f"[Open PubChem]({url})")
96
-
97
- hazards = pub.get("hazards") or []
98
- if hazards:
99
- lines.append("")
100
- lines.append("### Safety / Hazard Information")
101
- # Render as paragraphs (avoids weird wrapping from bullet nesting)
102
- for h in hazards:
103
- name = (h or {}).get("name") or "Hazard"
104
- text = (h or {}).get("text") or ""
105
- if not text:
106
- continue
107
- lines.append(f"**{name}:** {text}")
108
- lines.append("")
109
-
110
- return "\n".join(lines).rstrip() + "\n"
111
-
112
-
113
- def render_ctx_summary(ctx: Dict[str, Any]) -> str:
114
- if not ctx or not ctx.get("ok"):
115
- search_url = ctx.get("dashboard_search") if isinstance(ctx, dict) else None
116
- err = ctx.get("error") if isinstance(ctx, dict) else "Unknown CTX error"
117
- if search_url:
118
- return f"{err}\n\n[Open CompTox Dashboard search]({search_url})"
119
- return str(err)
120
 
121
- dtxsid = ctx.get("dtxsid")
122
- dash = ctx.get("dashboard_url")
123
- summary = ctx.get("summary")
124
 
 
 
 
 
 
125
  lines = []
126
  if dtxsid:
127
  lines.append(f"**DTXSID:** `{dtxsid}`")
 
 
 
 
 
 
 
 
128
  if dash:
129
- lines.append(f"[Open CompTox Dashboard]({dash})")
130
-
131
- # Try to surface key fields (if present) without dumping huge JSON
132
- if isinstance(summary, dict):
133
- interesting_keys = [
134
- "geneTox",
135
- "genetox",
136
- "overall",
137
- "summary",
138
- "conclusion",
139
- "call",
140
- "result",
141
- "assessment",
142
- ]
143
- picked = {}
144
- for k in summary.keys():
145
- lk = k.lower()
146
- if any(tok in lk for tok in interesting_keys):
147
- picked[k] = summary[k]
148
- if not picked:
149
- # fallback: first few keys
150
- for k in list(summary.keys())[:8]:
151
- picked[k] = summary[k]
152
-
153
- lines.append("")
154
- lines.append("```json")
155
- txt = json_pretty(picked)
156
- # Keep it readable in UI
157
- if len(txt) > 6000:
158
- txt = txt[:6000] + "\n... (truncated)"
159
- lines.append(txt)
160
- lines.append("```")
161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  return "\n".join(lines)
163
 
164
 
165
- def render_ntp_summary(ntp_res: Dict[str, Any]) -> str:
166
- if not ntp_res or not ntp_res.get("ok"):
167
- err = ntp_res.get("error") if isinstance(ntp_res, dict) else "Unknown NTP error"
168
- return f"NTP Technical Reports unavailable: {err}"
169
 
170
- items = ntp_res.get("items") or []
171
- if not items:
172
- return "No NTP Technical Reports found for this CAS." # CAS-filtered
173
 
174
  lines = []
175
- for it in items:
176
- num = it.get("tr") or it.get("num") or ""
177
- title = it.get("title") or "Report"
178
- url = it.get("report_page") or it.get("url") or ""
179
  if url:
180
- lines.append(f"- **TR-{num}** [{title}]({url})")
181
  else:
182
- lines.append(f"- **TR-{num}** {title}")
183
  return "\n".join(lines)
184
 
185
 
186
- def render_iarc_block(iarc_res: Dict[str, Any]) -> str:
187
- if not iarc_res or not iarc_res.get("ok"):
188
  return "IARC link unavailable."
189
- url = iarc_res.get("url")
190
- return f"[Search IARC Monographs (NCBI Bookshelf)]({url})" if url else "IARC link unavailable."
 
 
191
 
192
 
193
- def render_scholar_block(sch_res: Dict[str, Any]) -> str:
194
- if not sch_res or not sch_res.get("ok"):
195
- return "Google Scholar link unavailable."
196
- url = sch_res.get("url")
197
- return f"[Open Google Scholar search]({url})" if url else "Google Scholar link unavailable."
198
 
199
 
200
- def render_fema_block(fema_res: Dict[str, Any]) -> str:
201
- if not fema_res or not fema_res.get("ok"):
202
- err = fema_res.get("error") if isinstance(fema_res, dict) else "FEMA link unavailable."
203
- return str(err)
204
- url = fema_res.get("url")
205
  return f"[Open FEMA / Fragrance Materials Safety Resource search]({url})" if url else "FEMA link unavailable."
206
 
207
 
208
- def render_cdc_block(cdc_res: Any) -> str:
209
- if not cdc_res:
210
- return "No CDC ToxProfiles match."
211
- # Accept either dict or list
212
- if isinstance(cdc_res, dict):
213
- url = cdc_res.get("url")
214
- name = cdc_res.get("name") or "CDC ToxProfile"
215
- return f"[{name}]({url})" if url else name
216
- if isinstance(cdc_res, list):
217
- lines = []
218
- for it in cdc_res:
219
- if not isinstance(it, dict):
220
- continue
221
- name = it.get("name") or "CDC ToxProfile"
222
- url = it.get("url")
223
- lines.append(f"- [{name}]({url})" if url else f"- {name}")
224
- return "\n".join(lines) if lines else "No CDC ToxProfiles match."
225
- return str(cdc_res)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
 
228
  # -----------------------------
229
- # Search + AI
230
  # -----------------------------
231
-
232
- async def run_search(query: str) -> Dict[str, Any]:
233
  q = (query or "").strip()
234
  if not q:
235
  raise gr.Error("Enter a CAS number (preferred) or chemical name.")
@@ -239,130 +232,65 @@ async def run_search(query: str) -> Dict[str, Any]:
239
  return SEARCH_CACHE[cache_key]
240
 
241
  async with client() as http:
242
- # PubChem accepts names and CAS. We also use it to resolve CAS via synonyms.
243
  pub = await pubchem.pubchem_by_query(q, http)
244
 
245
  cas = q
246
- if not pubchem.is_cas(cas):
247
  cas = pub.get("resolved_cas") or q
248
 
249
- # CTX is CAS-first (but we allow name too; resolver will try both)
250
- ctx_task = ctx_src.fetch_ctx_genetox(cas, http) if cas else asyncio.sleep(0, result={"ok": False})
 
 
 
251
  ntp_task = ntp.search_technical_reports(cas, http, limit=8)
252
 
253
  ctx_res, ntp_res = await asyncio.gather(ctx_task, ntp_task)
254
 
255
- out: Dict[str, Any] = {
 
 
 
256
  "query": q,
257
  "cas_used": cas,
258
  "pubchem": pub,
259
  "ctx_genetox": ctx_res,
260
  "ntp_technical_reports": ntp_res,
 
261
  "iarc_monographs": iarc.bookshelf_link(cas),
262
  "google_scholar": {"ok": True, "url": scholar.scholar_link(cas)},
263
- "fema": fema.fema_link(cas if pubchem.is_cas(cas) else q),
264
  }
265
 
266
- # CDC toxprofiles (if module exists)
267
- if cdc is not None:
268
- try:
269
- # Try a few common function names (depending on how you implemented cdc.py)
270
- if hasattr(cdc, "lookup"):
271
- out["cdc_toxprofiles"] = cdc.lookup(cas)
272
- elif hasattr(cdc, "search"):
273
- out["cdc_toxprofiles"] = cdc.search(cas)
274
- elif hasattr(cdc, "toxprofile_for"):
275
- out["cdc_toxprofiles"] = cdc.toxprofile_for(cas)
276
- else:
277
- out["cdc_toxprofiles"] = None
278
- except Exception:
279
- out["cdc_toxprofiles"] = None
280
-
281
  SEARCH_CACHE[cache_key] = out
282
  return out
283
 
284
 
285
- def _prune_for_prompt(obj: Any, max_chars: int) -> str:
286
- txt = json_pretty(obj)
287
- if len(txt) <= max_chars:
288
- return txt
289
- return txt[:max_chars] + "\n... (truncated)"
290
-
291
-
292
- def build_prompt(data: Dict[str, Any]) -> str:
293
- """Build a prompt that will not exceed model context.
294
-
295
- Key change vs earlier version: DO NOT dump full raw JSON from all sources.
296
- """
297
-
298
- pub = data.get("pubchem") or {}
299
- props = (pub.get("props") or {}) if isinstance(pub, dict) else {}
300
- hazards = (pub.get("hazards") or []) if isinstance(pub, dict) else []
301
-
302
- prompt_obj = {
303
- "query": data.get("query"),
304
- "cas_used": data.get("cas_used"),
305
- "pubchem": {
306
- "cid": pub.get("cid"),
307
- "resolved_cas": pub.get("resolved_cas"),
308
- "iupac": props.get("IUPACName") or props.get("iupac_name"),
309
- "formula": props.get("MolecularFormula"),
310
- "molecular_weight": props.get("MolecularWeight"),
311
- "canonical_smiles": props.get("CanonicalSMILES"),
312
- "hazards": hazards[:10],
313
- },
314
- "ctx_genetox": {
315
- "ok": (data.get("ctx_genetox") or {}).get("ok"),
316
- "dtxsid": (data.get("ctx_genetox") or {}).get("dtxsid"),
317
- "summary": (data.get("ctx_genetox") or {}).get("summary"),
318
- },
319
- "ntp_technical_reports": (data.get("ntp_technical_reports") or {}).get("items", []),
320
- "cdc_toxprofiles": data.get("cdc_toxprofiles"),
321
- }
322
-
323
- body = _prune_for_prompt(prompt_obj, max_chars=12000)
324
-
325
- return (
326
- "You are a toxicology regulatory assistant. "
327
- "Using ONLY the evidence JSON below, write a concise weight-of-evidence summary focused on mutagenicity/genotoxicity. "
328
- "If evidence is conflicting or absent, say so explicitly. "
329
- "Cite which source each statement comes from (PubChem hazards, CTX genetox summary, NTP TR titles, CDC ToxProfiles).\n\n"
330
- "EVIDENCE_JSON:\n"
331
- + body
332
- )
333
-
334
-
335
  def do_search(query: str):
336
  data = asyncio.run(run_search(query))
337
-
338
  overview_md_text = render_overview(data)
339
  pubchem_md_text = render_pubchem_summary(data.get("pubchem", {}))
340
  ctx_md_text = render_ctx_summary(data.get("ctx_genetox", {}))
341
  ntp_md_text = render_ntp_summary(data.get("ntp_technical_reports", {}))
 
342
  iarc_md_text = render_iarc_block(data.get("iarc_monographs", {}))
343
  scholar_md_text = render_scholar_block(data.get("google_scholar", {}))
344
  fema_md_text = render_fema_block(data.get("fema", {}))
345
 
346
- cdc_md_text = ""
347
- if "cdc_toxprofiles" in data:
348
- cdc_md_text = render_cdc_block(data.get("cdc_toxprofiles"))
349
-
350
- raw_pubchem_json = json_pretty(data.get("pubchem", {}))
351
- raw_ctx_json = json_pretty(data.get("ctx_genetox", {}))
352
- raw_ntp_json = json_pretty(data.get("ntp_technical_reports", {}))
353
- raw_iarc_json = json_pretty(data.get("iarc_monographs", {}))
354
- raw_scholar_json = json_pretty(data.get("google_scholar", {}))
355
- raw_fema_json = json_pretty(data.get("fema", {}))
356
 
357
- # IMPORTANT: return order must match `outputs=[...]`
358
- # If CDC accordion exists, include it right after PubChem.
359
  return (
360
- data, # state
361
  overview_md_text,
362
  pubchem_md_text,
363
- cdc_md_text,
364
  ctx_md_text,
365
  ntp_md_text,
 
366
  iarc_md_text,
367
  scholar_md_text,
368
  fema_md_text,
@@ -372,14 +300,13 @@ def do_search(query: str):
372
  raw_iarc_json,
373
  raw_scholar_json,
374
  raw_fema_json,
375
- "", # ai_out (blank after search)
376
  )
377
 
378
 
379
  def generate_ai(data: dict):
380
  if not data:
381
  raise gr.Error("Run a search first.")
382
-
383
  cas = data.get("cas_used") or data.get("query") or ""
384
  cache_key = f"ai::{cas}"
385
  if cache_key in AI_CACHE:
@@ -389,8 +316,6 @@ def generate_ai(data: dict):
389
  if not allowed:
390
  return f"AI Summary capacity reached for today (limit {info.get('limit')}). Please try again tomorrow."
391
 
392
- from core.sources.ai_summary import generate_ai_summary # local import avoids cold-start issues
393
-
394
  resp = generate_ai_summary(build_prompt(data))
395
  if not resp.get("ok"):
396
  return f"**AI summary unavailable:** {resp.get('error')}"
@@ -403,17 +328,19 @@ def generate_ai(data: dict):
403
  def download_report(data: dict, ai_text: str):
404
  if not data:
405
  raise gr.Error("Run a search first.")
406
-
407
  cas = data.get("cas_used") or data.get("query") or "unknown"
408
  pdf_path, json_path = build_pdf(cas, evidence=data, ai_summary=ai_text if ai_text else None)
409
  return pdf_path, json_path
410
 
411
 
412
  # -----------------------------
413
- # UI
414
  # -----------------------------
 
 
 
415
 
416
- with gr.Blocks(title="ToxRAI (HF Demo)") as demo:
417
  gr.Markdown("# 🧪 ToxRAI — Demo (CAS-first)")
418
  gr.Markdown(
419
  f"Public demo • AI summaries/day global cap: **{settings.max_ai_summaries_per_day}** • Cache TTL: **{settings.cache_ttl_seconds}s**"
@@ -436,7 +363,6 @@ with gr.Blocks(title="ToxRAI (HF Demo)") as demo:
436
  with gr.Accordion("PubChem (summary)", open=False):
437
  pubchem_md = gr.Markdown()
438
 
439
- # CDC accordion (optional)
440
  with gr.Accordion("CDC ToxProfiles", open=False):
441
  cdc_md = gr.Markdown()
442
 
@@ -480,9 +406,9 @@ with gr.Blocks(title="ToxRAI (HF Demo)") as demo:
480
  state,
481
  overview_md,
482
  pubchem_md,
483
- cdc_md,
484
  ctx_md,
485
  ntp_md,
 
486
  iarc_md,
487
  scholar_md,
488
  fema_md,
@@ -503,9 +429,9 @@ with gr.Blocks(title="ToxRAI (HF Demo)") as demo:
503
  state,
504
  overview_md,
505
  pubchem_md,
506
- cdc_md,
507
  ctx_md,
508
  ntp_md,
 
509
  iarc_md,
510
  scholar_md,
511
  fema_md,
@@ -523,4 +449,5 @@ with gr.Blocks(title="ToxRAI (HF Demo)") as demo:
523
  pdf_btn.click(fn=download_report, inputs=[state, ai_out], outputs=[pdf_file, json_file])
524
 
525
 
526
- demo.queue(default_concurrency_limit=6).launch()
 
 
 
 
1
  import os
2
+ import json
3
  import time
4
+ import asyncio
5
+ from dataclasses import dataclass
6
+ from typing import Any, Dict, Tuple
7
 
8
  import gradio as gr
 
9
 
10
+ from core.http import client
11
  from core.rate_limit import check_and_increment_global_ai_cap
12
+ from core.validate import is_cas
13
  from core.pdf_report import build_pdf
 
14
 
15
+ from core.sources import pubchem, ntp, ctx as ctx_src, iarc, scholar, fema, cdc
16
+ from core.sources.ai_summary import generate_ai_summary
17
+
18
+
19
+ # -----------------------------
20
+ # Settings
21
+ # -----------------------------
22
+ @dataclass
23
+ class Settings:
24
+ openai_model: str = os.getenv("OPENAI_MODEL", "gpt-4o")
25
+ max_ai_summaries_per_day: int = int(os.getenv("MAX_AI_SUMMARIES_PER_DAY", "100"))
26
+ cache_ttl_seconds: int = int(os.getenv("CACHE_TTL_SECONDS", "86400"))
27
+
28
+
29
+ settings = Settings()
30
 
31
  # -----------------------------
32
+ # Simple in-memory caches
33
  # -----------------------------
34
  SEARCH_CACHE: Dict[str, Dict[str, Any]] = {}
35
  AI_CACHE: Dict[str, str] = {}
36
 
37
+ # -----------------------------
38
+ # Utilities
39
+ # -----------------------------
40
+ def _pretty(obj: Any) -> str:
41
  try:
42
+ return json.dumps(obj, indent=2, ensure_ascii=False)
43
  except Exception:
44
  return str(obj)
45
 
46
 
47
+ def _truncate_text(s: str, max_chars: int) -> str:
48
+ if not s:
49
+ return ""
50
+ if len(s) <= max_chars:
51
+ return s
52
+ return s[:max_chars] + "\n\n[TRUNCATED]\n"
53
 
54
 
55
  # -----------------------------
56
+ # Renderers (Markdown blocks)
57
  # -----------------------------
58
+ def render_overview(data: dict) -> str:
59
+ q = data.get("query", "")
60
+ cas = data.get("cas_used", "")
61
+ return f"**Query:** `{q}`\n\n**CAS used:** `{cas}`"
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ def render_pubchem_summary(pub: dict) -> str:
65
+ if not pub or not pub.get("ok"):
66
+ return f"PubChem unavailable: {pub.get('error') if isinstance(pub, dict) else 'unknown'}"
67
 
68
+ cid = pub.get("cid", "")
69
+ resolved_cas = pub.get("resolved_cas", "")
70
+ iupac = pub.get("iupac_name") or pub.get("title") or "-"
71
+ mf = pub.get("molecular_formula") or "-"
72
+ mw = pub.get("molecular_weight") or "-"
73
+ smiles = pub.get("canonical_smiles") or "-"
74
+ struct_url = pub.get("structure_png") or ""
75
 
76
+ lines = [
77
+ f"**CID:** `{cid}`",
78
+ f"**Resolved CAS (from synonyms):** `{resolved_cas}`" if resolved_cas else "",
79
+ f"**IUPAC/Title:** {iupac}",
80
+ "",
81
+ f"**Molecular Formula:** `{mf}`",
82
+ f"**Molecular Weight:** `{mw}`",
83
+ f"**Canonical SMILES:** `{smiles}`",
84
+ "",
85
+ ]
86
 
87
+ if struct_url:
88
+ lines += [
89
+ "**Structure**",
90
+ f"![structure]({struct_url})",
91
+ "",
92
+ ]
93
 
94
+ pc_url = pub.get("url") or ""
95
+ if pc_url:
96
+ lines.append(f"[Open PubChem]({pc_url})")
 
 
97
 
98
+ return "\n".join([x for x in lines if x != ""])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
 
 
 
100
 
101
+ def render_ctx_summary(ctx: dict) -> str:
102
+ if not ctx or not ctx.get("ok"):
103
+ return ctx.get("error") or "CTX unavailable."
104
+ # Full fields already in ctx dict; render a compact header + note
105
+ dtxsid = ctx.get("dtxsid") or ""
106
  lines = []
107
  if dtxsid:
108
  lines.append(f"**DTXSID:** `{dtxsid}`")
109
+ hazard = ctx.get("ghs_hazard_statements") or ""
110
+ echa = ctx.get("echa_cl_summary") or ""
111
+ if hazard:
112
+ lines.append(f"\n**GHS Hazard Statements:**\n\n{hazard}")
113
+ if echa:
114
+ lines.append(f"\n**ECHA C&L Summary:**\n\n{echa}")
115
+
116
+ dash = ctx.get("dashboard_search_url") or ""
117
  if dash:
118
+ lines.append(f"\n[Open CompTox Dashboard search]({dash})")
119
+ return "\n".join(lines) if lines else "No DTXSID found for this query."
120
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
+ def render_ntp_summary(ntp_obj: dict) -> str:
123
+ if not ntp_obj or not ntp_obj.get("ok"):
124
+ return ntp_obj.get("error") or "NTP unavailable."
125
+ hits = ntp_obj.get("hits") or []
126
+ if not hits:
127
+ return "No NTP Technical Reports found for this CAS."
128
+ lines = []
129
+ for h in hits:
130
+ tr = h.get("tr") or h.get("title") or "NTP Technical Report"
131
+ url = h.get("url") or ""
132
+ pdf = h.get("pdf") or ""
133
+ # Always show PDF link when available
134
+ if pdf:
135
+ lines.append(f"- **{tr}** — [Report page]({url}) • [PDF]({pdf})" if url else f"- **{tr}** — [PDF]({pdf})")
136
+ else:
137
+ lines.append(f"- **{tr}** — [Report page]({url})" if url else f"- **{tr}**")
138
  return "\n".join(lines)
139
 
140
 
141
+ def render_cdc_summary(cdc_obj: dict) -> str:
142
+ if not cdc_obj or not cdc_obj.get("ok"):
143
+ return cdc_obj.get("error") or "CDC toxprofiles unavailable."
 
144
 
145
+ matches = cdc_obj.get("matches") or []
146
+ if not matches:
147
+ return "No toxprofile is available for the chemical."
148
 
149
  lines = []
150
+ for m in matches:
151
+ name = m.get("name") or "ToxProfile"
152
+ cas = m.get("cas") or ""
153
+ url = m.get("url") or ""
154
  if url:
155
+ lines.append(f"- **{name}** (CAS: {cas}) — [CDC ToxProfile]({url})")
156
  else:
157
+ lines.append(f"- **{name}** (CAS: {cas})")
158
  return "\n".join(lines)
159
 
160
 
161
+ def render_iarc_block(obj: dict) -> str:
162
+ if not obj or not obj.get("ok"):
163
  return "IARC link unavailable."
164
+ url = obj.get("url") or ""
165
+ if not url:
166
+ return "IARC link unavailable."
167
+ return f"[Open IARC Monographs search]({url})"
168
 
169
 
170
+ def render_scholar_block(obj: dict) -> str:
171
+ if not obj or not obj.get("ok"):
172
+ return "Scholar link unavailable."
173
+ url = obj.get("url") or ""
174
+ return f"[Open Google Scholar search]({url})" if url else "Scholar link unavailable."
175
 
176
 
177
+ def render_fema_block(obj: dict) -> str:
178
+ if not obj or not obj.get("ok"):
179
+ return "FEMA link unavailable."
180
+ url = obj.get("url") or ""
 
181
  return f"[Open FEMA / Fragrance Materials Safety Resource search]({url})" if url else "FEMA link unavailable."
182
 
183
 
184
+ # -----------------------------
185
+ # Prompt builder (keep small)
186
+ # -----------------------------
187
+ def build_prompt(data: dict) -> str:
188
+ cas = data.get("cas_used") or data.get("query") or "unknown"
189
+
190
+ pub = data.get("pubchem", {})
191
+ ctx = data.get("ctx_genetox", {})
192
+ ntp_obj = data.get("ntp_technical_reports", {})
193
+ cdc_obj = data.get("cdc_toxprofiles", {})
194
+
195
+ prompt = f"""You are a toxicology assistant. Summarize weight-of-evidence for mutagenicity/genotoxicity.
196
+
197
+ Chemical CAS: {cas}
198
+
199
+ PUBCHEM (selected fields):
200
+ {_pretty({k: pub.get(k) for k in ['cid','resolved_cas','iupac_name','title','molecular_formula','molecular_weight','canonical_smiles','url']})}
201
+
202
+ CTX (selected blocks only):
203
+ {_pretty({k: ctx.get(k) for k in ['dtxsid','ghs_hazard_statements','echa_cl_summary','genetox_records']})}
204
+
205
+ NTP Technical Reports (hits):
206
+ {_pretty(ntp_obj.get('hits') if isinstance(ntp_obj, dict) else ntp_obj)}
207
+
208
+ CDC ToxProfiles (matches):
209
+ {_pretty(cdc_obj.get('matches') if isinstance(cdc_obj, dict) else cdc_obj)}
210
+
211
+ Write a concise, structured summary:
212
+ - Identity & key links
213
+ - Genetox signals (Ames, micronucleus, chromosomal aberrations, etc.)
214
+ - Any conflicts/inconsistencies
215
+ - Overall conclusion (low/medium/high concern)
216
+ - What data is missing
217
+ """
218
+
219
+ return _truncate_text(prompt, max_chars=16000)
220
 
221
 
222
  # -----------------------------
223
+ # Search pipeline
224
  # -----------------------------
225
+ async def run_search(query: str) -> dict:
 
226
  q = (query or "").strip()
227
  if not q:
228
  raise gr.Error("Enter a CAS number (preferred) or chemical name.")
 
232
  return SEARCH_CACHE[cache_key]
233
 
234
  async with client() as http:
 
235
  pub = await pubchem.pubchem_by_query(q, http)
236
 
237
  cas = q
238
+ if not is_cas(cas):
239
  cas = pub.get("resolved_cas") or q
240
 
241
+ ctx_task = (
242
+ ctx_src.fetch_ctx_genetox(cas, http)
243
+ if is_cas(cas)
244
+ else asyncio.sleep(0, result={"ok": False, "error": "CTX requires CAS (CAS-first)."})
245
+ )
246
  ntp_task = ntp.search_technical_reports(cas, http, limit=8)
247
 
248
  ctx_res, ntp_res = await asyncio.gather(ctx_task, ntp_task)
249
 
250
+ # CDC (offline/local index): try resolved CAS first, then name fallback inside cdc.search()
251
+ cdc_res = cdc.search(q, cas=cas if is_cas(cas) else None, limit=8)
252
+
253
+ out = {
254
  "query": q,
255
  "cas_used": cas,
256
  "pubchem": pub,
257
  "ctx_genetox": ctx_res,
258
  "ntp_technical_reports": ntp_res,
259
+ "cdc_toxprofiles": cdc_res,
260
  "iarc_monographs": iarc.bookshelf_link(cas),
261
  "google_scholar": {"ok": True, "url": scholar.scholar_link(cas)},
262
+ "fema": fema.fema_link(cas if is_cas(cas) else q),
263
  }
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  SEARCH_CACHE[cache_key] = out
266
  return out
267
 
268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  def do_search(query: str):
270
  data = asyncio.run(run_search(query))
 
271
  overview_md_text = render_overview(data)
272
  pubchem_md_text = render_pubchem_summary(data.get("pubchem", {}))
273
  ctx_md_text = render_ctx_summary(data.get("ctx_genetox", {}))
274
  ntp_md_text = render_ntp_summary(data.get("ntp_technical_reports", {}))
275
+ cdc_md_text = render_cdc_summary(data.get("cdc_toxprofiles", {}))
276
  iarc_md_text = render_iarc_block(data.get("iarc_monographs", {}))
277
  scholar_md_text = render_scholar_block(data.get("google_scholar", {}))
278
  fema_md_text = render_fema_block(data.get("fema", {}))
279
 
280
+ raw_pubchem_json = _pretty(data.get("pubchem", {}))
281
+ raw_ctx_json = _pretty(data.get("ctx_genetox", {}))
282
+ raw_ntp_json = _pretty(data.get("ntp_technical_reports", {}))
283
+ raw_iarc_json = _pretty(data.get("iarc_monographs", {}))
284
+ raw_scholar_json = _pretty(data.get("google_scholar", {}))
285
+ raw_fema_json = _pretty(data.get("fema", {}))
 
 
 
 
286
 
 
 
287
  return (
288
+ data,
289
  overview_md_text,
290
  pubchem_md_text,
 
291
  ctx_md_text,
292
  ntp_md_text,
293
+ cdc_md_text,
294
  iarc_md_text,
295
  scholar_md_text,
296
  fema_md_text,
 
300
  raw_iarc_json,
301
  raw_scholar_json,
302
  raw_fema_json,
303
+ "", # ai_out blank after search
304
  )
305
 
306
 
307
  def generate_ai(data: dict):
308
  if not data:
309
  raise gr.Error("Run a search first.")
 
310
  cas = data.get("cas_used") or data.get("query") or ""
311
  cache_key = f"ai::{cas}"
312
  if cache_key in AI_CACHE:
 
316
  if not allowed:
317
  return f"AI Summary capacity reached for today (limit {info.get('limit')}). Please try again tomorrow."
318
 
 
 
319
  resp = generate_ai_summary(build_prompt(data))
320
  if not resp.get("ok"):
321
  return f"**AI summary unavailable:** {resp.get('error')}"
 
328
  def download_report(data: dict, ai_text: str):
329
  if not data:
330
  raise gr.Error("Run a search first.")
 
331
  cas = data.get("cas_used") or data.get("query") or "unknown"
332
  pdf_path, json_path = build_pdf(cas, evidence=data, ai_summary=ai_text if ai_text else None)
333
  return pdf_path, json_path
334
 
335
 
336
  # -----------------------------
337
+ # UI (light, production-like)
338
  # -----------------------------
339
+ LIGHT_CSS = """
340
+ .gradio-container { background: white !important; }
341
+ """
342
 
343
+ with gr.Blocks(title="ToxRAI (HF Demo)", css=LIGHT_CSS) as demo:
344
  gr.Markdown("# 🧪 ToxRAI — Demo (CAS-first)")
345
  gr.Markdown(
346
  f"Public demo • AI summaries/day global cap: **{settings.max_ai_summaries_per_day}** • Cache TTL: **{settings.cache_ttl_seconds}s**"
 
363
  with gr.Accordion("PubChem (summary)", open=False):
364
  pubchem_md = gr.Markdown()
365
 
 
366
  with gr.Accordion("CDC ToxProfiles", open=False):
367
  cdc_md = gr.Markdown()
368
 
 
406
  state,
407
  overview_md,
408
  pubchem_md,
 
409
  ctx_md,
410
  ntp_md,
411
+ cdc_md,
412
  iarc_md,
413
  scholar_md,
414
  fema_md,
 
429
  state,
430
  overview_md,
431
  pubchem_md,
 
432
  ctx_md,
433
  ntp_md,
434
+ cdc_md,
435
  iarc_md,
436
  scholar_md,
437
  fema_md,
 
449
  pdf_btn.click(fn=download_report, inputs=[state, ai_out], outputs=[pdf_file, json_file])
450
 
451
 
452
+ if __name__ == "__main__":
453
+ demo.queue().launch()