import asyncio import json import os import time from typing import Any, Dict, Optional import gradio as gr import httpx from core.config import settings from core.rate_limit import check_and_increment_global_ai_cap from core.pdf_report import build_pdf from core.sources import pubchem, ntp, ctx as ctx_src, iarc, scholar, fema # Optional: CDC module may exist in your repo (user added). try: from core.sources import cdc except Exception: cdc = None # type: ignore # ----------------------------- # Caches (simple in-memory) # ----------------------------- SEARCH_CACHE: Dict[str, Dict[str, Any]] = {} AI_CACHE: Dict[str, str] = {} def json_pretty(obj: Any) -> str: try: return json.dumps(obj, indent=2, ensure_ascii=False, default=str) except Exception: return str(obj) def client() -> httpx.AsyncClient: return httpx.AsyncClient(headers={"user-agent": "toxrai-hf-demo"}) # ----------------------------- # Rendering helpers (Markdown) # ----------------------------- def render_overview(data: Dict[str, Any]) -> str: q = data.get("query") or "" cas = data.get("cas_used") or "" lines = [ f"**Query:** `{q}`", f"**CAS used:** `{cas}`", ] # Add quick IDs when available pub = data.get("pubchem") or {} if pub.get("ok") and pub.get("cid"): lines.append(f"**PubChem CID:** `{pub.get('cid')}`") ctx = data.get("ctx_genetox") or {} if ctx.get("ok") and ctx.get("dtxsid"): lines.append(f"**EPA CompTox DTXSID:** `{ctx.get('dtxsid')}`") return "\n\n".join(lines) def render_pubchem_summary(pub: Dict[str, Any]) -> str: if not pub or not pub.get("ok"): err = pub.get("error") if isinstance(pub, dict) else "Unknown PubChem error" return f"PubChem unavailable: {err}" cid = pub.get("cid") resolved_cas = pub.get("resolved_cas") or "-" props = pub.get("props") or {} iupac_name = props.get("IUPACName") or props.get("iupac_name") or "-" formula = props.get("MolecularFormula") or "-" mw = props.get("MolecularWeight") mw_str = f"{mw}" if mw not in (None, "") else "-" smiles = props.get("CanonicalSMILES") or "-" lines = [] lines.append(f"**CID:** `{cid}`") lines.append(f"**Resolved CAS (from synonyms):** `{resolved_cas}`") lines.append(f"**IUPAC/Title:** {iupac_name}") lines.append("") lines.append(f"**Molecular Formula:** `{formula}`") lines.append(f"**Molecular Weight:** `{mw_str}`") lines.append(f"**Canonical SMILES:** `{smiles}`") structure_png = pub.get("structure_png") if structure_png: lines.append("") lines.append("**Structure**") lines.append(f"![]({structure_png})") url = pub.get("url") if url: lines.append("") lines.append(f"[Open PubChem]({url})") hazards = pub.get("hazards") or [] if hazards: lines.append("") lines.append("### Safety / Hazard Information") # Render as paragraphs (avoids weird wrapping from bullet nesting) for h in hazards: name = (h or {}).get("name") or "Hazard" text = (h or {}).get("text") or "" if not text: continue lines.append(f"**{name}:** {text}") lines.append("") return "\n".join(lines).rstrip() + "\n" def render_ctx_summary(ctx: Dict[str, Any]) -> str: if not ctx or not ctx.get("ok"): search_url = ctx.get("dashboard_search") if isinstance(ctx, dict) else None err = ctx.get("error") if isinstance(ctx, dict) else "Unknown CTX error" if search_url: return f"{err}\n\n[Open CompTox Dashboard search]({search_url})" return str(err) dtxsid = ctx.get("dtxsid") dash = ctx.get("dashboard_url") summary = ctx.get("summary") lines = [] if dtxsid: lines.append(f"**DTXSID:** `{dtxsid}`") if dash: lines.append(f"[Open CompTox Dashboard]({dash})") # Try to surface key fields (if present) without dumping huge JSON if isinstance(summary, dict): interesting_keys = [ "geneTox", "genetox", "overall", "summary", "conclusion", "call", "result", "assessment", ] picked = {} for k in summary.keys(): lk = k.lower() if any(tok in lk for tok in interesting_keys): picked[k] = summary[k] if not picked: # fallback: first few keys for k in list(summary.keys())[:8]: picked[k] = summary[k] lines.append("") lines.append("```json") txt = json_pretty(picked) # Keep it readable in UI if len(txt) > 6000: txt = txt[:6000] + "\n... (truncated)" lines.append(txt) lines.append("```") return "\n".join(lines) def render_ntp_summary(ntp_res: Dict[str, Any]) -> str: if not ntp_res or not ntp_res.get("ok"): err = ntp_res.get("error") if isinstance(ntp_res, dict) else "Unknown NTP error" return f"NTP Technical Reports unavailable: {err}" items = ntp_res.get("items") or [] if not items: return "No NTP Technical Reports found for this CAS." # CAS-filtered lines = [] for it in items: num = it.get("tr") or it.get("num") or "" title = it.get("title") or "Report" url = it.get("report_page") or it.get("url") or "" if url: lines.append(f"- **TR-{num}** [{title}]({url})") else: lines.append(f"- **TR-{num}** {title}") return "\n".join(lines) def render_iarc_block(iarc_res: Dict[str, Any]) -> str: if not iarc_res or not iarc_res.get("ok"): return "IARC link unavailable." url = iarc_res.get("url") if url: return f"[Search IARC Monographs (NCBI Bookshelf)]({url})" results = iarc_res.get("results") if isinstance(iarc_res, dict) else None if isinstance(results, list) and results: lines = [] for it in results: if not isinstance(it, dict): continue title = it.get("title") or "IARC Monographs" link = it.get("url") year = it.get("year") suffix = f" ({year})" if year else "" if link: lines.append(f"- [{title}]({link}){suffix}") else: lines.append(f"- {title}{suffix}") return "\n".join(lines) if lines else "IARC link unavailable." return "IARC link unavailable." def render_scholar_block(sch_res: Dict[str, Any]) -> str: if not sch_res or not sch_res.get("ok"): return "Google Scholar link unavailable." url = sch_res.get("url") return f"[Open Google Scholar search]({url})" if url else "Google Scholar link unavailable." def render_fema_block(fema_res: Dict[str, Any]) -> str: if not fema_res or not fema_res.get("ok"): err = fema_res.get("error") if isinstance(fema_res, dict) else "FEMA link unavailable." return str(err) cas_url = fema_res.get("cas_url") name_url = fema_res.get("name_url") combo_url = fema_res.get("combo_url") alt = fema_res.get("alt_url") search_api = fema_res.get("search_api_url") if not cas_url and not name_url and not combo_url and not alt and not search_api: return "FEMA link unavailable." lines = ["A FEMA risk assessment for this chemical is available:"] if cas_url: lines.append(f"- [Search by CAS]({cas_url})") if name_url: lines.append(f"- [Search by Chemical Name]({name_url})") if combo_url: lines.append(f"- [Search by CAS + Name]({combo_url})") if search_api: lines.append(f"- [Generic FEMA search (alt)]({search_api})") if alt: lines.append(f"- [Generic FEMA search]({alt})") return "\n".join(lines) def render_cdc_block(cdc_res: Any) -> str: if not cdc_res: return "No CDC ToxProfiles match." # Accept either dict or list if isinstance(cdc_res, dict): url = cdc_res.get("url") name = cdc_res.get("name") or "CDC ToxProfile" return f"[{name}]({url})" if url else name if isinstance(cdc_res, list): lines = [] for it in cdc_res: if not isinstance(it, dict): continue name = it.get("name") or "CDC ToxProfile" url = it.get("url") lines.append(f"- [{name}]({url})" if url else f"- {name}") return "\n".join(lines) if lines else "No CDC ToxProfiles match." return str(cdc_res) # ----------------------------- # Search + AI # ----------------------------- async def run_search(query: str) -> Dict[str, Any]: q = (query or "").strip() if not q: raise gr.Error("Enter a CAS number (preferred) or chemical name.") cache_key = f"search::{q.lower()}" if cache_key in SEARCH_CACHE: return SEARCH_CACHE[cache_key] async with client() as http: # PubChem accepts names and CAS. We also use it to resolve CAS via synonyms. pub = await pubchem.pubchem_by_query(q, http) cas = q if not pubchem.is_cas(cas): cas = pub.get("resolved_cas") or q # CTX is CAS-first (but we allow name too; resolver will try both) pub_dtxsid = pub.get("dtxsid") if isinstance(pub, dict) else None ctx_query = pub_dtxsid or q ctx_task = ctx_src.fetch_ctx_genetox(ctx_query, http) if ctx_query else asyncio.sleep(0, result={"ok": False}) ntp_task = ntp.search_technical_reports(cas, http, limit=8) ctx_res, ntp_res = await asyncio.gather(ctx_task, ntp_task) out: Dict[str, Any] = { "query": q, "cas_used": cas, "pubchem": pub, "ctx_genetox": ctx_res, "ntp_technical_reports": ntp_res, "iarc_monographs": iarc.bookshelf_link(cas), "google_scholar": {"ok": True, "url": scholar.scholar_link(cas)}, "fema": fema.fema_link(cas if pubchem.is_cas(cas) else "", q), } # CDC toxprofiles (if module exists) if cdc is not None: try: # Try a few common function names (depending on how you implemented cdc.py) if hasattr(cdc, "lookup"): out["cdc_toxprofiles"] = cdc.lookup(cas) elif hasattr(cdc, "search"): out["cdc_toxprofiles"] = cdc.search(cas) elif hasattr(cdc, "toxprofile_for"): out["cdc_toxprofiles"] = cdc.toxprofile_for(cas) else: out["cdc_toxprofiles"] = None except Exception: out["cdc_toxprofiles"] = None SEARCH_CACHE[cache_key] = out return out def _prune_for_prompt(obj: Any, max_chars: int) -> str: txt = json_pretty(obj) if len(txt) <= max_chars: return txt return txt[:max_chars] + "\n... (truncated)" def build_prompt(data: Dict[str, Any]) -> str: """Build a prompt that will not exceed model context. Key change vs earlier version: DO NOT dump full raw JSON from all sources. """ pub = data.get("pubchem") or {} props = (pub.get("props") or {}) if isinstance(pub, dict) else {} hazards = (pub.get("hazards") or []) if isinstance(pub, dict) else [] prompt_obj = { "query": data.get("query"), "cas_used": data.get("cas_used"), "pubchem": { "cid": pub.get("cid"), "resolved_cas": pub.get("resolved_cas"), "iupac": props.get("IUPACName") or props.get("iupac_name"), "formula": props.get("MolecularFormula"), "molecular_weight": props.get("MolecularWeight"), "canonical_smiles": props.get("CanonicalSMILES"), "hazards": hazards[:10], }, "ctx_genetox": { "ok": (data.get("ctx_genetox") or {}).get("ok"), "dtxsid": (data.get("ctx_genetox") or {}).get("dtxsid"), "summary": (data.get("ctx_genetox") or {}).get("summary"), }, "ntp_technical_reports": (data.get("ntp_technical_reports") or {}).get("items", []), "cdc_toxprofiles": data.get("cdc_toxprofiles"), } body = _prune_for_prompt(prompt_obj, max_chars=12000) return ( "You are a toxicology regulatory assistant. " "Using ONLY the evidence JSON below, write a concise weight-of-evidence summary focused on mutagenicity/genotoxicity. " "If evidence is conflicting or absent, say so explicitly. " "Cite which source each statement comes from (PubChem hazards, CTX genetox summary, NTP TR titles, CDC ToxProfiles).\n\n" "EVIDENCE_JSON:\n" + body ) def do_search(query: str): data = asyncio.run(run_search(query)) overview_md_text = render_overview(data) pubchem_md_text = render_pubchem_summary(data.get("pubchem", {})) ctx_md_text = render_ctx_summary(data.get("ctx_genetox", {})) ntp_md_text = render_ntp_summary(data.get("ntp_technical_reports", {})) iarc_md_text = render_iarc_block(data.get("iarc_monographs", {})) scholar_md_text = render_scholar_block(data.get("google_scholar", {})) fema_md_text = render_fema_block(data.get("fema", {})) cdc_md_text = "" if "cdc_toxprofiles" in data: cdc_md_text = render_cdc_block(data.get("cdc_toxprofiles")) raw_pubchem_json = json_pretty(data.get("pubchem", {})) raw_ctx_json = json_pretty(data.get("ctx_genetox", {})) raw_ntp_json = json_pretty(data.get("ntp_technical_reports", {})) raw_iarc_json = json_pretty(data.get("iarc_monographs", {})) raw_scholar_json = json_pretty(data.get("google_scholar", {})) raw_fema_json = json_pretty(data.get("fema", {})) # IMPORTANT: return order must match `outputs=[...]` # If CDC accordion exists, include it right after PubChem. return ( data, # state overview_md_text, pubchem_md_text, cdc_md_text, ctx_md_text, ntp_md_text, iarc_md_text, scholar_md_text, fema_md_text, raw_pubchem_json, raw_ctx_json, raw_ntp_json, raw_iarc_json, raw_scholar_json, raw_fema_json, "", # ai_out (blank after search) ) def generate_ai(data: dict): if not data: raise gr.Error("Run a search first.") cas = data.get("cas_used") or data.get("query") or "" cache_key = f"ai::{cas}" if cache_key in AI_CACHE: return AI_CACHE[cache_key] allowed, info = check_and_increment_global_ai_cap() if not allowed: return f"AI Summary capacity reached for today (limit {info.get('limit')}). Please try again tomorrow." from core.sources.ai_summary import generate_ai_summary # local import avoids cold-start issues resp = generate_ai_summary(build_prompt(data)) if not resp.get("ok"): return f"**AI summary unavailable:** {resp.get('error')}" text = resp.get("text") or "" AI_CACHE[cache_key] = text return text def download_report(data: dict, ai_text: str): if not data: raise gr.Error("Run a search first.") cas = data.get("cas_used") or data.get("query") or "unknown" pdf_path, json_path = build_pdf(cas, evidence=data, ai_summary=ai_text if ai_text else None) return pdf_path, json_path # ----------------------------- # UI # ----------------------------- with gr.Blocks(title="ToxRAI (HF Demo)") as demo: gr.Markdown("# 🧪 ToxRAI — Demo (CAS-first)") gr.Markdown( f"Public demo • AI summaries/day global cap: **{settings.max_ai_summaries_per_day}** • Cache TTL: **{settings.cache_ttl_seconds}s**" ) with gr.Tabs(): with gr.Tab("Search"): state = gr.State(None) with gr.Row(): query_in = gr.Textbox( label="CAS (preferred) or Chemical name", placeholder="e.g., 80-05-7 or bisphenol A", scale=4, ) search_btn = gr.Button("Search", variant="primary", scale=1) overview_md = gr.Markdown() with gr.Accordion("PubChem (summary)", open=False): pubchem_md = gr.Markdown() # CDC accordion (optional) with gr.Accordion("CDC ToxProfiles", open=False): cdc_md = gr.Markdown() with gr.Accordion("EPA CompTox (CTX) — Genetox (full fields)", open=False): ctx_md = gr.Markdown() with gr.Accordion("NTP Technical Reports", open=False): ntp_md = gr.Markdown() with gr.Accordion("IARC Monographs", open=False): iarc_md = gr.Markdown() with gr.Accordion("Google Scholar", open=False): scholar_md = gr.Markdown() with gr.Accordion("FEMA Risk Assessment", open=False): fema_md = gr.Markdown() with gr.Accordion("Raw outputs (all sources)", open=False): raw_pubchem = gr.Code(label="PubChem (raw)", language="json") raw_ctx = gr.Code(label="CTX Genetox (raw)", language="json") raw_ntp = gr.Code(label="NTP TR (raw)", language="json") raw_iarc = gr.Code(label="IARC (raw)", language="json") raw_scholar = gr.Code(label="Scholar link (raw)", language="json") raw_fema = gr.Code(label="FEMA (raw)", language="json") with gr.Row(): ai_btn = gr.Button("Generate AI Summary (GPT-4o)", variant="secondary") pdf_btn = gr.Button("Build PDF + JSON") ai_out = gr.Markdown() with gr.Row(): pdf_file = gr.File(label="Download PDF") json_file = gr.File(label="Download JSON evidence packet") search_btn.click( fn=do_search, inputs=[query_in], outputs=[ state, overview_md, pubchem_md, cdc_md, ctx_md, ntp_md, iarc_md, scholar_md, fema_md, raw_pubchem, raw_ctx, raw_ntp, raw_iarc, raw_scholar, raw_fema, ai_out, ], ) query_in.submit( fn=do_search, inputs=[query_in], outputs=[ state, overview_md, pubchem_md, cdc_md, ctx_md, ntp_md, iarc_md, scholar_md, fema_md, raw_pubchem, raw_ctx, raw_ntp, raw_iarc, raw_scholar, raw_fema, ai_out, ], ) ai_btn.click(fn=generate_ai, inputs=[state], outputs=[ai_out]) pdf_btn.click(fn=download_report, inputs=[state, ai_out], outputs=[pdf_file, json_file]) demo.queue(default_concurrency_limit=6) app = demo if __name__ == "__main__": demo.launch()