|
|
import asyncio |
|
|
import json |
|
|
import os |
|
|
import time |
|
|
from typing import Any, Dict, Optional |
|
|
|
|
|
import gradio as gr |
|
|
import httpx |
|
|
|
|
|
from core.config import settings |
|
|
from core.rate_limit import check_and_increment_global_ai_cap |
|
|
from core.pdf_report import build_pdf |
|
|
from core.sources import pubchem, ntp, ctx as ctx_src, iarc, scholar, fema |
|
|
|
|
|
|
|
|
try: |
|
|
from core.sources import cdc |
|
|
except Exception: |
|
|
cdc = None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SEARCH_CACHE: Dict[str, Dict[str, Any]] = {} |
|
|
AI_CACHE: Dict[str, str] = {} |
|
|
|
|
|
|
|
|
def json_pretty(obj: Any) -> str: |
|
|
try: |
|
|
return json.dumps(obj, indent=2, ensure_ascii=False, default=str) |
|
|
except Exception: |
|
|
return str(obj) |
|
|
|
|
|
|
|
|
def client() -> httpx.AsyncClient: |
|
|
return httpx.AsyncClient(headers={"user-agent": "toxrai-hf-demo"}) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def render_overview(data: Dict[str, Any]) -> str: |
|
|
q = data.get("query") or "" |
|
|
cas = data.get("cas_used") or "" |
|
|
lines = [ |
|
|
f"**Query:** `{q}`", |
|
|
f"**CAS used:** `{cas}`", |
|
|
] |
|
|
|
|
|
|
|
|
pub = data.get("pubchem") or {} |
|
|
if pub.get("ok") and pub.get("cid"): |
|
|
lines.append(f"**PubChem CID:** `{pub.get('cid')}`") |
|
|
ctx = data.get("ctx_genetox") or {} |
|
|
if ctx.get("ok") and ctx.get("dtxsid"): |
|
|
lines.append(f"**EPA CompTox DTXSID:** `{ctx.get('dtxsid')}`") |
|
|
|
|
|
return "\n\n".join(lines) |
|
|
|
|
|
|
|
|
def render_pubchem_summary(pub: Dict[str, Any]) -> str: |
|
|
if not pub or not pub.get("ok"): |
|
|
err = pub.get("error") if isinstance(pub, dict) else "Unknown PubChem error" |
|
|
return f"PubChem unavailable: {err}" |
|
|
|
|
|
cid = pub.get("cid") |
|
|
resolved_cas = pub.get("resolved_cas") or "-" |
|
|
props = pub.get("props") or {} |
|
|
|
|
|
iupac_name = props.get("IUPACName") or props.get("iupac_name") or "-" |
|
|
formula = props.get("MolecularFormula") or "-" |
|
|
mw = props.get("MolecularWeight") |
|
|
mw_str = f"{mw}" if mw not in (None, "") else "-" |
|
|
smiles = props.get("CanonicalSMILES") or "-" |
|
|
|
|
|
lines = [] |
|
|
lines.append(f"**CID:** `{cid}`") |
|
|
lines.append(f"**Resolved CAS (from synonyms):** `{resolved_cas}`") |
|
|
lines.append(f"**IUPAC/Title:** {iupac_name}") |
|
|
lines.append("") |
|
|
lines.append(f"**Molecular Formula:** `{formula}`") |
|
|
lines.append(f"**Molecular Weight:** `{mw_str}`") |
|
|
lines.append(f"**Canonical SMILES:** `{smiles}`") |
|
|
|
|
|
structure_png = pub.get("structure_png") |
|
|
if structure_png: |
|
|
lines.append("") |
|
|
lines.append("**Structure**") |
|
|
lines.append(f"") |
|
|
|
|
|
url = pub.get("url") |
|
|
if url: |
|
|
lines.append("") |
|
|
lines.append(f"[Open PubChem]({url})") |
|
|
|
|
|
hazards = pub.get("hazards") or [] |
|
|
if hazards: |
|
|
lines.append("") |
|
|
lines.append("### Safety / Hazard Information") |
|
|
|
|
|
for h in hazards: |
|
|
name = (h or {}).get("name") or "Hazard" |
|
|
text = (h or {}).get("text") or "" |
|
|
if not text: |
|
|
continue |
|
|
lines.append(f"**{name}:** {text}") |
|
|
lines.append("") |
|
|
|
|
|
return "\n".join(lines).rstrip() + "\n" |
|
|
|
|
|
|
|
|
def render_ctx_summary(ctx: Dict[str, Any]) -> str: |
|
|
if not ctx or not ctx.get("ok"): |
|
|
search_url = ctx.get("dashboard_search") if isinstance(ctx, dict) else None |
|
|
err = ctx.get("error") if isinstance(ctx, dict) else "Unknown CTX error" |
|
|
if search_url: |
|
|
return f"{err}\n\n[Open CompTox Dashboard search]({search_url})" |
|
|
return str(err) |
|
|
|
|
|
dtxsid = ctx.get("dtxsid") |
|
|
dash = ctx.get("dashboard_url") |
|
|
summary = ctx.get("summary") |
|
|
|
|
|
lines = [] |
|
|
if dtxsid: |
|
|
lines.append(f"**DTXSID:** `{dtxsid}`") |
|
|
if dash: |
|
|
lines.append(f"[Open CompTox Dashboard]({dash})") |
|
|
|
|
|
|
|
|
if isinstance(summary, dict): |
|
|
interesting_keys = [ |
|
|
"geneTox", |
|
|
"genetox", |
|
|
"overall", |
|
|
"summary", |
|
|
"conclusion", |
|
|
"call", |
|
|
"result", |
|
|
"assessment", |
|
|
] |
|
|
picked = {} |
|
|
for k in summary.keys(): |
|
|
lk = k.lower() |
|
|
if any(tok in lk for tok in interesting_keys): |
|
|
picked[k] = summary[k] |
|
|
if not picked: |
|
|
|
|
|
for k in list(summary.keys())[:8]: |
|
|
picked[k] = summary[k] |
|
|
|
|
|
lines.append("") |
|
|
lines.append("```json") |
|
|
txt = json_pretty(picked) |
|
|
|
|
|
if len(txt) > 6000: |
|
|
txt = txt[:6000] + "\n... (truncated)" |
|
|
lines.append(txt) |
|
|
lines.append("```") |
|
|
|
|
|
return "\n".join(lines) |
|
|
|
|
|
|
|
|
def render_ntp_summary(ntp_res: Dict[str, Any]) -> str: |
|
|
if not ntp_res or not ntp_res.get("ok"): |
|
|
err = ntp_res.get("error") if isinstance(ntp_res, dict) else "Unknown NTP error" |
|
|
return f"NTP Technical Reports unavailable: {err}" |
|
|
|
|
|
items = ntp_res.get("items") or [] |
|
|
if not items: |
|
|
return "No NTP Technical Reports found for this CAS." |
|
|
|
|
|
lines = [] |
|
|
for it in items: |
|
|
num = it.get("tr") or it.get("num") or "" |
|
|
title = it.get("title") or "Report" |
|
|
url = it.get("report_page") or it.get("url") or "" |
|
|
if url: |
|
|
lines.append(f"- **TR-{num}** [{title}]({url})") |
|
|
else: |
|
|
lines.append(f"- **TR-{num}** {title}") |
|
|
return "\n".join(lines) |
|
|
|
|
|
|
|
|
def render_iarc_block(iarc_res: Dict[str, Any]) -> str: |
|
|
if not iarc_res or not iarc_res.get("ok"): |
|
|
return "IARC link unavailable." |
|
|
url = iarc_res.get("url") |
|
|
if url: |
|
|
return f"[Search IARC Monographs (NCBI Bookshelf)]({url})" |
|
|
|
|
|
results = iarc_res.get("results") if isinstance(iarc_res, dict) else None |
|
|
if isinstance(results, list) and results: |
|
|
lines = [] |
|
|
for it in results: |
|
|
if not isinstance(it, dict): |
|
|
continue |
|
|
title = it.get("title") or "IARC Monographs" |
|
|
link = it.get("url") |
|
|
year = it.get("year") |
|
|
suffix = f" ({year})" if year else "" |
|
|
if link: |
|
|
lines.append(f"- [{title}]({link}){suffix}") |
|
|
else: |
|
|
lines.append(f"- {title}{suffix}") |
|
|
return "\n".join(lines) if lines else "IARC link unavailable." |
|
|
|
|
|
return "IARC link unavailable." |
|
|
|
|
|
|
|
|
def render_scholar_block(sch_res: Dict[str, Any]) -> str: |
|
|
if not sch_res or not sch_res.get("ok"): |
|
|
return "Google Scholar link unavailable." |
|
|
url = sch_res.get("url") |
|
|
return f"[Open Google Scholar search]({url})" if url else "Google Scholar link unavailable." |
|
|
|
|
|
|
|
|
def render_fema_block(fema_res: Dict[str, Any]) -> str: |
|
|
if not fema_res or not fema_res.get("ok"): |
|
|
err = fema_res.get("error") if isinstance(fema_res, dict) else "FEMA link unavailable." |
|
|
return str(err) |
|
|
cas_url = fema_res.get("cas_url") |
|
|
name_url = fema_res.get("name_url") |
|
|
combo_url = fema_res.get("combo_url") |
|
|
alt = fema_res.get("alt_url") |
|
|
search_api = fema_res.get("search_api_url") |
|
|
if not cas_url and not name_url and not combo_url and not alt and not search_api: |
|
|
return "FEMA link unavailable." |
|
|
lines = ["A FEMA risk assessment for this chemical is available:"] |
|
|
if cas_url: |
|
|
lines.append(f"- [Search by CAS]({cas_url})") |
|
|
if name_url: |
|
|
lines.append(f"- [Search by Chemical Name]({name_url})") |
|
|
if combo_url: |
|
|
lines.append(f"- [Search by CAS + Name]({combo_url})") |
|
|
if search_api: |
|
|
lines.append(f"- [Generic FEMA search (alt)]({search_api})") |
|
|
if alt: |
|
|
lines.append(f"- [Generic FEMA search]({alt})") |
|
|
return "\n".join(lines) |
|
|
|
|
|
|
|
|
def render_cdc_block(cdc_res: Any) -> str: |
|
|
if not cdc_res: |
|
|
return "No CDC ToxProfiles match." |
|
|
|
|
|
if isinstance(cdc_res, dict): |
|
|
url = cdc_res.get("url") |
|
|
name = cdc_res.get("name") or "CDC ToxProfile" |
|
|
return f"[{name}]({url})" if url else name |
|
|
if isinstance(cdc_res, list): |
|
|
lines = [] |
|
|
for it in cdc_res: |
|
|
if not isinstance(it, dict): |
|
|
continue |
|
|
name = it.get("name") or "CDC ToxProfile" |
|
|
url = it.get("url") |
|
|
lines.append(f"- [{name}]({url})" if url else f"- {name}") |
|
|
return "\n".join(lines) if lines else "No CDC ToxProfiles match." |
|
|
return str(cdc_res) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def run_search(query: str) -> Dict[str, Any]: |
|
|
q = (query or "").strip() |
|
|
if not q: |
|
|
raise gr.Error("Enter a CAS number (preferred) or chemical name.") |
|
|
|
|
|
cache_key = f"search::{q.lower()}" |
|
|
if cache_key in SEARCH_CACHE: |
|
|
return SEARCH_CACHE[cache_key] |
|
|
|
|
|
async with client() as http: |
|
|
|
|
|
pub = await pubchem.pubchem_by_query(q, http) |
|
|
|
|
|
cas = q |
|
|
if not pubchem.is_cas(cas): |
|
|
cas = pub.get("resolved_cas") or q |
|
|
|
|
|
|
|
|
pub_dtxsid = pub.get("dtxsid") if isinstance(pub, dict) else None |
|
|
ctx_query = pub_dtxsid or q |
|
|
ctx_task = ctx_src.fetch_ctx_genetox(ctx_query, http) if ctx_query else asyncio.sleep(0, result={"ok": False}) |
|
|
ntp_task = ntp.search_technical_reports(cas, http, limit=8) |
|
|
|
|
|
ctx_res, ntp_res = await asyncio.gather(ctx_task, ntp_task) |
|
|
|
|
|
out: Dict[str, Any] = { |
|
|
"query": q, |
|
|
"cas_used": cas, |
|
|
"pubchem": pub, |
|
|
"ctx_genetox": ctx_res, |
|
|
"ntp_technical_reports": ntp_res, |
|
|
"iarc_monographs": iarc.bookshelf_link(cas), |
|
|
"google_scholar": {"ok": True, "url": scholar.scholar_link(cas)}, |
|
|
"fema": fema.fema_link(cas if pubchem.is_cas(cas) else "", q), |
|
|
} |
|
|
|
|
|
|
|
|
if cdc is not None: |
|
|
try: |
|
|
|
|
|
if hasattr(cdc, "lookup"): |
|
|
out["cdc_toxprofiles"] = cdc.lookup(cas) |
|
|
elif hasattr(cdc, "search"): |
|
|
out["cdc_toxprofiles"] = cdc.search(cas) |
|
|
elif hasattr(cdc, "toxprofile_for"): |
|
|
out["cdc_toxprofiles"] = cdc.toxprofile_for(cas) |
|
|
else: |
|
|
out["cdc_toxprofiles"] = None |
|
|
except Exception: |
|
|
out["cdc_toxprofiles"] = None |
|
|
|
|
|
SEARCH_CACHE[cache_key] = out |
|
|
return out |
|
|
|
|
|
|
|
|
def _prune_for_prompt(obj: Any, max_chars: int) -> str: |
|
|
txt = json_pretty(obj) |
|
|
if len(txt) <= max_chars: |
|
|
return txt |
|
|
return txt[:max_chars] + "\n... (truncated)" |
|
|
|
|
|
|
|
|
def build_prompt(data: Dict[str, Any]) -> str: |
|
|
"""Build a prompt that will not exceed model context. |
|
|
|
|
|
Key change vs earlier version: DO NOT dump full raw JSON from all sources. |
|
|
""" |
|
|
|
|
|
pub = data.get("pubchem") or {} |
|
|
props = (pub.get("props") or {}) if isinstance(pub, dict) else {} |
|
|
hazards = (pub.get("hazards") or []) if isinstance(pub, dict) else [] |
|
|
|
|
|
prompt_obj = { |
|
|
"query": data.get("query"), |
|
|
"cas_used": data.get("cas_used"), |
|
|
"pubchem": { |
|
|
"cid": pub.get("cid"), |
|
|
"resolved_cas": pub.get("resolved_cas"), |
|
|
"iupac": props.get("IUPACName") or props.get("iupac_name"), |
|
|
"formula": props.get("MolecularFormula"), |
|
|
"molecular_weight": props.get("MolecularWeight"), |
|
|
"canonical_smiles": props.get("CanonicalSMILES"), |
|
|
"hazards": hazards[:10], |
|
|
}, |
|
|
"ctx_genetox": { |
|
|
"ok": (data.get("ctx_genetox") or {}).get("ok"), |
|
|
"dtxsid": (data.get("ctx_genetox") or {}).get("dtxsid"), |
|
|
"summary": (data.get("ctx_genetox") or {}).get("summary"), |
|
|
}, |
|
|
"ntp_technical_reports": (data.get("ntp_technical_reports") or {}).get("items", []), |
|
|
"cdc_toxprofiles": data.get("cdc_toxprofiles"), |
|
|
} |
|
|
|
|
|
body = _prune_for_prompt(prompt_obj, max_chars=12000) |
|
|
|
|
|
return ( |
|
|
"You are a toxicology regulatory assistant. " |
|
|
"Using ONLY the evidence JSON below, write a concise weight-of-evidence summary focused on mutagenicity/genotoxicity. " |
|
|
"If evidence is conflicting or absent, say so explicitly. " |
|
|
"Cite which source each statement comes from (PubChem hazards, CTX genetox summary, NTP TR titles, CDC ToxProfiles).\n\n" |
|
|
"EVIDENCE_JSON:\n" |
|
|
+ body |
|
|
) |
|
|
|
|
|
|
|
|
def do_search(query: str): |
|
|
data = asyncio.run(run_search(query)) |
|
|
|
|
|
overview_md_text = render_overview(data) |
|
|
pubchem_md_text = render_pubchem_summary(data.get("pubchem", {})) |
|
|
ctx_md_text = render_ctx_summary(data.get("ctx_genetox", {})) |
|
|
ntp_md_text = render_ntp_summary(data.get("ntp_technical_reports", {})) |
|
|
iarc_md_text = render_iarc_block(data.get("iarc_monographs", {})) |
|
|
scholar_md_text = render_scholar_block(data.get("google_scholar", {})) |
|
|
fema_md_text = render_fema_block(data.get("fema", {})) |
|
|
|
|
|
cdc_md_text = "" |
|
|
if "cdc_toxprofiles" in data: |
|
|
cdc_md_text = render_cdc_block(data.get("cdc_toxprofiles")) |
|
|
|
|
|
raw_pubchem_json = json_pretty(data.get("pubchem", {})) |
|
|
raw_ctx_json = json_pretty(data.get("ctx_genetox", {})) |
|
|
raw_ntp_json = json_pretty(data.get("ntp_technical_reports", {})) |
|
|
raw_iarc_json = json_pretty(data.get("iarc_monographs", {})) |
|
|
raw_scholar_json = json_pretty(data.get("google_scholar", {})) |
|
|
raw_fema_json = json_pretty(data.get("fema", {})) |
|
|
|
|
|
|
|
|
|
|
|
return ( |
|
|
data, |
|
|
overview_md_text, |
|
|
pubchem_md_text, |
|
|
cdc_md_text, |
|
|
ctx_md_text, |
|
|
ntp_md_text, |
|
|
iarc_md_text, |
|
|
scholar_md_text, |
|
|
fema_md_text, |
|
|
raw_pubchem_json, |
|
|
raw_ctx_json, |
|
|
raw_ntp_json, |
|
|
raw_iarc_json, |
|
|
raw_scholar_json, |
|
|
raw_fema_json, |
|
|
"", |
|
|
) |
|
|
|
|
|
|
|
|
def generate_ai(data: dict): |
|
|
if not data: |
|
|
raise gr.Error("Run a search first.") |
|
|
|
|
|
cas = data.get("cas_used") or data.get("query") or "" |
|
|
cache_key = f"ai::{cas}" |
|
|
if cache_key in AI_CACHE: |
|
|
return AI_CACHE[cache_key] |
|
|
|
|
|
allowed, info = check_and_increment_global_ai_cap() |
|
|
if not allowed: |
|
|
return f"AI Summary capacity reached for today (limit {info.get('limit')}). Please try again tomorrow." |
|
|
|
|
|
from core.sources.ai_summary import generate_ai_summary |
|
|
|
|
|
resp = generate_ai_summary(build_prompt(data)) |
|
|
if not resp.get("ok"): |
|
|
return f"**AI summary unavailable:** {resp.get('error')}" |
|
|
|
|
|
text = resp.get("text") or "" |
|
|
AI_CACHE[cache_key] = text |
|
|
return text |
|
|
|
|
|
|
|
|
def download_report(data: dict, ai_text: str): |
|
|
if not data: |
|
|
raise gr.Error("Run a search first.") |
|
|
|
|
|
cas = data.get("cas_used") or data.get("query") or "unknown" |
|
|
pdf_path, json_path = build_pdf(cas, evidence=data, ai_summary=ai_text if ai_text else None) |
|
|
return pdf_path, json_path |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="ToxRAI (HF Demo)") as demo: |
|
|
gr.Markdown("# 🧪 ToxRAI — Demo (CAS-first)") |
|
|
gr.Markdown( |
|
|
f"Public demo • AI summaries/day global cap: **{settings.max_ai_summaries_per_day}** • Cache TTL: **{settings.cache_ttl_seconds}s**" |
|
|
) |
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.Tab("Search"): |
|
|
state = gr.State(None) |
|
|
|
|
|
with gr.Row(): |
|
|
query_in = gr.Textbox( |
|
|
label="CAS (preferred) or Chemical name", |
|
|
placeholder="e.g., 80-05-7 or bisphenol A", |
|
|
scale=4, |
|
|
) |
|
|
search_btn = gr.Button("Search", variant="primary", scale=1) |
|
|
|
|
|
overview_md = gr.Markdown() |
|
|
|
|
|
with gr.Accordion("PubChem (summary)", open=False): |
|
|
pubchem_md = gr.Markdown() |
|
|
|
|
|
|
|
|
with gr.Accordion("CDC ToxProfiles", open=False): |
|
|
cdc_md = gr.Markdown() |
|
|
|
|
|
with gr.Accordion("EPA CompTox (CTX) — Genetox (full fields)", open=False): |
|
|
ctx_md = gr.Markdown() |
|
|
|
|
|
with gr.Accordion("NTP Technical Reports", open=False): |
|
|
ntp_md = gr.Markdown() |
|
|
|
|
|
with gr.Accordion("IARC Monographs", open=False): |
|
|
iarc_md = gr.Markdown() |
|
|
|
|
|
with gr.Accordion("Google Scholar", open=False): |
|
|
scholar_md = gr.Markdown() |
|
|
|
|
|
with gr.Accordion("FEMA Risk Assessment", open=False): |
|
|
fema_md = gr.Markdown() |
|
|
|
|
|
with gr.Accordion("Raw outputs (all sources)", open=False): |
|
|
raw_pubchem = gr.Code(label="PubChem (raw)", language="json") |
|
|
raw_ctx = gr.Code(label="CTX Genetox (raw)", language="json") |
|
|
raw_ntp = gr.Code(label="NTP TR (raw)", language="json") |
|
|
raw_iarc = gr.Code(label="IARC (raw)", language="json") |
|
|
raw_scholar = gr.Code(label="Scholar link (raw)", language="json") |
|
|
raw_fema = gr.Code(label="FEMA (raw)", language="json") |
|
|
|
|
|
with gr.Row(): |
|
|
ai_btn = gr.Button("Generate AI Summary (GPT-4o)", variant="secondary") |
|
|
pdf_btn = gr.Button("Build PDF + JSON") |
|
|
|
|
|
ai_out = gr.Markdown() |
|
|
|
|
|
with gr.Row(): |
|
|
pdf_file = gr.File(label="Download PDF") |
|
|
json_file = gr.File(label="Download JSON evidence packet") |
|
|
|
|
|
search_btn.click( |
|
|
fn=do_search, |
|
|
inputs=[query_in], |
|
|
outputs=[ |
|
|
state, |
|
|
overview_md, |
|
|
pubchem_md, |
|
|
cdc_md, |
|
|
ctx_md, |
|
|
ntp_md, |
|
|
iarc_md, |
|
|
scholar_md, |
|
|
fema_md, |
|
|
raw_pubchem, |
|
|
raw_ctx, |
|
|
raw_ntp, |
|
|
raw_iarc, |
|
|
raw_scholar, |
|
|
raw_fema, |
|
|
ai_out, |
|
|
], |
|
|
) |
|
|
|
|
|
query_in.submit( |
|
|
fn=do_search, |
|
|
inputs=[query_in], |
|
|
outputs=[ |
|
|
state, |
|
|
overview_md, |
|
|
pubchem_md, |
|
|
cdc_md, |
|
|
ctx_md, |
|
|
ntp_md, |
|
|
iarc_md, |
|
|
scholar_md, |
|
|
fema_md, |
|
|
raw_pubchem, |
|
|
raw_ctx, |
|
|
raw_ntp, |
|
|
raw_iarc, |
|
|
raw_scholar, |
|
|
raw_fema, |
|
|
ai_out, |
|
|
], |
|
|
) |
|
|
|
|
|
ai_btn.click(fn=generate_ai, inputs=[state], outputs=[ai_out]) |
|
|
pdf_btn.click(fn=download_report, inputs=[state, ai_out], outputs=[pdf_file, json_file]) |
|
|
|
|
|
|
|
|
demo.queue(default_concurrency_limit=6) |
|
|
app = demo |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|