Spaces:

hchevva
/

NLP_Project

Runtime error

App Files Files Community

hchevva commited on Feb 14

Commit

4bf9d97

verified ·

1 Parent(s): 9457c11

Upload 5 files

Browse files

Files changed (5) hide show

app.py +551 -153
cancer_risk_input_template.csv +1 -0
literature_explorer.py +40 -27
requirements.txt +5 -4
runtime.txt +1 -0

app.py CHANGED Viewed

@@ -10,13 +10,137 @@ import numpy as np
 import pandas as pd
 from pypdf import PdfReader
-from sklearn.feature_extraction.text import TfidfVectorizer
 from openai import OpenAI
 from literature_explorer import build_literature_explorer_tab
 # =============================
 # Defaults
 # =============================
@@ -236,6 +360,22 @@ def select_relevant_chunks(
     if not texts:
         return []
     vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=20000)
     X = vectorizer.fit_transform(texts)
@@ -897,7 +1037,7 @@ def run_extraction(
     if not files:
         return (
             "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#666;'>Upload PDFs to run extraction.</div></div>",
-            pd.DataFrame(), None, None, "Upload one or more PDFs.",
             gr.update(choices=[], value=None),
             [], [], pd.DataFrame(columns=["Field","Value"]), ""
         )
@@ -907,7 +1047,7 @@ def run_extraction(
     except Exception as e:
         return (
             "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>Invalid vocab JSON.</div></div>",
-            pd.DataFrame(), None, None, f"Controlled vocab JSON invalid: {e}",
             gr.update(choices=[], value=None),
             [], [], pd.DataFrame(columns=["Field","Value"]), ""
         )
@@ -916,7 +1056,7 @@ def run_extraction(
     if not field_props:
         return (
             "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>No columns defined.</div></div>",
-            pd.DataFrame(), None, None, "No extraction fields are defined. (Check selected endpoints or admin field spec.)",
             gr.update(choices=[], value=None),
             [], [], pd.DataFrame(columns=["Field","Value"]), ""
         )
@@ -936,13 +1076,14 @@ def run_extraction(
     except Exception as e:
         return (
             "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>Missing API key.</div></div>",
-            pd.DataFrame(), None, None, str(e),
             gr.update(choices=[], value=None),
             [], [], pd.DataFrame(columns=["Field","Value"]), ""
         )
     paper_details: List[Dict[str, Any]] = []
     output_rows: List[Dict[str, Any]] = []
     tmpdir = Path(tempfile.mkdtemp(prefix="tox_extract_"))
@@ -963,20 +1104,62 @@ def run_extraction(
                 "extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
                 "evidence": []
             }
         else:
             chunks = chunk_pages(pages, target_chars=int(chunk_chars))
-            queries = [
                 "regulatory acceptability risk hazard concern conclusion uncertainty evidence NOAEL LOAEL BMD",
                 "chemical name CAS number",
             ]
-            for ep in (selected_endpoints or []):
-                queries.extend(ENDPOINT_QUERY_HINTS.get(ep, []))
-            for k, ins in field_instr.items():
-                queries.append(ins if ins else k)
-            selected = select_relevant_chunks(chunks, queries, top_per_query=2, max_chunks=12)
-            context = build_context(selected, max_chars=int(max_context_chars))
             ex = openai_structured_extract(
                 client=client,
@@ -1060,7 +1243,16 @@ def run_extraction(
     csv_path = tmpdir / "extraction_table.csv"
     json_path = tmpdir / "extraction_details.json"
     df.to_csv(csv_path, index=False)
-    json_path.write_text(json.dumps(paper_details, indent=2), encoding="utf-8")
     choices = [r.get("record_id") for r in records if r.get("record_id")]
     default = choices[0] if choices else None
@@ -1083,6 +1275,7 @@ def run_extraction(
         overview,
         str(csv_path),
         str(json_path),
         status,
         gr.update(choices=choices, value=default),
         records,
@@ -1145,6 +1338,135 @@ def export_reviewed_csv(records: List[Dict[str, Any]]):
     return str(path), "Reviewed CSV ready to download."
 # =============================
 # Synthesis tab handler
 # =============================
@@ -1155,7 +1477,10 @@ def run_synthesis(api_key, model, extraction_json_file):
         client = get_openai_client(api_key)
     except Exception as e:
         return str(e)
-    rows = json.loads(Path(extraction_json_file.name).read_text(encoding="utf-8"))
     return openai_synthesize_across_papers(client, model, rows)
@@ -1173,11 +1498,24 @@ def set_admin_visibility(is_admin: bool):
 # =============================
 # Gradio UI
 # =============================
-with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
-    gr.Markdown(
-        "# Toxicology PDF → Grounded Extractor\n"
-        "Upload PDFs → choose endpoints → Run → review report → export.\n\n"
-        "**Note:** Text-based PDFs only (not scanned/image PDFs)."
     )
     state_records = gr.State([])
@@ -1189,130 +1527,136 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
     vocab_json = gr.Textbox(visible=False, interactive=False, lines=8)
     with gr.Tab("Extract"):
-        # --- Run section (simple) ---
-        with gr.Group():
-            files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
-            with gr.Row():
-                api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
-                model = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
-            with gr.Row():
-                endpoint_preset = gr.Dropdown(
-                    label="Endpoint preset",
-                    choices=list(ENDPOINT_PRESETS.keys()),
-                    value="Required – Safety Assessor"
-                )
-                endpoints = gr.Dropdown(
-                    label="Endpoints to extract (Core included automatically)",
-                    choices=list(ENDPOINT_MODULES.keys()),
-                    multiselect=True,
-                    value=ENDPOINT_PRESETS["Required – Safety Assessor"]
-                )
-            extract_btn = gr.Button("Run Extraction", variant="primary")
-            status = gr.Textbox(label="Status", interactive=False)
-        # --- Report (results-first) ---
-        gr.Markdown("## Report")
-        summary_card = gr.HTML(render_summary_card("", []))
-        overview_df = gr.Dataframe(
-            label="Batch Overview",
-            interactive=False,
-            wrap=True,
-            show_row_numbers=True
-        )
-        with gr.Row():
-            out_csv = gr.File(label="Download: extraction_table.csv")
-            out_json = gr.File(label="Download: extraction_details.json (evidence + structured data)")
-        record_pick = gr.Dropdown(label="Select record", choices=[], value=None)
-        with gr.Row():
-            review_mode = gr.Checkbox(label="Review mode (enable editing)", value=False)
-            save_btn = gr.Button("Save edits")
-            export_btn = gr.Button("Export reviewed CSV")
-        review_status = gr.Textbox(label="Review status", interactive=False)
-        with gr.Row():
-            vertical_view = gr.Dataframe(
-                headers=["Field", "Value"],
-                interactive=False,
-                wrap=True,
-                show_row_numbers=False,
-                label="Extracted fields (vertical)"
-            )
-            evidence_md = gr.Markdown()
-        reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
-        # --- Advanced runtime settings (collapsed) ---
-        with gr.Accordion("Advanced runtime settings", open=False):
-            with gr.Row():
-                max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
-                chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
-                max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
-        # --- Admin tools (collapsed) ---
-        with gr.Accordion("Admin tools (taxonomy + custom columns)", open=False):
-            admin_mode = gr.Checkbox(label="Enable Admin mode", value=False)
-            admin_group = gr.Group(visible=False)
-            admin_vocab_group = gr.Group(visible=False)
-            admin_fields_group = gr.Group(visible=False)
-            with admin_group:
-                gr.Markdown("### Admin: Configure extraction taxonomy + custom columns.")
-            with admin_vocab_group:
-                gr.Markdown("### Controlled vocabulary (lists only)")
-                vocab_category = gr.Dropdown(label="Category (lists only)", choices=[], value=None)
-                vocab_search = gr.Textbox(label="Search terms", placeholder="Type to filter (e.g., 471, AMES, comet)", lines=1)
-                with gr.Row():
-                    vocab_term_add = gr.Textbox(label="Add term", placeholder="type term and click Add")
-                    vocab_add_btn = gr.Button("Add")
-                with gr.Row():
-                    vocab_term_remove = gr.Textbox(label="Remove term", placeholder="type exact term and click Remove")
-                    vocab_remove_btn = gr.Button("Remove")
-                    vocab_apply_btn = gr.Button("Apply full list to category")
-                    vocab_reset_btn = gr.Button("Reset vocab to defaults")
-                vocab_terms_df = gr.Dataframe(headers=["term"], label="Terms (full list; edit directly)", interactive=True, wrap=True)
-                vocab_terms_filtered = gr.Dataframe(headers=["term"], label="Filtered preview (read-only)", interactive=False, wrap=True)
-                vocab_status = gr.Textbox(label="Vocab status", interactive=False)
-                with gr.Accordion("Raw vocab JSON (auto-generated)", open=False):
-                    vocab_json_admin = gr.Textbox(label="Controlled vocab JSON", lines=12, interactive=False)
-            with admin_fields_group:
-                gr.Markdown("### Custom columns (Field Builder)")
-                gr.Markdown("Tip: Use endpoint selection to start, then tweak fields.")
-                with gr.Row():
-                    admin_apply_endpoints_btn = gr.Button("Load selected endpoints into builder (Replace)", variant="secondary")
-                    fields_apply_btn = gr.Button("Apply builder table")
-                with gr.Row():
-                    field_name_in = gr.Textbox(label="Field name", placeholder="e.g., genotoxicity_result")
-                    field_type_in = gr.Dropdown(label="Type", choices=TYPE_CHOICES, value="str")
-                enum_values_in = gr.Textbox(label="Enum values (comma-separated; for enum/list[enum])", placeholder="a,b,c", lines=2)
-                instructions_in = gr.Textbox(label="Instructions", placeholder="Tell the extractor exactly what to pull.", lines=2)
-                add_update_field_btn = gr.Button("Add/Update field")
-                fields_df = gr.Dataframe(
-                    label="Fields (edit and click Apply)",
-                    headers=["field","type","enum_values","instructions"],
-                    interactive=True,
-                    wrap=True
-                )
-                fields_status = gr.Textbox(label="Field builder status", interactive=False)
         # --- Wiring ---
         admin_mode.change(
@@ -1336,7 +1680,7 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
         extract_btn.click(
             fn=run_extraction,
             inputs=[files, api_key, model, endpoints, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars, admin_mode],
-            outputs=[summary_card, overview_df, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
         )
         record_pick.change(
@@ -1460,14 +1804,68 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
         build_literature_explorer_tab()
     with gr.Tab("Cross-paper Synthesis"):
-        gr.Markdown("Upload `extraction_details.json` from Extract tab. Synthesis is based strictly on grounded extractions.")
-        api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
-        model2 = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
-        extraction_json_file = gr.File(label="Upload extraction_details.json", file_types=[".json"], file_count="single")
-        synth_btn = gr.Button("Synthesize Across Papers")
-        synth_md = gr.Markdown()
-        synth_btn.click(fn=run_synthesis, inputs=[api_key2, model2, extraction_json_file], outputs=[synth_md])
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", "7860"))
-    demo.queue().launch(server_name="0.0.0.0", server_port=port)

 import pandas as pd
 from pypdf import PdfReader
+try:
+    from sklearn.feature_extraction.text import TfidfVectorizer
+except Exception:  # pragma: no cover - fallback path for minimal runtime
+    TfidfVectorizer = None
 from openai import OpenAI
 from literature_explorer import build_literature_explorer_tab
+from toxra_core.artifacts import make_run_dir, write_dataframe_csv, write_json, write_markdown
+from toxra_core.calculation_client import MCPClientError, run_batch_cancer_risk
+from toxra_core.contracts import CANCER_RISK_TEMPLATE_COLUMNS
+from toxra_core.nlp_pipeline import extract_evidence_span, expand_regulatory_queries, hybrid_rank_text_items
+from toxra_core.regulatory_mapper import map_extraction_to_framework
+# =============================
+# UI theme
+# =============================
+APP_CSS = """
+@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@400;500;600;700&display=swap');
+:root {
+  --bg: #f5f7fb;
+  --panel: #ffffff;
+  --ink: #0f172a;
+  --muted: #516079;
+  --line: #e2e8f0;
+  --accent: #2563eb;
+  --accent-2: #0ea5e9;
+  --accent-soft: #e6efff;
+  --shadow: 0 10px 28px rgba(15, 23, 42, 0.08);
+  --radius: 14px;
+}
+.gradio-container {
+  background: var(--bg);
+  color: var(--ink);
+  font-family: "IBM Plex Sans", ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, "Helvetica Neue", Arial, "Noto Sans", "Apple Color Emoji", "Segoe UI Emoji";
+}
+.hero {
+  background: linear-gradient(180deg, #edf3ff 0%, #f4f8ff 100%);
+  color: var(--ink);
+  border-radius: 16px;
+  padding: 18px 22px;
+  box-shadow: var(--shadow);
+  border: 1px solid #dbe5f4;
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  gap: 16px;
+  flex-wrap: wrap;
+}
+.hero-left { min-width: 240px; }
+.hero-right { margin-left: auto; }
+.hero-title { font-size: 22px; font-weight: 700; letter-spacing: 0.08em; }
+.hero-sub { margin-top: 4px; font-size: 13px; color: #3b4b63; }
+.hero-pills { margin-top: 10px; display: flex; gap: 8px; flex-wrap: wrap; }
+.hero-pill {
+  background: var(--accent-soft);
+  color: #1e3a8a;
+  border: 1px solid #d6e3f6;
+  border-radius: 999px;
+  padding: 4px 10px;
+  font-size: 11px;
+  font-weight: 600;
+}
+.hero-status {
+  background: #ffffff;
+  color: #334155;
+  border: 1px solid #d9e2ef;
+  border-radius: 999px;
+  padding: 6px 12px;
+  font-size: 12px;
+  font-weight: 600;
+  box-shadow: 0 6px 16px rgba(15, 23, 42, 0.06);
+}
+.split-row { gap: 18px; }
+.card {
+  background: var(--panel);
+  border: 1px solid var(--line);
+  border-radius: var(--radius);
+  padding: 16px;
+  box-shadow: var(--shadow);
+}
+.left-rail .card + .card { margin-top: 16px; }
+.right-panel .card { margin-bottom: 14px; }
+.section-title {
+  font-size: 12px;
+  text-transform: uppercase;
+  letter-spacing: 0.14em;
+  color: var(--muted);
+  margin-bottom: 8px;
+}
+.gradio-container input,
+.gradio-container textarea,
+.gradio-container select {
+  border-radius: 10px !important;
+  border-color: var(--line) !important;
+}
+.gradio-container button.primary {
+  background: var(--accent) !important;
+  border-color: var(--accent) !important;
+}
+.gradio-container button.primary:hover { background: #1d4ed8 !important; }
+.gradio-container .tab-nav { gap: 8px; }
+.gradio-container .tab-nav button {
+  background: var(--panel);
+  border: 1px solid var(--line);
+  border-radius: 999px;
+  padding: 6px 14px;
+  font-size: 12px;
+  color: var(--muted);
+}
+.gradio-container .tab-nav button.selected {
+  background: var(--accent);
+  border-color: var(--accent);
+  color: #ffffff;
+}
+.gradio-container .accordion {
+  border: 1px solid var(--line);
+  border-radius: var(--radius);
+}
+"""
 # =============================
 # Defaults
 # =============================
     if not texts:
         return []
+    if TfidfVectorizer is None:
+        selected_idx: List[int] = []
+        for q in queries:
+            q_tokens = set([w for w in re.findall(r"[a-zA-Z0-9\\-]+", (q or "").lower()) if len(w) >= 3])
+            scored = []
+            for i, t in enumerate(texts):
+                tl = t.lower()
+                scored.append((sum(1 for tok in q_tokens if tok in tl), i))
+            scored.sort(key=lambda x: x[0], reverse=True)
+            for _, i in scored[:top_per_query]:
+                if i not in selected_idx:
+                    selected_idx.append(i)
+        if not selected_idx:
+            selected_idx = list(range(min(len(chunks), max_chunks)))
+        return [chunks[i] for i in selected_idx[:max_chunks]]
     vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=20000)
     X = vectorizer.fit_transform(texts)
     if not files:
         return (
             "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#666;'>Upload PDFs to run extraction.</div></div>",
+            pd.DataFrame(), None, None, None, "Upload one or more PDFs.",
             gr.update(choices=[], value=None),
             [], [], pd.DataFrame(columns=["Field","Value"]), ""
         )
     except Exception as e:
         return (
             "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>Invalid vocab JSON.</div></div>",
+            pd.DataFrame(), None, None, None, f"Controlled vocab JSON invalid: {e}",
             gr.update(choices=[], value=None),
             [], [], pd.DataFrame(columns=["Field","Value"]), ""
         )
     if not field_props:
         return (
             "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>No columns defined.</div></div>",
+            pd.DataFrame(), None, None, None, "No extraction fields are defined. (Check selected endpoints or admin field spec.)",
             gr.update(choices=[], value=None),
             [], [], pd.DataFrame(columns=["Field","Value"]), ""
         )
     except Exception as e:
         return (
             "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>Missing API key.</div></div>",
+            pd.DataFrame(), None, None, None, str(e),
             gr.update(choices=[], value=None),
             [], [], pd.DataFrame(columns=["Field","Value"]), ""
         )
     paper_details: List[Dict[str, Any]] = []
     output_rows: List[Dict[str, Any]] = []
+    nlp_diagnostics: List[Dict[str, Any]] = []
     tmpdir = Path(tempfile.mkdtemp(prefix="tox_extract_"))
                 "extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
                 "evidence": []
             }
+            nlp_diagnostics.append(
+                {
+                    "file": filename,
+                    "ranking_method": "unavailable_no_text",
+                    "selected_indices": [],
+                    "coverage_by_query_family": {},
+                    "coverage_score": 0.0,
+                }
+            )
         else:
             chunks = chunk_pages(pages, target_chars=int(chunk_chars))
+            base_queries = [
                 "regulatory acceptability risk hazard concern conclusion uncertainty evidence NOAEL LOAEL BMD",
                 "chemical name CAS number",
             ]
+            extra_terms = [ins if ins else k for k, ins in field_instr.items()]
+            queries, families = expand_regulatory_queries(
+                base_queries=base_queries,
+                endpoint_modules=selected_endpoints or [],
+                frameworks=["FDA CTP", "EPA"],
+                extra_terms=extra_terms,
+            )
+            emb_mat = None
+            qemb = None
+            try:
+                texts = [c.get("text", "") for c in chunks]
+                if texts:
+                    emb_mat = embed_texts(client, DEFAULT_EMBEDDING_MODEL, texts)
+                    qemb = embed_texts(client, DEFAULT_EMBEDDING_MODEL, [" ".join(queries[:20])])[0]
+            except Exception:
+                emb_mat = None
+                qemb = None
+            selected, diag = hybrid_rank_text_items(
+                items=chunks,
+                query=" ".join(queries[:20]),
+                families=families,
+                top_k=12,
+                item_embeddings=emb_mat,
+                query_embedding=qemb,
+            )
+            nlp_diagnostics.append(dict({"file": filename}, **diag))
+            span_blocks: List[str] = []
+            chars = 0
+            for c in selected:
+                span = extract_evidence_span(c.get("text", ""), " ".join(queries[:20]), page=None, n_sentences=5)
+                snippet = span.get("text", "") or c.get("text", "")
+                block = f"[pages {c.get('pages','')}]\n{snippet}\n"
+                if chars + len(block) > int(max_context_chars):
+                    break
+                span_blocks.append(block)
+                chars += len(block)
+            context = "\n".join(span_blocks).strip()
+            if not context:
+                context = build_context(selected, max_chars=int(max_context_chars))
             ex = openai_structured_extract(
                 client=client,
     csv_path = tmpdir / "extraction_table.csv"
     json_path = tmpdir / "extraction_details.json"
     df.to_csv(csv_path, index=False)
+    details_payload = {
+        "papers": paper_details,
+        "toxra_extensions": {
+            "nlp_diagnostics": nlp_diagnostics,
+            "regulatory_gap_assessment": {},
+            "risk_calculation_refs": [],
+        },
+    }
+    json_path.write_text(json.dumps(details_payload, indent=2), encoding="utf-8")
+    prefilled_template_path = export_prefilled_cancer_risk_template(records)
     choices = [r.get("record_id") for r in records if r.get("record_id")]
     default = choices[0] if choices else None
         overview,
         str(csv_path),
         str(json_path),
+        str(prefilled_template_path),
         status,
         gr.update(choices=choices, value=default),
         records,
     return str(path), "Reviewed CSV ready to download."
+# =============================
+# New modules: template, mapping, MCP batch
+# =============================
+def _load_extraction_payload(file_obj: Any) -> Tuple[Any, List[Dict[str, Any]], Dict[str, Any]]:
+    if file_obj is None:
+        raise ValueError("Upload extraction_details.json first.")
+    payload = json.loads(Path(file_obj.name).read_text(encoding="utf-8"))
+    if isinstance(payload, list):
+        return payload, payload, {}
+    if isinstance(payload, dict):
+        papers = payload.get("papers", [])
+        if not isinstance(papers, list):
+            raise ValueError("Invalid extraction_details.json format: papers must be a list.")
+        ext = payload.get("toxra_extensions", {})
+        return payload, papers, (ext if isinstance(ext, dict) else {})
+    raise ValueError("Unsupported extraction_details.json format.")
+def export_blank_cancer_risk_template():
+    tmpdir = Path(tempfile.mkdtemp(prefix="tox_template_"))
+    path = tmpdir / "cancer_risk_input_template.csv"
+    pd.DataFrame(columns=CANCER_RISK_TEMPLATE_COLUMNS).to_csv(path, index=False)
+    return str(path), "Blank cancer risk template ready."
+def export_prefilled_cancer_risk_template(records: List[Dict[str, Any]]):
+    tmpdir = Path(tempfile.mkdtemp(prefix="tox_template_prefilled_"))
+    path = tmpdir / "cancer_risk_input_template_prefilled.csv"
+    if not records:
+        pd.DataFrame(columns=CANCER_RISK_TEMPLATE_COLUMNS).to_csv(path, index=False)
+        return str(path)
+    rows: List[Dict[str, Any]] = []
+    seen = set()
+    for r in records:
+        rid = str(r.get("record_id", "")).strip()
+        if not rid or rid in seen:
+            continue
+        seen.add(rid)
+        route = str(r.get("exposure_route", "")).strip().lower()
+        if route not in {"oral", "inhalation"}:
+            route = ""
+        casn = str(r.get("cas_numbers", "")).split(";")[0].strip()
+        rows.append(
+            {
+                "record_id": rid,
+                "chemical_name": str(r.get("chemical", "")).strip(),
+                "casrn": casn,
+                "route": route,
+                "exposure_value": "",
+                "exposure_unit": "",
+                "body_weight_kg": "",
+                "csf_value": "",
+                "csf_unit": "",
+                "iur_value": "",
+                "air_conc_value": "",
+                "air_conc_unit": "",
+                "source_reference": str(r.get("file", "")).strip(),
+            }
+        )
+    df = pd.DataFrame(rows, columns=CANCER_RISK_TEMPLATE_COLUMNS)
+    df.to_csv(path, index=False)
+    return str(path)
+def run_regulatory_gap_assessment(extraction_json_file, framework: str, override_notes: str):
+    if extraction_json_file is None:
+        return pd.DataFrame(), "Upload extraction_details.json first.", None, None, "No input file."
+    try:
+        payload, _, _ = _load_extraction_payload(extraction_json_file)
+        matrix_df, report, report_md = map_extraction_to_framework(
+            extraction_payload=payload,
+            framework=framework,
+            catalog_dir="regulatory_catalog",
+            override_notes=override_notes or "",
+        )
+    except Exception as e:
+        return pd.DataFrame(), f"(assessment unavailable: {e})", None, None, str(e)
+    run_dir = make_run_dir(base_dir="runs")
+    matrix_path = write_dataframe_csv(run_dir / "regulatory_gap_matrix.csv", matrix_df)
+    report_path = write_json(run_dir / "regulatory_gap_report.json", report)
+    write_markdown(run_dir / "regulatory_gap_report.md", report_md)
+    md = "### Regulatory Gap Summary\n" + report_md
+    status = f"✅ Gap assessment complete. Covered={report.get('summary', {}).get('covered', 0)} | Missing={report.get('summary', {}).get('missing', 0)}"
+    return matrix_df, md, str(matrix_path), str(report_path), status
+def run_cancer_risk_batch_ui(input_csv_file):
+    if input_csv_file is None:
+        return pd.DataFrame(), None, None, None, "Upload a populated cancer risk input CSV."
+    try:
+        df = pd.read_csv(input_csv_file.name)
+    except Exception as e:
+        return pd.DataFrame(), None, None, None, f"Could not read CSV: {e}"
+    missing = [c for c in CANCER_RISK_TEMPLATE_COLUMNS if c not in df.columns]
+    if missing:
+        return pd.DataFrame(), None, None, None, f"Missing required columns: {missing}"
+    run_dir = make_run_dir(base_dir="runs")
+    rows = df.fillna("").to_dict("records")
+    try:
+        result = run_batch_cancer_risk(rows, run_dir=str(run_dir))
+    except MCPClientError as e:
+        return pd.DataFrame(), None, None, None, f"MCP server unavailable: {e}"
+    except Exception as e:
+        return pd.DataFrame(), None, None, None, f"Calculation failed: {e}"
+    result_rows = result.get("rows", []) if isinstance(result.get("rows", []), list) else []
+    out_df = pd.DataFrame(result_rows)
+    result_csv_path = write_dataframe_csv(run_dir / "cancer_risk_results.csv", out_df)
+    write_json(run_dir / "cancer_risk_results.json", result)
+    artifacts = result.get("artifacts", {}) if isinstance(result, dict) else {}
+    log_path = artifacts.get("log_jsonl", str(run_dir / "cancer_risk_log.jsonl"))
+    report_path = artifacts.get("report_md", str(run_dir / "cancer_risk_report.md"))
+    summ = result.get("summary", {})
+    status = (
+        f"✅ Batch complete. total={summ.get('total_rows', 0)} "
+        f"ok={summ.get('ok_rows', 0)} error={summ.get('error_rows', 0)}"
+    )
+    return out_df, str(result_csv_path), str(log_path), str(report_path), status
 # =============================
 # Synthesis tab handler
 # =============================
         client = get_openai_client(api_key)
     except Exception as e:
         return str(e)
+    payload = json.loads(Path(extraction_json_file.name).read_text(encoding="utf-8"))
+    rows = payload.get("papers", payload) if isinstance(payload, dict) else payload
+    if not isinstance(rows, list):
+        return "Invalid extraction JSON format for synthesis."
     return openai_synthesize_across_papers(client, model, rows)
 # =============================
 # Gradio UI
 # =============================
+with gr.Blocks(title="Toxicology PDF → Grounded Extractor", css=APP_CSS) as demo:
+    gr.HTML(
+        """
+        <div class="hero">
+          <div class="hero-left">
+            <div class="hero-title">TOXRA.AI</div>
+            <div class="hero-sub">Grounded toxicology extraction &amp; literature exploration</div>
+            <div class="hero-pills">
+              <span class="hero-pill">Text-based PDFs only</span>
+              <span class="hero-pill">Results-first reporting</span>
+              <span class="hero-pill">Admin-configurable extraction</span>
+            </div>
+          </div>
+          <div class="hero-right">
+            <span class="hero-status">Production · Beta</span>
+          </div>
+        </div>
+        """
     )
     state_records = gr.State([])
     vocab_json = gr.Textbox(visible=False, interactive=False, lines=8)
     with gr.Tab("Extract"):
+        with gr.Row(elem_classes="split-row"):
+            with gr.Column(scale=4, min_width=320, elem_classes="left-rail"):
+                with gr.Group(elem_classes="card"):
+                    gr.Markdown("Extract setup", elem_classes="section-title")
+                    files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
+                    with gr.Row():
+                        api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
+                        model = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
+                    with gr.Row():
+                        endpoint_preset = gr.Dropdown(
+                            label="Endpoint preset",
+                            choices=list(ENDPOINT_PRESETS.keys()),
+                            value="Required – Safety Assessor"
+                        )
+                        endpoints = gr.Dropdown(
+                            label="Endpoints to extract (Core included automatically)",
+                            choices=list(ENDPOINT_MODULES.keys()),
+                            multiselect=True,
+                            value=ENDPOINT_PRESETS["Required – Safety Assessor"]
+                        )
+                    extract_btn = gr.Button("Run Extraction", variant="primary")
+                    status = gr.Textbox(label="Status", interactive=False)
+                with gr.Accordion("Advanced runtime settings", open=False, elem_classes="card"):
+                    with gr.Row():
+                        max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
+                        chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
+                        max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
+                with gr.Accordion("Admin tools (taxonomy + custom columns)", open=False, elem_classes="card"):
+                    admin_mode = gr.Checkbox(label="Enable Admin mode", value=False)
+                    admin_group = gr.Group(visible=False)
+                    admin_vocab_group = gr.Group(visible=False)
+                    admin_fields_group = gr.Group(visible=False)
+                    with admin_group:
+                        gr.Markdown("### Admin: Configure extraction taxonomy + custom columns.")
+                    with admin_vocab_group:
+                        gr.Markdown("### Controlled vocabulary (lists only)")
+                        vocab_category = gr.Dropdown(label="Category (lists only)", choices=[], value=None)
+                        vocab_search = gr.Textbox(label="Search terms", placeholder="Type to filter (e.g., 471, AMES, comet)", lines=1)
+                        with gr.Row():
+                            vocab_term_add = gr.Textbox(label="Add term", placeholder="type term and click Add")
+                            vocab_add_btn = gr.Button("Add")
+                        with gr.Row():
+                            vocab_term_remove = gr.Textbox(label="Remove term", placeholder="type exact term and click Remove")
+                            vocab_remove_btn = gr.Button("Remove")
+                            vocab_apply_btn = gr.Button("Apply full list to category")
+                            vocab_reset_btn = gr.Button("Reset vocab to defaults")
+                        vocab_terms_df = gr.Dataframe(headers=["term"], label="Terms (full list; edit directly)", interactive=True, wrap=True)
+                        vocab_terms_filtered = gr.Dataframe(headers=["term"], label="Filtered preview (read-only)", interactive=False, wrap=True)
+                        vocab_status = gr.Textbox(label="Vocab status", interactive=False)
+                        with gr.Accordion("Raw vocab JSON (auto-generated)", open=False):
+                            vocab_json_admin = gr.Textbox(label="Controlled vocab JSON", lines=12, interactive=False)
+                    with admin_fields_group:
+                        gr.Markdown("### Custom columns (Field Builder)")
+                        gr.Markdown("Tip: Use endpoint selection to start, then tweak fields.")
+                        with gr.Row():
+                            admin_apply_endpoints_btn = gr.Button("Load selected endpoints into builder (Replace)", variant="secondary")
+                            fields_apply_btn = gr.Button("Apply builder table")
+                        with gr.Row():
+                            field_name_in = gr.Textbox(label="Field name", placeholder="e.g., genotoxicity_result")
+                            field_type_in = gr.Dropdown(label="Type", choices=TYPE_CHOICES, value="str")
+                        enum_values_in = gr.Textbox(label="Enum values (comma-separated; for enum/list[enum])", placeholder="a,b,c", lines=2)
+                        instructions_in = gr.Textbox(label="Instructions", placeholder="Tell the extractor exactly what to pull.", lines=2)
+                        add_update_field_btn = gr.Button("Add/Update field")
+                        fields_df = gr.Dataframe(
+                            label="Fields (edit and click Apply)",
+                            headers=["field","type","enum_values","instructions"],
+                            interactive=True,
+                            wrap=True
+                        )
+                        fields_status = gr.Textbox(label="Field builder status", interactive=False)
+            with gr.Column(scale=7, min_width=480, elem_classes="right-panel"):
+                with gr.Tabs(elem_classes="report-tabs"):
+                    with gr.Tab("Overview"):
+                        with gr.Group(elem_classes="card"):
+                            gr.Markdown("Report overview", elem_classes="section-title")
+                            summary_card = gr.HTML(render_summary_card("", []))
+                        with gr.Group(elem_classes="card"):
+                            overview_df = gr.Dataframe(
+                                label="Batch Overview",
+                                interactive=False,
+                                wrap=True,
+                                show_row_numbers=True
+                            )
+                    with gr.Tab("Record"):
+                        with gr.Group(elem_classes="card"):
+                            record_pick = gr.Dropdown(label="Select record", choices=[], value=None)
+                            with gr.Row():
+                                review_mode = gr.Checkbox(label="Review mode (enable editing)", value=False)
+                                save_btn = gr.Button("Save edits")
+                                export_btn = gr.Button("Export reviewed CSV")
+                            review_status = gr.Textbox(label="Review status", interactive=False)
+                        with gr.Group(elem_classes="card"):
+                            vertical_view = gr.Dataframe(
+                                headers=["Field", "Value"],
+                                interactive=False,
+                                wrap=True,
+                                show_row_numbers=False,
+                                label="Extracted fields (vertical)"
+                            )
+                    with gr.Tab("Evidence"):
+                        with gr.Group(elem_classes="card"):
+                            evidence_md = gr.Markdown()
+                    with gr.Tab("Exports"):
+                        with gr.Group(elem_classes="card"):
+                            out_csv = gr.File(label="Download: extraction_table.csv")
+                            out_json = gr.File(label="Download: extraction_details.json (evidence + structured data)")
+                            risk_template_prefilled = gr.File(label="Download: cancer_risk_input_template_prefilled.csv (record_id linked)")
+                            reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
         # --- Wiring ---
         admin_mode.change(
         extract_btn.click(
             fn=run_extraction,
             inputs=[files, api_key, model, endpoints, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars, admin_mode],
+            outputs=[summary_card, overview_df, out_csv, out_json, risk_template_prefilled, status, record_pick, state_records, state_details, vertical_view, evidence_md]
         )
         record_pick.change(
         build_literature_explorer_tab()
     with gr.Tab("Cross-paper Synthesis"):
+        with gr.Group(elem_classes="card"):
+            gr.Markdown("Upload `extraction_details.json` from Extract tab. Synthesis is based strictly on grounded extractions.")
+            api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
+            model2 = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
+            extraction_json_file = gr.File(label="Upload extraction_details.json", file_types=[".json"], file_count="single")
+            synth_btn = gr.Button("Synthesize Across Papers")
+            synth_md = gr.Markdown()
+            synth_btn.click(fn=run_synthesis, inputs=[api_key2, model2, extraction_json_file], outputs=[synth_md])
+    with gr.Tab("Regulatory Gap Assessment"):
+        with gr.Group(elem_classes="card"):
+            gr.Markdown(
+                "Run clause-level mapping against regulatory catalogs. "
+                "Use `extraction_details.json` from Extract tab."
+            )
+            with gr.Row():
+                reg_extraction_json = gr.File(label="Upload extraction_details.json", file_types=[".json"], file_count="single")
+                reg_framework = gr.Dropdown(label="Framework profile", choices=["FDA CTP", "EPA"], value="FDA CTP")
+            reg_override_notes = gr.Textbox(
+                label="Override notes (optional)",
+                lines=2,
+                placeholder="Context to include in gap prompts."
+            )
+            reg_run_btn = gr.Button("Run Regulatory Gap Assessment", variant="primary")
+            reg_status = gr.Textbox(label="Status", interactive=False)
+            reg_summary_md = gr.Markdown()
+            reg_matrix_df = gr.Dataframe(label="Clause-level gap matrix", interactive=False, wrap=True)
+            reg_matrix_file = gr.File(label="Download: regulatory_gap_matrix.csv")
+            reg_report_file = gr.File(label="Download: regulatory_gap_report.json")
+            reg_run_btn.click(
+                fn=run_regulatory_gap_assessment,
+                inputs=[reg_extraction_json, reg_framework, reg_override_notes],
+                outputs=[reg_matrix_df, reg_summary_md, reg_matrix_file, reg_report_file, reg_status]
+            )
+    with gr.Tab("Cancer Risk Calculator"):
+        with gr.Group(elem_classes="card"):
+            gr.Markdown(
+                "Deterministic FDA/EPA cancer risk calculations routed through a dedicated local MCP server. "
+                "Use `record_id` values from extraction outputs for traceability."
+            )
+            with gr.Row():
+                template_btn = gr.Button("Download Blank CSV Template")
+                template_file = gr.File(label="Download: cancer_risk_input_template.csv")
+                template_status = gr.Textbox(label="Template status", interactive=False)
+            template_btn.click(fn=export_blank_cancer_risk_template, inputs=None, outputs=[template_file, template_status])
+            risk_input_csv = gr.File(label="Upload populated cancer risk input CSV", file_types=[".csv"], file_count="single")
+            risk_run_btn = gr.Button("Run Cancer Risk Batch", variant="primary")
+            risk_status = gr.Textbox(label="Status", interactive=False)
+            risk_results_df = gr.Dataframe(label="Cancer risk results", interactive=False, wrap=True)
+            risk_results_csv = gr.File(label="Download: cancer_risk_results.csv")
+            risk_log_file = gr.File(label="Download: cancer_risk_log.jsonl")
+            risk_report_file = gr.File(label="Download: cancer_risk_report.md")
+            risk_run_btn.click(
+                fn=run_cancer_risk_batch_ui,
+                inputs=[risk_input_csv],
+                outputs=[risk_results_df, risk_results_csv, risk_log_file, risk_report_file, risk_status]
+            )
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", "7860"))
+    demo.queue().launch(server_name="0.0.0.0", server_port=port)

cancer_risk_input_template.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ record_id,chemical_name,casrn,route,exposure_value,exposure_unit,body_weight_kg,csf_value,csf_unit,iur_value,air_conc_value,air_conc_unit,source_reference

literature_explorer.py CHANGED Viewed

@@ -9,6 +9,11 @@ import numpy as np
 import pandas as pd
 from pypdf import PdfReader
 from openai import OpenAI
 # =============================
@@ -454,39 +459,41 @@ def search(
     if not filtered_idx:
         return pd.DataFrame(), "### Grounded mini-summary\n(No pages match your filters)", "### Evidence used\n"
-    ranked: List[Tuple[float, Dict[str, Any]]] = []
-    # embeddings path
     if idx.get("has_embeddings") and idx.get("embeddings") is not None:
         try:
             client = get_client(api_key)
             qemb = embed_texts(client, embedding_model or idx.get("embedding_model") or DEFAULT_EMBEDDING_MODEL, [query])[0]
-            mat = idx["embeddings"][filtered_idx, :]
-            scores = mat @ qemb
-            order = np.argsort(scores)[::-1][:max(1, int(top_k))]
-            for j in order:
-                page_i = filtered_idx[int(j)]
-                ranked.append((float(scores[int(j)]), pages[page_i]))
         except Exception:
-            ranked = []
-    # fallback ranking
-    if not ranked:
-        qwords = set([w for w in re.findall(r"[a-zA-Z0-9\-]+", query.lower()) if len(w) >= 3])
-        tmp = []
-        for i in filtered_idx:
-            t = (pages[i].get("text") or "").lower()
-            hits = sum(1 for w in qwords if w in t)
-            tmp.append((hits, pages[i]))
-        tmp.sort(key=lambda x: x[0], reverse=True)
-        ranked = [(float(h), r) for h, r in tmp[:max(1, int(top_k))]]
     rows = []
     evidence = []
-    for score, r in ranked:
         pid = r["paper_id"]
         org = (papers.get(pid, {}) or {}).get("organ", "unknown")
-        ctx = expanded_context(r.get("text", ""), query, n_sentences=5)
         ctx_wrapped = hard_wrap(ctx, width=110)
         preview = ctx.strip()
@@ -495,7 +502,7 @@ def search(
         rows.append({
             "file": r.get("file",""),
             "page": r.get("page",""),
-            "score": round(score, 4),
             "organ": org,
             "endpoints": "; ".join(r.get("endpoints") or []),
             "enzymes": "; ".join((r.get("enzymes") or [])[:12]),
@@ -530,6 +537,12 @@ def search(
     except Exception as e:
         mini_summary = f"(mini-summary unavailable: {e})"
     mini_md = "### Grounded mini-summary\n" + mini_summary
     return results_df, mini_md, evidence_md
@@ -551,8 +564,8 @@ def on_select_result(df: pd.DataFrame, idx: dict, query: str, evt: gr.SelectData
         meta = f"**{citation}**"
         return meta, citation, "(page text not found)", ""
-    ctx = expanded_context(rec.get("text",""), query, n_sentences=5)
-    ctx = hard_wrap(ctx, width=110)
     full_txt = hard_wrap(rec.get("text",""), width=110)
     meta = f"**{citation}** | organ: **{r.get('organ','')}** | score: **{r.get('score','')}**"
@@ -646,4 +659,4 @@ def build_literature_explorer_tab():
         fn=citation_ready,
         inputs=[citation_box],
         outputs=[copy_status]
-    )

 import pandas as pd
 from pypdf import PdfReader
 from openai import OpenAI
+from toxra_core.nlp_pipeline import (
+    expand_regulatory_queries,
+    extract_evidence_span,
+    hybrid_rank_text_items,
+)
 # =============================
     if not filtered_idx:
         return pd.DataFrame(), "### Grounded mini-summary\n(No pages match your filters)", "### Evidence used\n"
+    filtered_pages = [pages[i] for i in filtered_idx]
+    emb_mat = None
+    qemb = None
     if idx.get("has_embeddings") and idx.get("embeddings") is not None:
         try:
             client = get_client(api_key)
             qemb = embed_texts(client, embedding_model or idx.get("embedding_model") or DEFAULT_EMBEDDING_MODEL, [query])[0]
+            emb_mat = idx["embeddings"][filtered_idx, :]
         except Exception:
+            emb_mat = None
+            qemb = None
+    _, query_families = expand_regulatory_queries(
+        base_queries=[query],
+        endpoint_modules=endpoint_filter or [],
+        frameworks=["FDA CTP", "EPA"],
+        extra_terms=[],
+    )
+    ranked_pages, rank_diag = hybrid_rank_text_items(
+        items=filtered_pages,
+        query=query,
+        families=query_families,
+        top_k=max(1, int(top_k)),
+        item_embeddings=emb_mat,
+        query_embedding=qemb,
+    )
     rows = []
     evidence = []
+    for r in ranked_pages:
         pid = r["paper_id"]
         org = (papers.get(pid, {}) or {}).get("organ", "unknown")
+        span = extract_evidence_span(r.get("text", ""), query, page=r.get("page"), n_sentences=5)
+        ctx = span.get("text", "")
         ctx_wrapped = hard_wrap(ctx, width=110)
         preview = ctx.strip()
         rows.append({
             "file": r.get("file",""),
             "page": r.get("page",""),
+            "score": round(float(r.get("_nlp_rrf_score", 0.0)), 4),
             "organ": org,
             "endpoints": "; ".join(r.get("endpoints") or []),
             "enzymes": "; ".join((r.get("enzymes") or [])[:12]),
     except Exception as e:
         mini_summary = f"(mini-summary unavailable: {e})"
+    if rank_diag:
+        mini_summary = (
+            f"{mini_summary}\n\n"
+            f"_NLP diagnostics: method={rank_diag.get('ranking_method','')}, "
+            f"coverage={rank_diag.get('coverage_score', 0.0)}._"
+        )
     mini_md = "### Grounded mini-summary\n" + mini_summary
     return results_df, mini_md, evidence_md
         meta = f"**{citation}**"
         return meta, citation, "(page text not found)", ""
+    span = extract_evidence_span(rec.get("text",""), query, page=page, n_sentences=5)
+    ctx = hard_wrap(span.get("text", ""), width=110)
     full_txt = hard_wrap(rec.get("text",""), width=110)
     meta = f"**{citation}** | organ: **{r.get('organ','')}** | score: **{r.get('score','')}**"
         fn=citation_ready,
         inputs=[citation_box],
         outputs=[copy_status]
+    )

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
-gradio>=5.0.0
 pandas>=2.0.0
-numpy>=2.0.0
-pypdf>=5.0.0
 scikit-learn>=1.4.0
-openai>=1.0.0

+gradio>=4.0.0
+numpy>=1.26.0
 pandas>=2.0.0
+pypdf>=4.0.0
 scikit-learn>=1.4.0
+openai>=1.40.0
+pytest>=8.0.0

runtime.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python-3.11