Spaces:

hchevva
/

NLP_Project

Running

App Files Files Community

hchevva commited on 10 days ago

Commit

f6221d9

verified ·

1 Parent(s): ddb431d

Update app.py

Browse files

Files changed (1) hide show

app.py +297 -111

app.py CHANGED Viewed

@@ -135,6 +135,20 @@ ENDPOINT_MODULES: Dict[str, List[Dict[str, Any]]] = {
     "Carcinogenicity": PRESET_CARCINOGENICITY,
 }
 ENDPOINT_QUERY_HINTS: Dict[str, List[str]] = {
     "Genotoxicity (OECD TG)": ["genotoxicity", "mutagenicity", "AMES", "micronucleus", "comet assay", "chromosomal aberration", "OECD TG 471 473 476 487 490 474 489"],
     "NAMs / In Silico": ["in silico", "QSAR", "read-across", "AOP", "PBPK", "high-throughput", "omics", "organ-on-chip", "microphysiological"],
@@ -264,10 +278,6 @@ def slugify_field(name: str) -> str:
 def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
-    """
-    spec lines: Field Name | type | instructions
-    types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]
-    """
     props: Dict[str, Any] = {}
     instr: Dict[str, str] = {}
@@ -421,54 +431,6 @@ def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[
     return resp.output_text
-# =============================
-# UI helpers: vertical view + evidence + overview
-# =============================
-def _make_vertical(records: List[Dict[str, Any]], record_id: str) -> pd.DataFrame:
-    if not records or not record_id:
-        return pd.DataFrame(columns=["Field", "Value"])
-    row = next((r for r in records if r.get("record_id") == record_id), None)
-    if not row:
-        return pd.DataFrame(columns=["Field", "Value"])
-    hidden = {"record_id"}
-    keys = [k for k in row.keys() if k not in hidden]
-    return pd.DataFrame({"Field": keys, "Value": [row[k] for k in keys]})
-def _render_evidence(details: List[Dict[str, Any]], file_name: str, allowed_fields: Optional[set] = None, max_items: int = 120) -> str:
-    if not details or not file_name:
-        return ""
-    d = next((x for x in details if x.get("_file") == file_name), None)
-    if not d:
-        return ""
-    ev = d.get("evidence", []) or []
-    lines = []
-    for e in ev:
-        field = (e.get("field", "") or "").strip()
-        if allowed_fields is not None and field and field not in allowed_fields:
-            continue
-        quote = (e.get("quote", "") or "").strip()
-        pages = (e.get("pages", "") or "").strip()
-        if quote:
-            if len(quote) > 320:
-                quote = quote[:320] + "…"
-            lines.append(f"- **{field}** (pages {pages}): “{quote}”")
-        if len(lines) >= max_items:
-            break
-    header = "### Evidence (grounding)\n"
-    return header + ("\n".join(lines) if lines else "- (no evidence returned)")
-def _overview_df_from_records(records: List[Dict[str, Any]]) -> pd.DataFrame:
-    if not records:
-        return pd.DataFrame(columns=["record_id","file","paper_title","chemical","endpoint","risk_stance","risk_confidence"])
-    df = pd.DataFrame(records)
-    cols = ["record_id","file","paper_title","chemical","endpoint","risk_stance","risk_confidence"]
-    cols = [c for c in cols if c in df.columns]
-    return df[cols].copy() if cols else df.head(50)
 # =============================
 # Controlled vocab editor helpers (lists only) + search filter
 # =============================
@@ -572,7 +534,7 @@ def vocab_filter_preview(terms_df, search):
 # =============================
-# Field builder (admin) + endpoint selection mapping
 # =============================
 TYPE_CHOICES = ["str", "num", "bool", "list[str]", "list[num]", "enum", "list[enum]"]
@@ -606,14 +568,17 @@ def build_spec_from_field_rows(rows: List[Dict[str, Any]]) -> str:
     return "\n".join(lines).strip() + "\n"
-def build_rows_from_endpoints(selected_endpoints: List[str]) -> Tuple[List[Dict[str, Any]], Dict[str, str]]:
     selected_endpoints = selected_endpoints or []
     rows: List[Dict[str, Any]] = []
     field_key_to_module: Dict[str, str] = {}
     for r in PRESET_CORE:
         rows.append(dict(r))
-        field_key_to_module[slugify_field(r["field"])] = "Core"
     for module in selected_endpoints:
         preset = ENDPOINT_MODULES.get(module)
@@ -621,7 +586,9 @@ def build_rows_from_endpoints(selected_endpoints: List[str]) -> Tuple[List[Dict[
             continue
         for r in preset:
             rows.append(dict(r))
-            field_key_to_module[slugify_field(r["field"])] = module
     seen = set()
     deduped: List[Dict[str, Any]] = []
@@ -632,21 +599,30 @@ def build_rows_from_endpoints(selected_endpoints: List[str]) -> Tuple[List[Dict[
         seen.add(k)
         deduped.append(r)
-    field_key_to_module = {slugify_field(r["field"]): field_key_to_module.get(slugify_field(r["field"]), "Custom") for r in deduped}
-    return deduped, field_key_to_module
-def sync_fields_from_endpoints(selected_endpoints: List[str], admin_mode: bool):
     if admin_mode:
-        return gr.update(), gr.update(), gr.update(), "Admin mode: endpoint selection will not overwrite custom columns."
-    rows, _ = build_rows_from_endpoints(selected_endpoints)
     df = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
     spec = build_spec_from_field_rows(rows)
     return rows, df, spec, "✅ Columns updated from selected endpoints."
 def admin_apply_endpoints(selected_endpoints: List[str]):
-    rows, _ = build_rows_from_endpoints(selected_endpoints)
     df = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
     spec = build_spec_from_field_rows(rows)
     return rows, df, spec, "✅ Loaded selected endpoints into the builder (Replace)."
@@ -706,7 +682,7 @@ def fields_apply_df(field_rows: List[Dict[str, Any]], df_in: Any):
 # =============================
-# Row-building logic (paper vs chemical-endpoint)
 # =============================
 def _as_list(x) -> List[str]:
     if x is None:
@@ -728,12 +704,174 @@ def _format_value(v: Any) -> Any:
     return v
 def _record_id(file_name: str, chemical: str, endpoint: str) -> str:
     chemical = (chemical or "").strip() or "-"
     endpoint = (endpoint or "").strip() or "Paper"
     return f"{file_name} | {chemical} | {endpoint}"
 # =============================
 # Main extraction handler
 # =============================
@@ -751,6 +889,7 @@ def run_extraction(
 ):
     if not files:
         return (
             pd.DataFrame(), None, None, "Upload one or more PDFs.",
             gr.update(choices=[], value=None),
             [], [], pd.DataFrame(columns=["Field","Value"]), ""
@@ -760,6 +899,7 @@ def run_extraction(
         vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
     except Exception as e:
         return (
             pd.DataFrame(), None, None, f"Controlled vocab JSON invalid: {e}",
             gr.update(choices=[], value=None),
             [], [], pd.DataFrame(columns=["Field","Value"]), ""
@@ -768,6 +908,7 @@ def run_extraction(
     field_props, field_instr = parse_field_spec(field_spec or "")
     if not field_props:
         return (
             pd.DataFrame(), None, None, "No extraction fields are defined. (Check selected endpoints or admin field spec.)",
             gr.update(choices=[], value=None),
             [], [], pd.DataFrame(columns=["Field","Value"]), ""
@@ -777,15 +918,17 @@ def run_extraction(
     if admin_mode:
         field_key_to_module = {k: "Custom" for k in field_props.keys()}
         endpoint_modules_for_rows = ["Custom"]
     else:
-        _, field_key_to_module = build_rows_from_endpoints(selected_endpoints or [])
         endpoint_modules_for_rows = list(selected_endpoints or []) or ["Core"]
     try:
         client = get_openai_client(api_key)
     except Exception as e:
         return (
             pd.DataFrame(), None, None, str(e),
             gr.update(choices=[], value=None),
             [], [], pd.DataFrame(columns=["Field","Value"]), ""
@@ -854,32 +997,55 @@ def run_extraction(
         if not chemicals:
             chemicals = ["-"]
         if len(chemicals) <= 1:
             row = dict(base)
-            row["chemical"] = chemicals[0]
             row["endpoint"] = "Paper"
-            row["record_id"] = _record_id(filename, row["chemical"], row["endpoint"])
             for k in field_props.keys():
                 row[k] = _format_value(ext.get(k, [] if field_props[k].get("type") == "array" else ""))
             output_rows.append(row)
         else:
             core_keys = [k for k, m in field_key_to_module.items() if m == "Core"] if not admin_mode else []
-            for chem in chemicals:
-                for module in endpoint_modules_for_rows:
-                    row = dict(base)
-                    row["chemical"] = chem
-                    row["endpoint"] = module
-                    row["record_id"] = _record_id(filename, chem, module)
-                    for k in field_props.keys():
-                        m = field_key_to_module.get(k, "Custom")
-                        include = (k in core_keys) or (m == module) or admin_mode
-                        if include:
-                            if k == "chemicals":
-                                row[k] = chem  # make per-row chemical consistent
-                            else:
-                                row[k] = _format_value(ext.get(k, [] if field_props[k].get("type") == "array" else ""))
-                    output_rows.append(row)
     df = pd.DataFrame(output_rows)
     records = df.to_dict("records")
@@ -893,20 +1059,20 @@ def run_extraction(
     default = choices[0] if choices else None
     vertical = _make_vertical(records, default) if default else pd.DataFrame(columns=["Field","Value"])
     allowed_fields = None
     if default:
         selected_row = next((r for r in records if r.get("record_id") == default), {})
         allowed_fields = set([k for k in selected_row.keys() if k not in {"record_id"}])
-    file_for_evidence = None
-    if default:
-        file_for_evidence = default.split(" | ")[0].strip()
     evidence = _render_evidence(paper_details, file_for_evidence, allowed_fields=allowed_fields) if file_for_evidence else ""
     overview = _overview_df_from_records(records)
     status = "✅ Done. Review in the report below and export when ready."
     return (
         overview,
         str(csv_path),
         str(json_path),
@@ -924,11 +1090,11 @@ def run_extraction(
 # =============================
 def on_pick(record_id: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
     if not record_id:
-        return pd.DataFrame(columns=["Field","Value"]), ""
     row = next((r for r in (records or []) if r.get("record_id") == record_id), {})
     file_name = (row.get("file") or "")
     allowed_fields = set(row.keys()) - {"record_id"}
-    return _make_vertical(records, record_id), _render_evidence(details, file_name, allowed_fields=allowed_fields)
 def toggle_review_mode(is_on: bool):
@@ -937,12 +1103,12 @@ def toggle_review_mode(is_on: bool):
 def save_review_changes(record_id: str, vertical_df: Any, records: List[Dict[str, Any]]):
     if not record_id or not records:
-        return pd.DataFrame(), records, "Nothing to save."
     try:
         dfv = vertical_df if isinstance(vertical_df, pd.DataFrame) else pd.DataFrame(vertical_df, columns=["Field", "Value"])
     except Exception:
-        return _overview_df_from_records(records), records, "Could not parse edited vertical table."
     dfv = dfv.dropna(subset=["Field"])
     updates = {str(r["Field"]): r["Value"] for _, r in dfv.iterrows() if str(r["Field"]).strip()}
@@ -960,7 +1126,7 @@ def save_review_changes(record_id: str, vertical_df: Any, records: List[Dict[str
             new_records.append(r)
     msg = "Saved changes into session data. Export reviewed CSV to download." if updated else "Record not found."
-    return _overview_df_from_records(new_records), new_records, msg
 def export_reviewed_csv(records: List[Dict[str, Any]]):
@@ -977,19 +1143,17 @@ def export_reviewed_csv(records: List[Dict[str, Any]]):
 # =============================
 def run_synthesis(api_key, model, extraction_json_file):
     if extraction_json_file is None:
-        return "Upload the extraction_details.json from the Extract tab first."
     try:
         client = get_openai_client(api_key)
     except Exception as e:
         return str(e)
     rows = json.loads(Path(extraction_json_file.name).read_text(encoding="utf-8"))
     return openai_synthesize_across_papers(client, model, rows)
 # =============================
-# UI visibility helpers
 # =============================
 def set_admin_visibility(is_admin: bool):
     return (
@@ -1014,10 +1178,11 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
     vocab_state = gr.State({})
     field_rows_state = gr.State([])
-    field_spec = gr.Textbox(visible=False, interactive=False, lines=8, label="(hidden) field spec")
-    vocab_json = gr.Textbox(visible=False, interactive=False, lines=8, label="(hidden) vocab json")
     with gr.Tab("Extract"):
         with gr.Group():
             files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
@@ -1025,17 +1190,26 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
                 api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
                 model = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
-            endpoints = gr.Dropdown(
-                label="Endpoints to extract (Core included automatically)",
-                choices=list(ENDPOINT_MODULES.keys()),
-                multiselect=True,
-                value=["Genotoxicity (OECD TG)"]
-            )
             extract_btn = gr.Button("Run Extraction", variant="primary")
             status = gr.Textbox(label="Status", interactive=False)
         gr.Markdown("## Report")
         overview_df = gr.Dataframe(
             label="Batch Overview",
             interactive=False,
@@ -1068,12 +1242,14 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
         reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
         with gr.Accordion("Advanced runtime settings", open=False):
             with gr.Row():
                 max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
                 chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
                 max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
         with gr.Accordion("Admin tools (taxonomy + custom columns)", open=False):
             admin_mode = gr.Checkbox(label="Enable Admin mode", value=False)
@@ -1082,7 +1258,7 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
             admin_fields_group = gr.Group(visible=False)
             with admin_group:
-                gr.Markdown("### Admin: Configure what gets extracted (columns) and how terms are normalized.")
             with admin_vocab_group:
                 gr.Markdown("### Controlled vocabulary (lists only)")
@@ -1131,28 +1307,35 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
                 fields_status = gr.Textbox(label="Field builder status", interactive=False)
         admin_mode.change(
             fn=set_admin_visibility,
             inputs=[admin_mode],
             outputs=[admin_group, admin_vocab_group, admin_fields_group]
         )
         endpoints.change(
             fn=sync_fields_from_endpoints,
-            inputs=[endpoints, admin_mode],
             outputs=[field_rows_state, fields_df, field_spec, status]
         )
         extract_btn.click(
             fn=run_extraction,
             inputs=[files, api_key, model, endpoints, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars, admin_mode],
-            outputs=[overview_df, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
         )
         record_pick.change(
             fn=on_pick,
             inputs=[record_pick, state_records, state_details],
-            outputs=[vertical_view, evidence_md]
         )
         review_mode.change(fn=toggle_review_mode, inputs=[review_mode], outputs=[vertical_view])
@@ -1160,7 +1343,7 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
         save_btn.click(
             fn=save_review_changes,
             inputs=[record_pick, vertical_view, state_records],
-            outputs=[overview_df, state_records, review_status]
         )
         export_btn.click(
@@ -1169,6 +1352,7 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
             outputs=[reviewed_csv, review_status]
         )
         vocab_search.change(fn=vocab_filter_preview, inputs=[vocab_terms_df, vocab_search], outputs=[vocab_terms_filtered])
         vocab_category.change(
@@ -1205,6 +1389,7 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
             outputs=[vocab_state, vocab_category, vocab_terms_df, vocab_terms_filtered, vocab_json_admin, vocab_status, vocab_json]
         )
         admin_apply_endpoints_btn.click(
             fn=admin_apply_endpoints,
             inputs=[endpoints],
@@ -1223,11 +1408,12 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
             outputs=[field_rows_state, fields_df, field_spec, fields_status]
         )
         def _init_all():
             vocab, keys, k0, full_df, filtered_df, vjson, vmsg = vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
-            default_endpoints = ["Genotoxicity (OECD TG)"]
-            rows, _ = build_rows_from_endpoints(default_endpoints)
             fdf = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
             fspec = build_spec_from_field_rows(rows)
@@ -1264,7 +1450,7 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
         )
     with gr.Tab("Cross-paper Synthesis"):
-        gr.Markdown("Upload `extraction_details.json` from Extract. Synthesis is based strictly on grounded extractions.")
         api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
         model2 = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
         extraction_json_file = gr.File(label="Upload extraction_details.json", file_types=[".json"], file_count="single")

     "Carcinogenicity": PRESET_CARCINOGENICITY,
 }
+# Endpoint presets (requested)
+ENDPOINT_PRESETS: Dict[str, List[str]] = {
+    "Required – Safety Assessor": [
+        "Genotoxicity (OECD TG)",
+        "Repeated dose toxicity",
+        "Irritation / Sensitization",
+        "Repro / Developmental",
+        "Acute toxicity",
+    ],
+    "Core only (fast)": [],
+    "Screening – NAMs + Genotox": ["NAMs / In Silico", "Genotoxicity (OECD TG)"],
+    "Full – All endpoints": list(ENDPOINT_MODULES.keys()),
+}
 ENDPOINT_QUERY_HINTS: Dict[str, List[str]] = {
     "Genotoxicity (OECD TG)": ["genotoxicity", "mutagenicity", "AMES", "micronucleus", "comet assay", "chromosomal aberration", "OECD TG 471 473 476 487 490 474 489"],
     "NAMs / In Silico": ["in silico", "QSAR", "read-across", "AOP", "PBPK", "high-throughput", "omics", "organ-on-chip", "microphysiological"],
 def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
     props: Dict[str, Any] = {}
     instr: Dict[str, str] = {}
     return resp.output_text
 # =============================
 # Controlled vocab editor helpers (lists only) + search filter
 # =============================
 # =============================
+# Field mapping from endpoints
 # =============================
 TYPE_CHOICES = ["str", "num", "bool", "list[str]", "list[num]", "enum", "list[enum]"]
     return "\n".join(lines).strip() + "\n"
+def build_rows_from_endpoints(selected_endpoints: List[str]) -> Tuple[List[Dict[str, Any]], Dict[str, str], Dict[str, List[str]]]:
     selected_endpoints = selected_endpoints or []
     rows: List[Dict[str, Any]] = []
     field_key_to_module: Dict[str, str] = {}
+    module_to_keys: Dict[str, List[str]] = {}
     for r in PRESET_CORE:
         rows.append(dict(r))
+        k = slugify_field(r["field"])
+        field_key_to_module[k] = "Core"
+        module_to_keys.setdefault("Core", []).append(k)
     for module in selected_endpoints:
         preset = ENDPOINT_MODULES.get(module)
             continue
         for r in preset:
             rows.append(dict(r))
+            k = slugify_field(r["field"])
+            field_key_to_module[k] = module
+            module_to_keys.setdefault(module, []).append(k)
     seen = set()
     deduped: List[Dict[str, Any]] = []
         seen.add(k)
         deduped.append(r)
+    # Rebuild module_to_keys to match deduped
+    dedup_keys = set([slugify_field(r["field"]) for r in deduped])
+    module_to_keys = {m: [k for k in ks if k in dedup_keys] for m, ks in module_to_keys.items()}
+    return deduped, field_key_to_module, module_to_keys
+def apply_endpoint_preset(preset_name: str):
+    vals = ENDPOINT_PRESETS.get(preset_name, [])
+    return gr.update(value=vals)
+def sync_fields_from_endpoints(selected_endpoints: List[str], admin_mode: bool, current_rows: List[Dict[str, Any]], current_spec: str):
     if admin_mode:
+        df = pd.DataFrame(current_rows or [], columns=["field","type","enum_values","instructions"])
+        return current_rows, df, current_spec, "Admin mode: endpoint selection will not overwrite custom columns."
+    rows, _, _ = build_rows_from_endpoints(selected_endpoints or [])
     df = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
     spec = build_spec_from_field_rows(rows)
     return rows, df, spec, "✅ Columns updated from selected endpoints."
 def admin_apply_endpoints(selected_endpoints: List[str]):
+    rows, _, _ = build_rows_from_endpoints(selected_endpoints or [])
     df = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
     spec = build_spec_from_field_rows(rows)
     return rows, df, spec, "✅ Loaded selected endpoints into the builder (Replace)."
 # =============================
+# Row building + “non-empty module” logic
 # =============================
 def _as_list(x) -> List[str]:
     if x is None:
     return v
+EMPTY_STRINGS = {"", "not_reported", "insufficient_data", "none", "na", "n/a", "null"}
+def _is_empty_value(v: Any) -> bool:
+    if v is None:
+        return True
+    if isinstance(v, float) and np.isnan(v):
+        return True
+    if isinstance(v, list):
+        cleaned = [str(x).strip() for x in v if str(x).strip()]
+        if not cleaned:
+            return True
+        # empty if all items are not_reported / similar
+        return all((c.lower() in EMPTY_STRINGS) for c in cleaned)
+    s = str(v).strip()
+    if not s:
+        return True
+    return s.lower() in EMPTY_STRINGS
 def _record_id(file_name: str, chemical: str, endpoint: str) -> str:
     chemical = (chemical or "").strip() or "-"
     endpoint = (endpoint or "").strip() or "Paper"
     return f"{file_name} | {chemical} | {endpoint}"
+def _module_has_any_data(ext: Dict[str, Any], module_keys: List[str], field_props: Dict[str, Any]) -> bool:
+    for k in (module_keys or []):
+        v = ext.get(k, None)
+        if not _is_empty_value(v):
+            return True
+    return False
+# =============================
+# Evidence + report helpers
+# =============================
+def _make_vertical(records: List[Dict[str, Any]], record_id: str) -> pd.DataFrame:
+    if not records or not record_id:
+        return pd.DataFrame(columns=["Field", "Value"])
+    row = next((r for r in records if r.get("record_id") == record_id), None)
+    if not row:
+        return pd.DataFrame(columns=["Field", "Value"])
+    hidden = {"record_id"}
+    keys = [k for k in row.keys() if k not in hidden]
+    return pd.DataFrame({"Field": keys, "Value": [row.get(k, "") for k in keys]})
+def _render_evidence(details: List[Dict[str, Any]], file_name: str, allowed_fields: Optional[set] = None, max_items: int = 120) -> str:
+    if not details or not file_name:
+        return ""
+    d = next((x for x in details if x.get("_file") == file_name), None)
+    if not d:
+        return ""
+    ev = d.get("evidence", []) or []
+    lines = []
+    for e in ev:
+        field = (e.get("field", "") or "").strip()
+        if allowed_fields is not None and field and field not in allowed_fields:
+            continue
+        quote = (e.get("quote", "") or "").strip()
+        pages = (e.get("pages", "") or "").strip()
+        if quote:
+            if len(quote) > 320:
+                quote = quote[:320] + "…"
+            lines.append(f"- **{field}** (pages {pages}): “{quote}”")
+        if len(lines) >= max_items:
+            break
+    header = "### Evidence (grounding)\n"
+    return header + ("\n".join(lines) if lines else "- (no evidence returned)")
+def _overview_df_from_records(records: List[Dict[str, Any]]) -> pd.DataFrame:
+    if not records:
+        return pd.DataFrame(columns=["record_id","file","paper_title","chemical","endpoint","risk_stance","risk_confidence"])
+    df = pd.DataFrame(records)
+    cols = ["record_id","file","paper_title","chemical","endpoint","risk_stance","risk_confidence"]
+    cols = [c for c in cols if c in df.columns]
+    return df[cols].copy() if cols else df.head(50)
+def _risk_badge(risk: str) -> str:
+    r = (risk or "").strip().lower()
+    if r == "acceptable":
+        bg = "#e7f7ed"; fg = "#0f5132"
+    elif r == "acceptable_with_uncertainty":
+        bg = "#fff3cd"; fg = "#664d03"
+    elif r == "not_acceptable":
+        bg = "#f8d7da"; fg = "#842029"
+    else:
+        bg = "#e2e3e5"; fg = "#41464b"
+    label = risk if risk else "unknown"
+    return f'<span style="background:{bg};color:{fg};padding:4px 10px;border-radius:999px;font-weight:600;font-size:12px;">{label}</span>'
+def _safe_str(x: Any) -> str:
+    if x is None:
+        return ""
+    if isinstance(x, float) and np.isnan(x):
+        return ""
+    return str(x)
+def render_summary_card(record_id: str, records: List[Dict[str, Any]]) -> str:
+    if not record_id or not records:
+        return "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#666;'>Run extraction to view results.</div></div>"
+    row = next((r for r in records if r.get("record_id") == record_id), None)
+    if not row:
+        return "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#666;'>Select a record.</div></div>"
+    title = _safe_str(row.get("paper_title", "")).strip() or "Untitled paper"
+    file_name = _safe_str(row.get("file", ""))
+    chemical = _safe_str(row.get("chemical", "-"))
+    endpoint = _safe_str(row.get("endpoint", "Paper"))
+    risk = _safe_str(row.get("risk_stance", ""))
+    conf = row.get("risk_confidence", "")
+    try:
+        conf_txt = f"{float(conf):.2f}" if conf != "" else ""
+    except Exception:
+        conf_txt = _safe_str(conf)
+    key_findings = _safe_str(row.get("key_findings", "")).strip()
+    dose_metrics = _safe_str(row.get("dose_metrics", "")).strip()
+    conclusion = _safe_str(row.get("conclusion", "")).strip()
+    # Keep compact
+    def _clip(s: str, n: int = 380) -> str:
+        s = s.strip()
+        if len(s) <= n:
+            return s
+        return s[:n] + "…"
+    return f"""
+    <div style="border:1px solid #eaeaea;padding:14px;border-radius:12px;">
+      <div style="display:flex;align-items:center;justify-content:space-between;gap:12px;flex-wrap:wrap;">
+        <div style="font-weight:700;font-size:16px;">Executive Summary</div>
+        <div>{_risk_badge(risk)} <span style="margin-left:10px;color:#666;font-size:12px;">confidence: {conf_txt}</span></div>
+      </div>
+      <div style="margin-top:10px;">
+        <div style="font-weight:650;">{title}</div>
+        <div style="color:#666;font-size:12px;margin-top:4px;">
+          <span><b>File:</b> {file_name}</span> &nbsp; • &nbsp;
+          <span><b>Chemical:</b> {chemical}</span> &nbsp; • &nbsp;
+          <span><b>Endpoint:</b> {endpoint}</span>
+        </div>
+      </div>
+      <div style="margin-top:12px;display:grid;grid-template-columns:1fr;gap:10px;">
+        <div>
+          <div style="font-weight:650;margin-bottom:4px;">Key Findings</div>
+          <div style="color:#222;">{_clip(key_findings) if key_findings else "<span style='color:#666'>(not reported)</span>"}</div>
+        </div>
+        <div>
+          <div style="font-weight:650;margin-bottom:4px;">Dose Metrics</div>
+          <div style="color:#222;">{_clip(dose_metrics) if dose_metrics else "<span style='color:#666'>(not reported)</span>"}</div>
+        </div>
+        <div>
+          <div style="font-weight:650;margin-bottom:4px;">Conclusion</div>
+          <div style="color:#222;">{_clip(conclusion) if conclusion else "<span style='color:#666'>(not reported)</span>"}</div>
+        </div>
+      </div>
+    </div>
+    """
 # =============================
 # Main extraction handler
 # =============================
 ):
     if not files:
         return (
+            "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#666;'>Upload PDFs to run extraction.</div></div>",
             pd.DataFrame(), None, None, "Upload one or more PDFs.",
             gr.update(choices=[], value=None),
             [], [], pd.DataFrame(columns=["Field","Value"]), ""
         vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
     except Exception as e:
         return (
+            "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>Invalid vocab JSON.</div></div>",
             pd.DataFrame(), None, None, f"Controlled vocab JSON invalid: {e}",
             gr.update(choices=[], value=None),
             [], [], pd.DataFrame(columns=["Field","Value"]), ""
     field_props, field_instr = parse_field_spec(field_spec or "")
     if not field_props:
         return (
+            "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>No columns defined.</div></div>",
             pd.DataFrame(), None, None, "No extraction fields are defined. (Check selected endpoints or admin field spec.)",
             gr.update(choices=[], value=None),
             [], [], pd.DataFrame(columns=["Field","Value"]), ""
     if admin_mode:
         field_key_to_module = {k: "Custom" for k in field_props.keys()}
+        module_to_keys: Dict[str, List[str]] = {"Custom": list(field_props.keys())}
         endpoint_modules_for_rows = ["Custom"]
     else:
+        _, field_key_to_module, module_to_keys = build_rows_from_endpoints(selected_endpoints or [])
         endpoint_modules_for_rows = list(selected_endpoints or []) or ["Core"]
     try:
         client = get_openai_client(api_key)
     except Exception as e:
         return (
+            "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>Missing API key.</div></div>",
             pd.DataFrame(), None, None, str(e),
             gr.update(choices=[], value=None),
             [], [], pd.DataFrame(columns=["Field","Value"]), ""
         if not chemicals:
             chemicals = ["-"]
+        # Single-chemical => one-row-per-paper
         if len(chemicals) <= 1:
+            chem = chemicals[0]
             row = dict(base)
+            row["chemical"] = chem
             row["endpoint"] = "Paper"
+            row["record_id"] = _record_id(filename, chem, row["endpoint"])
             for k in field_props.keys():
                 row[k] = _format_value(ext.get(k, [] if field_props[k].get("type") == "array" else ""))
             output_rows.append(row)
+        # Multi-chemical => chemical–endpoint rows (ONLY non-empty modules)
         else:
             core_keys = [k for k, m in field_key_to_module.items() if m == "Core"] if not admin_mode else []
+            # determine which endpoint modules have any data (skip empty ones)
+            candidate_modules = [m for m in endpoint_modules_for_rows if m != "Core"]
+            non_empty_modules = []
+            for m in candidate_modules:
+                if _module_has_any_data(ext, module_to_keys.get(m, []), field_props):
+                    non_empty_modules.append(m)
+            # If everything empty, fall back to a single Paper row (otherwise you get no rows)
+            if not non_empty_modules:
+                row = dict(base)
+                row["chemical"] = "multiple"
+                row["endpoint"] = "Paper"
+                row["record_id"] = _record_id(filename, row["chemical"], row["endpoint"])
+                for k in field_props.keys():
+                    row[k] = _format_value(ext.get(k, [] if field_props[k].get("type") == "array" else ""))
+                output_rows.append(row)
+            else:
+                for chem in chemicals:
+                    for module in non_empty_modules:
+                        row = dict(base)
+                        row["chemical"] = chem
+                        row["endpoint"] = module
+                        row["record_id"] = _record_id(filename, chem, module)
+                        for k in field_props.keys():
+                            m = field_key_to_module.get(k, "Custom")
+                            include = (k in core_keys) or (m == module) or admin_mode
+                            if include:
+                                if k == "chemicals":
+                                    row[k] = chem
+                                else:
+                                    row[k] = _format_value(ext.get(k, [] if field_props[k].get("type") == "array" else ""))
+                        output_rows.append(row)
     df = pd.DataFrame(output_rows)
     records = df.to_dict("records")
     default = choices[0] if choices else None
     vertical = _make_vertical(records, default) if default else pd.DataFrame(columns=["Field","Value"])
+    summary_html = render_summary_card(default, records) if default else render_summary_card("", [])
     allowed_fields = None
+    file_for_evidence = None
     if default:
         selected_row = next((r for r in records if r.get("record_id") == default), {})
         allowed_fields = set([k for k in selected_row.keys() if k not in {"record_id"}])
+        file_for_evidence = (default.split(" | ")[0] or "").strip()
     evidence = _render_evidence(paper_details, file_for_evidence, allowed_fields=allowed_fields) if file_for_evidence else ""
     overview = _overview_df_from_records(records)
     status = "✅ Done. Review in the report below and export when ready."
     return (
+        summary_html,
         overview,
         str(csv_path),
         str(json_path),
 # =============================
 def on_pick(record_id: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
     if not record_id:
+        return render_summary_card("", []), pd.DataFrame(columns=["Field","Value"]), ""
     row = next((r for r in (records or []) if r.get("record_id") == record_id), {})
     file_name = (row.get("file") or "")
     allowed_fields = set(row.keys()) - {"record_id"}
+    return render_summary_card(record_id, records), _make_vertical(records, record_id), _render_evidence(details, file_name, allowed_fields=allowed_fields)
 def toggle_review_mode(is_on: bool):
 def save_review_changes(record_id: str, vertical_df: Any, records: List[Dict[str, Any]]):
     if not record_id or not records:
+        return pd.DataFrame(), records, "Nothing to save.", render_summary_card("", [])
     try:
         dfv = vertical_df if isinstance(vertical_df, pd.DataFrame) else pd.DataFrame(vertical_df, columns=["Field", "Value"])
     except Exception:
+        return _overview_df_from_records(records), records, "Could not parse edited vertical table.", render_summary_card(record_id, records)
     dfv = dfv.dropna(subset=["Field"])
     updates = {str(r["Field"]): r["Value"] for _, r in dfv.iterrows() if str(r["Field"]).strip()}
             new_records.append(r)
     msg = "Saved changes into session data. Export reviewed CSV to download." if updated else "Record not found."
+    return _overview_df_from_records(new_records), new_records, msg, render_summary_card(record_id, new_records)
 def export_reviewed_csv(records: List[Dict[str, Any]]):
 # =============================
 def run_synthesis(api_key, model, extraction_json_file):
     if extraction_json_file is None:
+        return "Upload the extraction_details.json from Extract tab first."
     try:
         client = get_openai_client(api_key)
     except Exception as e:
         return str(e)
     rows = json.loads(Path(extraction_json_file.name).read_text(encoding="utf-8"))
     return openai_synthesize_across_papers(client, model, rows)
 # =============================
+# Admin visibility helpers
 # =============================
 def set_admin_visibility(is_admin: bool):
     return (
     vocab_state = gr.State({})
     field_rows_state = gr.State([])
+    field_spec = gr.Textbox(visible=False, interactive=False, lines=8)
+    vocab_json = gr.Textbox(visible=False, interactive=False, lines=8)
     with gr.Tab("Extract"):
+        # --- Run section (simple) ---
         with gr.Group():
             files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
                 api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
                 model = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
+            with gr.Row():
+                endpoint_preset = gr.Dropdown(
+                    label="Endpoint preset",
+                    choices=list(ENDPOINT_PRESETS.keys()),
+                    value="Required – Safety Assessor"
+                )
+                endpoints = gr.Dropdown(
+                    label="Endpoints to extract (Core included automatically)",
+                    choices=list(ENDPOINT_MODULES.keys()),
+                    multiselect=True,
+                    value=ENDPOINT_PRESETS["Required – Safety Assessor"]
+                )
             extract_btn = gr.Button("Run Extraction", variant="primary")
             status = gr.Textbox(label="Status", interactive=False)
+        # --- Report (results-first) ---
         gr.Markdown("## Report")
+        summary_card = gr.HTML(render_summary_card("", []))
         overview_df = gr.Dataframe(
             label="Batch Overview",
             interactive=False,
         reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
+        # --- Advanced runtime settings (collapsed) ---
         with gr.Accordion("Advanced runtime settings", open=False):
             with gr.Row():
                 max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
                 chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
                 max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
+        # --- Admin tools (collapsed) ---
         with gr.Accordion("Admin tools (taxonomy + custom columns)", open=False):
             admin_mode = gr.Checkbox(label="Enable Admin mode", value=False)
             admin_fields_group = gr.Group(visible=False)
             with admin_group:
+                gr.Markdown("### Admin: Configure extraction taxonomy + custom columns.")
             with admin_vocab_group:
                 gr.Markdown("### Controlled vocabulary (lists only)")
                 fields_status = gr.Textbox(label="Field builder status", interactive=False)
+        # --- Wiring ---
         admin_mode.change(
             fn=set_admin_visibility,
             inputs=[admin_mode],
             outputs=[admin_group, admin_vocab_group, admin_fields_group]
         )
+        endpoint_preset.change(
+            fn=apply_endpoint_preset,
+            inputs=[endpoint_preset],
+            outputs=[endpoints]
+        )
         endpoints.change(
             fn=sync_fields_from_endpoints,
+            inputs=[endpoints, admin_mode, field_rows_state, field_spec],
             outputs=[field_rows_state, fields_df, field_spec, status]
         )
         extract_btn.click(
             fn=run_extraction,
             inputs=[files, api_key, model, endpoints, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars, admin_mode],
+            outputs=[summary_card, overview_df, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
         )
         record_pick.change(
             fn=on_pick,
             inputs=[record_pick, state_records, state_details],
+            outputs=[summary_card, vertical_view, evidence_md]
         )
         review_mode.change(fn=toggle_review_mode, inputs=[review_mode], outputs=[vertical_view])
         save_btn.click(
             fn=save_review_changes,
             inputs=[record_pick, vertical_view, state_records],
+            outputs=[overview_df, state_records, review_status, summary_card]
         )
         export_btn.click(
             outputs=[reviewed_csv, review_status]
         )
+        # Admin vocab wiring
         vocab_search.change(fn=vocab_filter_preview, inputs=[vocab_terms_df, vocab_search], outputs=[vocab_terms_filtered])
         vocab_category.change(
             outputs=[vocab_state, vocab_category, vocab_terms_df, vocab_terms_filtered, vocab_json_admin, vocab_status, vocab_json]
         )
+        # Admin field builder wiring
         admin_apply_endpoints_btn.click(
             fn=admin_apply_endpoints,
             inputs=[endpoints],
             outputs=[field_rows_state, fields_df, field_spec, fields_status]
         )
+        # Init
         def _init_all():
             vocab, keys, k0, full_df, filtered_df, vjson, vmsg = vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
+            default_endpoints = ENDPOINT_PRESETS["Required – Safety Assessor"]
+            rows, _, _ = build_rows_from_endpoints(default_endpoints)
             fdf = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
             fspec = build_spec_from_field_rows(rows)
         )
     with gr.Tab("Cross-paper Synthesis"):
+        gr.Markdown("Upload `extraction_details.json` from Extract tab. Synthesis is based strictly on grounded extractions.")
         api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
         model2 = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
         extraction_json_file = gr.File(label="Upload extraction_details.json", file_types=[".json"], file_count="single")