Spaces:

hchevva
/

NLP_Project

Running

App Files Files Community

hchevva commited on 10 days ago

Commit

ddb431d

verified ·

1 Parent(s): 6766619

Update app.py

Browse files

Files changed (1) hide show

app.py +480 -347

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import re
 import json
 import tempfile
 from pathlib import Path
-from typing import Dict, List, Tuple, Any
 import gradio as gr
 import numpy as np
@@ -23,7 +23,6 @@ DEFAULT_CONTROLLED_VOCAB_JSON = """{
   "approach_enum": ["in_vivo","in_vitro","in_silico","nams","mixed","not_reported"],
-  "study_type_enum": ["in_vivo","in_vitro","epidemiology","in_silico","review","methodology","other"],
   "in_silico_method_enum": [
     "qsar","read_across","molecular_docking","molecular_dynamics","pbpk_pbtK","aop_based","ml_model","other","not_reported"
   ],
@@ -36,8 +35,6 @@ DEFAULT_CONTROLLED_VOCAB_JSON = """{
   "exposure_route_enum": ["oral","inhalation","dermal","parenteral","multiple","not_reported"],
   "species_enum": ["human","rat","mouse","rabbit","dog","non_human_primate","cell_line","other","not_reported"],
-  "dose_metric_terms": ["noael","loael","bmd","bmdl","ld50","lc50","ec50","ic50"],
   "genotoxicity_oecd_tg_in_vitro_enum": [
     "OECD_TG_471_Bacterial Reverse mutation test(AMES test)",
     "OECD_TG_473_In Vitro Mammalian Chromosomal Aberration Test",
@@ -54,81 +51,98 @@ DEFAULT_CONTROLLED_VOCAB_JSON = """{
     "not_reported"
   ],
-  "genotoxicity_result_enum": ["positive","negative","equivocal","not_reported"]
 }"""
-# (Used only as a fallback / advanced preview)
-DEFAULT_FIELD_SPEC = """# One field per line: Field Name | type | instructions
-# types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]
-Chemical(s) | list[str] | Primary chemical(s) studied; include common name + abbreviation if present.
-CAS_numbers | list[str] | Extract any CAS numbers mentioned.
-Approach | enum[in_vivo,in_vitro,in_silico,nams,mixed,not_reported] | Identify if results are in silico or NAMs; use 'mixed' if multiple.
-In_silico_methods | list[enum[qsar,read_across,molecular_docking,molecular_dynamics,pbpk_pbtK,aop_based,ml_model,other,not_reported]] | If in_silico, list methods used (can be multiple).
-NAMs_methods | list[enum[high_throughput_screening_hts,omics_transcriptomics,omics_proteomics,omics_metabolomics,organ_on_chip,microphysiological_system_mps,3d_tissue_model,in_chemico_assay,in_silico_as_nams,other,not_reported]] | If NAMs, list methods used (can be multiple).
-Study_type | enum[in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other] | Choose the best match.
-Exposure_route | enum[oral,inhalation,dermal,parenteral,multiple,not_reported] | Choose best match.
-Species | enum[human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported] | Choose best match.
-Genotox_OECD_TG_in_vitro | list[enum[
-  OECD_TG_471_Bacterial Reverse mutation test(AMES test),
-  OECD_TG_473_In Vitro Mammalian Chromosomal Aberration Test,
-  OECD_TG_476_In Vitro Mammalian Cell Gene Mutation Tests (Hprt & xprt),
-  OECD_TG_487_In Vitro Mammalian Cell Micronucleus Test,
-  OECD_TG_490_In Vitro Mammalian Cell Gene Mutation Tests (Thymidine Kinase),
-  not_reported
-]] | If genotoxicity in vitro tests are reported, select all applicable TGs. Otherwise not_reported.
-Genotox_OECD_TG_in_vivo | list[enum[
-  OECD_TG_474_In Vivo Mammalian Erythrocyte Micronucleus Test,
-  OECD_TG_475_Mammalian Bone Marrow Chromosomal Aberration Test,
-  OECD_TG_488_Transgenic Rodent Somatic & Germ Cell Gene Mutation Assays,
-  OECD_TG_489_In Vivo Mammalian Alkaline Comet Assay,
-  not_reported
-]] | If genotoxicity in vivo tests are reported, select all applicable TGs. Otherwise not_reported.
-Genotoxicity_result | enum[positive,negative,equivocal,not_reported] | Classify based on reported results. If unclear, not_reported.
-Genotoxicity_result_notes | str | Short explanation grounded to the paper’s wording + what test context it applies to.
-Dose_metrics | list[str] | Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available.
-Key_findings | str | 2-4 bullet-like sentences summarizing the main findings.
-Conclusion | str | What does the paper conclude about safety/risk?
-"""
 # =============================
-# Field presets (UI)
 # =============================
 PRESET_CORE = [
-    {"field": "Chemical(s)", "type": "list[str]", "enum_values": "", "instructions": "Primary chemical(s) studied; include common name + abbreviation if present."},
-    {"field": "CAS_numbers", "type": "list[str]", "enum_values": "", "instructions": "Extract any CAS numbers mentioned."},
-    {"field": "Study_type", "type": "enum", "enum_values": "in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other", "instructions": "Choose the best match."},
-    {"field": "Exposure_route", "type": "enum", "enum_values": "oral,inhalation,dermal,parenteral,multiple,not_reported", "instructions": "Choose best match."},
-    {"field": "Species", "type": "enum", "enum_values": "human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported", "instructions": "Choose best match."},
-    {"field": "Dose_metrics", "type": "list[str]", "enum_values": "", "instructions": "Include reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available."},
-    {"field": "Key_findings", "type": "str", "enum_values": "", "instructions": "2-4 bullet-like sentences summarizing the main findings."},
-    {"field": "Conclusion", "type": "str", "enum_values": "", "instructions": "What does the paper conclude about safety/risk?"},
 ]
 PRESET_NAMS_INSILICO = [
-    {"field": "Approach", "type": "enum", "enum_values": "in_vivo,in_vitro,in_silico,nams,mixed,not_reported", "instructions": "Identify if results are in silico or NAMs; use 'mixed' if multiple."},
-    {"field": "In_silico_methods", "type": "list[enum]", "enum_values": "qsar,read_across,molecular_docking,molecular_dynamics,pbpk_pbtK,aop_based,ml_model,other,not_reported", "instructions": "If in_silico, list methods used (can be multiple)."},
-    {"field": "NAMs_methods", "type": "list[enum]", "enum_values": "high_throughput_screening_hts,omics_transcriptomics,omics_proteomics,omics_metabolomics,organ_on_chip,microphysiological_system_mps,3d_tissue_model,in_chemico_assay,in_silico_as_nams,other,not_reported", "instructions": "If NAMs, list methods used (can be multiple)."},
 ]
 PRESET_GENOTOX_OECD = [
-    {"field": "Genotox_OECD_TG_in_vitro", "type": "list[enum]", "enum_values": "OECD_TG_471_Bacterial Reverse mutation test(AMES test),OECD_TG_473_In Vitro Mammalian Chromosomal Aberration Test,OECD_TG_476_In Vitro Mammalian Cell Gene Mutation Tests (Hprt & xprt),OECD_TG_487_In Vitro Mammalian Cell Micronucleus Test,OECD_TG_490_In Vitro Mammalian Cell Gene Mutation Tests (Thymidine Kinase),not_reported", "instructions": "If in vitro genotox tests are reported, select TGs. Otherwise not_reported."},
-    {"field": "Genotox_OECD_TG_in_vivo", "type": "list[enum]", "enum_values": "OECD_TG_474_In Vivo Mammalian Erythrocyte Micronucleus Test,OECD_TG_475_Mammalian Bone Marrow Chromosomal Aberration Test,OECD_TG_488_Transgenic Rodent Somatic & Germ Cell Gene Mutation Assays,OECD_TG_489_In Vivo Mammalian Alkaline Comet Assay,not_reported", "instructions": "If in vivo genotox tests are reported, select TGs. Otherwise not_reported."},
-    {"field": "Genotoxicity_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Classify based on reported results. If unclear, not_reported."},
-    {"field": "Genotoxicity_result_notes", "type": "str", "enum_values": "", "instructions": "Short explanation grounded to the paper’s wording + test context."},
 ]
-PRESET_MAP = {
-    "Core (recommended)": PRESET_CORE,
-    "NAMs + In Silico": PRESET_NAMS_INSILICO,
-    "Genotox (OECD TGs)": PRESET_GENOTOX_OECD,
 }
@@ -243,10 +257,10 @@ def build_context(selected_chunks: List[Dict[str, Any]], max_chars: int = 20000)
 # Spec -> JSON schema
 # =============================
 def slugify_field(name: str) -> str:
-    name = name.strip()
     name = re.sub(r"[^\w\s-]", "", name)
     name = re.sub(r"[\s-]+", "_", name).lower()
-    return name[:60] if name else "field"
 def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
@@ -317,7 +331,7 @@ def build_extraction_schema(field_props: Dict[str, Any], vocab: Dict[str, Any])
                 "type": "object",
                 "additionalProperties": False,
                 "properties": field_props,
-                "required": all_field_keys  # strict requirement
             },
             "evidence": {
                 "type": "array",
@@ -359,15 +373,13 @@ def openai_structured_extract(
     vocab_text = json.dumps(controlled_vocab, indent=2)
     system_msg = (
-        "You are a toxicology research paper data-extraction assistant.\n"
         "Grounding rules (must follow):\n"
         "1) Use ONLY the provided excerpts; do NOT invent details.\n"
-        "2) If a value is not explicitly stated, output empty string or empty list (or an allowed enum like 'not_reported').\n"
         "3) Provide evidence quotes + page ranges for extracted fields.\n"
         "4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
         "5) Prefer controlled vocab terms when applicable.\n"
-        "6) For OECD TG fields, only populate if explicitly stated or clearly described; otherwise use not_reported.\n"
-        "7) For NAMs/in_silico fields, only populate if explicitly described; otherwise not_reported.\n"
     )
     user_msg = (
@@ -400,7 +412,7 @@ def openai_structured_extract(
 def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[str, Any]]) -> str:
     system_msg = (
-        "You are a senior toxicology scientist summarizing multiple papers.\n"
         "Create a concise synthesis: consensus, disagreements, data gaps, and actionable next steps.\n"
         "Base strictly on the provided extracted JSON (which is evidence-backed).\n"
     )
@@ -412,16 +424,19 @@ def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[
 # =============================
 # UI helpers: vertical view + evidence + overview
 # =============================
-def _make_vertical(records: List[Dict[str, Any]], file_name: str) -> pd.DataFrame:
-    if not records or not file_name:
         return pd.DataFrame(columns=["Field", "Value"])
-    row = next((r for r in records if r.get("file") == file_name), None)
     if not row:
         return pd.DataFrame(columns=["Field", "Value"])
-    return pd.DataFrame({"Field": list(row.keys()), "Value": [row[k] for k in row.keys()]})
-def _render_evidence(details: List[Dict[str, Any]], file_name: str, max_items: int = 80) -> str:
     if not details or not file_name:
         return ""
     d = next((x for x in details if x.get("_file") == file_name), None)
@@ -429,29 +444,33 @@ def _render_evidence(details: List[Dict[str, Any]], file_name: str, max_items: i
         return ""
     ev = d.get("evidence", []) or []
     lines = []
-    for e in ev[:max_items]:
         quote = (e.get("quote", "") or "").strip()
         pages = (e.get("pages", "") or "").strip()
-        field = (e.get("field", "") or "").strip()
         if quote:
-            if len(quote) > 280:
-                quote = quote[:280] + "…"
             lines.append(f"- **{field}** (pages {pages}): “{quote}”")
     header = "### Evidence (grounding)\n"
     return header + ("\n".join(lines) if lines else "- (no evidence returned)")
 def _overview_df_from_records(records: List[Dict[str, Any]]) -> pd.DataFrame:
     if not records:
-        return pd.DataFrame(columns=["file","paper_title","risk_stance","risk_confidence"])
     df = pd.DataFrame(records)
-    cols = ["file","paper_title","risk_stance","risk_confidence"]
     cols = [c for c in cols if c in df.columns]
     return df[cols].copy() if cols else df.head(50)
 # =============================
-# Controlled vocab guided editor (lists only) + SEARCH FILTER
 # =============================
 def _filter_terms_df(df: pd.DataFrame, query: str) -> pd.DataFrame:
     if df is None or df.empty:
@@ -473,7 +492,13 @@ def vocab_init_state(vocab_json: str):
     default_key = list_keys[0] if list_keys else None
     terms = vocab.get(default_key, []) if default_key else []
     full_df = pd.DataFrame({"term": terms})
-    return vocab, list_keys, default_key, full_df, json.dumps(vocab, indent=2), "✅ Vocab loaded."
 def vocab_load_category(vocab_state: Dict[str, Any], category: str, search: str):
@@ -538,10 +563,6 @@ def vocab_apply_df(vocab_state: Dict[str, Any], category: str, terms_df: Any, se
     return vjson, filtered, f"✅ Applied {len(terms)} terms to {category}."
-def vocab_reset_defaults():
-    return vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
 def vocab_filter_preview(terms_df, search):
     try:
         df = terms_df if isinstance(terms_df, pd.DataFrame) else pd.DataFrame(terms_df, columns=["term"])
@@ -551,18 +572,18 @@ def vocab_filter_preview(terms_df, search):
 # =============================
-# Field builder (type dropdown + presets)
 # =============================
 TYPE_CHOICES = ["str", "num", "bool", "list[str]", "list[num]", "enum", "list[enum]"]
-def build_spec_from_field_df(df: pd.DataFrame) -> str:
     lines = [
         "# One field per line: Field Name | type | instructions",
         "# types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]",
         ""
     ]
-    for _, r in df.iterrows():
         field = str(r.get("field","")).strip()
         ftype = str(r.get("type","")).strip()
         enums = str(r.get("enum_values","")).strip()
@@ -585,38 +606,50 @@ def build_spec_from_field_df(df: pd.DataFrame) -> str:
     return "\n".join(lines).strip() + "\n"
-def fields_init_state():
-    fields = []
-    for row in (PRESET_CORE + PRESET_NAMS_INSILICO + PRESET_GENOTOX_OECD):
-        fields.append(dict(row))
-    df = pd.DataFrame(fields, columns=["field","type","enum_values","instructions"])
-    spec = build_spec_from_field_df(df)
-    return fields, df, spec, "✅ Field builder loaded."
-def fields_load_preset(preset_name: str, mode: str, field_rows: List[Dict[str, Any]]):
-    preset = PRESET_MAP.get(preset_name)
-    if not preset:
-        df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
-        return field_rows, df, build_spec_from_field_df(df), "Unknown preset."
-    if mode == "Replace":
-        new_rows = [dict(r) for r in preset]
-    else:
-        new_rows = [dict(r) for r in field_rows]
-        for p in preset:
-            found = False
-            for r in new_rows:
-                if str(r.get("field","")).strip().lower() == str(p.get("field","")).strip().lower():
-                    r.update(p)
-                    found = True
-                    break
-            if not found:
-                new_rows.append(dict(p))
-    df = pd.DataFrame(new_rows, columns=["field","type","enum_values","instructions"])
-    spec = build_spec_from_field_df(df)
-    return new_rows, df, spec, f"✅ Loaded preset: {preset_name} ({mode})."
 def fields_add_or_update(field_name: str, ftype: str, enum_values: str, instructions: str, field_rows: List[Dict[str, Any]]):
@@ -627,7 +660,7 @@ def fields_add_or_update(field_name: str, ftype: str, enum_values: str, instruct
     if not field_name or not ftype:
         df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
-        return field_rows, df, build_spec_from_field_df(df), "Field name and type are required."
     updated = False
     for r in field_rows:
@@ -642,8 +675,7 @@ def fields_add_or_update(field_name: str, ftype: str, enum_values: str, instruct
         field_rows.append({"field": field_name, "type": ftype, "enum_values": enum_values, "instructions": instructions})
     df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
-    spec = build_spec_from_field_df(df)
-    return field_rows, df, spec, ("Updated field." if updated else "Added field.")
 def fields_apply_df(field_rows: List[Dict[str, Any]], df_in: Any):
@@ -651,7 +683,7 @@ def fields_apply_df(field_rows: List[Dict[str, Any]], df_in: Any):
         df = df_in if isinstance(df_in, pd.DataFrame) else pd.DataFrame(df_in, columns=["field","type","enum_values","instructions"])
     except Exception:
         df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
-        return field_rows, df, build_spec_from_field_df(df), "Could not parse builder table."
     cleaned = []
     seen = set()
@@ -669,10 +701,39 @@ def fields_apply_df(field_rows: List[Dict[str, Any]], df_in: Any):
         cleaned.append({"field": field, "type": ftype, "enum_values": enums, "instructions": instr})
     df2 = pd.DataFrame(cleaned, columns=["field","type","enum_values","instructions"])
-    spec = build_spec_from_field_df(df2)
     return cleaned, df2, spec, f"✅ Applied builder table ({len(cleaned)} fields)."
 # =============================
 # Main extraction handler
 # =============================
@@ -680,33 +741,58 @@ def run_extraction(
     files,
     api_key,
     model,
     field_spec,
     vocab_json,
     max_pages,
     chunk_chars,
-    max_context_chars
 ):
     if not files:
-        return pd.DataFrame(), None, None, "Upload one or more PDFs.", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
     try:
         vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
     except Exception as e:
-        return pd.DataFrame(), None, None, f"Controlled vocab JSON invalid: {e}", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
-    field_props, field_instr = parse_field_spec(field_spec or DEFAULT_FIELD_SPEC)
     if not field_props:
-        return pd.DataFrame(), None, None, "Extraction spec produced no fields.", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
     schema = build_extraction_schema(field_props, vocab)
     try:
         client = get_openai_client(api_key)
     except Exception as e:
-        return pd.DataFrame(), None, None, str(e), gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
-    results: List[Dict[str, Any]] = []
-    flat_rows: List[Dict[str, Any]] = []
     tmpdir = Path(tempfile.mkdtemp(prefix="tox_extract_"))
@@ -723,21 +809,26 @@ def run_extraction(
                 "paper_title": "",
                 "risk_stance": "insufficient_data",
                 "risk_confidence": 0.0,
-                "risk_summary": "No extractable text found. This app supports text-based PDFs only.",
                 "extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
                 "evidence": []
             }
-            results.append(ex)
         else:
             chunks = chunk_pages(pages, target_chars=int(chunk_chars))
-            queries = ["regulatory acceptability risk hazard concern conclusion adverse effect uncertainty noael loael bmd bmdl"]
             for k, ins in field_instr.items():
                 queries.append(ins if ins else k)
             selected = select_relevant_chunks(chunks, queries, top_per_query=2, max_chunks=12)
             context = build_context(selected, max_chars=int(max_context_chars))
-            extracted = openai_structured_extract(
                 client=client,
                 model=model,
                 schema=schema,
@@ -745,42 +836,76 @@ def run_extraction(
                 field_instructions=field_instr,
                 context=context
             )
-            extracted["_file"] = filename
-            extracted["_pages_in_pdf"] = page_count
-            results.append(extracted)
-        ex = results[-1]
-        row = {
             "file": filename,
-            "paper_title": ex.get("paper_title",""),
-            "risk_stance": ex.get("risk_stance",""),
-            "risk_confidence": ex.get("risk_confidence",""),
-            "risk_summary": ex.get("risk_summary","")
         }
         ext = ex.get("extracted") or {}
-        for k in field_props.keys():
-            v = ext.get(k, "" if field_props[k].get("type") != "array" else [])
-            if isinstance(v, list):
-                row[k] = "; ".join([str(x) for x in v])
-            else:
-                row[k] = v
-        flat_rows.append(row)
-    df = pd.DataFrame(flat_rows)
     records = df.to_dict("records")
     csv_path = tmpdir / "extraction_table.csv"
     json_path = tmpdir / "extraction_details.json"
     df.to_csv(csv_path, index=False)
-    json_path.write_text(json.dumps(results, indent=2), encoding="utf-8")
-    choices = [r["file"] for r in records if "file" in r]
     default = choices[0] if choices else None
-    vertical = _make_vertical(records, default)
-    evidence = _render_evidence(results, default)
     overview = _overview_df_from_records(records)
-    status = "Done. Use the vertical view + evidence for review. Export reviewed CSV when ready."
     return (
         overview,
         str(csv_path),
@@ -788,7 +913,7 @@ def run_extraction(
         status,
         gr.update(choices=choices, value=default),
         records,
-        results,
         vertical,
         evidence
     )
@@ -797,16 +922,21 @@ def run_extraction(
 # =============================
 # Review mode handlers
 # =============================
-def on_pick(file_name: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
-    return _make_vertical(records, file_name), _render_evidence(details, file_name)
 def toggle_review_mode(is_on: bool):
     return gr.update(interactive=bool(is_on))
-def save_review_changes(file_name: str, vertical_df: Any, records: List[Dict[str, Any]]):
-    if not file_name or not records:
         return pd.DataFrame(), records, "Nothing to save."
     try:
@@ -820,7 +950,7 @@ def save_review_changes(file_name: str, vertical_df: Any, records: List[Dict[str
     new_records = []
     updated = False
     for r in records:
-        if r.get("file") == file_name:
             rr = dict(r)
             for k, v in updates.items():
                 rr[k] = v
@@ -858,77 +988,189 @@ def run_synthesis(api_key, model, extraction_json_file):
     return openai_synthesize_across_papers(client, model, rows)
 # =============================
 # Gradio UI
 # =============================
 with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
     gr.Markdown(
-        "# Toxicology PDF → Grounded Extractor (GPT-4o)\n\n"
-        "**Important:** Text-based PDFs only (not scanned/image PDFs). If no extractable text is found, the record is marked `insufficient_data`.\n\n"
-        "UI includes a guided **Controlled Vocab editor** (lists only, with search) and a **Field Builder** (type dropdown + presets)."
     )
-    # States
-    state_records = gr.State([])     # list[dict]
-    state_details = gr.State([])     # list[dict]
-    vocab_state = gr.State({})       # dict
-    field_rows_state = gr.State([])  # list[dict]
     with gr.Tab("Extract"):
-        files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
-        with gr.Row():
-            api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
-            model = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
-        with gr.Row():
-            max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
-            chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
-            max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
-        # -------------------------
-        # Controlled Vocabulary (guided editor)
-        # -------------------------
-        gr.Markdown("## Controlled Vocabulary (guided editor)")
-        vocab_category = gr.Dropdown(label="Category (lists only)", choices=[], value=None)
-        vocab_search = gr.Textbox(label="Search terms", placeholder="Type to filter (e.g., 471, AMES, comet)", lines=1)
         with gr.Row():
-            vocab_term_add = gr.Textbox(label="Add term", placeholder="type term and click Add")
-            vocab_add_btn = gr.Button("Add")
         with gr.Row():
-            vocab_term_remove = gr.Textbox(label="Remove term", placeholder="type exact term and click Remove")
-            vocab_remove_btn = gr.Button("Remove")
-            vocab_apply_btn = gr.Button("Apply full list to category")
-            vocab_reset_btn = gr.Button("Reset vocab to defaults")
-        vocab_terms_df = gr.Dataframe(
-            headers=["term"],
-            label="Terms (full list; edit directly)",
-            interactive=True,
-            wrap=True
         )
-        vocab_terms_filtered = gr.Dataframe(
-            headers=["term"],
-            label="Filtered preview (read-only)",
-            interactive=False,
-            wrap=True
         )
-        vocab_status = gr.Textbox(label="Vocab status", interactive=False)
-        with gr.Accordion("Advanced: Raw vocab JSON (auto-generated)", open=False):
-            vocab_json = gr.Textbox(label="Controlled vocab JSON", lines=12, interactive=False)
-        # Filter preview wiring (must be AFTER vocab_terms_df exists)
-        vocab_search.change(
-            fn=vocab_filter_preview,
-            inputs=[vocab_terms_df, vocab_search],
-            outputs=[vocab_terms_filtered]
         )
         vocab_category.change(
             fn=vocab_load_category,
             inputs=[vocab_state, vocab_category, vocab_search],
@@ -950,53 +1192,22 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
         vocab_apply_btn.click(
             fn=vocab_apply_df,
             inputs=[vocab_state, vocab_category, vocab_terms_df, vocab_search],
-            outputs=[vocab_json, vocab_terms_filtered, vocab_status]
         )
         vocab_reset_btn.click(
-            fn=vocab_reset_defaults,
             inputs=None,
-            outputs=[vocab_state, vocab_category, vocab_terms_df, vocab_json, vocab_status]
-        ).then(
-            fn=vocab_load_category,
-            inputs=[vocab_state, vocab_category, vocab_search],
-            outputs=[vocab_terms_df, vocab_terms_filtered, vocab_status]
-        )
-        # -------------------------
-        # Field Builder
-        # -------------------------
-        gr.Markdown("## Extraction Spec (Field Builder)")
-        with gr.Row():
-            preset_name = gr.Dropdown(label="Preset", choices=list(PRESET_MAP.keys()), value="Core (recommended)")
-            preset_mode = gr.Radio(label="Preset mode", choices=["Replace", "Append"], value="Append")
-            preset_btn = gr.Button("Load preset")
-        with gr.Row():
-            field_name_in = gr.Textbox(label="Field name", placeholder="e.g., Genotoxicity_result")
-            field_type_in = gr.Dropdown(label="Type", choices=TYPE_CHOICES, value="str")
-        enum_values_in = gr.Textbox(label="Enum values (comma-separated; for enum/list[enum])", placeholder="a,b,c", lines=2)
-        instructions_in = gr.Textbox(label="Instructions", placeholder="Tell the extractor exactly what to pull.", lines=2)
-        add_update_field_btn = gr.Button("Add/Update field")
-        fields_df = gr.Dataframe(
-            label="Fields (edit and click Apply)",
-            headers=["field","type","enum_values","instructions"],
-            interactive=True,
-            wrap=True
         )
-        fields_apply_btn = gr.Button("Apply builder table")
-        fields_status = gr.Textbox(label="Field builder status", interactive=False)
-        with gr.Accordion("Advanced: Raw extraction spec (auto-generated)", open=False):
-            field_spec = gr.Textbox(label="Extraction spec", lines=12, interactive=False)
-        preset_btn.click(
-            fn=fields_load_preset,
-            inputs=[preset_name, preset_mode, field_rows_state],
             outputs=[field_rows_state, fields_df, field_spec, fields_status]
         )
@@ -1012,88 +1223,26 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
             outputs=[field_rows_state, fields_df, field_spec, fields_status]
         )
-        # -------------------------
-        # Run extraction
-        # -------------------------
-        extract_btn = gr.Button("Run Extraction (Grounded)")
-        status = gr.Textbox(label="Status", interactive=False)
-        overview_df = gr.Dataframe(
-            label="Batch Overview (compact)",
-            interactive=False,
-            wrap=True,
-            show_row_numbers=True,
-            buttons=["fullscreen", "copy"]
-        )
-        with gr.Row():
-            out_csv = gr.File(label="Download: extraction_table.csv")
-            out_json = gr.File(label="Download: extraction_details.json (evidence + structured data)")
-        gr.Markdown("## Readable view (vertical) + evidence")
-        record_pick = gr.Dropdown(label="Select record", choices=[], value=None)
-        with gr.Row():
-            review_mode = gr.Checkbox(label="Review mode (enable editing)", value=False)
-            save_btn = gr.Button("Save edits")
-            export_btn = gr.Button("Export reviewed CSV")
-        review_status = gr.Textbox(label="Review status", interactive=False)
-        vertical_view = gr.Dataframe(
-            headers=["Field", "Value"],
-            interactive=False,
-            wrap=True,
-            show_row_numbers=False,
-            label="Vertical record view (Field → Value)"
-        )
-        evidence_md = gr.Markdown()
-        reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
-        extract_btn.click(
-            fn=run_extraction,
-            inputs=[files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars],
-            outputs=[overview_df, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
-        )
-        record_pick.change(
-            fn=on_pick,
-            inputs=[record_pick, state_records, state_details],
-            outputs=[vertical_view, evidence_md]
-        )
-        review_mode.change(fn=toggle_review_mode, inputs=[review_mode], outputs=[vertical_view])
-        save_btn.click(
-            fn=save_review_changes,
-            inputs=[record_pick, vertical_view, state_records],
-            outputs=[overview_df, state_records, review_status]
-        )
-        export_btn.click(
-            fn=export_reviewed_csv,
-            inputs=[state_records],
-            outputs=[reviewed_csv, review_status]
-        )
-        # -------------------------
-        # Initialize vocab + fields on load
-        # -------------------------
-        def _init_all():
-            v, keys, k0, full_df, vjson, vmsg = vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
-            filtered_df = _filter_terms_df(full_df, "")
-            frows, fdf, fspec, fmsg = fields_init_state()
             return (
-                v,
                 gr.update(choices=keys, value=k0),
                 full_df,
                 filtered_df,
                 vjson,
                 vmsg,
-                frows,
                 fdf,
                 fspec,
-                fmsg
             )
         demo.load(
@@ -1104,12 +1253,13 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
                 vocab_category,
                 vocab_terms_df,
                 vocab_terms_filtered,
-                vocab_json,
                 vocab_status,
                 field_rows_state,
                 fields_df,
                 field_spec,
-                fields_status
             ]
         )
@@ -1122,23 +1272,6 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
         synth_md = gr.Markdown()
         synth_btn.click(fn=run_synthesis, inputs=[api_key2, model2, extraction_json_file], outputs=[synth_md])
-    with gr.Tab("Pending tasks"):
-        gr.Markdown(
-            "## Pending tasks\n\n"
-            "1) One row per chemical–endpoint pair\n"
-            "- Change schema to output `records[]` and flatten into multiple rows per paper\n\n"
-            "2) Evidence verification\n"
-            "- If evidence quote not found in context → blank value + flag UNVERIFIED\n\n"
-            "3) Taxonomy mapping\n"
-            "- Synonyms + preferred terms for FDA / OECD / MedDRA-like structure\n\n"
-            "4) Column transforms\n"
-            "- Parse NOAEL/LOAEL etc into structured {metric,value,unit,route,duration}\n\n"
-            "5) Compare mode\n"
-            "- Compare across papers by chemical/endpoint, output consensus + disagreements table\n\n"
-            "6) OCR (optional)\n"
-            "- Currently: text-based PDFs only; OCR adds heavy deps"
-        )
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", "7860"))
     demo.queue().launch(server_name="0.0.0.0", server_port=port)

 import json
 import tempfile
 from pathlib import Path
+from typing import Dict, List, Tuple, Any, Optional
 import gradio as gr
 import numpy as np
   "approach_enum": ["in_vivo","in_vitro","in_silico","nams","mixed","not_reported"],
   "in_silico_method_enum": [
     "qsar","read_across","molecular_docking","molecular_dynamics","pbpk_pbtK","aop_based","ml_model","other","not_reported"
   ],
   "exposure_route_enum": ["oral","inhalation","dermal","parenteral","multiple","not_reported"],
   "species_enum": ["human","rat","mouse","rabbit","dog","non_human_primate","cell_line","other","not_reported"],
   "genotoxicity_oecd_tg_in_vitro_enum": [
     "OECD_TG_471_Bacterial Reverse mutation test(AMES test)",
     "OECD_TG_473_In Vitro Mammalian Chromosomal Aberration Test",
     "not_reported"
   ],
+  "genotoxicity_result_enum": ["positive","negative","equivocal","not_reported"],
+  "binary_result_enum": ["positive","negative","equivocal","not_reported"],
+  "carcinogenicity_result_enum": ["carcinogenic","not_carcinogenic","insufficient_data","not_reported"]
 }"""
 # =============================
+# Endpoint modules (what users choose)
 # =============================
 PRESET_CORE = [
+    {"field": "chemicals", "type": "list[str]", "enum_values": "", "instructions": "List chemical(s) studied. If multiple, include each separately."},
+    {"field": "cas_numbers", "type": "list[str]", "enum_values": "", "instructions": "Extract CAS number(s) mentioned (may be multiple)."},
+    {"field": "study_type", "type": "enum", "enum_values": "in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other,not_reported", "instructions": "Choose best match."},
+    {"field": "exposure_route", "type": "enum", "enum_values": "oral,inhalation,dermal,parenteral,multiple,not_reported", "instructions": "Choose best match."},
+    {"field": "species", "type": "enum", "enum_values": "human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported", "instructions": "Choose best match."},
+    {"field": "dose_metrics", "type": "list[str]", "enum_values": "", "instructions": "Capture NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units and route if available."},
+    {"field": "key_findings", "type": "str", "enum_values": "", "instructions": "2–4 short sentences summarizing major findings. Grounded to text."},
+    {"field": "conclusion", "type": "str", "enum_values": "", "instructions": "Paper's conclusion about safety/risk (grounded)."},
 ]
 PRESET_NAMS_INSILICO = [
+    {"field": "approach", "type": "enum", "enum_values": "in_vivo,in_vitro,in_silico,nams,mixed,not_reported", "instructions": "Identify if results are in silico or NAMs; use mixed if multiple."},
+    {"field": "in_silico_methods", "type": "list[enum]", "enum_values": "qsar,read_across,molecular_docking,molecular_dynamics,pbpk_pbtK,aop_based,ml_model,other,not_reported", "instructions": "If in_silico, list methods used (multiple allowed)."},
+    {"field": "nams_methods", "type": "list[enum]", "enum_values": "high_throughput_screening_hts,omics_transcriptomics,omics_proteomics,omics_metabolomics,organ_on_chip,microphysiological_system_mps,3d_tissue_model,in_chemico_assay,in_silico_as_nams,other,not_reported", "instructions": "If NAMs, list methods used (multiple allowed)."},
+    {"field": "nams_or_insilico_key_results", "type": "str", "enum_values": "", "instructions": "Summarize in silico / NAMs results and key metrics (grounded)."},
 ]
 PRESET_GENOTOX_OECD = [
+    {
+        "field": "genotox_oecd_tg_in_vitro",
+        "type": "list[enum]",
+        "enum_values": "OECD_TG_471_Bacterial Reverse mutation test(AMES test),OECD_TG_473_In Vitro Mammalian Chromosomal Aberration Test,OECD_TG_476_In Vitro Mammalian Cell Gene Mutation Tests (Hprt & xprt),OECD_TG_487_In Vitro Mammalian Cell Micronucleus Test,OECD_TG_490_In Vitro Mammalian Cell Gene Mutation Tests (Thymidine Kinase),not_reported",
+        "instructions": "Select all in vitro OECD TGs explicitly reported (or clearly described). If none, use not_reported."
+    },
+    {
+        "field": "genotox_oecd_tg_in_vivo",
+        "type": "list[enum]",
+        "enum_values": "OECD_TG_474_In Vivo Mammalian Erythrocyte Micronucleus Test,OECD_TG_475_Mammalian Bone Marrow Chromosomal Aberration Test,OECD_TG_488_Transgenic Rodent Somatic & Germ Cell Gene Mutation Assays,OECD_TG_489_In Vivo Mammalian Alkaline Comet Assay,not_reported",
+        "instructions": "Select all in vivo OECD TGs explicitly reported (or clearly described). If none, use not_reported."
+    },
+    {"field": "genotoxicity_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Classify overall genotoxicity outcome as reported. If unclear, not_reported."},
+    {"field": "genotoxicity_result_notes", "type": "str", "enum_values": "", "instructions": "Short explanation grounded to text + test context (e.g., AMES, micronucleus)."},
 ]
+PRESET_ACUTE_TOX = [
+    {"field": "acute_toxicity_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "If acute toxicity is assessed, classify as positive/negative/equivocal; otherwise not_reported."},
+    {"field": "acute_toxicity_key_metrics", "type": "list[str]", "enum_values": "", "instructions": "Extract LD50/LC50/EC50/IC50 etc with units/route/species if available."},
+    {"field": "acute_toxicity_notes", "type": "str", "enum_values": "", "instructions": "Grounded summary of acute toxicity findings."},
+]
+PRESET_REPEATED_DOSE = [
+    {"field": "repeated_dose_noael_loael", "type": "list[str]", "enum_values": "", "instructions": "Extract NOAEL/LOAEL (and study duration) with units/route if available."},
+    {"field": "repeated_dose_target_organs", "type": "list[str]", "enum_values": "", "instructions": "List target organs/critical effects explicitly reported."},
+    {"field": "repeated_dose_notes", "type": "str", "enum_values": "", "instructions": "Grounded summary of repeated-dose toxicity conclusions."},
+]
+PRESET_IRR_SENS = [
+    {"field": "skin_irritation_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Skin irritation outcome (as reported)."},
+    {"field": "eye_irritation_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Eye irritation outcome (as reported)."},
+    {"field": "skin_sensitization_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Skin sensitization outcome (as reported)."},
+    {"field": "irritation_sensitization_notes", "type": "str", "enum_values": "", "instructions": "Grounded notes including method/model if stated."},
+]
+PRESET_REPRO_DEV = [
+    {"field": "reproductive_toxicity_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Reproductive toxicity outcome (as reported)."},
+    {"field": "developmental_toxicity_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Developmental toxicity outcome (as reported)."},
+    {"field": "repro_dev_notes", "type": "str", "enum_values": "", "instructions": "Grounded notes including endpoints and study design if stated."},
+]
+PRESET_CARCINOGENICITY = [
+    {"field": "carcinogenicity_result", "type": "enum", "enum_values": "carcinogenic,not_carcinogenic,insufficient_data,not_reported", "instructions": "As reported. If evidence insufficient, insufficient_data."},
+    {"field": "carcinogenicity_notes", "type": "str", "enum_values": "", "instructions": "Grounded notes including species, duration, tumor findings if stated."},
+]
+ENDPOINT_MODULES: Dict[str, List[Dict[str, Any]]] = {
+    "Genotoxicity (OECD TG)": PRESET_GENOTOX_OECD,
+    "NAMs / In Silico": PRESET_NAMS_INSILICO,
+    "Acute toxicity": PRESET_ACUTE_TOX,
+    "Repeated dose toxicity": PRESET_REPEATED_DOSE,
+    "Irritation / Sensitization": PRESET_IRR_SENS,
+    "Repro / Developmental": PRESET_REPRO_DEV,
+    "Carcinogenicity": PRESET_CARCINOGENICITY,
+}
+ENDPOINT_QUERY_HINTS: Dict[str, List[str]] = {
+    "Genotoxicity (OECD TG)": ["genotoxicity", "mutagenicity", "AMES", "micronucleus", "comet assay", "chromosomal aberration", "OECD TG 471 473 476 487 490 474 489"],
+    "NAMs / In Silico": ["in silico", "QSAR", "read-across", "AOP", "PBPK", "high-throughput", "omics", "organ-on-chip", "microphysiological"],
+    "Acute toxicity": ["acute toxicity", "LD50", "LC50", "single dose", "lethality", "mortality"],
+    "Repeated dose toxicity": ["repeated dose", "subchronic", "chronic", "NOAEL", "LOAEL", "target organ", "90-day", "28-day"],
+    "Irritation / Sensitization": ["skin irritation", "eye irritation", "sensitization", "LLNA", "Draize"],
+    "Repro / Developmental": ["reproductive toxicity", "fertility", "developmental toxicity", "teratogenic", "prenatal", "postnatal"],
+    "Carcinogenicity": ["carcinogenicity", "tumor", "neoplasm", "cancer", "two-year bioassay"],
 }
 # Spec -> JSON schema
 # =============================
 def slugify_field(name: str) -> str:
+    name = (name or "").strip()
     name = re.sub(r"[^\w\s-]", "", name)
     name = re.sub(r"[\s-]+", "_", name).lower()
+    return name[:80] if name else "field"
 def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
                 "type": "object",
                 "additionalProperties": False,
                 "properties": field_props,
+                "required": all_field_keys
             },
             "evidence": {
                 "type": "array",
     vocab_text = json.dumps(controlled_vocab, indent=2)
     system_msg = (
+        "You are a toxicology research paper data-extraction assistant for an industry safety assessor.\n"
         "Grounding rules (must follow):\n"
         "1) Use ONLY the provided excerpts; do NOT invent details.\n"
+        "2) If a value is not explicitly stated, output empty string or empty list, OR the enum value 'not_reported'/'insufficient_data' when applicable.\n"
         "3) Provide evidence quotes + page ranges for extracted fields.\n"
         "4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
         "5) Prefer controlled vocab terms when applicable.\n"
     )
     user_msg = (
 def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[str, Any]]) -> str:
     system_msg = (
+        "You are a senior toxicology safety assessor summarizing multiple papers.\n"
         "Create a concise synthesis: consensus, disagreements, data gaps, and actionable next steps.\n"
         "Base strictly on the provided extracted JSON (which is evidence-backed).\n"
     )
 # =============================
 # UI helpers: vertical view + evidence + overview
 # =============================
+def _make_vertical(records: List[Dict[str, Any]], record_id: str) -> pd.DataFrame:
+    if not records or not record_id:
         return pd.DataFrame(columns=["Field", "Value"])
+    row = next((r for r in records if r.get("record_id") == record_id), None)
     if not row:
         return pd.DataFrame(columns=["Field", "Value"])
+    hidden = {"record_id"}
+    keys = [k for k in row.keys() if k not in hidden]
+    return pd.DataFrame({"Field": keys, "Value": [row[k] for k in keys]})
+def _render_evidence(details: List[Dict[str, Any]], file_name: str, allowed_fields: Optional[set] = None, max_items: int = 120) -> str:
     if not details or not file_name:
         return ""
     d = next((x for x in details if x.get("_file") == file_name), None)
         return ""
     ev = d.get("evidence", []) or []
     lines = []
+    for e in ev:
+        field = (e.get("field", "") or "").strip()
+        if allowed_fields is not None and field and field not in allowed_fields:
+            continue
         quote = (e.get("quote", "") or "").strip()
         pages = (e.get("pages", "") or "").strip()
         if quote:
+            if len(quote) > 320:
+                quote = quote[:320] + "…"
             lines.append(f"- **{field}** (pages {pages}): “{quote}”")
+        if len(lines) >= max_items:
+            break
     header = "### Evidence (grounding)\n"
     return header + ("\n".join(lines) if lines else "- (no evidence returned)")
 def _overview_df_from_records(records: List[Dict[str, Any]]) -> pd.DataFrame:
     if not records:
+        return pd.DataFrame(columns=["record_id","file","paper_title","chemical","endpoint","risk_stance","risk_confidence"])
     df = pd.DataFrame(records)
+    cols = ["record_id","file","paper_title","chemical","endpoint","risk_stance","risk_confidence"]
     cols = [c for c in cols if c in df.columns]
     return df[cols].copy() if cols else df.head(50)
 # =============================
+# Controlled vocab editor helpers (lists only) + search filter
 # =============================
 def _filter_terms_df(df: pd.DataFrame, query: str) -> pd.DataFrame:
     if df is None or df.empty:
     default_key = list_keys[0] if list_keys else None
     terms = vocab.get(default_key, []) if default_key else []
     full_df = pd.DataFrame({"term": terms})
+    filtered_df = _filter_terms_df(full_df, "")
+    return vocab, list_keys, default_key, full_df, filtered_df, json.dumps(vocab, indent=2), "✅ Vocab loaded."
+def vocab_reset_defaults_ui():
+    vocab, keys, k0, full_df, filtered_df, vjson, msg = vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
+    return vocab, gr.update(choices=keys, value=k0), full_df, filtered_df, vjson, msg, vjson
 def vocab_load_category(vocab_state: Dict[str, Any], category: str, search: str):
     return vjson, filtered, f"✅ Applied {len(terms)} terms to {category}."
 def vocab_filter_preview(terms_df, search):
     try:
         df = terms_df if isinstance(terms_df, pd.DataFrame) else pd.DataFrame(terms_df, columns=["term"])
 # =============================
+# Field builder (admin) + endpoint selection mapping
 # =============================
 TYPE_CHOICES = ["str", "num", "bool", "list[str]", "list[num]", "enum", "list[enum]"]
+def build_spec_from_field_rows(rows: List[Dict[str, Any]]) -> str:
     lines = [
         "# One field per line: Field Name | type | instructions",
         "# types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]",
         ""
     ]
+    for r in rows:
         field = str(r.get("field","")).strip()
         ftype = str(r.get("type","")).strip()
         enums = str(r.get("enum_values","")).strip()
     return "\n".join(lines).strip() + "\n"
+def build_rows_from_endpoints(selected_endpoints: List[str]) -> Tuple[List[Dict[str, Any]], Dict[str, str]]:
+    selected_endpoints = selected_endpoints or []
+    rows: List[Dict[str, Any]] = []
+    field_key_to_module: Dict[str, str] = {}
+    for r in PRESET_CORE:
+        rows.append(dict(r))
+        field_key_to_module[slugify_field(r["field"])] = "Core"
+    for module in selected_endpoints:
+        preset = ENDPOINT_MODULES.get(module)
+        if not preset:
+            continue
+        for r in preset:
+            rows.append(dict(r))
+            field_key_to_module[slugify_field(r["field"])] = module
+    seen = set()
+    deduped: List[Dict[str, Any]] = []
+    for r in rows:
+        k = str(r.get("field","")).strip().lower()
+        if not k or k in seen:
+            continue
+        seen.add(k)
+        deduped.append(r)
+    field_key_to_module = {slugify_field(r["field"]): field_key_to_module.get(slugify_field(r["field"]), "Custom") for r in deduped}
+    return deduped, field_key_to_module
+def sync_fields_from_endpoints(selected_endpoints: List[str], admin_mode: bool):
+    if admin_mode:
+        return gr.update(), gr.update(), gr.update(), "Admin mode: endpoint selection will not overwrite custom columns."
+    rows, _ = build_rows_from_endpoints(selected_endpoints)
+    df = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
+    spec = build_spec_from_field_rows(rows)
+    return rows, df, spec, "✅ Columns updated from selected endpoints."
+def admin_apply_endpoints(selected_endpoints: List[str]):
+    rows, _ = build_rows_from_endpoints(selected_endpoints)
+    df = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
+    spec = build_spec_from_field_rows(rows)
+    return rows, df, spec, "✅ Loaded selected endpoints into the builder (Replace)."
 def fields_add_or_update(field_name: str, ftype: str, enum_values: str, instructions: str, field_rows: List[Dict[str, Any]]):
     if not field_name or not ftype:
         df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
+        return field_rows, df, build_spec_from_field_rows(field_rows), "Field name and type are required."
     updated = False
     for r in field_rows:
         field_rows.append({"field": field_name, "type": ftype, "enum_values": enum_values, "instructions": instructions})
     df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
+    return field_rows, df, build_spec_from_field_rows(field_rows), ("Updated field." if updated else "Added field.")
 def fields_apply_df(field_rows: List[Dict[str, Any]], df_in: Any):
         df = df_in if isinstance(df_in, pd.DataFrame) else pd.DataFrame(df_in, columns=["field","type","enum_values","instructions"])
     except Exception:
         df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
+        return field_rows, df, build_spec_from_field_rows(field_rows), "Could not parse builder table."
     cleaned = []
     seen = set()
         cleaned.append({"field": field, "type": ftype, "enum_values": enums, "instructions": instr})
     df2 = pd.DataFrame(cleaned, columns=["field","type","enum_values","instructions"])
+    spec = build_spec_from_field_rows(cleaned)
     return cleaned, df2, spec, f"✅ Applied builder table ({len(cleaned)} fields)."
+# =============================
+# Row-building logic (paper vs chemical-endpoint)
+# =============================
+def _as_list(x) -> List[str]:
+    if x is None:
+        return []
+    if isinstance(x, list):
+        out = []
+        for v in x:
+            s = str(v).strip()
+            if s:
+                out.append(s)
+        return out
+    s = str(x).strip()
+    return [s] if s else []
+def _format_value(v: Any) -> Any:
+    if isinstance(v, list):
+        return "; ".join([str(x) for x in v if str(x).strip()])
+    return v
+def _record_id(file_name: str, chemical: str, endpoint: str) -> str:
+    chemical = (chemical or "").strip() or "-"
+    endpoint = (endpoint or "").strip() or "Paper"
+    return f"{file_name} | {chemical} | {endpoint}"
 # =============================
 # Main extraction handler
 # =============================
     files,
     api_key,
     model,
+    selected_endpoints,
     field_spec,
     vocab_json,
     max_pages,
     chunk_chars,
+    max_context_chars,
+    admin_mode
 ):
     if not files:
+        return (
+            pd.DataFrame(), None, None, "Upload one or more PDFs.",
+            gr.update(choices=[], value=None),
+            [], [], pd.DataFrame(columns=["Field","Value"]), ""
+        )
     try:
         vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
     except Exception as e:
+        return (
+            pd.DataFrame(), None, None, f"Controlled vocab JSON invalid: {e}",
+            gr.update(choices=[], value=None),
+            [], [], pd.DataFrame(columns=["Field","Value"]), ""
+        )
+    field_props, field_instr = parse_field_spec(field_spec or "")
     if not field_props:
+        return (
+            pd.DataFrame(), None, None, "No extraction fields are defined. (Check selected endpoints or admin field spec.)",
+            gr.update(choices=[], value=None),
+            [], [], pd.DataFrame(columns=["Field","Value"]), ""
+        )
     schema = build_extraction_schema(field_props, vocab)
+    if admin_mode:
+        field_key_to_module = {k: "Custom" for k in field_props.keys()}
+        endpoint_modules_for_rows = ["Custom"]
+    else:
+        _, field_key_to_module = build_rows_from_endpoints(selected_endpoints or [])
+        endpoint_modules_for_rows = list(selected_endpoints or []) or ["Core"]
     try:
         client = get_openai_client(api_key)
     except Exception as e:
+        return (
+            pd.DataFrame(), None, None, str(e),
+            gr.update(choices=[], value=None),
+            [], [], pd.DataFrame(columns=["Field","Value"]), ""
+        )
+    paper_details: List[Dict[str, Any]] = []
+    output_rows: List[Dict[str, Any]] = []
     tmpdir = Path(tempfile.mkdtemp(prefix="tox_extract_"))
                 "paper_title": "",
                 "risk_stance": "insufficient_data",
                 "risk_confidence": 0.0,
+                "risk_summary": "No extractable text found. This app supports text-based PDFs only (not scanned images).",
                 "extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
                 "evidence": []
             }
         else:
             chunks = chunk_pages(pages, target_chars=int(chunk_chars))
+            queries = [
+                "regulatory acceptability risk hazard concern conclusion uncertainty evidence NOAEL LOAEL BMD",
+                "chemical name CAS number",
+            ]
+            for ep in (selected_endpoints or []):
+                queries.extend(ENDPOINT_QUERY_HINTS.get(ep, []))
             for k, ins in field_instr.items():
                 queries.append(ins if ins else k)
             selected = select_relevant_chunks(chunks, queries, top_per_query=2, max_chunks=12)
             context = build_context(selected, max_chars=int(max_context_chars))
+            ex = openai_structured_extract(
                 client=client,
                 model=model,
                 schema=schema,
                 field_instructions=field_instr,
                 context=context
             )
+            ex["_file"] = filename
+            ex["_pages_in_pdf"] = page_count
+        paper_details.append(ex)
+        base = {
             "file": filename,
+            "paper_title": ex.get("paper_title", ""),
+            "risk_stance": ex.get("risk_stance", ""),
+            "risk_confidence": ex.get("risk_confidence", ""),
+            "risk_summary": ex.get("risk_summary", ""),
         }
         ext = ex.get("extracted") or {}
+        chemicals = _as_list(ext.get("chemicals"))
+        if not chemicals:
+            chemicals = ["-"]
+        if len(chemicals) <= 1:
+            row = dict(base)
+            row["chemical"] = chemicals[0]
+            row["endpoint"] = "Paper"
+            row["record_id"] = _record_id(filename, row["chemical"], row["endpoint"])
+            for k in field_props.keys():
+                row[k] = _format_value(ext.get(k, [] if field_props[k].get("type") == "array" else ""))
+            output_rows.append(row)
+        else:
+            core_keys = [k for k, m in field_key_to_module.items() if m == "Core"] if not admin_mode else []
+            for chem in chemicals:
+                for module in endpoint_modules_for_rows:
+                    row = dict(base)
+                    row["chemical"] = chem
+                    row["endpoint"] = module
+                    row["record_id"] = _record_id(filename, chem, module)
+                    for k in field_props.keys():
+                        m = field_key_to_module.get(k, "Custom")
+                        include = (k in core_keys) or (m == module) or admin_mode
+                        if include:
+                            if k == "chemicals":
+                                row[k] = chem  # make per-row chemical consistent
+                            else:
+                                row[k] = _format_value(ext.get(k, [] if field_props[k].get("type") == "array" else ""))
+                    output_rows.append(row)
+    df = pd.DataFrame(output_rows)
     records = df.to_dict("records")
     csv_path = tmpdir / "extraction_table.csv"
     json_path = tmpdir / "extraction_details.json"
     df.to_csv(csv_path, index=False)
+    json_path.write_text(json.dumps(paper_details, indent=2), encoding="utf-8")
+    choices = [r.get("record_id") for r in records if r.get("record_id")]
     default = choices[0] if choices else None
+    vertical = _make_vertical(records, default) if default else pd.DataFrame(columns=["Field","Value"])
+    allowed_fields = None
+    if default:
+        selected_row = next((r for r in records if r.get("record_id") == default), {})
+        allowed_fields = set([k for k in selected_row.keys() if k not in {"record_id"}])
+    file_for_evidence = None
+    if default:
+        file_for_evidence = default.split(" | ")[0].strip()
+    evidence = _render_evidence(paper_details, file_for_evidence, allowed_fields=allowed_fields) if file_for_evidence else ""
     overview = _overview_df_from_records(records)
+    status = "✅ Done. Review in the report below and export when ready."
     return (
         overview,
         str(csv_path),
         status,
         gr.update(choices=choices, value=default),
         records,
+        paper_details,
         vertical,
         evidence
     )
 # =============================
 # Review mode handlers
 # =============================
+def on_pick(record_id: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
+    if not record_id:
+        return pd.DataFrame(columns=["Field","Value"]), ""
+    row = next((r for r in (records or []) if r.get("record_id") == record_id), {})
+    file_name = (row.get("file") or "")
+    allowed_fields = set(row.keys()) - {"record_id"}
+    return _make_vertical(records, record_id), _render_evidence(details, file_name, allowed_fields=allowed_fields)
 def toggle_review_mode(is_on: bool):
     return gr.update(interactive=bool(is_on))
+def save_review_changes(record_id: str, vertical_df: Any, records: List[Dict[str, Any]]):
+    if not record_id or not records:
         return pd.DataFrame(), records, "Nothing to save."
     try:
     new_records = []
     updated = False
     for r in records:
+        if r.get("record_id") == record_id:
             rr = dict(r)
             for k, v in updates.items():
                 rr[k] = v
     return openai_synthesize_across_papers(client, model, rows)
+# =============================
+# UI visibility helpers
+# =============================
+def set_admin_visibility(is_admin: bool):
+    return (
+        gr.update(visible=bool(is_admin)),
+        gr.update(visible=bool(is_admin)),
+        gr.update(visible=bool(is_admin))
+    )
 # =============================
 # Gradio UI
 # =============================
 with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
     gr.Markdown(
+        "# Toxicology PDF → Grounded Extractor\n"
+        "Upload PDFs → choose endpoints → Run → review report → export.\n\n"
+        "**Note:** Text-based PDFs only (not scanned/image PDFs)."
     )
+    state_records = gr.State([])
+    state_details = gr.State([])
+    vocab_state = gr.State({})
+    field_rows_state = gr.State([])
+    field_spec = gr.Textbox(visible=False, interactive=False, lines=8, label="(hidden) field spec")
+    vocab_json = gr.Textbox(visible=False, interactive=False, lines=8, label="(hidden) vocab json")
     with gr.Tab("Extract"):
+        with gr.Group():
+            files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
+            with gr.Row():
+                api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
+                model = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
+            endpoints = gr.Dropdown(
+                label="Endpoints to extract (Core included automatically)",
+                choices=list(ENDPOINT_MODULES.keys()),
+                multiselect=True,
+                value=["Genotoxicity (OECD TG)"]
+            )
+            extract_btn = gr.Button("Run Extraction", variant="primary")
+            status = gr.Textbox(label="Status", interactive=False)
+        gr.Markdown("## Report")
+        overview_df = gr.Dataframe(
+            label="Batch Overview",
+            interactive=False,
+            wrap=True,
+            show_row_numbers=True
+        )
+        with gr.Row():
+            out_csv = gr.File(label="Download: extraction_table.csv")
+            out_json = gr.File(label="Download: extraction_details.json (evidence + structured data)")
+        record_pick = gr.Dropdown(label="Select record", choices=[], value=None)
         with gr.Row():
+            review_mode = gr.Checkbox(label="Review mode (enable editing)", value=False)
+            save_btn = gr.Button("Save edits")
+            export_btn = gr.Button("Export reviewed CSV")
+        review_status = gr.Textbox(label="Review status", interactive=False)
         with gr.Row():
+            vertical_view = gr.Dataframe(
+                headers=["Field", "Value"],
+                interactive=False,
+                wrap=True,
+                show_row_numbers=False,
+                label="Extracted fields (vertical)"
+            )
+            evidence_md = gr.Markdown()
+        reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
+        with gr.Accordion("Advanced runtime settings", open=False):
+            with gr.Row():
+                max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
+                chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
+                max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
+        with gr.Accordion("Admin tools (taxonomy + custom columns)", open=False):
+            admin_mode = gr.Checkbox(label="Enable Admin mode", value=False)
+            admin_group = gr.Group(visible=False)
+            admin_vocab_group = gr.Group(visible=False)
+            admin_fields_group = gr.Group(visible=False)
+            with admin_group:
+                gr.Markdown("### Admin: Configure what gets extracted (columns) and how terms are normalized.")
+            with admin_vocab_group:
+                gr.Markdown("### Controlled vocabulary (lists only)")
+                vocab_category = gr.Dropdown(label="Category (lists only)", choices=[], value=None)
+                vocab_search = gr.Textbox(label="Search terms", placeholder="Type to filter (e.g., 471, AMES, comet)", lines=1)
+                with gr.Row():
+                    vocab_term_add = gr.Textbox(label="Add term", placeholder="type term and click Add")
+                    vocab_add_btn = gr.Button("Add")
+                with gr.Row():
+                    vocab_term_remove = gr.Textbox(label="Remove term", placeholder="type exact term and click Remove")
+                    vocab_remove_btn = gr.Button("Remove")
+                    vocab_apply_btn = gr.Button("Apply full list to category")
+                    vocab_reset_btn = gr.Button("Reset vocab to defaults")
+                vocab_terms_df = gr.Dataframe(headers=["term"], label="Terms (full list; edit directly)", interactive=True, wrap=True)
+                vocab_terms_filtered = gr.Dataframe(headers=["term"], label="Filtered preview (read-only)", interactive=False, wrap=True)
+                vocab_status = gr.Textbox(label="Vocab status", interactive=False)
+                with gr.Accordion("Raw vocab JSON (auto-generated)", open=False):
+                    vocab_json_admin = gr.Textbox(label="Controlled vocab JSON", lines=12, interactive=False)
+            with admin_fields_group:
+                gr.Markdown("### Custom columns (Field Builder)")
+                gr.Markdown("Tip: Use endpoint selection to start, then tweak fields.")
+                with gr.Row():
+                    admin_apply_endpoints_btn = gr.Button("Load selected endpoints into builder (Replace)", variant="secondary")
+                    fields_apply_btn = gr.Button("Apply builder table")
+                with gr.Row():
+                    field_name_in = gr.Textbox(label="Field name", placeholder="e.g., genotoxicity_result")
+                    field_type_in = gr.Dropdown(label="Type", choices=TYPE_CHOICES, value="str")
+                enum_values_in = gr.Textbox(label="Enum values (comma-separated; for enum/list[enum])", placeholder="a,b,c", lines=2)
+                instructions_in = gr.Textbox(label="Instructions", placeholder="Tell the extractor exactly what to pull.", lines=2)
+                add_update_field_btn = gr.Button("Add/Update field")
+                fields_df = gr.Dataframe(
+                    label="Fields (edit and click Apply)",
+                    headers=["field","type","enum_values","instructions"],
+                    interactive=True,
+                    wrap=True
+                )
+                fields_status = gr.Textbox(label="Field builder status", interactive=False)
+        admin_mode.change(
+            fn=set_admin_visibility,
+            inputs=[admin_mode],
+            outputs=[admin_group, admin_vocab_group, admin_fields_group]
         )
+        endpoints.change(
+            fn=sync_fields_from_endpoints,
+            inputs=[endpoints, admin_mode],
+            outputs=[field_rows_state, fields_df, field_spec, status]
         )
+        extract_btn.click(
+            fn=run_extraction,
+            inputs=[files, api_key, model, endpoints, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars, admin_mode],
+            outputs=[overview_df, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
+        )
+        record_pick.change(
+            fn=on_pick,
+            inputs=[record_pick, state_records, state_details],
+            outputs=[vertical_view, evidence_md]
+        )
+        review_mode.change(fn=toggle_review_mode, inputs=[review_mode], outputs=[vertical_view])
+        save_btn.click(
+            fn=save_review_changes,
+            inputs=[record_pick, vertical_view, state_records],
+            outputs=[overview_df, state_records, review_status]
+        )
+        export_btn.click(
+            fn=export_reviewed_csv,
+            inputs=[state_records],
+            outputs=[reviewed_csv, review_status]
         )
+        vocab_search.change(fn=vocab_filter_preview, inputs=[vocab_terms_df, vocab_search], outputs=[vocab_terms_filtered])
         vocab_category.change(
             fn=vocab_load_category,
             inputs=[vocab_state, vocab_category, vocab_search],
         vocab_apply_btn.click(
             fn=vocab_apply_df,
             inputs=[vocab_state, vocab_category, vocab_terms_df, vocab_search],
+            outputs=[vocab_json_admin, vocab_terms_filtered, vocab_status]
+        ).then(
+            fn=lambda x: x,
+            inputs=[vocab_json_admin],
+            outputs=[vocab_json]
         )
         vocab_reset_btn.click(
+            fn=vocab_reset_defaults_ui,
             inputs=None,
+            outputs=[vocab_state, vocab_category, vocab_terms_df, vocab_terms_filtered, vocab_json_admin, vocab_status, vocab_json]
         )
+        admin_apply_endpoints_btn.click(
+            fn=admin_apply_endpoints,
+            inputs=[endpoints],
             outputs=[field_rows_state, fields_df, field_spec, fields_status]
         )
             outputs=[field_rows_state, fields_df, field_spec, fields_status]
         )
+        def _init_all():
+            vocab, keys, k0, full_df, filtered_df, vjson, vmsg = vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
+            default_endpoints = ["Genotoxicity (OECD TG)"]
+            rows, _ = build_rows_from_endpoints(default_endpoints)
+            fdf = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
+            fspec = build_spec_from_field_rows(rows)
             return (
+                vocab,
                 gr.update(choices=keys, value=k0),
                 full_df,
                 filtered_df,
                 vjson,
                 vmsg,
+                vjson,
+                rows,
                 fdf,
                 fspec,
+                "✅ Ready."
             )
         demo.load(
                 vocab_category,
                 vocab_terms_df,
                 vocab_terms_filtered,
+                vocab_json_admin,
                 vocab_status,
+                vocab_json,
                 field_rows_state,
                 fields_df,
                 field_spec,
+                status
             ]
         )
         synth_md = gr.Markdown()
         synth_btn.click(fn=run_synthesis, inputs=[api_key2, model2, extraction_json_file], outputs=[synth_md])
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", "7860"))
     demo.queue().launch(server_name="0.0.0.0", server_port=port)