Spaces:

hchevva
/

NLP_Project

Running

App Files Files Community

hchevva commited on 25 days ago

Commit

79b39e6

verified ·

1 Parent(s): 5b07a9d

Update app.py

Browse files

Files changed (1) hide show

app.py +364 -243

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import os
 import re
 import json
-import math
 import tempfile
 from pathlib import Path
 from typing import Dict, List, Tuple, Any
@@ -13,39 +12,52 @@ import pandas as pd
 from pypdf import PdfReader
 from sklearn.feature_extraction.text import TfidfVectorizer
-from openai import OpenAI  # OpenAI Responses API client
-# -----------------------------
 # Defaults
-# -----------------------------
 DEFAULT_CONTROLLED_VOCAB_JSON = """{
   "risk_stance_enum": ["acceptable","acceptable_with_uncertainty","not_acceptable","insufficient_data"],
   "study_type_enum": ["in_vivo","in_vitro","epidemiology","in_silico","review","methodology","other"],
   "exposure_route_enum": ["oral","inhalation","dermal","parenteral","multiple","not_reported"],
   "species_enum": ["human","rat","mouse","rabbit","dog","non_human_primate","cell_line","other","not_reported"],
-  "endpoint_terms": ["hepatotoxicity","nephrotoxicity","neurotoxicity","immunotoxicity","reproductive_toxicity","developmental_toxicity","genotoxicity","carcinogenicity","endocrine_activity","respiratory_toxicity","dermal_toxicity","hematotoxicity","cytotoxicity","oxidative_stress","inflammation"],
-  "dose_metric_terms": ["noael","loael","bmd","bmdl","ld50","lc50","ec50","ic50"],
-  "risk_language_terms": ["adverse_effect","no_adverse_effect_observed","increased_risk","safe_at_tested_dose","insufficient_evidence","uncertainty_high"]
 }"""
-DEFAULT_FIELD_SPEC = """# One field per line:  Field Name | type | instructions | optional: enum values
 # types: str, num, bool, list[str], list[num], enum[a,b,c]
 Chemical(s) | list[str] | Primary chemical(s) studied; include common name + abbreviation if present.
 CAS_numbers | list[str] | Extract any CAS numbers mentioned.
 Study_type | enum[in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other] | Choose the best match.
 Exposure_route | enum[oral,inhalation,dermal,parenteral,multiple,not_reported] | Choose best match.
 Species | enum[human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported] | Choose best match.
-Key_endpoints | list[str] | Extract endpoints; prefer controlled vocab terms if applicable.
-Key_findings | str | 2-4 bullet-like sentences summarizing the main findings.
 Dose_metrics | list[str] | Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available.
 Conclusion | str | What does the paper conclude about safety/risk?
 """
-# -----------------------------
-# PDF extraction (page-aware)
-# -----------------------------
 def extract_pages_from_pdf(pdf_path: str, max_pages: int = 0) -> Tuple[List[Tuple[int, str]], int]:
     reader = PdfReader(pdf_path)
     page_count = len(reader.pages)
@@ -57,8 +69,7 @@ def extract_pages_from_pdf(pdf_path: str, max_pages: int = 0) -> Tuple[List[Tupl
             t = reader.pages[i].extract_text() or ""
         except Exception:
             t = ""
-        t = (t or "").strip()
-        pages.append((i + 1, t))
     return pages, page_count
@@ -70,9 +81,6 @@ def clean_text(t: str) -> str:
 def chunk_pages(pages: List[Tuple[int, str]], target_chars: int = 3000) -> List[Dict[str, Any]]:
-    """
-    Build chunks with page ranges, roughly target_chars each.
-    """
     chunks = []
     buf = []
     start_page = None
@@ -85,12 +93,10 @@ def chunk_pages(pages: List[Tuple[int, str]], target_chars: int = 3000) -> List[
         if start_page is None:
             start_page = pno
-        # If adding this page exceeds chunk size, flush
         if cur_len + len(txt) + 1 > target_chars and buf:
-            end_page = (pno - 1) if (pno - 1) >= start_page else start_page
-            chunks.append(
-                {"pages": f"{start_page}-{end_page}", "text": " ".join(buf)}
-            )
             buf = [txt]
             start_page = pno
             cur_len = len(txt)
@@ -99,16 +105,21 @@ def chunk_pages(pages: List[Tuple[int, str]], target_chars: int = 3000) -> List[
             cur_len += len(txt) + 1
     if buf and start_page is not None:
-        end_page = pages[-1][0]
         chunks.append({"pages": f"{start_page}-{end_page}", "text": " ".join(buf)})
     return chunks
-# -----------------------------
 # Lightweight retrieval (TF-IDF) to select relevant excerpts
-# -----------------------------
-def select_relevant_chunks(chunks: List[Dict[str, Any]], queries: List[str], top_per_query: int = 2, max_chunks: int = 10) -> List[Dict[str, Any]]:
     texts = [c["text"] for c in chunks]
     if not texts:
         return []
@@ -116,24 +127,22 @@ def select_relevant_chunks(chunks: List[Dict[str, Any]], queries: List[str], top
     vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=20000)
     X = vectorizer.fit_transform(texts)
-    selected_idx = []
     for q in queries:
         q = (q or "").strip()
         if not q:
             continue
         qv = vectorizer.transform([q])
-        sims = (X @ qv.T).toarray().ravel()  # cosine-like (not normalized), good enough for ranking
         idx = np.argsort(sims)[::-1]
         for i in idx[:top_per_query]:
             if i not in selected_idx:
                 selected_idx.append(i)
-    # fallback: if nothing selected, take first few chunks
     if not selected_idx:
         selected_idx = list(range(min(len(chunks), max_chunks)))
-    selected = [chunks[i] for i in selected_idx[:max_chunks]]
-    return selected
 def build_context(selected_chunks: List[Dict[str, Any]], max_chars: int = 20000) -> str:
@@ -148,9 +157,9 @@ def build_context(selected_chunks: List[Dict[str, Any]], max_chars: int = 20000)
     return "\n".join(parts).strip()
-# -----------------------------
 # User-defined extraction spec -> JSON Schema
-# -----------------------------
 def slugify_field(name: str) -> str:
     name = name.strip()
     name = re.sub(r"[^\w\s-]", "", name)
@@ -158,14 +167,13 @@ def slugify_field(name: str) -> str:
     return name[:60] if name else "field"
-def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], List[str], Dict[str, str]]:
     """
     spec lines: Field Name | type | instructions
-    Returns: properties dict, required list, instructions map (field_key -> instruction)
     """
-    props = {}
-    required = []
-    instr = {}
     for raw_line in (spec or "").splitlines():
         line = raw_line.strip()
@@ -180,15 +188,10 @@ def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], List[str], Dict[str, st
         ftype = parts[1]
         finstr = parts[2] if len(parts) >= 3 else ""
-        is_required = False
-        if field_name.startswith("*"):
-            is_required = True
-            field_name = field_name[1:].strip()
         key = slugify_field(field_name)
         instr[key] = finstr
-        schema = {"type": "string"}
         if ftype == "str":
             schema = {"type": "string"}
@@ -208,20 +211,20 @@ def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], List[str], Dict[str, st
             schema = {"type": "string"}
         props[key] = schema
-        if is_required:
-            required.append(key)
-    # If user didn’t mark required fields, keep it permissive
-    return props, required, instr
-def build_extraction_schema(field_props: Dict[str, Any], required_fields: List[str], vocab: Dict[str, Any]) -> Dict[str, Any]:
     risk_enum = vocab.get(
         "risk_stance_enum",
         ["acceptable", "acceptable_with_uncertainty", "not_acceptable", "insufficient_data"]
     )
-    # IMPORTANT: strict schema requires required == all property keys
     all_field_keys = list(field_props.keys())
     schema = {
@@ -236,7 +239,7 @@ def build_extraction_schema(field_props: Dict[str, Any], required_fields: List[s
                 "type": "object",
                 "additionalProperties": False,
                 "properties": field_props,
-                "required": all_field_keys  # <-- FIX
             },
             "evidence": {
                 "type": "array",
@@ -257,13 +260,13 @@ def build_extraction_schema(field_props: Dict[str, Any], required_fields: List[s
     return schema
-# -----------------------------
-# OpenAI call (Responses API + Structured Outputs)
-# -----------------------------
 def get_openai_client(api_key: str) -> OpenAI:
     key = (api_key or "").strip() or os.getenv("OPENAI_API_KEY", "").strip()
     if not key:
-        raise ValueError("Missing OpenAI API key. Provide it in the UI or set OPENAI_API_KEY.")
     return OpenAI(api_key=key)
@@ -275,25 +278,20 @@ def openai_structured_extract(
     field_instructions: Dict[str, str],
     context: str
 ) -> Dict[str, Any]:
-    # Build instruction text for the model
     field_instr_lines = []
     for k, v in field_instructions.items():
-        if v:
-            field_instr_lines.append(f"- {k}: {v}")
-        else:
-            field_instr_lines.append(f"- {k}: (no extra instructions)")
     vocab_text = json.dumps(controlled_vocab, indent=2)
     system_msg = (
         "You are a toxicology research paper data-extraction assistant.\n"
-        "Rules:\n"
-        "1) Use ONLY the provided excerpts; do not invent details.\n"
-        "2) If a value is not stated, use an empty string, empty list, or 'not_reported' if the enum allows it.\n"
-        "3) Always include evidence quotes with page ranges (from excerpt headers).\n"
-        "4) risk_stance reflects overall concern from the paper's findings (high/moderate/low/inconclusive/not_assessed).\n"
-        "5) Prefer controlled vocabulary terms when applicable.\n"
     )
     user_msg = (
@@ -302,7 +300,7 @@ def openai_structured_extract(
         "FIELD INSTRUCTIONS:\n"
         + "\n".join(field_instr_lines)
         + "\n\n"
-        "EXCERPTS:\n"
         f"{context}\n"
     )
@@ -321,103 +319,104 @@ def openai_structured_extract(
             }
         }
     )
-    # Structured outputs: JSON is in output_text
-    out = resp.output_text
-    return json.loads(out)
 def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[str, Any]]) -> str:
     system_msg = (
         "You are a senior toxicology scientist summarizing multiple papers.\n"
-        "Produce a concise synthesis for researchers: consensus, disagreements, data gaps, and next steps.\n"
-        "Base your synthesis strictly on the provided extracted JSON (which itself is evidence-backed).\n"
     )
     user_msg = "EXTRACTED_ROWS_JSON:\n" + json.dumps(rows, indent=2)
-    resp = client.responses.create(
-        model=model,
-        input=[
-            {"role": "system", "content": system_msg},
-            {"role": "user", "content": user_msg}
-        ]
-    )
-    return resp.output_text
-def openai_suggest_vocab_additions(client: OpenAI, model: str, current_vocab: Dict[str, Any], context: str) -> Dict[str, Any]:
-    schema = {
-        "type": "object",
-        "additionalProperties": False,
-        "properties": {
-            "additions": {
-                "type": "object",
-                "additionalProperties": {
-                    "type": "array",
-                    "items": {"type": "string"}
-                }
-            },
-            "notes": {"type": "string"}
-        },
-        "required": ["additions", "notes"]
-    }
-    system_msg = (
-        "You propose controlled-vocabulary additions for toxicology paper extraction.\n"
-        "Return only new candidate terms grouped under keys that already exist or new keys if needed.\n"
-        "Avoid duplicates already in current vocab.\n"
-    )
-    user_msg = (
-        "CURRENT_VOCAB_JSON:\n"
-        + json.dumps(current_vocab, indent=2)
-        + "\n\n"
-        "EXCERPTS:\n"
-        + context
-    )
     resp = client.responses.create(
         model=model,
         input=[
             {"role": "system", "content": system_msg},
             {"role": "user", "content": user_msg}
         ],
-        text={
-            "format": {
-                "type": "json_schema",
-                "name": "vocab_additions",
-                "schema": schema,
-                "strict": True
-            }
-        }
     )
-    return json.loads(resp.output_text)
-# -----------------------------
-# Gradio handlers
-# -----------------------------
-def run_extraction(files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars):
     if not files:
-        return None, None, None, "Upload one or more PDFs."
     try:
         vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
     except Exception as e:
-        return None, None, None, f"Controlled vocab JSON is invalid: {e}"
-    field_props, required_fields, field_instr = parse_field_spec(field_spec or DEFAULT_FIELD_SPEC)
     if not field_props:
-        return None, None, None, "Field spec produced no fields. Add lines like: Field | str | instructions"
-    schema = build_extraction_schema(field_props, required_fields, vocab)
     try:
         client = get_openai_client(api_key)
     except Exception as e:
-        return None, None, None, str(e)
-    results = []
-    flat_rows = []
     tmpdir = Path(tempfile.mkdtemp(prefix="tox_extract_"))
@@ -426,46 +425,51 @@ def run_extraction(files, api_key, model, field_spec, vocab_json, max_pages, chu
         filename = os.path.basename(pdf_path)
         pages, page_count = extract_pages_from_pdf(pdf_path, max_pages=int(max_pages))
         chunks = chunk_pages(pages, target_chars=int(chunk_chars))
-        # Build queries: risk stance + each field instruction
-        queries = [
-            "risk stance hazard risk conclusion adverse effect noael loael bmd bmdl ld50 lc50 safety concern",
-        ]
         for k, ins in field_instr.items():
-            if ins:
-                queries.append(ins)
-            else:
-                queries.append(k)
         selected = select_relevant_chunks(chunks, queries, top_per_query=2, max_chunks=12)
         context = build_context(selected, max_chars=int(max_context_chars))
-        if not context.strip():
-            # nothing extractable (scanned or empty)
-            extracted = {
-                "paper_title": "",
-                "risk_stance": "not_assessed",
-                "risk_confidence": 0.0,
-                "risk_summary": "No text extracted from PDF (may be scanned).",
-                "extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
-                "evidence": []
-            }
-        else:
-            extracted = openai_structured_extract(
-                client=client,
-                model=model,
-                schema=schema,
-                controlled_vocab=vocab,
-                field_instructions=field_instr,
-                context=context
-            )
         extracted["_file"] = filename
         extracted["_pages_in_pdf"] = page_count
         results.append(extracted)
-        # Flatten to table row
         row = {
             "file": filename,
             "paper_title": extracted.get("paper_title", ""),
@@ -473,12 +477,15 @@ def run_extraction(files, api_key, model, field_spec, vocab_json, max_pages, chu
             "risk_confidence": extracted.get("risk_confidence", ""),
             "risk_summary": extracted.get("risk_summary", "")
         }
         for k in field_props.keys():
-            v = (extracted.get("extracted") or {}).get(k, "")
             if isinstance(v, list):
                 row[k] = "; ".join([str(x) for x in v])
             else:
                 row[k] = v
         flat_rows.append(row)
     df = pd.DataFrame(flat_rows)
@@ -488,120 +495,208 @@ def run_extraction(files, api_key, model, field_spec, vocab_json, max_pages, chu
     df.to_csv(csv_path, index=False)
     json_path.write_text(json.dumps(results, indent=2), encoding="utf-8")
-    status = "Done. Download the CSV table (productivity output) and JSON details (evidence + structure)."
-    return df, str(csv_path), str(json_path), status
-def run_synthesis(api_key, model, extraction_json_file):
-    if extraction_json_file is None:
-        return "Upload the extraction_details.json first (from the extraction step)."
-    try:
-        client = get_openai_client(api_key)
-    except Exception as e:
-        return str(e)
-    rows = json.loads(Path(extraction_json_file.name).read_text(encoding="utf-8"))
-    md = openai_synthesize_across_papers(client, model, rows)
-    return md
-def suggest_vocab(api_key, model, vocab_json, files, max_pages, chunk_chars, max_context_chars):
-    if not files:
-        return vocab_json, "Upload PDFs so I can propose vocab additions from their content."
     try:
-        client = get_openai_client(api_key)
-    except Exception as e:
-        return vocab_json, str(e)
-    try:
-        vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
-    except Exception as e:
-        return vocab_json, f"Controlled vocab JSON is invalid: {e}"
-    # Build a small context from the first 1-2 docs
-    contexts = []
-    for f in files[:2]:
-        pages, _ = extract_pages_from_pdf(f.name, max_pages=int(max_pages))
-        chunks = chunk_pages(pages, target_chars=int(chunk_chars))
-        selected = select_relevant_chunks(
-            chunks,
-            queries=["toxicology endpoints noael loael bmd genotoxicity carcinogenicity endocrine exposure route species"],
-            top_per_query=2,
-            max_chunks=8
-        )
-        ctx = build_context(selected, max_chars=int(max_context_chars))
-        if ctx:
-            contexts.append(ctx)
-    combined = "\n\n---\n\n".join(contexts)[:int(max_context_chars)]
-    additions = openai_suggest_vocab_additions(client, model, vocab, combined)
-    # Merge additions (simple)
-    merged = dict(vocab)
-    add_obj = additions.get("additions", {})
-    for k, arr in add_obj.items():
-        if not isinstance(arr, list):
-            continue
-        if k not in merged:
-            merged[k] = []
-        if isinstance(merged[k], list):
-            for term in arr:
-                if term not in merged[k]:
-                    merged[k].append(term)
-    return json.dumps(merged, indent=2), "Vocab updated with suggested additions. Review/edit before extracting."
-# -----------------------------
 # Gradio UI
-# -----------------------------
-with gr.Blocks(title="Toxicology PDF → Table Extractor (GPT-4o)") as demo:
-    gr.Markdown("# Toxicology PDF → Table Extractor (GPT-4o)")
     with gr.Tab("Extract to Table"):
         files = gr.File(label="Upload toxicology research PDFs", file_types=[".pdf"], file_count="multiple")
-        api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
-        model = gr.Dropdown(
-            label="Model",
-            choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"],
-            value="gpt-4o-2024-08-06"
-        )
         with gr.Row():
-            max_pages = gr.Slider(0, 200, value=0, step=1, label="Max pages to read (0 = all)")
-            chunk_chars = gr.Slider(1200, 8000, value=3000, step=100, label="Chunk size (chars)")
-            max_context_chars = gr.Slider(5000, 40000, value=20000, step=1000, label="Max context sent to GPT (chars)")
-        vocab_json = gr.Textbox(label="Controlled vocabulary (JSON)", value=DEFAULT_CONTROLLED_VOCAB_JSON, lines=12)
-        field_spec = gr.Textbox(label="Extraction spec (you control what fields to extract)", value=DEFAULT_FIELD_SPEC, lines=10)
-        with gr.Row():
-            vocab_btn = gr.Button("Suggest vocab additions from PDFs")
-            extract_btn = gr.Button("Run Extraction (Table)")
         status = gr.Textbox(label="Status", interactive=False)
-        table = gr.Dataframe(label="Extracted Table (one row per paper)", interactive=False)
-        out_csv = gr.File(label="Download: extraction_table.csv")
-        out_json = gr.File(label="Download: extraction_details.json (evidence + structured data)")
-        vocab_btn.click(
-            fn=suggest_vocab,
-            inputs=[api_key, model, vocab_json, files, max_pages, chunk_chars, max_context_chars],
-            outputs=[vocab_json, status]
         )
         extract_btn.click(
             fn=run_extraction,
             inputs=[files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars],
-            outputs=[table, out_csv, out_json, status]
         )
     with gr.Tab("Cross-paper Synthesis"):
-        gr.Markdown("Upload the `extraction_details.json` produced by the Extract tab, then synthesize across papers.")
         api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
         model2 = gr.Dropdown(
             label="Model",
@@ -618,6 +713,32 @@ with gr.Blocks(title="Toxicology PDF → Table Extractor (GPT-4o)") as demo:
             outputs=[synth_md]
         )
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", "7860"))
     demo.queue().launch(server_name="0.0.0.0", server_port=port)

 import os
 import re
 import json
 import tempfile
 from pathlib import Path
 from typing import Dict, List, Tuple, Any
 from pypdf import PdfReader
 from sklearn.feature_extraction.text import TfidfVectorizer
+from openai import OpenAI
+# =============================
 # Defaults
+# =============================
 DEFAULT_CONTROLLED_VOCAB_JSON = """{
   "risk_stance_enum": ["acceptable","acceptable_with_uncertainty","not_acceptable","insufficient_data"],
   "study_type_enum": ["in_vivo","in_vitro","epidemiology","in_silico","review","methodology","other"],
   "exposure_route_enum": ["oral","inhalation","dermal","parenteral","multiple","not_reported"],
   "species_enum": ["human","rat","mouse","rabbit","dog","non_human_primate","cell_line","other","not_reported"],
+  "oecd_endpoints": [
+    "acute_toxicity","subacute_toxicity","subchronic_toxicity","chronic_toxicity",
+    "carcinogenicity","genotoxicity","reproductive_toxicity","developmental_toxicity",
+    "neurotoxicity","immunotoxicity","endocrine_activity","sensitization","irritation_corrosion"
+  ],
+  "meddra_like_terms": [
+    "hepatic_disorder","renal_disorder","nervous_system_disorder","respiratory_disorder",
+    "skin_and_subcutaneous_tissue_disorder","reproductive_system_disorder",
+    "immune_system_disorder","blood_and_lymphatic_system_disorder"
+  ],
+  "dose_metric_terms": ["noael","loael","bmd","bmdl","ld50","lc50","ec50","ic50"]
 }"""
+DEFAULT_FIELD_SPEC = """# One field per line: Field Name | type | instructions
 # types: str, num, bool, list[str], list[num], enum[a,b,c]
 Chemical(s) | list[str] | Primary chemical(s) studied; include common name + abbreviation if present.
 CAS_numbers | list[str] | Extract any CAS numbers mentioned.
 Study_type | enum[in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other] | Choose the best match.
 Exposure_route | enum[oral,inhalation,dermal,parenteral,multiple,not_reported] | Choose best match.
 Species | enum[human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported] | Choose best match.
+OECD_endpoints | list[str] | Extract endpoints; prefer controlled vocab 'oecd_endpoints' when applicable.
+MedDRA_like_terms | list[str] | Extract effects; prefer controlled vocab 'meddra_like_terms' when applicable.
 Dose_metrics | list[str] | Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available.
+Key_findings | str | 2-4 bullet-like sentences summarizing the main findings.
 Conclusion | str | What does the paper conclude about safety/risk?
 """
+# =============================
+# PDF extraction (text-based PDFs only)
+# =============================
 def extract_pages_from_pdf(pdf_path: str, max_pages: int = 0) -> Tuple[List[Tuple[int, str]], int]:
     reader = PdfReader(pdf_path)
     page_count = len(reader.pages)
             t = reader.pages[i].extract_text() or ""
         except Exception:
             t = ""
+        pages.append((i + 1, t or ""))
     return pages, page_count
 def chunk_pages(pages: List[Tuple[int, str]], target_chars: int = 3000) -> List[Dict[str, Any]]:
     chunks = []
     buf = []
     start_page = None
         if start_page is None:
             start_page = pno
         if cur_len + len(txt) + 1 > target_chars and buf:
+            end_page = pno - 1
+            end_page = end_page if end_page >= start_page else start_page
+            chunks.append({"pages": f"{start_page}-{end_page}", "text": " ".join(buf)})
             buf = [txt]
             start_page = pno
             cur_len = len(txt)
             cur_len += len(txt) + 1
     if buf and start_page is not None:
+        end_page = pages[-1][0] if pages else start_page
         chunks.append({"pages": f"{start_page}-{end_page}", "text": " ".join(buf)})
     return chunks
+# =============================
 # Lightweight retrieval (TF-IDF) to select relevant excerpts
+# =============================
+def select_relevant_chunks(
+    chunks: List[Dict[str, Any]],
+    queries: List[str],
+    top_per_query: int = 2,
+    max_chunks: int = 12
+) -> List[Dict[str, Any]]:
     texts = [c["text"] for c in chunks]
     if not texts:
         return []
     vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=20000)
     X = vectorizer.fit_transform(texts)
+    selected_idx: List[int] = []
     for q in queries:
         q = (q or "").strip()
         if not q:
             continue
         qv = vectorizer.transform([q])
+        sims = (X @ qv.T).toarray().ravel()
         idx = np.argsort(sims)[::-1]
         for i in idx[:top_per_query]:
             if i not in selected_idx:
                 selected_idx.append(i)
     if not selected_idx:
         selected_idx = list(range(min(len(chunks), max_chunks)))
+    return [chunks[i] for i in selected_idx[:max_chunks]]
 def build_context(selected_chunks: List[Dict[str, Any]], max_chars: int = 20000) -> str:
     return "\n".join(parts).strip()
+# =============================
 # User-defined extraction spec -> JSON Schema
+# =============================
 def slugify_field(name: str) -> str:
     name = name.strip()
     name = re.sub(r"[^\w\s-]", "", name)
     return name[:60] if name else "field"
+def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
     """
     spec lines: Field Name | type | instructions
+    Returns: properties dict, instructions map (field_key -> instruction)
     """
+    props: Dict[str, Any] = {}
+    instr: Dict[str, str] = {}
     for raw_line in (spec or "").splitlines():
         line = raw_line.strip()
         ftype = parts[1]
         finstr = parts[2] if len(parts) >= 3 else ""
         key = slugify_field(field_name)
         instr[key] = finstr
+        schema: Dict[str, Any] = {"type": "string"}
         if ftype == "str":
             schema = {"type": "string"}
             schema = {"type": "string"}
         props[key] = schema
+    return props, instr
+def build_extraction_schema(field_props: Dict[str, Any], vocab: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    IMPORTANT: Structured Outputs (strict=True) requires that for every object:
+      required must exist and include every key in properties.
+    """
     risk_enum = vocab.get(
         "risk_stance_enum",
         ["acceptable", "acceptable_with_uncertainty", "not_acceptable", "insufficient_data"]
     )
     all_field_keys = list(field_props.keys())
     schema = {
                 "type": "object",
                 "additionalProperties": False,
                 "properties": field_props,
+                "required": all_field_keys  # strict requirement
             },
             "evidence": {
                 "type": "array",
     return schema
+# =============================
+# OpenAI client + extraction
+# =============================
 def get_openai_client(api_key: str) -> OpenAI:
     key = (api_key or "").strip() or os.getenv("OPENAI_API_KEY", "").strip()
     if not key:
+        raise ValueError("Missing OpenAI API key. Provide it in the UI or set OPENAI_API_KEY secret in Hugging Face.")
     return OpenAI(api_key=key)
     field_instructions: Dict[str, str],
     context: str
 ) -> Dict[str, Any]:
     field_instr_lines = []
     for k, v in field_instructions.items():
+        field_instr_lines.append(f"- {k}: {v if v else '(no extra instructions)'}")
     vocab_text = json.dumps(controlled_vocab, indent=2)
     system_msg = (
         "You are a toxicology research paper data-extraction assistant.\n"
+        "Grounding rules (must follow):\n"
+        "1) Use ONLY the provided excerpts; do NOT invent details.\n"
+        "2) If a value is not explicitly stated, output empty string or empty list (or an allowed enum like 'not_reported').\n"
+        "3) Provide evidence quotes + page ranges for extracted fields.\n"
+        "4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
+        "5) Prefer controlled vocab terms when applicable.\n"
     )
     user_msg = (
         "FIELD INSTRUCTIONS:\n"
         + "\n".join(field_instr_lines)
         + "\n\n"
+        "EXCERPTS (with page ranges):\n"
         f"{context}\n"
     )
             }
         }
     )
+    return json.loads(resp.output_text)
 def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[str, Any]]) -> str:
     system_msg = (
         "You are a senior toxicology scientist summarizing multiple papers.\n"
+        "Create a concise synthesis: consensus, disagreements, data gaps, and actionable next steps.\n"
+        "Base strictly on the provided extracted JSON (which is evidence-backed).\n"
     )
     user_msg = "EXTRACTED_ROWS_JSON:\n" + json.dumps(rows, indent=2)
     resp = client.responses.create(
         model=model,
         input=[
             {"role": "system", "content": system_msg},
             {"role": "user", "content": user_msg}
         ],
     )
+    return resp.output_text
+# =============================
+# Grounding helpers (UI)
+# =============================
+def _make_vertical(records: List[Dict[str, Any]], file_name: str) -> pd.DataFrame:
+    if not records or not file_name:
+        return pd.DataFrame(columns=["Field", "Value"])
+    row = next((r for r in records if r.get("file") == file_name), None)
+    if not row:
+        return pd.DataFrame(columns=["Field", "Value"])
+    return pd.DataFrame({"Field": list(row.keys()), "Value": [row[k] for k in row.keys()]})
+def _render_evidence(details: List[Dict[str, Any]], file_name: str, max_items: int = 80) -> str:
+    if not details or not file_name:
+        return ""
+    d = next((x for x in details if x.get("_file") == file_name), None)
+    if not d:
+        return ""
+    ev = d.get("evidence", []) or []
+    lines = []
+    for e in ev[:max_items]:
+        quote = (e.get("quote", "") or "").strip()
+        pages = (e.get("pages", "") or "").strip()
+        field = (e.get("field", "") or "").strip()
+        if quote:
+            if len(quote) > 280:
+                quote = quote[:280] + "…"
+            lines.append(f"- **{field}** (pages {pages}): “{quote}”")
+    header = "### Evidence (grounding)\n"
+    if not lines:
+        lines = ["- (no evidence returned)"]
+    return header + "\n".join(lines) + "\n\n> Review note: evidence reflects the original extraction. If you change values, re-run extraction to refresh evidence."
+def _text_based_pdf_warning(pages: List[Tuple[int, str]]) -> bool:
+    # If almost no text exists across pages, treat as non-text PDF.
+    joined = " ".join([clean_text(t) for _, t in pages if clean_text(t)])
+    return len(joined.strip()) < 200  # heuristic threshold
+# =============================
+# Main extraction handler
+# =============================
+def run_extraction(
+    files,
+    api_key,
+    model,
+    field_spec,
+    vocab_json,
+    max_pages,
+    chunk_chars,
+    max_context_chars
+):
     if not files:
+        return None, None, None, "Upload one or more PDFs.", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
+    # vocab
     try:
         vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
     except Exception as e:
+        return None, None, None, f"Controlled vocab JSON is invalid: {e}", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
+    # field spec
+    field_props, field_instr = parse_field_spec(field_spec or DEFAULT_FIELD_SPEC)
     if not field_props:
+        return None, None, None, "Field spec produced no fields. Add lines like: Field | str | instructions", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
+    schema = build_extraction_schema(field_props, vocab)
+    # OpenAI
     try:
         client = get_openai_client(api_key)
     except Exception as e:
+        return None, None, None, str(e), gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
+    results: List[Dict[str, Any]] = []
+    flat_rows: List[Dict[str, Any]] = []
     tmpdir = Path(tempfile.mkdtemp(prefix="tox_extract_"))
         filename = os.path.basename(pdf_path)
         pages, page_count = extract_pages_from_pdf(pdf_path, max_pages=int(max_pages))
+        # enforce text-based PDFs note
+        if _text_based_pdf_warning(pages):
+            # create an "empty" record with warning
+            row = {"file": filename, "paper_title": "", "risk_stance": "insufficient_data", "risk_confidence": 0.0, "risk_summary": "No extractable text found. This app supports text-based PDFs only."}
+            for k, sch in field_props.items():
+                row[k] = "" if sch.get("type") != "array" else ""
+            flat_rows.append(row)
+            results.append({
+                "_file": filename,
+                "_pages_in_pdf": page_count,
+                "paper_title": "",
+                "risk_stance": "insufficient_data",
+                "risk_confidence": 0.0,
+                "risk_summary": "No extractable text found. This app supports text-based PDFs only.",
+                "extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
+                "evidence": []
+            })
+            continue
         chunks = chunk_pages(pages, target_chars=int(chunk_chars))
+        # Queries: risk stance + each field instruction (or field key)
+        queries = ["regulatory acceptability risk hazard concern conclusion noael loael bmd bmdl adverse effect uncertainty"]
         for k, ins in field_instr.items():
+            queries.append(ins if ins else k)
         selected = select_relevant_chunks(chunks, queries, top_per_query=2, max_chunks=12)
         context = build_context(selected, max_chars=int(max_context_chars))
+        extracted = openai_structured_extract(
+            client=client,
+            model=model,
+            schema=schema,
+            controlled_vocab=vocab,
+            field_instructions=field_instr,
+            context=context
+        )
         extracted["_file"] = filename
         extracted["_pages_in_pdf"] = page_count
         results.append(extracted)
+        # flatten to table (wide)
         row = {
             "file": filename,
             "paper_title": extracted.get("paper_title", ""),
             "risk_confidence": extracted.get("risk_confidence", ""),
             "risk_summary": extracted.get("risk_summary", "")
         }
+        ext = extracted.get("extracted") or {}
         for k in field_props.keys():
+            v = ext.get(k, "" if field_props[k].get("type") != "array" else [])
             if isinstance(v, list):
                 row[k] = "; ".join([str(x) for x in v])
             else:
                 row[k] = v
         flat_rows.append(row)
     df = pd.DataFrame(flat_rows)
     df.to_csv(csv_path, index=False)
     json_path.write_text(json.dumps(results, indent=2), encoding="utf-8")
+    records = df.to_dict("records")
+    choices = [r["file"] for r in records if "file" in r]
+    default = choices[0] if choices else None
+    vertical = _make_vertical(records, default)
+    evidence = _render_evidence(results, default)
+    status = "Done. Use the vertical view to read cleanly. Enable Review Mode to edit and export a reviewed CSV."
+    return (
+        df,
+        str(csv_path),
+        str(json_path),
+        status,
+        gr.update(choices=choices, value=default),
+        records,
+        results,
+        vertical,
+        evidence
+    )
+# =============================
+# Review mode handlers
+# =============================
+def on_pick(file_name: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
+    return _make_vertical(records, file_name), _render_evidence(details, file_name)
+def toggle_review_mode(is_on: bool):
+    # make vertical table editable when review mode is on
+    return gr.update(interactive=bool(is_on))
+def save_review_changes(file_name: str, vertical_df: Any, records: List[Dict[str, Any]]):
+    """
+    vertical_df comes from gr.Dataframe: typically list[list] or pandas df-like.
+    Expect two columns: Field, Value
+    """
+    if not file_name or not records:
+        return None, records, "Nothing to save."
+    # Convert vertical_df into dict
     try:
+        if isinstance(vertical_df, pd.DataFrame):
+            dfv = vertical_df
+        else:
+            # gradio may pass list-of-lists
+            dfv = pd.DataFrame(vertical_df, columns=["Field", "Value"])
+    except Exception:
+        return None, records, "Could not parse edited vertical table."
+    dfv = dfv.dropna(subset=["Field"])
+    updates = {str(r["Field"]): r["Value"] for _, r in dfv.iterrows() if str(r["Field"]).strip()}
+    # Update matching record
+    new_records = []
+    updated = False
+    for r in records:
+        if r.get("file") == file_name:
+            rr = dict(r)
+            for k, v in updates.items():
+                rr[k] = v
+            new_records.append(rr)
+            updated = True
+        else:
+            new_records.append(r)
+    df_wide = pd.DataFrame(new_records) if new_records else pd.DataFrame()
+    msg = "Saved changes into session table. Export reviewed CSV to download." if updated else "Record not found."
+    return df_wide, new_records, msg
+def export_reviewed_csv(records: List[Dict[str, Any]]):
+    if not records:
+        return None, "No reviewed data to export."
+    tmpdir = Path(tempfile.mkdtemp(prefix="tox_review_"))
+    path = tmpdir / "reviewed_extraction_table.csv"
+    pd.DataFrame(records).to_csv(path, index=False)
+    return str(path), "Reviewed CSV ready to download."
+# =============================
+# Synthesis tab handler
+# =============================
+def run_synthesis(api_key, model, extraction_json_file):
+    if extraction_json_file is None:
+        return "Upload the extraction_details.json produced by the Extract tab first."
+    try:
+        client = get_openai_client(api_key)
+    except Exception as e:
+        return str(e)
+    rows = json.loads(Path(extraction_json_file.name).read_text(encoding="utf-8"))
+    return openai_synthesize_across_papers(client, model, rows)
+# =============================
 # Gradio UI
+# =============================
+with gr.Blocks(title="Toxicology PDF → Grounded Table Extractor") as demo:
+    gr.Markdown(
+        "# Toxicology PDF → Grounded Table Extractor (GPT-4o)\n\n"
+        "**Important:** This app supports **text-based PDFs only** (not scanned/image PDFs). If a PDF has no extractable text, it will be flagged as insufficient_data.\n\n"
+        "You control *what* to extract using the **Extraction spec**. Outputs are grounded by evidence quotes + page ranges."
+    )
+    # State stores for review mode
+    state_records = gr.State([])   # wide table rows: list[dict]
+    state_details = gr.State([])   # extraction details JSON: list[dict]
     with gr.Tab("Extract to Table"):
         files = gr.File(label="Upload toxicology research PDFs", file_types=[".pdf"], file_count="multiple")
+        with gr.Row():
+            api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
+            model = gr.Dropdown(
+                label="Model",
+                choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"],
+                value="gpt-4o-2024-08-06"
+            )
         with gr.Row():
+            max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
+            chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
+            max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
+        vocab_json = gr.Textbox(label="Controlled vocabulary (JSON)", value=DEFAULT_CONTROLLED_VOCAB_JSON, lines=10)
+        field_spec = gr.Textbox(label="Extraction spec (you control the columns)", value=DEFAULT_FIELD_SPEC, lines=10)
+        extract_btn = gr.Button("Run Extraction (Grounded)")
         status = gr.Textbox(label="Status", interactive=False)
+        table = gr.Dataframe(
+            label="Wide Table (download-friendly)",
+            interactive=False,
+            wrap=True,
+            show_row_numbers=True,
+            buttons=["fullscreen", "copy"]
+        )
+        with gr.Row():
+            out_csv = gr.File(label="Download: extraction_table.csv")
+            out_json = gr.File(label="Download: extraction_details.json (evidence + structured data)")
+        gr.Markdown("## Readable view (vertical) + evidence")
+        record_pick = gr.Dropdown(label="Select record", choices=[], value=None)
+        with gr.Row():
+            review_mode = gr.Checkbox(label="Review mode (enable editing)", value=False)
+            save_btn = gr.Button("Save changes to session table")
+            export_btn = gr.Button("Export reviewed CSV")
+        review_status = gr.Textbox(label="Review status", interactive=False)
+        vertical_view = gr.Dataframe(
+            headers=["Field", "Value"],
+            interactive=False,
+            wrap=True,
+            show_row_numbers=False,
+            label="Vertical record view (Field → Value)"
         )
+        evidence_md = gr.Markdown()
+        reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
+        # Run extraction
         extract_btn.click(
             fn=run_extraction,
             inputs=[files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars],
+            outputs=[table, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
+        )
+        # On select record
+        record_pick.change(
+            fn=on_pick,
+            inputs=[record_pick, state_records, state_details],
+            outputs=[vertical_view, evidence_md]
+        )
+        # Toggle review mode editing
+        review_mode.change(
+            fn=toggle_review_mode,
+            inputs=[review_mode],
+            outputs=[vertical_view]
+        )
+        # Save edits back to wide table + state
+        save_btn.click(
+            fn=save_review_changes,
+            inputs=[record_pick, vertical_view, state_records],
+            outputs=[table, state_records, review_status]
+        )
+        # Export reviewed CSV
+        export_btn.click(
+            fn=export_reviewed_csv,
+            inputs=[state_records],
+            outputs=[reviewed_csv, review_status]
         )
     with gr.Tab("Cross-paper Synthesis"):
+        gr.Markdown("Upload the `extraction_details.json` from the Extract tab. Synthesis is based strictly on those grounded extractions.")
         api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
         model2 = gr.Dropdown(
             label="Model",
             outputs=[synth_md]
         )
+    with gr.Tab("Pending tasks"):
+        gr.Markdown(
+            "## Product roadmap (pending tasks)\n\n"
+            "### 1) Granular data model (one row per chemical–endpoint pair)\n"
+            "- Change schema to return `records: [ {chemical, endpoint, ...} ]`\n"
+            "- Flatten into wide table; vertical viewer targets a single record\n\n"
+            "### 2) Stronger grounding & verification\n"
+            "- Require evidence per field (already)\n"
+            "- Add automatic evidence verification (quote must exist in excerpt)\n"
+            "- Add `UNVERIFIED` flags + force empty values when evidence fails\n\n"
+            "### 3) Controlled vocab expansion & mapping\n"
+            "- Add synonym lists and preferred terms\n"
+            "- Map extracted terms into: FDA taxonomy / OECD endpoints / MedDRA-like groupings\n"
+            "- Add a vocab editor + import/export vocab JSON\n\n"
+            "### 4) Column transforms (structured parsing)\n"
+            "- Parse dose metrics into `{metric, value, unit, route, duration}`\n"
+            "- Normalize units (e.g., mg/kg/day)\n"
+            "- Auto-split multi-chemical text into canonical list\n\n"
+            "### 5) Multi-document compare mode\n"
+            "- Compare by chemical or endpoint\n"
+            "- Create a consensus + disagreements table\n\n"
+            "### 6) PDF limitations\n"
+            "- Current: **text-based PDFs only**\n"
+            "- Optional future: OCR for scanned PDFs (adds heavy dependencies)\n"
+        )
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", "7860"))
     demo.queue().launch(server_name="0.0.0.0", server_port=port)