Spaces:

hchevva
/

NLP_Project

Runtime error

App Files Files Community

hchevva commited on Feb 1

Commit

50467c5

verified ·

1 Parent(s): 4a6dfec

Update app.py

Browse files

Files changed (1) hide show

app.py +512 -406

app.py CHANGED Viewed

@@ -1,511 +1,617 @@
 import os
 import re
 import math
 import tempfile
 from pathlib import Path
-from typing import Dict, List, Tuple
 import gradio as gr
 import numpy as np
 import pandas as pd
-import nltk
-from nltk.sentiment import SentimentIntensityAnalyzer
 from pypdf import PdfReader
 from sklearn.feature_extraction.text import TfidfVectorizer
-import matplotlib.pyplot as plt
-import seaborn as sns
-from wordcloud import WordCloud
-from sumy.parsers.plaintext import PlaintextParser
-from sumy.nlp.tokenizers import Tokenizer
-from sumy.summarizers.text_rank import TextRankSummarizer
 # -----------------------------
-# NLTK setup (downloads once)
 # -----------------------------
-_NLTK_READY = False
-def ensure_nltk():
-    global _NLTK_READY
-    if _NLTK_READY:
-        return
-    nltk.download("punkt", quiet=True)
-    nltk.download("punkt_tab", quiet=True)  # some envs need this
-    nltk.download("vader_lexicon", quiet=True)
-    _NLTK_READY = True
 # -----------------------------
-# PDF extraction
 # -----------------------------
-def extract_text_from_pdf(pdf_path: str, max_pages: int = 0) -> Tuple[str, int]:
-    """
-    Returns (text, page_count). max_pages=0 means all pages.
-    Note: scanned-image PDFs may yield little/no text.
-    """
     reader = PdfReader(pdf_path)
     page_count = len(reader.pages)
-    pages_to_read = page_count if (max_pages is None or max_pages <= 0) else min(page_count, max_pages)
-    parts = []
     for i in range(pages_to_read):
         try:
             t = reader.pages[i].extract_text() or ""
         except Exception:
             t = ""
-        if t.strip():
-            parts.append(t)
-    return "\n".join(parts).strip(), page_count
-# -----------------------------
-# Utilities
-# -----------------------------
-def clean_whitespace(text: str) -> str:
-    text = text or ""
-    text = text.replace("\x00", " ")
-    text = re.sub(r"\s+", " ", text).strip()
-    return text
-def split_into_chunks(text: str, chunk_chars: int = 3000) -> List[str]:
     """
-    Chunk by sentences into ~chunk_chars blocks.
     """
-    text = text or ""
-    if not text.strip():
-        return []
-    sentences = nltk.sent_tokenize(text)
     chunks = []
-    cur = []
     cur_len = 0
-    for s in sentences:
-        s = s.strip()
-        if not s:
             continue
-        if cur_len + len(s) + 1 > chunk_chars and cur:
-            chunks.append(" ".join(cur))
-            cur = [s]
-            cur_len = len(s)
         else:
-            cur.append(s)
-            cur_len += len(s) + 1
-    if cur:
-        chunks.append(" ".join(cur))
     return chunks
-def vader_doc_sentiment(text: str, chunk_chars: int = 3000) -> Tuple[float, str, List[float]]:
-    """
-    Returns: (avg_compound_score, label, chunk_scores)
-    """
-    ensure_nltk()
-    sia = SentimentIntensityAnalyzer()
-    chunks = split_into_chunks(text, chunk_chars=chunk_chars)
-    if not chunks:
-        return 0.0, "Neutral", []
-    scores = [sia.polarity_scores(c).get("compound", 0.0) for c in chunks]
-    avg = float(np.mean(scores))
-    if avg >= 0.05:
-        label = "Positive"
-    elif avg <= -0.05:
-        label = "Negative"
-    else:
-        label = "Neutral"
-    return avg, label, scores
-def extract_keywords_tfidf(text: str, top_k: int = 20) -> List[Tuple[str, float]]:
-    """
-    TF-IDF keywords for a single document.
-    Uses unigrams + bigrams; returns list of (term, score).
-    """
-    text = text or ""
-    if not text.strip():
-        return []
-    vectorizer = TfidfVectorizer(
-        stop_words="english",
-        ngram_range=(1, 2),
-        max_features=5000
-    )
-    X = vectorizer.fit_transform([text])
-    feats = np.array(vectorizer.get_feature_names_out())
-    scores = X.toarray().ravel()
-    if scores.size == 0:
-        return []
-    idx = np.argsort(scores)[::-1]
-    idx = idx[: max(1, int(top_k))]
-    return [(feats[i], float(scores[i])) for i in idx if scores[i] > 0]
-def make_wordcloud_figure(text: str):
-    text = text or ""
-    if not text.strip():
-        return None
-    wc = WordCloud(width=1200, height=600, background_color="white").generate(text)
-    fig = plt.figure(figsize=(10, 5))
-    ax = fig.add_subplot(111)
-    ax.imshow(wc, interpolation="bilinear")
-    ax.axis("off")
-    fig.tight_layout()
-    return fig
-def textrank_summary(text: str, num_sentences: int = 6) -> str:
-    text = (text or "").strip()
-    if not text:
-        return ""
-    num_sentences = max(1, int(num_sentences))
-    parser = PlaintextParser.from_string(text, Tokenizer("english"))
-    summarizer = TextRankSummarizer()
-    sents = summarizer(parser.document, num_sentences)
-    return " ".join(str(s) for s in sents)
-def detect_title(text: str) -> str:
-    """
-    Heuristic: pick the first 'strong' line from the first ~30 lines.
-    """
-    raw = text or ""
-    lines = [l.strip() for l in raw.splitlines() if l.strip()]
-    lines = lines[:30]
-    for l in lines:
-        if 8 <= len(l) <= 200 and not l.lower().startswith(("abstract", "introduction")):
-            # avoid obvious author lines
-            if not re.search(r"\b(university|department|email|corresponding)\b", l.lower()):
-                return l
-    return lines[0] if lines else ""
-def extract_abstract(text: str) -> str:
-    """
-    Try: ABSTRACT ... INTRODUCTION
-    """
-    t = text or ""
-    m = re.search(r"\babstract\b(.*?)(\bintroduction\b|\b1\.\s*introduction\b)", t, flags=re.IGNORECASE | re.DOTALL)
-    if not m:
-        return ""
-    abs_text = clean_whitespace(m.group(1))
-    # keep reasonable length
-    return abs_text[:2000]
-def extract_section_headings(text: str, max_headings: int = 20) -> List[str]:
     """
-    Simple heading heuristic:
-    - Lines that look like: "1. Introduction", "2 Methods", "RESULTS", etc.
     """
-    lines = [l.strip() for l in (text or "").splitlines()]
-    headings = []
-    for l in lines:
-        if not l or len(l) > 120:
             continue
-        if re.match(r"^\d+(\.\d+)*\s+[A-Z].{2,}$", l):
-            headings.append(l)
-        elif l.isupper() and 4 <= len(l) <= 60:
-            headings.append(l)
-        if len(headings) >= max_headings:
-            break
-    # dedupe while preserving order
-    seen = set()
-    out = []
-    for h in headings:
-        key = h.lower()
-        if key not in seen:
-            seen.add(key)
-            out.append(h)
-    return out
-def detect_cas_numbers(text: str) -> List[str]:
-    """
-    CAS format: 2-7 digits - 2 digits - 1 digit
-    """
-    cas = re.findall(r"\b\d{2,7}-\d{2}-\d\b", text or "")
-    # unique preserve order
-    seen = set()
-    out = []
-    for c in cas:
-        if c not in seen:
-            seen.add(c)
-            out.append(c)
-    return out
-TOX_TERMS = [
-    "hazard", "risk", "exposure", "dose", "response", "toxicity",
-    "adverse", "noael", "loael", "benchmark dose", "bmd", "bmdl",
-    "carcinogenic", "mutagen", "genotoxic", "teratogenic",
-    "lc50", "ld50", "in vitro", "in vivo", "metabolite"
-]
-def tox_term_counts(text: str) -> List[Tuple[str, int]]:
-    t = (text or "").lower()
-    counts = []
-    for term in TOX_TERMS:
-        c = len(re.findall(r"\b" + re.escape(term) + r"\b", t))
-        if c > 0:
-            counts.append((term, c))
-    counts.sort(key=lambda x: x[1], reverse=True)
-    return counts
 # -----------------------------
-# Batch pipeline + reporting
 # -----------------------------
-def build_context_report(
-    filename: str,
-    title: str,
-    pages: int,
-    word_count: int,
-    sent_score: float,
-    sent_label: str,
-    keywords: List[Tuple[str, float]],
-    abstract: str,
-    headings: List[str],
-    summary: str,
-    cas: List[str],
-    tox_counts: List[Tuple[str, int]]
-) -> str:
-    kw = ", ".join([k for k, _ in keywords[:15]]) if keywords else "(none)"
-    cas_str = ", ".join(cas[:15]) + (" ..." if len(cas) > 15 else "") if cas else "(none)"
-    headings_str = "\n".join([f"- {h}" for h in headings]) if headings else "- (none detected)"
-    tox_str = "\n".join([f"- {t}: {c}" for t, c in tox_counts[:12]]) if tox_counts else "- (none detected)"
-    abs_block = abstract if abstract else "(abstract not detected)"
-    sum_block = summary if summary else "(summary unavailable)"
-    return f"""## {filename}
-**Title (heuristic):** {title or "(not detected)"}
-**Pages:** {pages}
-**Approx. word count:** {word_count:,}
-### Sentiment / Tone
-- **Average compound score:** {sent_score:.3f}
-- **Label:** **{sent_label}**
-> Interpretation note: for research papers, this is best read as *tone polarity* rather than emotion.
-### Keywords (TF-IDF)
-{kw}
-### Abstract (if detected)
-{abs_block}
-### Extractive summary (TextRank)
-{sum_block}
-### Section outline (heuristic)
-{headings_str}
-### CAS numbers detected
-{cas_str}
-### Toxicology concept coverage
-{tox_str}
-"""
-def analyze_pdfs(files, top_k_keywords, summary_sentences, chunk_chars, max_pages, make_wordcloud):
-    ensure_nltk()
     if not files:
-        return None, None, [], "", None, None, None, "Upload one or more PDFs."
-    top_k_keywords = int(top_k_keywords)
-    summary_sentences = int(summary_sentences)
-    chunk_chars = int(chunk_chars)
-    max_pages = int(max_pages)
-    results_rows = []
-    details: Dict[str, Dict] = {}
-    tmpdir = Path(tempfile.mkdtemp(prefix="tox_paper_nlp_"))
     for f in files:
         pdf_path = f.name
         filename = os.path.basename(pdf_path)
-        raw_text, pages = extract_text_from_pdf(pdf_path, max_pages=max_pages)
-        raw_text = raw_text or ""
-        word_count = len(clean_whitespace(raw_text).split())
-        # sentiment
-        sent_score, sent_label, chunk_scores = vader_doc_sentiment(raw_text, chunk_chars=chunk_chars)
-        # keywords + summary + context
-        keywords = extract_keywords_tfidf(raw_text, top_k=top_k_keywords)
-        abstract = extract_abstract(raw_text)
-        title = detect_title(raw_text)
-        headings = extract_section_headings(raw_text)
-        summary = textrank_summary(raw_text, num_sentences=summary_sentences)
-        cas = detect_cas_numbers(raw_text)
-        tox_counts = tox_term_counts(raw_text)
-        report_md = build_context_report(
-            filename=filename,
-            title=title,
-            pages=pages,
-            word_count=word_count,
-            sent_score=sent_score,
-            sent_label=sent_label,
-            keywords=keywords,
-            abstract=abstract,
-            headings=headings,
-            summary=summary,
-            cas=cas,
-            tox_counts=tox_counts
-        )
-        # Save extracted text + per-doc JSON for portability
-        txt_path = tmpdir / f"{Path(filename).stem}.txt"
-        txt_path.write_text(raw_text, encoding="utf-8", errors="ignore")
-        details[filename] = {
-            "filename": filename,
-            "pages": pages,
-            "word_count": word_count,
-            "sentiment_score": sent_score,
-            "sentiment_label": sent_label,
-            "chunk_scores": chunk_scores,
-            "keywords": keywords,
-            "abstract": abstract,
-            "title": title,
-            "headings": headings,
-            "summary": summary,
-            "cas_numbers": cas,
-            "tox_term_counts": tox_counts,
-            "report_md": report_md,
-            "text_path": str(txt_path),
-            "raw_text_preview": (raw_text[:6000] + " ...") if len(raw_text) > 6000 else raw_text
-        }
-        results_rows.append({
             "file": filename,
-            "pages": pages,
-            "word_count": word_count,
-            "sentiment_score": round(sent_score, 4),
-            "sentiment_label": sent_label,
-            "top_keywords": ", ".join([k for k, _ in keywords[:10]]),
-            "cas_count": len(cas),
-        })
-    df = pd.DataFrame(results_rows).sort_values(["sentiment_score", "word_count"], ascending=[False, False])
-    # Save table as CSV for download
-    csv_path = tmpdir / "pdf_nlp_results.csv"
     df.to_csv(csv_path, index=False)
-    # Populate doc selector and default view
-    doc_names = list(details.keys())
-    first = doc_names[0]
-    state = details
-    report_md = details[first]["report_md"]
-    # sentiment distribution plot for first doc
-    fig_sent = None
-    scores = details[first]["chunk_scores"]
-    if scores:
-        fig_sent = plt.figure()
-        ax = fig_sent.add_subplot(111)
-        sns.histplot(scores, kde=True, ax=ax)
-        ax.set_title(f"Chunk Sentiment Distribution: {first}")
-        ax.set_xlabel("VADER compound score")
-        ax.set_ylabel("Chunk count")
-        fig_sent.tight_layout()
-    fig_wc = None
-    if make_wordcloud:
-        fig_wc = make_wordcloud_figure(details[first]["raw_text_preview"])
-    return df, str(csv_path), doc_names, report_md, fig_sent, fig_wc, details[first]["raw_text_preview"], "Done."
-def render_doc(doc_name, state, make_wordcloud):
-    if not state or not doc_name or doc_name not in state:
-        return "", None, None, ""
-    d = state[doc_name]
-    report_md = d["report_md"]
-    preview = d["raw_text_preview"]
-    fig_sent = None
-    scores = d.get("chunk_scores", [])
-    if scores:
-        fig_sent = plt.figure()
-        ax = fig_sent.add_subplot(111)
-        sns.histplot(scores, kde=True, ax=ax)
-        ax.set_title(f"Chunk Sentiment Distribution: {doc_name}")
-        ax.set_xlabel("VADER compound score")
-        ax.set_ylabel("Chunk count")
-        fig_sent.tight_layout()
-    fig_wc = None
-    if make_wordcloud:
-        fig_wc = make_wordcloud_figure(preview)
-    return report_md, fig_sent, fig_wc, preview
 # -----------------------------
 # Gradio UI
 # -----------------------------
-with gr.Blocks(title="Toxicology PDF NLP Analyzer") as demo:
-    gr.Markdown("# Toxicology PDF NLP Analyzer")
-    state = gr.State({})
-    with gr.Tab("Batch (Upload PDFs)"):
         files = gr.File(label="Upload toxicology research PDFs", file_types=[".pdf"], file_count="multiple")
         with gr.Row():
-            top_k_keywords = gr.Slider(5, 50, value=20, step=1, label="Top keywords (TF-IDF)")
-            summary_sentences = gr.Slider(2, 12, value=6, step=1, label="Summary sentences (TextRank)")
-        with gr.Row():
-            chunk_chars = gr.Slider(800, 8000, value=3000, step=100, label="Chunk size for sentiment (chars)")
             max_pages = gr.Slider(0, 200, value=0, step=1, label="Max pages to read (0 = all)")
-        make_wordcloud = gr.Checkbox(label="Generate word cloud", value=True)
-        run_btn = gr.Button("Analyze PDFs")
         status = gr.Textbox(label="Status", interactive=False)
-        results_df = gr.Dataframe(label="Batch Results", interactive=False)
-        results_csv = gr.File(label="Download: results CSV")
-        with gr.Row():
-            doc_selector = gr.Dropdown(label="Select a document for details", choices=[], value=None)
-        report_md = gr.Markdown()
-        sent_plot = gr.Plot(label="Sentiment Distribution (by chunk)")
-        wc_plot = gr.Plot(label="Word Cloud")
-        raw_preview = gr.Textbox(label="Extracted text preview (first ~6k chars)", lines=10)
-        run_btn.click(
-            fn=analyze_pdfs,
-            inputs=[files, top_k_keywords, summary_sentences, chunk_chars, max_pages, make_wordcloud],
-            outputs=[results_df, results_csv, doc_selector, report_md, sent_plot, wc_plot, raw_preview, status]
-        ).then(
-            fn=lambda d: d, inputs=None, outputs=state
         )
-        # Update details view on selection change
-        doc_selector.change(
-            fn=render_doc,
-            inputs=[doc_selector, state, make_wordcloud],
-            outputs=[report_md, sent_plot, wc_plot, raw_preview]
         )
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", "7860"))
-    demo.launch(server_name="0.0.0.0", server_port=port)

 import os
 import re
+import json
 import math
 import tempfile
 from pathlib import Path
+from typing import Dict, List, Tuple, Any
 import gradio as gr
 import numpy as np
 import pandas as pd
 from pypdf import PdfReader
 from sklearn.feature_extraction.text import TfidfVectorizer
+from openai import OpenAI  # OpenAI Responses API client
 # -----------------------------
+# Defaults
 # -----------------------------
+DEFAULT_CONTROLLED_VOCAB_JSON = """{
+  "risk_stance_enum": ["high_concern","moderate_concern","low_concern","inconclusive","not_assessed"],
+  "study_type_enum": ["in_vivo","in_vitro","epidemiology","in_silico","review","methodology","other"],
+  "exposure_route_enum": ["oral","inhalation","dermal","parenteral","multiple","not_reported"],
+  "species_enum": ["human","rat","mouse","rabbit","dog","non_human_primate","cell_line","other","not_reported"],
+  "endpoint_terms": ["hepatotoxicity","nephrotoxicity","neurotoxicity","immunotoxicity","reproductive_toxicity","developmental_toxicity","genotoxicity","carcinogenicity","endocrine_activity","respiratory_toxicity","dermal_toxicity","hematotoxicity","cytotoxicity","oxidative_stress","inflammation"],
+  "dose_metric_terms": ["noael","loael","bmd","bmdl","ld50","lc50","ec50","ic50"],
+  "risk_language_terms": ["adverse_effect","no_adverse_effect_observed","increased_risk","safe_at_tested_dose","insufficient_evidence","uncertainty_high"]
+}"""
+DEFAULT_FIELD_SPEC = """# One field per line:  Field Name | type | instructions | optional: enum values
+# types: str, num, bool, list[str], list[num], enum[a,b,c]
+Chemical(s) | list[str] | Primary chemical(s) studied; include common name + abbreviation if present.
+CAS_numbers | list[str] | Extract any CAS numbers mentioned.
+Study_type | enum[in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other] | Choose the best match.
+Exposure_route | enum[oral,inhalation,dermal,parenteral,multiple,not_reported] | Choose best match.
+Species | enum[human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported] | Choose best match.
+Key_endpoints | list[str] | Extract endpoints; prefer controlled vocab terms if applicable.
+Key_findings | str | 2-4 bullet-like sentences summarizing the main findings.
+Dose_metrics | list[str] | Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available.
+Conclusion | str | What does the paper conclude about safety/risk?
+"""
 # -----------------------------
+# PDF extraction (page-aware)
 # -----------------------------
+def extract_pages_from_pdf(pdf_path: str, max_pages: int = 0) -> Tuple[List[Tuple[int, str]], int]:
     reader = PdfReader(pdf_path)
     page_count = len(reader.pages)
+    pages_to_read = page_count if (max_pages is None or max_pages <= 0) else min(page_count, int(max_pages))
+    pages: List[Tuple[int, str]] = []
     for i in range(pages_to_read):
         try:
             t = reader.pages[i].extract_text() or ""
         except Exception:
             t = ""
+        t = (t or "").strip()
+        pages.append((i + 1, t))
+    return pages, page_count
+def clean_text(t: str) -> str:
+    t = t or ""
+    t = t.replace("\x00", " ")
+    t = re.sub(r"\s+", " ", t).strip()
+    return t
+def chunk_pages(pages: List[Tuple[int, str]], target_chars: int = 3000) -> List[Dict[str, Any]]:
     """
+    Build chunks with page ranges, roughly target_chars each.
     """
     chunks = []
+    buf = []
+    start_page = None
     cur_len = 0
+    for pno, txt in pages:
+        txt = clean_text(txt)
+        if not txt:
             continue
+        if start_page is None:
+            start_page = pno
+        # If adding this page exceeds chunk size, flush
+        if cur_len + len(txt) + 1 > target_chars and buf:
+            end_page = (pno - 1) if (pno - 1) >= start_page else start_page
+            chunks.append(
+                {"pages": f"{start_page}-{end_page}", "text": " ".join(buf)}
+            )
+            buf = [txt]
+            start_page = pno
+            cur_len = len(txt)
         else:
+            buf.append(txt)
+            cur_len += len(txt) + 1
+    if buf and start_page is not None:
+        end_page = pages[-1][0]
+        chunks.append({"pages": f"{start_page}-{end_page}", "text": " ".join(buf)})
     return chunks
+# -----------------------------
+# Lightweight retrieval (TF-IDF) to select relevant excerpts
+# -----------------------------
+def select_relevant_chunks(chunks: List[Dict[str, Any]], queries: List[str], top_per_query: int = 2, max_chunks: int = 10) -> List[Dict[str, Any]]:
+    texts = [c["text"] for c in chunks]
+    if not texts:
+        return []
+    vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=20000)
+    X = vectorizer.fit_transform(texts)
+    selected_idx = []
+    for q in queries:
+        q = (q or "").strip()
+        if not q:
+            continue
+        qv = vectorizer.transform([q])
+        sims = (X @ qv.T).toarray().ravel()  # cosine-like (not normalized), good enough for ranking
+        idx = np.argsort(sims)[::-1]
+        for i in idx[:top_per_query]:
+            if i not in selected_idx:
+                selected_idx.append(i)
+    # fallback: if nothing selected, take first few chunks
+    if not selected_idx:
+        selected_idx = list(range(min(len(chunks), max_chunks)))
+    selected = [chunks[i] for i in selected_idx[:max_chunks]]
+    return selected
+def build_context(selected_chunks: List[Dict[str, Any]], max_chars: int = 20000) -> str:
+    parts = []
+    total = 0
+    for c in selected_chunks:
+        block = f"[pages {c['pages']}]\n{c['text']}\n"
+        if total + len(block) > max_chars:
+            break
+        parts.append(block)
+        total += len(block)
+    return "\n".join(parts).strip()
+# -----------------------------
+# User-defined extraction spec -> JSON Schema
+# -----------------------------
+def slugify_field(name: str) -> str:
+    name = name.strip()
+    name = re.sub(r"[^\w\s-]", "", name)
+    name = re.sub(r"[\s-]+", "_", name).lower()
+    return name[:60] if name else "field"
+def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], List[str], Dict[str, str]]:
     """
+    spec lines: Field Name | type | instructions
+    Returns: properties dict, required list, instructions map (field_key -> instruction)
     """
+    props = {}
+    required = []
+    instr = {}
+    for raw_line in (spec or "").splitlines():
+        line = raw_line.strip()
+        if not line or line.startswith("#"):
             continue
+        parts = [p.strip() for p in line.split("|")]
+        if len(parts) < 2:
+            continue
+        field_name = parts[0]
+        ftype = parts[1]
+        finstr = parts[2] if len(parts) >= 3 else ""
+        is_required = False
+        if field_name.startswith("*"):
+            is_required = True
+            field_name = field_name[1:].strip()
+        key = slugify_field(field_name)
+        instr[key] = finstr
+        schema = {"type": "string"}
+        if ftype == "str":
+            schema = {"type": "string"}
+        elif ftype == "num":
+            schema = {"type": "number"}
+        elif ftype == "bool":
+            schema = {"type": "boolean"}
+        elif ftype.startswith("list[str]"):
+            schema = {"type": "array", "items": {"type": "string"}}
+        elif ftype.startswith("list[num]"):
+            schema = {"type": "array", "items": {"type": "number"}}
+        elif ftype.startswith("enum[") and ftype.endswith("]"):
+            inside = ftype[len("enum["):-1].strip()
+            vals = [v.strip() for v in inside.split(",") if v.strip()]
+            schema = {"type": "string", "enum": vals}
+        else:
+            schema = {"type": "string"}
+        props[key] = schema
+        if is_required:
+            required.append(key)
+    # If user didn’t mark required fields, keep it permissive
+    return props, required, instr
+def build_extraction_schema(field_props: Dict[str, Any], required_fields: List[str], vocab: Dict[str, Any]) -> Dict[str, Any]:
+    risk_enum = vocab.get("risk_stance_enum", ["high_concern","moderate_concern","low_concern","inconclusive","not_assessed"])
+    schema = {
+        "type": "object",
+        "additionalProperties": False,
+        "properties": {
+            "paper_title": {"type": "string"},
+            "risk_stance": {"type": "string", "enum": risk_enum},
+            "risk_confidence": {"type": "number", "minimum": 0, "maximum": 1},
+            "risk_summary": {"type": "string"},
+            "extracted": {
+                "type": "object",
+                "additionalProperties": False,
+                "properties": field_props,
+                "required": required_fields
+            },
+            "evidence": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "additionalProperties": False,
+                    "properties": {
+                        "field": {"type": "string"},
+                        "quote": {"type": "string"},
+                        "pages": {"type": "string"}
+                    },
+                    "required": ["field", "quote", "pages"]
+                }
+            }
+        },
+        "required": ["paper_title", "risk_stance", "risk_confidence", "risk_summary", "extracted", "evidence"]
+    }
+    return schema
 # -----------------------------
+# OpenAI call (Responses API + Structured Outputs)
 # -----------------------------
+def get_openai_client(api_key: str) -> OpenAI:
+    key = (api_key or "").strip() or os.getenv("OPENAI_API_KEY", "").strip()
+    if not key:
+        raise ValueError("Missing OpenAI API key. Provide it in the UI or set OPENAI_API_KEY.")
+    return OpenAI(api_key=key)
+def openai_structured_extract(
+    client: OpenAI,
+    model: str,
+    schema: Dict[str, Any],
+    controlled_vocab: Dict[str, Any],
+    field_instructions: Dict[str, str],
+    context: str
+) -> Dict[str, Any]:
+    # Build instruction text for the model
+    field_instr_lines = []
+    for k, v in field_instructions.items():
+        if v:
+            field_instr_lines.append(f"- {k}: {v}")
+        else:
+            field_instr_lines.append(f"- {k}: (no extra instructions)")
+    vocab_text = json.dumps(controlled_vocab, indent=2)
+    system_msg = (
+        "You are a toxicology research paper data-extraction assistant.\n"
+        "Rules:\n"
+        "1) Use ONLY the provided excerpts; do not invent details.\n"
+        "2) If a value is not stated, use an empty string, empty list, or 'not_reported' if the enum allows it.\n"
+        "3) Always include evidence quotes with page ranges (from excerpt headers).\n"
+        "4) risk_stance reflects overall concern from the paper's findings (high/moderate/low/inconclusive/not_assessed).\n"
+        "5) Prefer controlled vocabulary terms when applicable.\n"
+    )
+    user_msg = (
+        "CONTROLLED VOCAB (JSON):\n"
+        f"{vocab_text}\n\n"
+        "FIELD INSTRUCTIONS:\n"
+        + "\n".join(field_instr_lines)
+        + "\n\n"
+        "EXCERPTS:\n"
+        f"{context}\n"
+    )
+    resp = client.responses.create(
+        model=model,
+        input=[
+            {"role": "system", "content": system_msg},
+            {"role": "user", "content": user_msg}
+        ],
+        text={
+            "format": {
+                "type": "json_schema",
+                "name": "tox_extraction",
+                "schema": schema,
+                "strict": True
+            }
+        }
+    )
+    # Structured outputs: JSON is in output_text
+    out = resp.output_text
+    return json.loads(out)
+def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[str, Any]]) -> str:
+    system_msg = (
+        "You are a senior toxicology scientist summarizing multiple papers.\n"
+        "Produce a concise synthesis for researchers: consensus, disagreements, data gaps, and next steps.\n"
+        "Base your synthesis strictly on the provided extracted JSON (which itself is evidence-backed).\n"
+    )
+    user_msg = "EXTRACTED_ROWS_JSON:\n" + json.dumps(rows, indent=2)
+    resp = client.responses.create(
+        model=model,
+        input=[
+            {"role": "system", "content": system_msg},
+            {"role": "user", "content": user_msg}
+        ]
+    )
+    return resp.output_text
+def openai_suggest_vocab_additions(client: OpenAI, model: str, current_vocab: Dict[str, Any], context: str) -> Dict[str, Any]:
+    schema = {
+        "type": "object",
+        "additionalProperties": False,
+        "properties": {
+            "additions": {
+                "type": "object",
+                "additionalProperties": {
+                    "type": "array",
+                    "items": {"type": "string"}
+                }
+            },
+            "notes": {"type": "string"}
+        },
+        "required": ["additions", "notes"]
+    }
+    system_msg = (
+        "You propose controlled-vocabulary additions for toxicology paper extraction.\n"
+        "Return only new candidate terms grouped under keys that already exist or new keys if needed.\n"
+        "Avoid duplicates already in current vocab.\n"
+    )
+    user_msg = (
+        "CURRENT_VOCAB_JSON:\n"
+        + json.dumps(current_vocab, indent=2)
+        + "\n\n"
+        "EXCERPTS:\n"
+        + context
+    )
+    resp = client.responses.create(
+        model=model,
+        input=[
+            {"role": "system", "content": system_msg},
+            {"role": "user", "content": user_msg}
+        ],
+        text={
+            "format": {
+                "type": "json_schema",
+                "name": "vocab_additions",
+                "schema": schema,
+                "strict": True
+            }
+        }
+    )
+    return json.loads(resp.output_text)
+# -----------------------------
+# Gradio handlers
+# -----------------------------
+def run_extraction(files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars):
     if not files:
+        return None, None, None, "Upload one or more PDFs."
+    try:
+        vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
+    except Exception as e:
+        return None, None, None, f"Controlled vocab JSON is invalid: {e}"
+    field_props, required_fields, field_instr = parse_field_spec(field_spec or DEFAULT_FIELD_SPEC)
+    if not field_props:
+        return None, None, None, "Field spec produced no fields. Add lines like: Field | str | instructions"
+    schema = build_extraction_schema(field_props, required_fields, vocab)
+    try:
+        client = get_openai_client(api_key)
+    except Exception as e:
+        return None, None, None, str(e)
+    results = []
+    flat_rows = []
+    tmpdir = Path(tempfile.mkdtemp(prefix="tox_extract_"))
     for f in files:
         pdf_path = f.name
         filename = os.path.basename(pdf_path)
+        pages, page_count = extract_pages_from_pdf(pdf_path, max_pages=int(max_pages))
+        chunks = chunk_pages(pages, target_chars=int(chunk_chars))
+        # Build queries: risk stance + each field instruction
+        queries = [
+            "risk stance hazard risk conclusion adverse effect noael loael bmd bmdl ld50 lc50 safety concern",
+        ]
+        for k, ins in field_instr.items():
+            if ins:
+                queries.append(ins)
+            else:
+                queries.append(k)
+        selected = select_relevant_chunks(chunks, queries, top_per_query=2, max_chunks=12)
+        context = build_context(selected, max_chars=int(max_context_chars))
+        if not context.strip():
+            # nothing extractable (scanned or empty)
+            extracted = {
+                "paper_title": "",
+                "risk_stance": "not_assessed",
+                "risk_confidence": 0.0,
+                "risk_summary": "No text extracted from PDF (may be scanned).",
+                "extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
+                "evidence": []
+            }
+        else:
+            extracted = openai_structured_extract(
+                client=client,
+                model=model,
+                schema=schema,
+                controlled_vocab=vocab,
+                field_instructions=field_instr,
+                context=context
+            )
+        extracted["_file"] = filename
+        extracted["_pages_in_pdf"] = page_count
+        results.append(extracted)
+        # Flatten to table row
+        row = {
             "file": filename,
+            "paper_title": extracted.get("paper_title", ""),
+            "risk_stance": extracted.get("risk_stance", ""),
+            "risk_confidence": extracted.get("risk_confidence", ""),
+            "risk_summary": extracted.get("risk_summary", "")
+        }
+        for k in field_props.keys():
+            v = (extracted.get("extracted") or {}).get(k, "")
+            if isinstance(v, list):
+                row[k] = "; ".join([str(x) for x in v])
+            else:
+                row[k] = v
+        flat_rows.append(row)
+    df = pd.DataFrame(flat_rows)
+    csv_path = tmpdir / "extraction_table.csv"
+    json_path = tmpdir / "extraction_details.json"
     df.to_csv(csv_path, index=False)
+    json_path.write_text(json.dumps(results, indent=2), encoding="utf-8")
+    status = "Done. Download the CSV table (productivity output) and JSON details (evidence + structure)."
+    return df, str(csv_path), str(json_path), status
+def run_synthesis(api_key, model, extraction_json_file):
+    if extraction_json_file is None:
+        return "Upload the extraction_details.json first (from the extraction step)."
+    try:
+        client = get_openai_client(api_key)
+    except Exception as e:
+        return str(e)
+    rows = json.loads(Path(extraction_json_file.name).read_text(encoding="utf-8"))
+    md = openai_synthesize_across_papers(client, model, rows)
+    return md
+def suggest_vocab(api_key, model, vocab_json, files, max_pages, chunk_chars, max_context_chars):
+    if not files:
+        return vocab_json, "Upload PDFs so I can propose vocab additions from their content."
+    try:
+        client = get_openai_client(api_key)
+    except Exception as e:
+        return vocab_json, str(e)
+    try:
+        vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
+    except Exception as e:
+        return vocab_json, f"Controlled vocab JSON is invalid: {e}"
+    # Build a small context from the first 1-2 docs
+    contexts = []
+    for f in files[:2]:
+        pages, _ = extract_pages_from_pdf(f.name, max_pages=int(max_pages))
+        chunks = chunk_pages(pages, target_chars=int(chunk_chars))
+        selected = select_relevant_chunks(
+            chunks,
+            queries=["toxicology endpoints noael loael bmd genotoxicity carcinogenicity endocrine exposure route species"],
+            top_per_query=2,
+            max_chunks=8
+        )
+        ctx = build_context(selected, max_chars=int(max_context_chars))
+        if ctx:
+            contexts.append(ctx)
+    combined = "\n\n---\n\n".join(contexts)[:int(max_context_chars)]
+    additions = openai_suggest_vocab_additions(client, model, vocab, combined)
+    # Merge additions (simple)
+    merged = dict(vocab)
+    add_obj = additions.get("additions", {})
+    for k, arr in add_obj.items():
+        if not isinstance(arr, list):
+            continue
+        if k not in merged:
+            merged[k] = []
+        if isinstance(merged[k], list):
+            for term in arr:
+                if term not in merged[k]:
+                    merged[k].append(term)
+    return json.dumps(merged, indent=2), "Vocab updated with suggested additions. Review/edit before extracting."
 # -----------------------------
 # Gradio UI
 # -----------------------------
+with gr.Blocks(title="Toxicology PDF → Table Extractor (GPT-4o)") as demo:
+    gr.Markdown("# Toxicology PDF → Table Extractor (GPT-4o)")
+    with gr.Tab("Extract to Table"):
         files = gr.File(label="Upload toxicology research PDFs", file_types=[".pdf"], file_count="multiple")
+        api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
+        model = gr.Dropdown(
+            label="Model",
+            choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"],
+            value="gpt-4o-2024-08-06"
+        )
         with gr.Row():
             max_pages = gr.Slider(0, 200, value=0, step=1, label="Max pages to read (0 = all)")
+            chunk_chars = gr.Slider(1200, 8000, value=3000, step=100, label="Chunk size (chars)")
+            max_context_chars = gr.Slider(5000, 40000, value=20000, step=1000, label="Max context sent to GPT (chars)")
+        vocab_json = gr.Textbox(label="Controlled vocabulary (JSON)", value=DEFAULT_CONTROLLED_VOCAB_JSON, lines=12)
+        field_spec = gr.Textbox(label="Extraction spec (you control what fields to extract)", value=DEFAULT_FIELD_SPEC, lines=10)
+        with gr.Row():
+            vocab_btn = gr.Button("Suggest vocab additions from PDFs")
+            extract_btn = gr.Button("Run Extraction (Table)")
         status = gr.Textbox(label="Status", interactive=False)
+        table = gr.Dataframe(label="Extracted Table (one row per paper)", interactive=False)
+        out_csv = gr.File(label="Download: extraction_table.csv")
+        out_json = gr.File(label="Download: extraction_details.json (evidence + structured data)")
+        vocab_btn.click(
+            fn=suggest_vocab,
+            inputs=[api_key, model, vocab_json, files, max_pages, chunk_chars, max_context_chars],
+            outputs=[vocab_json, status]
         )
+        extract_btn.click(
+            fn=run_extraction,
+            inputs=[files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars],
+            outputs=[table, out_csv, out_json, status]
         )
+    with gr.Tab("Cross-paper Synthesis"):
+        gr.Markdown("Upload the `extraction_details.json` produced by the Extract tab, then synthesize across papers.")
+        api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
+        model2 = gr.Dropdown(
+            label="Model",
+            choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"],
+            value="gpt-4o-2024-08-06"
+        )
+        extraction_json_file = gr.File(label="Upload extraction_details.json", file_types=[".json"], file_count="single")
+        synth_btn = gr.Button("Synthesize Across Papers")
+        synth_md = gr.Markdown()
+        synth_btn.click(
+            fn=run_synthesis,
+            inputs=[api_key2, model2, extraction_json_file],
+            outputs=[synth_md]
+        )
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", "7860"))
+    demo.queue().launch(server_name="0.0.0.0", server_port=port)