Spaces:

hchevva
/

NLP_Project

Runtime error

App Files Files Community

hchevva commited on Feb 8

Commit

8f914d5

verified ·

1 Parent(s): f6221d9

Create literature_explorer.py

Browse files

Files changed (1) hide show

literature_explorer.py +605 -0

literature_explorer.py ADDED Viewed

	@@ -0,0 +1,605 @@

+import os
+import re
+import json
+from typing import Any, Dict, List, Optional, Tuple
+import gradio as gr
+import numpy as np
+import pandas as pd
+from pypdf import PdfReader
+from openai import OpenAI
+# =============================
+# Pilot limits
+# =============================
+MAX_PDFS = 5
+MAX_PAGES_PER_PDF = 20
+MAX_CHARS_PER_PAGE_FOR_INDEX = 7000  # cap for cost/stability
+DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small"
+DEFAULT_SUMMARY_MODEL = "gpt-4o-mini"
+# =============================
+# Endpoint fallback inference lexicon (Explorer-only)
+# =============================
+ENDPOINT_HINTS: Dict[str, List[str]] = {
+    "Genotoxicity (OECD TG)": [
+        "genotoxic", "mutagen", "clastogen", "ames", "micronucleus", "comet assay",
+        "chromosomal aberration", "dna damage", "oecd tg 471", "tg471", "oecd tg 473", "tg473",
+        "oecd tg 476", "tg476", "oecd tg 487", "tg487", "oecd tg 490", "tg490",
+        "oecd tg 474", "tg474", "oecd tg 475", "tg475", "oecd tg 488", "tg488",
+        "oecd tg 489", "tg489"
+    ],
+    "NAMs / In Silico": ["in silico", "qsar", "read-across", "aop", "pbpk", "high-throughput", "omics", "organ-on-chip", "microphysiological"],
+    "Acute toxicity": ["acute toxicity", "ld50", "lc50", "single dose", "mortality", "lethality"],
+    "Repeated dose toxicity": ["repeated dose", "subchronic", "chronic", "noael", "loael", "28-day", "90-day", "target organ"],
+    "Irritation / Sensitization": ["skin irritation", "eye irritation", "draize", "sensitization", "llna", "patch test"],
+    "Repro / Developmental": ["reproductive toxicity", "fertility", "developmental toxicity", "teratogen", "prenatal", "postnatal"],
+    "Carcinogenicity": ["carcinogenic", "tumor", "neoplasm", "cancer", "two-year", "bioassay"],
+}
+# =============================
+# Organ inference (automatic only)
+# =============================
+ORGANS = ["liver", "lung", "kidney", "skin", "gi", "cns", "reproductive", "immune_blood", "mixed", "unknown"]
+ORGAN_HINTS: Dict[str, List[str]] = {
+    "liver": ["liver", "hepatic", "hepatocyte", "hepatotoxic", "bile", "cholest", "alt", "ast"],
+    "lung": ["lung", "pulmonary", "bronch", "alveol", "airway", "inhalation", "respiratory"],
+    "kidney": ["kidney", "renal", "nephro", "glomerul", "tubul", "creatinine", "bun"],
+    "skin": ["skin", "dermal", "epiderm", "cutaneous", "topical"],
+    "gi": ["gastro", "intestinal", "gut", "colon", "stomach", "oral", "ingestion"],
+    "cns": ["brain", "cns", "neuro", "neuronal", "glia", "blood-brain", "dopamin", "seroton"],
+    "reproductive": ["repro", "testis", "ovary", "uterus", "placent", "fetus", "embryo", "sperm", "oocyte"],
+    "immune_blood": ["immune", "cytok", "inflamm", "blood", "plasma", "serum", "hemat", "lymph", "macrophage"],
+}
+def infer_organ_label(doc_text: str) -> str:
+    t = (doc_text or "").lower()
+    scores = {k: 0 for k in ORGAN_HINTS.keys()}
+    for organ, hints in ORGAN_HINTS.items():
+        for h in hints:
+            if h in t:
+                scores[organ] += 1
+    best = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+    if not best or best[0][1] == 0:
+        return "unknown"
+    # if 2+ organs are close, label mixed
+    top_org, top_score = best[0]
+    if len(best) > 1 and best[1][1] > 0 and (top_score - best[1][1]) <= 1:
+        return "mixed"
+    return top_org
+# =============================
+# Curated enzymes by organ (starter list)
+# =============================
+ENZYMES_BY_ORGAN: Dict[str, List[str]] = {
+    "liver": ["CYP1A2","CYP2C9","CYP2C19","CYP2D6","CYP2E1","CYP3A4","CYP3A5","UGT1A1","UGT2B7","SULT1A1","GSTA1","GSTP1","ADH","ALDH","CES1","CES2"],
+    "lung": ["CYP1A1","CYP1B1","CYP2F1","GSTP1","MPO","ALDH"],
+    "kidney": ["OAT1","OAT3","OCT2","MATE1","MATE2","GSTP1","GSTA1"],
+    "skin": ["CYP1A1","GSTP1","UGT1A1","SULT1A1","ESTERASE","CES1","CES2"],
+    "gi": ["CYP3A4","UGT1A1","UGT2B7","SULT1A1","ABCB1","P-GP","CES1","CES2"],
+    "cns": ["MAO-A","MAO-B","MAOA","MAOB","COMT","ALDH"],
+    "reproductive": ["AROMATASE","CYP19A1","HSD17B","CYP17A1","UGT2B7"],
+    "immune_blood": ["MPO","COX","PTGS1","PTGS2","LOX","ALOX5"],
+    "mixed": [],
+    "unknown": [],
+}
+# conservative regex patterns
+ENZYME_REGEXES = [
+    re.compile(r"\bCYP\s?(\d[A-Z]?\d?[A-Z]?\d?)\b", re.IGNORECASE),
+    re.compile(r"\bUGT\s?(\d[A-Z0-9]+)\b", re.IGNORECASE),
+    re.compile(r"\bSULT\s?(\d[A-Z0-9]+)\b", re.IGNORECASE),
+    re.compile(r"\bGST\s?([A-Z0-9]+)\b", re.IGNORECASE),
+    re.compile(r"\bEC\s?(\d+\.\d+\.\d+\.\d+)\b", re.IGNORECASE),
+]
+def detect_enzymes(text: str, organ: str) -> List[str]:
+    t = text or ""
+    up = t.upper()
+    base = ENZYMES_BY_ORGAN.get(organ, [])
+    if organ in ("mixed", "unknown"):
+        base = ["CYP3A4","CYP2D6","CYP2E1","UGT1A1","SULT1A1","GSTP1","ALDH","ADH"]
+    out: List[str] = []
+    for e in base:
+        if e in up:
+            out.append(e)
+    # regex enrich
+    for rx in ENZYME_REGEXES:
+        for m in rx.finditer(t):
+            g = (m.group(1) or "").upper()
+            if not g:
+                continue
+            if rx.pattern.lower().startswith(r"\bcyp"):
+                v = f"CYP{g}"
+            elif rx.pattern.lower().startswith(r"\bugt"):
+                v = f"UGT{g}"
+            elif rx.pattern.lower().startswith(r"\bsult"):
+                v = f"SULT{g}"
+            elif rx.pattern.lower().startswith(r"\bgst"):
+                v = f"GST{g}"
+            else:
+                v = f"EC {g}"
+            if v not in out:
+                out.append(v)
+    # normalize P-gp variants
+    out2 = []
+    for x in out:
+        if x in ("P-GP", "PGP", "PGLYCO"):
+            x = "P-gp"
+        out2.append(x)
+    # dedupe
+    seen = set()
+    final = []
+    for x in out2:
+        k = x.lower()
+        if k not in seen:
+            seen.add(k)
+            final.append(x)
+    return final
+# =============================
+# Named pathways (starter lexicon)
+# =============================
+PATHWAY_TERMS = [
+    "oxidative stress",
+    "Nrf2",
+    "AhR",
+    "NF-kB",
+    "p53",
+    "MAPK",
+    "PPAR",
+    "apoptosis",
+    "DNA damage response",
+    "mitochondrial dysfunction",
+    "estrogen receptor",
+    "androgen receptor",
+    "inflammation",
+    "cytokine signaling",
+]
+PATHWAY_REGEXES = [
+    re.compile(r"\boxidative stress\b", re.IGNORECASE),
+    re.compile(r"\bNrf2\b", re.IGNORECASE),
+    re.compile(r"\bAhR\b", re.IGNORECASE),
+    re.compile(r"\bNF[-\s]?κ?B\b", re.IGNORECASE),
+    re.compile(r"\bp53\b", re.IGNORECASE),
+    re.compile(r"\bMAPK\b", re.IGNORECASE),
+    re.compile(r"\bPPAR\b", re.IGNORECASE),
+    re.compile(r"\bapoptos(?:is|e|ic)\b", re.IGNORECASE),
+    re.compile(r"\bDNA damage response\b", re.IGNORECASE),
+    re.compile(r"\bmitochondrial dysfunction\b", re.IGNORECASE),
+    re.compile(r"\bestrogen receptor\b", re.IGNORECASE),
+    re.compile(r"\bandrogen receptor\b", re.IGNORECASE),
+    re.compile(r"\binflammat(?:ion|ory)\b", re.IGNORECASE),
+    re.compile(r"\bcytokine signaling\b", re.IGNORECASE),
+]
+def detect_pathways(text: str) -> List[str]:
+    t = text or ""
+    out = []
+    for rx in PATHWAY_REGEXES:
+        if rx.search(t):
+            # map to friendly labels
+            # simplest: also do direct term scan afterwards
+            pass
+    tl = t.lower()
+    for term in PATHWAY_TERMS:
+        if term.lower() in tl:
+            out.append(term)
+    # ensure NF-kB catch even if κ symbol etc
+    if re.search(r"\bNF[-\s]?κ?B\b", t, flags=re.IGNORECASE) and "NF-kB" not in out:
+        out.append("NF-kB")
+    # dedupe preserve order
+    seen = set()
+    final = []
+    for x in out:
+        k = x.lower()
+        if k not in seen:
+            seen.add(k)
+            final.append(x)
+    return final
+# =============================
+# PDF utils
+# =============================
+def extract_pages(pdf_path: str, max_pages: int) -> Tuple[List[Tuple[int, str]], int]:
+    reader = PdfReader(pdf_path)
+    total = len(reader.pages)
+    n = min(total, max_pages)
+    pages: List[Tuple[int, str]] = []
+    for i in range(n):
+        try:
+            txt = reader.pages[i].extract_text() or ""
+        except Exception:
+            txt = ""
+        pages.append((i + 1, txt))
+    return pages, total
+def clean_text(t: str) -> str:
+    t = (t or "").replace("\x00", " ")
+    t = re.sub(r"\s+", " ", t).strip()
+    return t
+def is_text_based(pages: List[Tuple[int, str]]) -> bool:
+    joined = " ".join([clean_text(t) for _, t in pages if clean_text(t)])
+    return len(joined) >= 200
+# =============================
+# OpenAI helpers
+# =============================
+def get_client(api_key: str) -> OpenAI:
+    key = (api_key or "").strip() or os.getenv("OPENAI_API_KEY", "").strip()
+    if not key:
+        raise ValueError("Missing OpenAI API key. Provide it here or set OPENAI_API_KEY secret.")
+    return OpenAI(api_key=key)
+def batched(xs: List[Any], n: int) -> List[List[Any]]:
+    return [xs[i:i+n] for i in range(0, len(xs), n)]
+def embed_texts(client: OpenAI, model: str, texts: List[str]) -> np.ndarray:
+    embs: List[List[float]] = []
+    for b in batched(texts, 64):
+        resp = client.embeddings.create(model=model, input=b)
+        for item in resp.data:
+            embs.append(item.embedding)
+    arr = np.array(embs, dtype=np.float32)
+    norms = np.linalg.norm(arr, axis=1, keepdims=True) + 1e-12
+    return arr / norms
+# =============================
+# Endpoint detection
+# =============================
+def detect_endpoints(text: str) -> List[str]:
+    t = (text or "").lower()
+    found: List[str] = []
+    for ep, hints in ENDPOINT_HINTS.items():
+        for h in hints:
+            if h in t:
+                found.append(ep)
+                break
+    return found
+# =============================
+# "3–5 lines" expanded context = 3–5 sentences (PDF lines unreliable)
+# =============================
+def split_sentences(text: str) -> List[str]:
+    t = re.sub(r"\s+", " ", (text or "")).strip()
+    if not t:
+        return []
+    parts = re.split(r"(?<=[\.\?\!])\s+", t)
+    return [p.strip() for p in parts if p.strip()]
+def expanded_context(page_text: str, query: str, n_sentences: int = 5) -> str:
+    sents = split_sentences(page_text)
+    if not sents:
+        return ""
+    q = (query or "").strip().lower()
+    if not q:
+        return " ".join(sents[:n_sentences])
+    qwords = [w for w in re.findall(r"[a-zA-Z0-9\-]+", q) if len(w) >= 3]
+    hit_i = None
+    for i, s in enumerate(sents):
+        sl = s.lower()
+        if any(w in sl for w in qwords):
+            hit_i = i
+            break
+    if hit_i is None:
+        return " ".join(sents[:n_sentences])
+    start = max(0, hit_i - 2)
+    end = min(len(sents), hit_i + 3)
+    return " ".join(sents[start:end])
+# =============================
+# Index state object (stored in gr.State)
+# =============================
+def empty_index() -> Dict[str, Any]:
+    return {
+        "papers": [],         # {paper_id, file, organ, pages_indexed, text_based}
+        "pages": [],          # {paper_id, file, page, text, endpoints, enzymes, pathways}
+        "embeddings": None,   # np.ndarray normalized
+        "embedding_model": None,
+        "has_embeddings": False,
+        "enzymes_vocab": [],
+        "pathways_vocab": [],
+    }
+def build_index(files, api_key: str, embedding_model: str):
+    if not files:
+        return empty_index(), pd.DataFrame(), pd.DataFrame(), "Upload PDFs then click Build Search Index.", gr.update(choices=[]), gr.update(choices=[])
+    if len(files) > MAX_PDFS:
+        return empty_index(), pd.DataFrame(), pd.DataFrame(), f"Upload limit exceeded: max {MAX_PDFS} PDFs for pilot.", gr.update(choices=[]), gr.update(choices=[])
+    idx = empty_index()
+    papers_rows: List[Dict[str, Any]] = []
+    page_rows: List[Dict[str, Any]] = []
+    for f in files:
+        pdf_path = f.name
+        filename = os.path.basename(pdf_path)
+        pages, total = extract_pages(pdf_path, MAX_PAGES_PER_PDF)
+        text_ok = is_text_based(pages)
+        doc_text = " ".join([clean_text(t) for _, t in pages if clean_text(t)])
+        organ = infer_organ_label(doc_text) if text_ok else "unknown"
+        paper_id = filename
+        papers_rows.append({
+            "paper_id": paper_id,
+            "file": filename,
+            "organ": organ,
+            "pages_indexed": min(total, MAX_PAGES_PER_PDF),
+            "text_based": bool(text_ok),
+        })
+        if not text_ok:
+            continue
+        for pno, raw in pages:
+            txt = clean_text(raw)
+            if not txt:
+                continue
+            txt = txt[:MAX_CHARS_PER_PAGE_FOR_INDEX]
+            eps = detect_endpoints(txt)
+            enz = detect_enzymes(txt, organ)
+            pws = detect_pathways(txt)
+            page_rows.append({
+                "paper_id": paper_id,
+                "file": filename,
+                "page": pno,
+                "text": txt,
+                "endpoints": eps,
+                "enzymes": enz,
+                "pathways": pws,
+            })
+    idx["papers"] = papers_rows
+    idx["pages"] = page_rows
+    papers_df = pd.DataFrame(papers_rows, columns=["file","organ","pages_indexed","text_based"])
+    # Endpoint × Paper matrix (counts of pages mentioning each endpoint)
+    matrix = []
+    endpoint_names = list(ENDPOINT_HINTS.keys())
+    for p in papers_rows:
+        if not p.get("text_based"):
+            continue
+        pid = p["paper_id"]
+        row = {"file": p["file"], "organ": p["organ"]}
+        p_pages = [r for r in page_rows if r["paper_id"] == pid]
+        for ep in endpoint_names:
+            row[ep] = sum(1 for r in p_pages if ep in (r.get("endpoints") or []))
+        matrix.append(row)
+    endpoint_matrix_df = pd.DataFrame(matrix) if matrix else pd.DataFrame(columns=["file","organ"] + endpoint_names)
+    # vocab lists for filters (computed at indexing time)
+    enzymes_vocab = sorted({e for r in page_rows for e in (r.get("enzymes") or [])})
+    pathways_vocab = sorted({p for r in page_rows for p in (r.get("pathways") or [])})
+    idx["enzymes_vocab"] = enzymes_vocab
+    idx["pathways_vocab"] = pathways_vocab
+    # embeddings
+    status = "✅ Indexed pages locally (no embeddings)."
+    try:
+        client = get_client(api_key)
+        texts = [r["text"] for r in page_rows]
+        if texts:
+            em = embed_texts(client, embedding_model or DEFAULT_EMBEDDING_MODEL, texts)
+            idx["embeddings"] = em
+            idx["embedding_model"] = embedding_model or DEFAULT_EMBEDDING_MODEL
+            idx["has_embeddings"] = True
+            status = f"✅ Indexed {len(papers_rows)} paper(s), {len(texts)} page(s). Embeddings built ({idx['embedding_model']})."
+        else:
+            status = "⚠️ No text pages found to index (text-based PDFs only)."
+    except Exception as e:
+        status = f"⚠️ Indexed pages, but embeddings unavailable: {e}. You can still run search with fallback ranking."
+    return (
+        idx,
+        papers_df,
+        endpoint_matrix_df,
+        status,
+        gr.update(choices=[""] + enzymes_vocab, value=""),
+        gr.update(choices=[""] + pathways_vocab, value="")
+    )
+def search(
+    query: str,
+    idx: Dict[str, Any],
+    api_key: str,
+    embedding_model: str,
+    summary_model: str,
+    endpoint_filter: List[str],
+    organ_filter: str,
+    enzyme_filter: str,
+    pathway_filter: str,
+    top_k: int,
+):
+    query = (query or "").strip()
+    if not query:
+        return pd.DataFrame(), "### Grounded mini-summary\n(type a query)", "### Evidence used\n"
+    if not idx or not idx.get("pages"):
+        return pd.DataFrame(), "### Grounded mini-summary\n(Build the index first)", "### Evidence used\n"
+    pages = idx["pages"]
+    papers = {p["paper_id"]: p for p in (idx.get("papers") or [])}
+    def passes(r: Dict[str, Any]) -> bool:
+        if organ_filter and organ_filter != "any":
+            org = (papers.get(r["paper_id"], {}) or {}).get("organ", "unknown")
+            if org != organ_filter:
+                return False
+        if endpoint_filter:
+            eps = r.get("endpoints") or []
+            if not any(e in eps for e in endpoint_filter):
+                return False
+        if enzyme_filter:
+            enz = r.get("enzymes") or []
+            if enzyme_filter not in enz:
+                return False
+        if pathway_filter:
+            pws = r.get("pathways") or []
+            if pathway_filter not in pws:
+                return False
+        return True
+    filtered_idx = [i for i, r in enumerate(pages) if passes(r)]
+    if not filtered_idx:
+        return pd.DataFrame(), "### Grounded mini-summary\n(No pages match your filters)", "### Evidence used\n"
+    ranked: List[Tuple[float, Dict[str, Any]]] = []
+    # embeddings path
+    if idx.get("has_embeddings") and idx.get("embeddings") is not None:
+        try:
+            client = get_client(api_key)
+            qemb = embed_texts(client, embedding_model or idx.get("embedding_model") or DEFAULT_EMBEDDING_MODEL, [query])[0]
+            mat = idx["embeddings"][filtered_idx, :]
+            scores = mat @ qemb
+            order = np.argsort(scores)[::-1][:max(1, int(top_k))]
+            for j in order:
+                page_i = filtered_idx[int(j)]
+                ranked.append((float(scores[int(j)]), pages[page_i]))
+        except Exception:
+            ranked = []
+    # fallback ranking
+    if not ranked:
+        qwords = set([w for w in re.findall(r"[a-zA-Z0-9\-]+", query.lower()) if len(w) >= 3])
+        tmp = []
+        for i in filtered_idx:
+            t = (pages[i].get("text") or "").lower()
+            hits = sum(1 for w in qwords if w in t)
+            tmp.append((hits, pages[i]))
+        tmp.sort(key=lambda x: x[0], reverse=True)
+        ranked = [(float(h), r) for h, r in tmp[:max(1, int(top_k))]]
+    rows = []
+    evidence = []
+    for score, r in ranked:
+        pid = r["paper_id"]
+        org = (papers.get(pid, {}) or {}).get("organ", "unknown")
+        ctx = expanded_context(r.get("text", ""), query, n_sentences=5)
+        rows.append({
+            "file": r.get("file",""),
+            "page": r.get("page",""),
+            "score": round(score, 4),
+            "organ": org,
+            "endpoints": "; ".join(r.get("endpoints") or []),
+            "enzymes": "; ".join((r.get("enzymes") or [])[:12]),
+            "pathways": "; ".join((r.get("pathways") or [])[:12]),
+            "context": ctx
+        })
+        snippet = ctx[:360] + ("…" if len(ctx) > 360 else "")
+        evidence.append(f"- **{r.get('file','')}** (p.{r.get('page','')}): {snippet}")
+    results_df = pd.DataFrame(rows, columns=["file","page","score","organ","endpoints","enzymes","pathways","context"])
+    evidence_md = "### Evidence used\n" + "\n".join(evidence[:8])
+    # grounded mini-summary
+    mini_summary = "(mini-summary unavailable)"
+    try:
+        client = get_client(api_key)
+        payload = [{"file": x["file"], "page": x["page"], "context": x["context"]} for x in rows[:8]]
+        system_msg = (
+            "You are a literature assistant for toxicology researchers. "
+            "Write ONE neutral paragraph that answers the user's query based ONLY on the evidence excerpts. "
+            "Cite sources inline as (File p.X). Do not add outside facts."
+        )
+        user_msg = "USER QUERY:\n" + query + "\n\nEVIDENCE EXCERPTS:\n" + json.dumps(payload, indent=2)
+        resp = client.responses.create(
+            model=summary_model or DEFAULT_SUMMARY_MODEL,
+            input=[{"role":"system","content":system_msg},{"role":"user","content":user_msg}]
+        )
+        mini_summary = resp.output_text.strip()
+    except Exception as e:
+        mini_summary = f"(mini-summary unavailable: {e})"
+    mini_md = "### Grounded mini-summary\n" + mini_summary
+    return results_df, mini_md, evidence_md
+# =============================
+# Tab plugin (Option A)
+# =============================
+def build_literature_explorer_tab():
+    gr.Markdown(
+        "## Literature Explorer (Pilot)\n"
+        f"- Limits: **max {MAX_PDFS} PDFs**, **max {MAX_PAGES_PER_PDF} pages/PDF**\n"
+        "- Text-based PDFs only (not scanned/image PDFs).\n"
+        "- Semantic search is page-level; “3–5 lines context” is approximated as **3–5 sentences**.\n"
+    )
+    idx_state = gr.State(empty_index())
+    with gr.Group():
+        files = gr.File(label="Upload PDFs (Explorer only)", file_types=[".pdf"], file_count="multiple")
+        with gr.Row():
+            api_key = gr.Textbox(label="OpenAI API key (Explorer)", type="password")
+            embedding_model = gr.Dropdown(label="Embedding model", choices=["text-embedding-3-small","text-embedding-3-large"], value=DEFAULT_EMBEDDING_MODEL)
+            summary_model = gr.Dropdown(label="Mini-summary model", choices=["gpt-4o-mini","gpt-4o","gpt-4o-2024-08-06"], value=DEFAULT_SUMMARY_MODEL)
+        build_btn = gr.Button("Build Search Index", variant="primary")
+        index_status = gr.Textbox(label="Index status", interactive=False)
+        papers_df = gr.Dataframe(label="Indexed papers", interactive=False, wrap=True)
+        endpoint_matrix_df = gr.Dataframe(label="Endpoint correlation (pages per endpoint per paper)", interactive=False, wrap=True)
+    with gr.Group():
+        gr.Markdown("### Search across indexed papers")
+        query = gr.Textbox(label="Search query", placeholder="e.g., CYP3A4 oxidative stress and genotoxicity", lines=2)
+        with gr.Row():
+            endpoint_filter = gr.Dropdown(label="Endpoint filter (optional)", choices=list(ENDPOINT_HINTS.keys()), multiselect=True, value=[])
+            organ_filter = gr.Dropdown(label="Organ filter (optional)", choices=["any"] + ORGANS, value="any")
+            enzyme_filter = gr.Dropdown(label="Enzyme filter (optional)", choices=[""], value="")
+            pathway_filter = gr.Dropdown(label="Pathway filter (optional)", choices=[""], value="")
+        top_k = gr.Slider(5, 30, value=12, step=1, label="Top results")
+        search_btn = gr.Button("Search", variant="secondary")
+        mini_summary_md = gr.Markdown()
+        results_df = gr.Dataframe(label="Search results (page-level)", interactive=False, wrap=True)
+        evidence_md = gr.Markdown()
+    build_btn.click(
+        fn=build_index,
+        inputs=[files, api_key, embedding_model],
+        outputs=[idx_state, papers_df, endpoint_matrix_df, index_status, enzyme_filter, pathway_filter]
+    )
+    search_btn.click(
+        fn=search,
+        inputs=[query, idx_state, api_key, embedding_model, summary_model, endpoint_filter, organ_filter, enzyme_filter, pathway_filter, top_k],
+        outputs=[results_df, mini_summary_md, evidence_md]
+    )