Spaces:

hchevva
/

NLP_Project

Running

App Files Files Community

hchevva commited on 6 days ago

Commit

a9d60f5

verified ·

1 Parent(s): c15c89e

Update literature_explorer.py

Browse files

Files changed (1) hide show

literature_explorer.py +90 -46

literature_explorer.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import os
 import re
 import json
-from typing import Any, Dict, List, Optional, Tuple
 import gradio as gr
 import numpy as np
@@ -57,7 +58,6 @@ ORGAN_HINTS: Dict[str, List[str]] = {
     "immune_blood": ["immune", "cytok", "inflamm", "blood", "plasma", "serum", "hemat", "lymph", "macrophage"],
 }
 def infer_organ_label(doc_text: str) -> str:
     t = (doc_text or "").lower()
     scores = {k: 0 for k in ORGAN_HINTS.keys()}
@@ -70,7 +70,6 @@ def infer_organ_label(doc_text: str) -> str:
     if not best or best[0][1] == 0:
         return "unknown"
-    # if 2+ organs are close, label mixed
     top_org, top_score = best[0]
     if len(best) > 1 and best[1][1] > 0 and (top_score - best[1][1]) <= 1:
         return "mixed"
@@ -93,7 +92,6 @@ ENZYMES_BY_ORGAN: Dict[str, List[str]] = {
     "unknown": [],
 }
-# conservative regex patterns
 ENZYME_REGEXES = [
     re.compile(r"\bCYP\s?(\d[A-Z]?\d?[A-Z]?\d?)\b", re.IGNORECASE),
     re.compile(r"\bUGT\s?(\d[A-Z0-9]+)\b", re.IGNORECASE),
@@ -115,7 +113,6 @@ def detect_enzymes(text: str, organ: str) -> List[str]:
         if e in up:
             out.append(e)
-    # regex enrich
     for rx in ENZYME_REGEXES:
         for m in rx.finditer(t):
             g = (m.group(1) or "").upper()
@@ -141,7 +138,6 @@ def detect_enzymes(text: str, organ: str) -> List[str]:
             x = "P-gp"
         out2.append(x)
-    # dedupe
     seen = set()
     final = []
     for x in out2:
@@ -172,40 +168,15 @@ PATHWAY_TERMS = [
     "cytokine signaling",
 ]
-PATHWAY_REGEXES = [
-    re.compile(r"\boxidative stress\b", re.IGNORECASE),
-    re.compile(r"\bNrf2\b", re.IGNORECASE),
-    re.compile(r"\bAhR\b", re.IGNORECASE),
-    re.compile(r"\bNF[-\s]?κ?B\b", re.IGNORECASE),
-    re.compile(r"\bp53\b", re.IGNORECASE),
-    re.compile(r"\bMAPK\b", re.IGNORECASE),
-    re.compile(r"\bPPAR\b", re.IGNORECASE),
-    re.compile(r"\bapoptos(?:is|e|ic)\b", re.IGNORECASE),
-    re.compile(r"\bDNA damage response\b", re.IGNORECASE),
-    re.compile(r"\bmitochondrial dysfunction\b", re.IGNORECASE),
-    re.compile(r"\bestrogen receptor\b", re.IGNORECASE),
-    re.compile(r"\bandrogen receptor\b", re.IGNORECASE),
-    re.compile(r"\binflammat(?:ion|ory)\b", re.IGNORECASE),
-    re.compile(r"\bcytokine signaling\b", re.IGNORECASE),
-]
 def detect_pathways(text: str) -> List[str]:
     t = text or ""
-    out = []
-    for rx in PATHWAY_REGEXES:
-        if rx.search(t):
-            # map to friendly labels
-            # simplest: also do direct term scan afterwards
-            pass
     tl = t.lower()
     for term in PATHWAY_TERMS:
         if term.lower() in tl:
             out.append(term)
-    # ensure NF-kB catch even if κ symbol etc
     if re.search(r"\bNF[-\s]?κ?B\b", t, flags=re.IGNORECASE) and "NF-kB" not in out:
         out.append("NF-kB")
-    # dedupe preserve order
     seen = set()
     final = []
     for x in out:
@@ -241,6 +212,13 @@ def is_text_based(pages: List[Tuple[int, str]]) -> bool:
     joined = " ".join([clean_text(t) for _, t in pages if clean_text(t)])
     return len(joined) >= 200
 # =============================
 # OpenAI helpers
@@ -280,7 +258,7 @@ def detect_endpoints(text: str) -> List[str]:
 # =============================
-# "3–5 lines" expanded context = 3–5 sentences (PDF lines unreliable)
 # =============================
 def split_sentences(text: str) -> List[str]:
     t = re.sub(r"\s+", " ", (text or "")).strip()
@@ -329,10 +307,10 @@ def empty_index() -> Dict[str, Any]:
 def build_index(files, api_key: str, embedding_model: str):
     if not files:
-        return empty_index(), pd.DataFrame(), pd.DataFrame(), "Upload PDFs then click Build Search Index.", gr.update(choices=[]), gr.update(choices=[])
     if len(files) > MAX_PDFS:
-        return empty_index(), pd.DataFrame(), pd.DataFrame(), f"Upload limit exceeded: max {MAX_PDFS} PDFs for pilot.", gr.update(choices=[]), gr.update(choices=[])
     idx = empty_index()
     papers_rows: List[Dict[str, Any]] = []
@@ -384,17 +362,18 @@ def build_index(files, api_key: str, embedding_model: str):
     papers_df = pd.DataFrame(papers_rows, columns=["file","organ","pages_indexed","text_based"])
-    # Endpoint × Paper matrix (counts of pages mentioning each endpoint)
-    matrix = []
     endpoint_names = list(ENDPOINT_HINTS.keys())
     for p in papers_rows:
         if not p.get("text_based"):
             continue
         pid = p["paper_id"]
-        row = {"file": p["file"], "organ": p["organ"]}
         p_pages = [r for r in page_rows if r["paper_id"] == pid]
         for ep in endpoint_names:
-            row[ep] = sum(1 for r in p_pages if ep in (r.get("endpoints") or []))
         matrix.append(row)
     endpoint_matrix_df = pd.DataFrame(matrix) if matrix else pd.DataFrame(columns=["file","organ"] + endpoint_names)
@@ -508,6 +487,10 @@ def search(
         pid = r["paper_id"]
         org = (papers.get(pid, {}) or {}).get("organ", "unknown")
         ctx = expanded_context(r.get("text", ""), query, n_sentences=5)
         rows.append({
             "file": r.get("file",""),
@@ -517,20 +500,21 @@ def search(
             "endpoints": "; ".join(r.get("endpoints") or []),
             "enzymes": "; ".join((r.get("enzymes") or [])[:12]),
             "pathways": "; ".join((r.get("pathways") or [])[:12]),
-            "context": ctx
         })
-        snippet = ctx[:360] + ("…" if len(ctx) > 360 else "")
         evidence.append(f"- **{r.get('file','')}** (p.{r.get('page','')}): {snippet}")
-    results_df = pd.DataFrame(rows, columns=["file","page","score","organ","endpoints","enzymes","pathways","context"])
     evidence_md = "### Evidence used\n" + "\n".join(evidence[:8])
     # grounded mini-summary
     mini_summary = "(mini-summary unavailable)"
     try:
         client = get_client(api_key)
-        payload = [{"file": x["file"], "page": x["page"], "context": x["context"]} for x in rows[:8]]
         system_msg = (
             "You are a literature assistant for toxicology researchers. "
@@ -550,6 +534,38 @@ def search(
     return results_df, mini_md, evidence_md
 # =============================
 # Tab plugin (Option A)
 # =============================
@@ -558,7 +574,7 @@ def build_literature_explorer_tab():
         "## Literature Explorer (Pilot)\n"
         f"- Limits: **max {MAX_PDFS} PDFs**, **max {MAX_PAGES_PER_PDF} pages/PDF**\n"
         "- Text-based PDFs only (not scanned/image PDFs).\n"
-        "- Semantic search is page-level; “3–5 lines context” is approximated as **3–5 sentences**.\n"
     )
     idx_state = gr.State(empty_index())
@@ -573,7 +589,9 @@ def build_literature_explorer_tab():
         build_btn = gr.Button("Build Search Index", variant="primary")
         index_status = gr.Textbox(label="Index status", interactive=False)
         papers_df = gr.Dataframe(label="Indexed papers", interactive=False, wrap=True)
-        endpoint_matrix_df = gr.Dataframe(label="Endpoint correlation (pages per endpoint per paper)", interactive=False, wrap=True)
     with gr.Group():
         gr.Markdown("### Search across indexed papers")
@@ -589,7 +607,21 @@ def build_literature_explorer_tab():
         search_btn = gr.Button("Search", variant="secondary")
         mini_summary_md = gr.Markdown()
-        results_df = gr.Dataframe(label="Search results (page-level)", interactive=False, wrap=True)
         evidence_md = gr.Markdown()
     build_btn.click(
@@ -602,4 +634,16 @@ def build_literature_explorer_tab():
         fn=search,
         inputs=[query, idx_state, api_key, embedding_model, summary_model, endpoint_filter, organ_filter, enzyme_filter, pathway_filter, top_k],
         outputs=[results_df, mini_summary_md, evidence_md]
     )

 import os
 import re
 import json
+import textwrap
+from typing import Any, Dict, List, Tuple
 import gradio as gr
 import numpy as np
     "immune_blood": ["immune", "cytok", "inflamm", "blood", "plasma", "serum", "hemat", "lymph", "macrophage"],
 }
 def infer_organ_label(doc_text: str) -> str:
     t = (doc_text or "").lower()
     scores = {k: 0 for k in ORGAN_HINTS.keys()}
     if not best or best[0][1] == 0:
         return "unknown"
     top_org, top_score = best[0]
     if len(best) > 1 and best[1][1] > 0 and (top_score - best[1][1]) <= 1:
         return "mixed"
     "unknown": [],
 }
 ENZYME_REGEXES = [
     re.compile(r"\bCYP\s?(\d[A-Z]?\d?[A-Z]?\d?)\b", re.IGNORECASE),
     re.compile(r"\bUGT\s?(\d[A-Z0-9]+)\b", re.IGNORECASE),
         if e in up:
             out.append(e)
     for rx in ENZYME_REGEXES:
         for m in rx.finditer(t):
             g = (m.group(1) or "").upper()
             x = "P-gp"
         out2.append(x)
     seen = set()
     final = []
     for x in out2:
     "cytokine signaling",
 ]
 def detect_pathways(text: str) -> List[str]:
     t = text or ""
     tl = t.lower()
+    out = []
     for term in PATHWAY_TERMS:
         if term.lower() in tl:
             out.append(term)
     if re.search(r"\bNF[-\s]?κ?B\b", t, flags=re.IGNORECASE) and "NF-kB" not in out:
         out.append("NF-kB")
     seen = set()
     final = []
     for x in out:
     joined = " ".join([clean_text(t) for _, t in pages if clean_text(t)])
     return len(joined) >= 200
+def hard_wrap(s: str, width: int = 110) -> str:
+    s = (s or "").strip()
+    if not s:
+        return ""
+    return "\n".join(textwrap.fill(line, width=width, break_long_words=True, break_on_hyphens=True)
+                     for line in s.splitlines() if line.strip())
 # =============================
 # OpenAI helpers
 # =============================
+# Expanded context = 3–5 sentences (PDF lines unreliable)
 # =============================
 def split_sentences(text: str) -> List[str]:
     t = re.sub(r"\s+", " ", (text or "")).strip()
 def build_index(files, api_key: str, embedding_model: str):
     if not files:
+        return empty_index(), pd.DataFrame(), pd.DataFrame(), "Upload PDFs then click Build Search Index.", gr.update(choices=[""], value=""), gr.update(choices=[""], value="")
     if len(files) > MAX_PDFS:
+        return empty_index(), pd.DataFrame(), pd.DataFrame(), f"Upload limit exceeded: max {MAX_PDFS} PDFs for pilot.", gr.update(choices=[""], value=""), gr.update(choices=[""], value="")
     idx = empty_index()
     papers_rows: List[Dict[str, Any]] = []
     papers_df = pd.DataFrame(papers_rows, columns=["file","organ","pages_indexed","text_based"])
+    # �� Endpoint correlation: present/absent per paper (cleaner)
     endpoint_names = list(ENDPOINT_HINTS.keys())
+    matrix = []
     for p in papers_rows:
         if not p.get("text_based"):
             continue
         pid = p["paper_id"]
         p_pages = [r for r in page_rows if r["paper_id"] == pid]
+        row = {"file": p["file"], "organ": p["organ"]}
         for ep in endpoint_names:
+            present = any(ep in (r.get("endpoints") or []) for r in p_pages)
+            row[ep] = "present" if present else ""
         matrix.append(row)
     endpoint_matrix_df = pd.DataFrame(matrix) if matrix else pd.DataFrame(columns=["file","organ"] + endpoint_names)
         pid = r["paper_id"]
         org = (papers.get(pid, {}) or {}).get("organ", "unknown")
         ctx = expanded_context(r.get("text", ""), query, n_sentences=5)
+        ctx_wrapped = hard_wrap(ctx, width=110)
+        preview = ctx.strip()
+        preview = (preview[:220] + "…") if len(preview) > 220 else preview
         rows.append({
             "file": r.get("file",""),
             "endpoints": "; ".join(r.get("endpoints") or []),
             "enzymes": "; ".join((r.get("enzymes") or [])[:12]),
             "pathways": "; ".join((r.get("pathways") or [])[:12]),
+            "preview": preview,
         })
+        snippet = (ctx_wrapped.replace("\n", " ")[:360] + "…") if len(ctx_wrapped) > 360 else ctx_wrapped.replace("\n", " ")
         evidence.append(f"- **{r.get('file','')}** (p.{r.get('page','')}): {snippet}")
+    # ✅ Compact table (no long context column)
+    results_df = pd.DataFrame(rows, columns=["file","page","score","organ","endpoints","enzymes","pathways","preview"])
     evidence_md = "### Evidence used\n" + "\n".join(evidence[:8])
     # grounded mini-summary
     mini_summary = "(mini-summary unavailable)"
     try:
         client = get_client(api_key)
+        payload = [{"file": x["file"], "page": x["page"], "preview": x["preview"]} for x in rows[:8]]
         system_msg = (
             "You are a literature assistant for toxicology researchers. "
     return results_df, mini_md, evidence_md
+def on_select_result(df: pd.DataFrame, idx: dict, query: str, evt: gr.SelectData):
+    if df is None or df.empty:
+        return "", "", "", ""
+    # evt.index may be (row, col) or int depending on gradio version
+    row_i = evt.index[0] if isinstance(evt.index, (list, tuple)) else int(evt.index)
+    r = df.iloc[int(row_i)]
+    file = str(r.get("file", ""))
+    page = int(r.get("page", 0))
+    citation = f"{file} p.{page}"
+    rec = next((x for x in (idx.get("pages", []) or []) if x.get("file")==file and int(x.get("page",0))==page), None)
+    if not rec:
+        meta = f"**{citation}**"
+        return meta, citation, "(page text not found)", ""
+    ctx = expanded_context(rec.get("text",""), query, n_sentences=5)
+    ctx = hard_wrap(ctx, width=110)
+    full_txt = hard_wrap(rec.get("text",""), width=110)
+    meta = f"**{citation}** | organ: **{r.get('organ','')}** | score: **{r.get('score','')}**"
+    return meta, citation, ctx, full_txt
+def citation_ready(citation: str):
+    c = (citation or "").strip()
+    if not c:
+        return "Select a result row first."
+    return f"✅ Citation ready: {c} (copy from the box above)"
 # =============================
 # Tab plugin (Option A)
 # =============================
         "## Literature Explorer (Pilot)\n"
         f"- Limits: **max {MAX_PDFS} PDFs**, **max {MAX_PAGES_PER_PDF} pages/PDF**\n"
         "- Text-based PDFs only (not scanned/image PDFs).\n"
+        "- Search is **page-level**; “3–5 lines” is approximated as **3–5 sentences**.\n"
     )
     idx_state = gr.State(empty_index())
         build_btn = gr.Button("Build Search Index", variant="primary")
         index_status = gr.Textbox(label="Index status", interactive=False)
         papers_df = gr.Dataframe(label="Indexed papers", interactive=False, wrap=True)
+        # ✅ Table 2 now present/absent per paper
+        endpoint_matrix_df = gr.Dataframe(label="Endpoint correlation (present/absent per paper)", interactive=False, wrap=True)
     with gr.Group():
         gr.Markdown("### Search across indexed papers")
         search_btn = gr.Button("Search", variant="secondary")
         mini_summary_md = gr.Markdown()
+        # ✅ Table 3 compact (no long context)
+        results_df = gr.Dataframe(label="Search results (compact, page-level)", interactive=False, wrap=True)
+        # ✅ Selected result viewer (context moved out of table)
+        selected_meta = gr.Markdown()
+        citation_box = gr.Textbox(label="Citation (copy/paste)", interactive=False)
+        copy_btn = gr.Button("Copy citation (fills box)", variant="secondary")
+        copy_status = gr.Textbox(label="Copy status", interactive=False)
+        selected_context = gr.Textbox(label="Selected result context (3–5 sentences)", lines=6, interactive=False)
+        with gr.Accordion("Full page text (optional)", open=False):
+            full_page_text = gr.Textbox(label="Full page text", lines=14, interactive=False)
         evidence_md = gr.Markdown()
     build_btn.click(
         fn=search,
         inputs=[query, idx_state, api_key, embedding_model, summary_model, endpoint_filter, organ_filter, enzyme_filter, pathway_filter, top_k],
         outputs=[results_df, mini_summary_md, evidence_md]
+    )
+    results_df.select(
+        fn=on_select_result,
+        inputs=[results_df, idx_state, query],
+        outputs=[selected_meta, citation_box, selected_context, full_page_text]
+    )
+    copy_btn.click(
+        fn=citation_ready,
+        inputs=[citation_box],
+        outputs=[copy_status]
     )