hchevva commited on
Commit
a9d60f5
·
verified ·
1 Parent(s): c15c89e

Update literature_explorer.py

Browse files
Files changed (1) hide show
  1. literature_explorer.py +90 -46
literature_explorer.py CHANGED
@@ -1,7 +1,8 @@
1
  import os
2
  import re
3
  import json
4
- from typing import Any, Dict, List, Optional, Tuple
 
5
 
6
  import gradio as gr
7
  import numpy as np
@@ -57,7 +58,6 @@ ORGAN_HINTS: Dict[str, List[str]] = {
57
  "immune_blood": ["immune", "cytok", "inflamm", "blood", "plasma", "serum", "hemat", "lymph", "macrophage"],
58
  }
59
 
60
-
61
  def infer_organ_label(doc_text: str) -> str:
62
  t = (doc_text or "").lower()
63
  scores = {k: 0 for k in ORGAN_HINTS.keys()}
@@ -70,7 +70,6 @@ def infer_organ_label(doc_text: str) -> str:
70
  if not best or best[0][1] == 0:
71
  return "unknown"
72
 
73
- # if 2+ organs are close, label mixed
74
  top_org, top_score = best[0]
75
  if len(best) > 1 and best[1][1] > 0 and (top_score - best[1][1]) <= 1:
76
  return "mixed"
@@ -93,7 +92,6 @@ ENZYMES_BY_ORGAN: Dict[str, List[str]] = {
93
  "unknown": [],
94
  }
95
 
96
- # conservative regex patterns
97
  ENZYME_REGEXES = [
98
  re.compile(r"\bCYP\s?(\d[A-Z]?\d?[A-Z]?\d?)\b", re.IGNORECASE),
99
  re.compile(r"\bUGT\s?(\d[A-Z0-9]+)\b", re.IGNORECASE),
@@ -115,7 +113,6 @@ def detect_enzymes(text: str, organ: str) -> List[str]:
115
  if e in up:
116
  out.append(e)
117
 
118
- # regex enrich
119
  for rx in ENZYME_REGEXES:
120
  for m in rx.finditer(t):
121
  g = (m.group(1) or "").upper()
@@ -141,7 +138,6 @@ def detect_enzymes(text: str, organ: str) -> List[str]:
141
  x = "P-gp"
142
  out2.append(x)
143
 
144
- # dedupe
145
  seen = set()
146
  final = []
147
  for x in out2:
@@ -172,40 +168,15 @@ PATHWAY_TERMS = [
172
  "cytokine signaling",
173
  ]
174
 
175
- PATHWAY_REGEXES = [
176
- re.compile(r"\boxidative stress\b", re.IGNORECASE),
177
- re.compile(r"\bNrf2\b", re.IGNORECASE),
178
- re.compile(r"\bAhR\b", re.IGNORECASE),
179
- re.compile(r"\bNF[-\s]?κ?B\b", re.IGNORECASE),
180
- re.compile(r"\bp53\b", re.IGNORECASE),
181
- re.compile(r"\bMAPK\b", re.IGNORECASE),
182
- re.compile(r"\bPPAR\b", re.IGNORECASE),
183
- re.compile(r"\bapoptos(?:is|e|ic)\b", re.IGNORECASE),
184
- re.compile(r"\bDNA damage response\b", re.IGNORECASE),
185
- re.compile(r"\bmitochondrial dysfunction\b", re.IGNORECASE),
186
- re.compile(r"\bestrogen receptor\b", re.IGNORECASE),
187
- re.compile(r"\bandrogen receptor\b", re.IGNORECASE),
188
- re.compile(r"\binflammat(?:ion|ory)\b", re.IGNORECASE),
189
- re.compile(r"\bcytokine signaling\b", re.IGNORECASE),
190
- ]
191
-
192
  def detect_pathways(text: str) -> List[str]:
193
  t = text or ""
194
- out = []
195
- for rx in PATHWAY_REGEXES:
196
- if rx.search(t):
197
- # map to friendly labels
198
- # simplest: also do direct term scan afterwards
199
- pass
200
  tl = t.lower()
 
201
  for term in PATHWAY_TERMS:
202
  if term.lower() in tl:
203
  out.append(term)
204
- # ensure NF-kB catch even if κ symbol etc
205
  if re.search(r"\bNF[-\s]?κ?B\b", t, flags=re.IGNORECASE) and "NF-kB" not in out:
206
  out.append("NF-kB")
207
-
208
- # dedupe preserve order
209
  seen = set()
210
  final = []
211
  for x in out:
@@ -241,6 +212,13 @@ def is_text_based(pages: List[Tuple[int, str]]) -> bool:
241
  joined = " ".join([clean_text(t) for _, t in pages if clean_text(t)])
242
  return len(joined) >= 200
243
 
 
 
 
 
 
 
 
244
 
245
  # =============================
246
  # OpenAI helpers
@@ -280,7 +258,7 @@ def detect_endpoints(text: str) -> List[str]:
280
 
281
 
282
  # =============================
283
- # "3–5 lines" expanded context = 3–5 sentences (PDF lines unreliable)
284
  # =============================
285
  def split_sentences(text: str) -> List[str]:
286
  t = re.sub(r"\s+", " ", (text or "")).strip()
@@ -329,10 +307,10 @@ def empty_index() -> Dict[str, Any]:
329
 
330
  def build_index(files, api_key: str, embedding_model: str):
331
  if not files:
332
- return empty_index(), pd.DataFrame(), pd.DataFrame(), "Upload PDFs then click Build Search Index.", gr.update(choices=[]), gr.update(choices=[])
333
 
334
  if len(files) > MAX_PDFS:
335
- return empty_index(), pd.DataFrame(), pd.DataFrame(), f"Upload limit exceeded: max {MAX_PDFS} PDFs for pilot.", gr.update(choices=[]), gr.update(choices=[])
336
 
337
  idx = empty_index()
338
  papers_rows: List[Dict[str, Any]] = []
@@ -384,17 +362,18 @@ def build_index(files, api_key: str, embedding_model: str):
384
 
385
  papers_df = pd.DataFrame(papers_rows, columns=["file","organ","pages_indexed","text_based"])
386
 
387
- # Endpoint × Paper matrix (counts of pages mentioning each endpoint)
388
- matrix = []
389
  endpoint_names = list(ENDPOINT_HINTS.keys())
 
390
  for p in papers_rows:
391
  if not p.get("text_based"):
392
  continue
393
  pid = p["paper_id"]
394
- row = {"file": p["file"], "organ": p["organ"]}
395
  p_pages = [r for r in page_rows if r["paper_id"] == pid]
 
396
  for ep in endpoint_names:
397
- row[ep] = sum(1 for r in p_pages if ep in (r.get("endpoints") or []))
 
398
  matrix.append(row)
399
  endpoint_matrix_df = pd.DataFrame(matrix) if matrix else pd.DataFrame(columns=["file","organ"] + endpoint_names)
400
 
@@ -508,6 +487,10 @@ def search(
508
  pid = r["paper_id"]
509
  org = (papers.get(pid, {}) or {}).get("organ", "unknown")
510
  ctx = expanded_context(r.get("text", ""), query, n_sentences=5)
 
 
 
 
511
 
512
  rows.append({
513
  "file": r.get("file",""),
@@ -517,20 +500,21 @@ def search(
517
  "endpoints": "; ".join(r.get("endpoints") or []),
518
  "enzymes": "; ".join((r.get("enzymes") or [])[:12]),
519
  "pathways": "; ".join((r.get("pathways") or [])[:12]),
520
- "context": ctx
521
  })
522
 
523
- snippet = ctx[:360] + ("…" if len(ctx) > 360 else "")
524
  evidence.append(f"- **{r.get('file','')}** (p.{r.get('page','')}): {snippet}")
525
 
526
- results_df = pd.DataFrame(rows, columns=["file","page","score","organ","endpoints","enzymes","pathways","context"])
 
527
  evidence_md = "### Evidence used\n" + "\n".join(evidence[:8])
528
 
529
  # grounded mini-summary
530
  mini_summary = "(mini-summary unavailable)"
531
  try:
532
  client = get_client(api_key)
533
- payload = [{"file": x["file"], "page": x["page"], "context": x["context"]} for x in rows[:8]]
534
 
535
  system_msg = (
536
  "You are a literature assistant for toxicology researchers. "
@@ -550,6 +534,38 @@ def search(
550
  return results_df, mini_md, evidence_md
551
 
552
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
553
  # =============================
554
  # Tab plugin (Option A)
555
  # =============================
@@ -558,7 +574,7 @@ def build_literature_explorer_tab():
558
  "## Literature Explorer (Pilot)\n"
559
  f"- Limits: **max {MAX_PDFS} PDFs**, **max {MAX_PAGES_PER_PDF} pages/PDF**\n"
560
  "- Text-based PDFs only (not scanned/image PDFs).\n"
561
- "- Semantic search is page-level; “3–5 lines context” is approximated as **3–5 sentences**.\n"
562
  )
563
 
564
  idx_state = gr.State(empty_index())
@@ -573,7 +589,9 @@ def build_literature_explorer_tab():
573
  build_btn = gr.Button("Build Search Index", variant="primary")
574
  index_status = gr.Textbox(label="Index status", interactive=False)
575
  papers_df = gr.Dataframe(label="Indexed papers", interactive=False, wrap=True)
576
- endpoint_matrix_df = gr.Dataframe(label="Endpoint correlation (pages per endpoint per paper)", interactive=False, wrap=True)
 
 
577
 
578
  with gr.Group():
579
  gr.Markdown("### Search across indexed papers")
@@ -589,7 +607,21 @@ def build_literature_explorer_tab():
589
  search_btn = gr.Button("Search", variant="secondary")
590
 
591
  mini_summary_md = gr.Markdown()
592
- results_df = gr.Dataframe(label="Search results (page-level)", interactive=False, wrap=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
  evidence_md = gr.Markdown()
594
 
595
  build_btn.click(
@@ -602,4 +634,16 @@ def build_literature_explorer_tab():
602
  fn=search,
603
  inputs=[query, idx_state, api_key, embedding_model, summary_model, endpoint_filter, organ_filter, enzyme_filter, pathway_filter, top_k],
604
  outputs=[results_df, mini_summary_md, evidence_md]
 
 
 
 
 
 
 
 
 
 
 
 
605
  )
 
1
  import os
2
  import re
3
  import json
4
+ import textwrap
5
+ from typing import Any, Dict, List, Tuple
6
 
7
  import gradio as gr
8
  import numpy as np
 
58
  "immune_blood": ["immune", "cytok", "inflamm", "blood", "plasma", "serum", "hemat", "lymph", "macrophage"],
59
  }
60
 
 
61
  def infer_organ_label(doc_text: str) -> str:
62
  t = (doc_text or "").lower()
63
  scores = {k: 0 for k in ORGAN_HINTS.keys()}
 
70
  if not best or best[0][1] == 0:
71
  return "unknown"
72
 
 
73
  top_org, top_score = best[0]
74
  if len(best) > 1 and best[1][1] > 0 and (top_score - best[1][1]) <= 1:
75
  return "mixed"
 
92
  "unknown": [],
93
  }
94
 
 
95
  ENZYME_REGEXES = [
96
  re.compile(r"\bCYP\s?(\d[A-Z]?\d?[A-Z]?\d?)\b", re.IGNORECASE),
97
  re.compile(r"\bUGT\s?(\d[A-Z0-9]+)\b", re.IGNORECASE),
 
113
  if e in up:
114
  out.append(e)
115
 
 
116
  for rx in ENZYME_REGEXES:
117
  for m in rx.finditer(t):
118
  g = (m.group(1) or "").upper()
 
138
  x = "P-gp"
139
  out2.append(x)
140
 
 
141
  seen = set()
142
  final = []
143
  for x in out2:
 
168
  "cytokine signaling",
169
  ]
170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  def detect_pathways(text: str) -> List[str]:
172
  t = text or ""
 
 
 
 
 
 
173
  tl = t.lower()
174
+ out = []
175
  for term in PATHWAY_TERMS:
176
  if term.lower() in tl:
177
  out.append(term)
 
178
  if re.search(r"\bNF[-\s]?κ?B\b", t, flags=re.IGNORECASE) and "NF-kB" not in out:
179
  out.append("NF-kB")
 
 
180
  seen = set()
181
  final = []
182
  for x in out:
 
212
  joined = " ".join([clean_text(t) for _, t in pages if clean_text(t)])
213
  return len(joined) >= 200
214
 
215
+ def hard_wrap(s: str, width: int = 110) -> str:
216
+ s = (s or "").strip()
217
+ if not s:
218
+ return ""
219
+ return "\n".join(textwrap.fill(line, width=width, break_long_words=True, break_on_hyphens=True)
220
+ for line in s.splitlines() if line.strip())
221
+
222
 
223
  # =============================
224
  # OpenAI helpers
 
258
 
259
 
260
  # =============================
261
+ # Expanded context = 3–5 sentences (PDF lines unreliable)
262
  # =============================
263
  def split_sentences(text: str) -> List[str]:
264
  t = re.sub(r"\s+", " ", (text or "")).strip()
 
307
 
308
  def build_index(files, api_key: str, embedding_model: str):
309
  if not files:
310
+ return empty_index(), pd.DataFrame(), pd.DataFrame(), "Upload PDFs then click Build Search Index.", gr.update(choices=[""], value=""), gr.update(choices=[""], value="")
311
 
312
  if len(files) > MAX_PDFS:
313
+ return empty_index(), pd.DataFrame(), pd.DataFrame(), f"Upload limit exceeded: max {MAX_PDFS} PDFs for pilot.", gr.update(choices=[""], value=""), gr.update(choices=[""], value="")
314
 
315
  idx = empty_index()
316
  papers_rows: List[Dict[str, Any]] = []
 
362
 
363
  papers_df = pd.DataFrame(papers_rows, columns=["file","organ","pages_indexed","text_based"])
364
 
365
+ # �� Endpoint correlation: present/absent per paper (cleaner)
 
366
  endpoint_names = list(ENDPOINT_HINTS.keys())
367
+ matrix = []
368
  for p in papers_rows:
369
  if not p.get("text_based"):
370
  continue
371
  pid = p["paper_id"]
 
372
  p_pages = [r for r in page_rows if r["paper_id"] == pid]
373
+ row = {"file": p["file"], "organ": p["organ"]}
374
  for ep in endpoint_names:
375
+ present = any(ep in (r.get("endpoints") or []) for r in p_pages)
376
+ row[ep] = "present" if present else ""
377
  matrix.append(row)
378
  endpoint_matrix_df = pd.DataFrame(matrix) if matrix else pd.DataFrame(columns=["file","organ"] + endpoint_names)
379
 
 
487
  pid = r["paper_id"]
488
  org = (papers.get(pid, {}) or {}).get("organ", "unknown")
489
  ctx = expanded_context(r.get("text", ""), query, n_sentences=5)
490
+ ctx_wrapped = hard_wrap(ctx, width=110)
491
+
492
+ preview = ctx.strip()
493
+ preview = (preview[:220] + "…") if len(preview) > 220 else preview
494
 
495
  rows.append({
496
  "file": r.get("file",""),
 
500
  "endpoints": "; ".join(r.get("endpoints") or []),
501
  "enzymes": "; ".join((r.get("enzymes") or [])[:12]),
502
  "pathways": "; ".join((r.get("pathways") or [])[:12]),
503
+ "preview": preview,
504
  })
505
 
506
+ snippet = (ctx_wrapped.replace("\n", " ")[:360] + "…") if len(ctx_wrapped) > 360 else ctx_wrapped.replace("\n", " ")
507
  evidence.append(f"- **{r.get('file','')}** (p.{r.get('page','')}): {snippet}")
508
 
509
+ # Compact table (no long context column)
510
+ results_df = pd.DataFrame(rows, columns=["file","page","score","organ","endpoints","enzymes","pathways","preview"])
511
  evidence_md = "### Evidence used\n" + "\n".join(evidence[:8])
512
 
513
  # grounded mini-summary
514
  mini_summary = "(mini-summary unavailable)"
515
  try:
516
  client = get_client(api_key)
517
+ payload = [{"file": x["file"], "page": x["page"], "preview": x["preview"]} for x in rows[:8]]
518
 
519
  system_msg = (
520
  "You are a literature assistant for toxicology researchers. "
 
534
  return results_df, mini_md, evidence_md
535
 
536
 
537
+ def on_select_result(df: pd.DataFrame, idx: dict, query: str, evt: gr.SelectData):
538
+ if df is None or df.empty:
539
+ return "", "", "", ""
540
+
541
+ # evt.index may be (row, col) or int depending on gradio version
542
+ row_i = evt.index[0] if isinstance(evt.index, (list, tuple)) else int(evt.index)
543
+
544
+ r = df.iloc[int(row_i)]
545
+ file = str(r.get("file", ""))
546
+ page = int(r.get("page", 0))
547
+ citation = f"{file} p.{page}"
548
+
549
+ rec = next((x for x in (idx.get("pages", []) or []) if x.get("file")==file and int(x.get("page",0))==page), None)
550
+ if not rec:
551
+ meta = f"**{citation}**"
552
+ return meta, citation, "(page text not found)", ""
553
+
554
+ ctx = expanded_context(rec.get("text",""), query, n_sentences=5)
555
+ ctx = hard_wrap(ctx, width=110)
556
+ full_txt = hard_wrap(rec.get("text",""), width=110)
557
+
558
+ meta = f"**{citation}** | organ: **{r.get('organ','')}** | score: **{r.get('score','')}**"
559
+ return meta, citation, ctx, full_txt
560
+
561
+
562
+ def citation_ready(citation: str):
563
+ c = (citation or "").strip()
564
+ if not c:
565
+ return "Select a result row first."
566
+ return f"✅ Citation ready: {c} (copy from the box above)"
567
+
568
+
569
  # =============================
570
  # Tab plugin (Option A)
571
  # =============================
 
574
  "## Literature Explorer (Pilot)\n"
575
  f"- Limits: **max {MAX_PDFS} PDFs**, **max {MAX_PAGES_PER_PDF} pages/PDF**\n"
576
  "- Text-based PDFs only (not scanned/image PDFs).\n"
577
+ "- Search is **page-level**; “3–5 lines” is approximated as **3–5 sentences**.\n"
578
  )
579
 
580
  idx_state = gr.State(empty_index())
 
589
  build_btn = gr.Button("Build Search Index", variant="primary")
590
  index_status = gr.Textbox(label="Index status", interactive=False)
591
  papers_df = gr.Dataframe(label="Indexed papers", interactive=False, wrap=True)
592
+
593
+ # ✅ Table 2 now present/absent per paper
594
+ endpoint_matrix_df = gr.Dataframe(label="Endpoint correlation (present/absent per paper)", interactive=False, wrap=True)
595
 
596
  with gr.Group():
597
  gr.Markdown("### Search across indexed papers")
 
607
  search_btn = gr.Button("Search", variant="secondary")
608
 
609
  mini_summary_md = gr.Markdown()
610
+
611
+ # ✅ Table 3 compact (no long context)
612
+ results_df = gr.Dataframe(label="Search results (compact, page-level)", interactive=False, wrap=True)
613
+
614
+ # ✅ Selected result viewer (context moved out of table)
615
+ selected_meta = gr.Markdown()
616
+ citation_box = gr.Textbox(label="Citation (copy/paste)", interactive=False)
617
+ copy_btn = gr.Button("Copy citation (fills box)", variant="secondary")
618
+ copy_status = gr.Textbox(label="Copy status", interactive=False)
619
+
620
+ selected_context = gr.Textbox(label="Selected result context (3–5 sentences)", lines=6, interactive=False)
621
+
622
+ with gr.Accordion("Full page text (optional)", open=False):
623
+ full_page_text = gr.Textbox(label="Full page text", lines=14, interactive=False)
624
+
625
  evidence_md = gr.Markdown()
626
 
627
  build_btn.click(
 
634
  fn=search,
635
  inputs=[query, idx_state, api_key, embedding_model, summary_model, endpoint_filter, organ_filter, enzyme_filter, pathway_filter, top_k],
636
  outputs=[results_df, mini_summary_md, evidence_md]
637
+ )
638
+
639
+ results_df.select(
640
+ fn=on_select_result,
641
+ inputs=[results_df, idx_state, query],
642
+ outputs=[selected_meta, citation_box, selected_context, full_page_text]
643
+ )
644
+
645
+ copy_btn.click(
646
+ fn=citation_ready,
647
+ inputs=[citation_box],
648
+ outputs=[copy_status]
649
  )