hchevva commited on
Commit
8f914d5
·
verified ·
1 Parent(s): f6221d9

Create literature_explorer.py

Browse files
Files changed (1) hide show
  1. literature_explorer.py +605 -0
literature_explorer.py ADDED
@@ -0,0 +1,605 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ from typing import Any, Dict, List, Optional, Tuple
5
+
6
+ import gradio as gr
7
+ import numpy as np
8
+ import pandas as pd
9
+ from pypdf import PdfReader
10
+ from openai import OpenAI
11
+
12
+
13
+ # =============================
14
+ # Pilot limits
15
+ # =============================
16
+ MAX_PDFS = 5
17
+ MAX_PAGES_PER_PDF = 20
18
+
19
+ MAX_CHARS_PER_PAGE_FOR_INDEX = 7000 # cap for cost/stability
20
+ DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small"
21
+ DEFAULT_SUMMARY_MODEL = "gpt-4o-mini"
22
+
23
+
24
+ # =============================
25
+ # Endpoint fallback inference lexicon (Explorer-only)
26
+ # =============================
27
+ ENDPOINT_HINTS: Dict[str, List[str]] = {
28
+ "Genotoxicity (OECD TG)": [
29
+ "genotoxic", "mutagen", "clastogen", "ames", "micronucleus", "comet assay",
30
+ "chromosomal aberration", "dna damage", "oecd tg 471", "tg471", "oecd tg 473", "tg473",
31
+ "oecd tg 476", "tg476", "oecd tg 487", "tg487", "oecd tg 490", "tg490",
32
+ "oecd tg 474", "tg474", "oecd tg 475", "tg475", "oecd tg 488", "tg488",
33
+ "oecd tg 489", "tg489"
34
+ ],
35
+ "NAMs / In Silico": ["in silico", "qsar", "read-across", "aop", "pbpk", "high-throughput", "omics", "organ-on-chip", "microphysiological"],
36
+ "Acute toxicity": ["acute toxicity", "ld50", "lc50", "single dose", "mortality", "lethality"],
37
+ "Repeated dose toxicity": ["repeated dose", "subchronic", "chronic", "noael", "loael", "28-day", "90-day", "target organ"],
38
+ "Irritation / Sensitization": ["skin irritation", "eye irritation", "draize", "sensitization", "llna", "patch test"],
39
+ "Repro / Developmental": ["reproductive toxicity", "fertility", "developmental toxicity", "teratogen", "prenatal", "postnatal"],
40
+ "Carcinogenicity": ["carcinogenic", "tumor", "neoplasm", "cancer", "two-year", "bioassay"],
41
+ }
42
+
43
+
44
+ # =============================
45
+ # Organ inference (automatic only)
46
+ # =============================
47
+ ORGANS = ["liver", "lung", "kidney", "skin", "gi", "cns", "reproductive", "immune_blood", "mixed", "unknown"]
48
+
49
+ ORGAN_HINTS: Dict[str, List[str]] = {
50
+ "liver": ["liver", "hepatic", "hepatocyte", "hepatotoxic", "bile", "cholest", "alt", "ast"],
51
+ "lung": ["lung", "pulmonary", "bronch", "alveol", "airway", "inhalation", "respiratory"],
52
+ "kidney": ["kidney", "renal", "nephro", "glomerul", "tubul", "creatinine", "bun"],
53
+ "skin": ["skin", "dermal", "epiderm", "cutaneous", "topical"],
54
+ "gi": ["gastro", "intestinal", "gut", "colon", "stomach", "oral", "ingestion"],
55
+ "cns": ["brain", "cns", "neuro", "neuronal", "glia", "blood-brain", "dopamin", "seroton"],
56
+ "reproductive": ["repro", "testis", "ovary", "uterus", "placent", "fetus", "embryo", "sperm", "oocyte"],
57
+ "immune_blood": ["immune", "cytok", "inflamm", "blood", "plasma", "serum", "hemat", "lymph", "macrophage"],
58
+ }
59
+
60
+
61
+ def infer_organ_label(doc_text: str) -> str:
62
+ t = (doc_text or "").lower()
63
+ scores = {k: 0 for k in ORGAN_HINTS.keys()}
64
+ for organ, hints in ORGAN_HINTS.items():
65
+ for h in hints:
66
+ if h in t:
67
+ scores[organ] += 1
68
+
69
+ best = sorted(scores.items(), key=lambda x: x[1], reverse=True)
70
+ if not best or best[0][1] == 0:
71
+ return "unknown"
72
+
73
+ # if 2+ organs are close, label mixed
74
+ top_org, top_score = best[0]
75
+ if len(best) > 1 and best[1][1] > 0 and (top_score - best[1][1]) <= 1:
76
+ return "mixed"
77
+ return top_org
78
+
79
+
80
+ # =============================
81
+ # Curated enzymes by organ (starter list)
82
+ # =============================
83
+ ENZYMES_BY_ORGAN: Dict[str, List[str]] = {
84
+ "liver": ["CYP1A2","CYP2C9","CYP2C19","CYP2D6","CYP2E1","CYP3A4","CYP3A5","UGT1A1","UGT2B7","SULT1A1","GSTA1","GSTP1","ADH","ALDH","CES1","CES2"],
85
+ "lung": ["CYP1A1","CYP1B1","CYP2F1","GSTP1","MPO","ALDH"],
86
+ "kidney": ["OAT1","OAT3","OCT2","MATE1","MATE2","GSTP1","GSTA1"],
87
+ "skin": ["CYP1A1","GSTP1","UGT1A1","SULT1A1","ESTERASE","CES1","CES2"],
88
+ "gi": ["CYP3A4","UGT1A1","UGT2B7","SULT1A1","ABCB1","P-GP","CES1","CES2"],
89
+ "cns": ["MAO-A","MAO-B","MAOA","MAOB","COMT","ALDH"],
90
+ "reproductive": ["AROMATASE","CYP19A1","HSD17B","CYP17A1","UGT2B7"],
91
+ "immune_blood": ["MPO","COX","PTGS1","PTGS2","LOX","ALOX5"],
92
+ "mixed": [],
93
+ "unknown": [],
94
+ }
95
+
96
+ # conservative regex patterns
97
+ ENZYME_REGEXES = [
98
+ re.compile(r"\bCYP\s?(\d[A-Z]?\d?[A-Z]?\d?)\b", re.IGNORECASE),
99
+ re.compile(r"\bUGT\s?(\d[A-Z0-9]+)\b", re.IGNORECASE),
100
+ re.compile(r"\bSULT\s?(\d[A-Z0-9]+)\b", re.IGNORECASE),
101
+ re.compile(r"\bGST\s?([A-Z0-9]+)\b", re.IGNORECASE),
102
+ re.compile(r"\bEC\s?(\d+\.\d+\.\d+\.\d+)\b", re.IGNORECASE),
103
+ ]
104
+
105
+ def detect_enzymes(text: str, organ: str) -> List[str]:
106
+ t = text or ""
107
+ up = t.upper()
108
+
109
+ base = ENZYMES_BY_ORGAN.get(organ, [])
110
+ if organ in ("mixed", "unknown"):
111
+ base = ["CYP3A4","CYP2D6","CYP2E1","UGT1A1","SULT1A1","GSTP1","ALDH","ADH"]
112
+
113
+ out: List[str] = []
114
+ for e in base:
115
+ if e in up:
116
+ out.append(e)
117
+
118
+ # regex enrich
119
+ for rx in ENZYME_REGEXES:
120
+ for m in rx.finditer(t):
121
+ g = (m.group(1) or "").upper()
122
+ if not g:
123
+ continue
124
+ if rx.pattern.lower().startswith(r"\bcyp"):
125
+ v = f"CYP{g}"
126
+ elif rx.pattern.lower().startswith(r"\bugt"):
127
+ v = f"UGT{g}"
128
+ elif rx.pattern.lower().startswith(r"\bsult"):
129
+ v = f"SULT{g}"
130
+ elif rx.pattern.lower().startswith(r"\bgst"):
131
+ v = f"GST{g}"
132
+ else:
133
+ v = f"EC {g}"
134
+ if v not in out:
135
+ out.append(v)
136
+
137
+ # normalize P-gp variants
138
+ out2 = []
139
+ for x in out:
140
+ if x in ("P-GP", "PGP", "PGLYCO"):
141
+ x = "P-gp"
142
+ out2.append(x)
143
+
144
+ # dedupe
145
+ seen = set()
146
+ final = []
147
+ for x in out2:
148
+ k = x.lower()
149
+ if k not in seen:
150
+ seen.add(k)
151
+ final.append(x)
152
+ return final
153
+
154
+
155
+ # =============================
156
+ # Named pathways (starter lexicon)
157
+ # =============================
158
+ PATHWAY_TERMS = [
159
+ "oxidative stress",
160
+ "Nrf2",
161
+ "AhR",
162
+ "NF-kB",
163
+ "p53",
164
+ "MAPK",
165
+ "PPAR",
166
+ "apoptosis",
167
+ "DNA damage response",
168
+ "mitochondrial dysfunction",
169
+ "estrogen receptor",
170
+ "androgen receptor",
171
+ "inflammation",
172
+ "cytokine signaling",
173
+ ]
174
+
175
+ PATHWAY_REGEXES = [
176
+ re.compile(r"\boxidative stress\b", re.IGNORECASE),
177
+ re.compile(r"\bNrf2\b", re.IGNORECASE),
178
+ re.compile(r"\bAhR\b", re.IGNORECASE),
179
+ re.compile(r"\bNF[-\s]?κ?B\b", re.IGNORECASE),
180
+ re.compile(r"\bp53\b", re.IGNORECASE),
181
+ re.compile(r"\bMAPK\b", re.IGNORECASE),
182
+ re.compile(r"\bPPAR\b", re.IGNORECASE),
183
+ re.compile(r"\bapoptos(?:is|e|ic)\b", re.IGNORECASE),
184
+ re.compile(r"\bDNA damage response\b", re.IGNORECASE),
185
+ re.compile(r"\bmitochondrial dysfunction\b", re.IGNORECASE),
186
+ re.compile(r"\bestrogen receptor\b", re.IGNORECASE),
187
+ re.compile(r"\bandrogen receptor\b", re.IGNORECASE),
188
+ re.compile(r"\binflammat(?:ion|ory)\b", re.IGNORECASE),
189
+ re.compile(r"\bcytokine signaling\b", re.IGNORECASE),
190
+ ]
191
+
192
+ def detect_pathways(text: str) -> List[str]:
193
+ t = text or ""
194
+ out = []
195
+ for rx in PATHWAY_REGEXES:
196
+ if rx.search(t):
197
+ # map to friendly labels
198
+ # simplest: also do direct term scan afterwards
199
+ pass
200
+ tl = t.lower()
201
+ for term in PATHWAY_TERMS:
202
+ if term.lower() in tl:
203
+ out.append(term)
204
+ # ensure NF-kB catch even if κ symbol etc
205
+ if re.search(r"\bNF[-\s]?κ?B\b", t, flags=re.IGNORECASE) and "NF-kB" not in out:
206
+ out.append("NF-kB")
207
+
208
+ # dedupe preserve order
209
+ seen = set()
210
+ final = []
211
+ for x in out:
212
+ k = x.lower()
213
+ if k not in seen:
214
+ seen.add(k)
215
+ final.append(x)
216
+ return final
217
+
218
+
219
+ # =============================
220
+ # PDF utils
221
+ # =============================
222
+ def extract_pages(pdf_path: str, max_pages: int) -> Tuple[List[Tuple[int, str]], int]:
223
+ reader = PdfReader(pdf_path)
224
+ total = len(reader.pages)
225
+ n = min(total, max_pages)
226
+ pages: List[Tuple[int, str]] = []
227
+ for i in range(n):
228
+ try:
229
+ txt = reader.pages[i].extract_text() or ""
230
+ except Exception:
231
+ txt = ""
232
+ pages.append((i + 1, txt))
233
+ return pages, total
234
+
235
+ def clean_text(t: str) -> str:
236
+ t = (t or "").replace("\x00", " ")
237
+ t = re.sub(r"\s+", " ", t).strip()
238
+ return t
239
+
240
+ def is_text_based(pages: List[Tuple[int, str]]) -> bool:
241
+ joined = " ".join([clean_text(t) for _, t in pages if clean_text(t)])
242
+ return len(joined) >= 200
243
+
244
+
245
+ # =============================
246
+ # OpenAI helpers
247
+ # =============================
248
+ def get_client(api_key: str) -> OpenAI:
249
+ key = (api_key or "").strip() or os.getenv("OPENAI_API_KEY", "").strip()
250
+ if not key:
251
+ raise ValueError("Missing OpenAI API key. Provide it here or set OPENAI_API_KEY secret.")
252
+ return OpenAI(api_key=key)
253
+
254
+ def batched(xs: List[Any], n: int) -> List[List[Any]]:
255
+ return [xs[i:i+n] for i in range(0, len(xs), n)]
256
+
257
+ def embed_texts(client: OpenAI, model: str, texts: List[str]) -> np.ndarray:
258
+ embs: List[List[float]] = []
259
+ for b in batched(texts, 64):
260
+ resp = client.embeddings.create(model=model, input=b)
261
+ for item in resp.data:
262
+ embs.append(item.embedding)
263
+ arr = np.array(embs, dtype=np.float32)
264
+ norms = np.linalg.norm(arr, axis=1, keepdims=True) + 1e-12
265
+ return arr / norms
266
+
267
+
268
+ # =============================
269
+ # Endpoint detection
270
+ # =============================
271
+ def detect_endpoints(text: str) -> List[str]:
272
+ t = (text or "").lower()
273
+ found: List[str] = []
274
+ for ep, hints in ENDPOINT_HINTS.items():
275
+ for h in hints:
276
+ if h in t:
277
+ found.append(ep)
278
+ break
279
+ return found
280
+
281
+
282
+ # =============================
283
+ # "3–5 lines" expanded context = 3–5 sentences (PDF lines unreliable)
284
+ # =============================
285
+ def split_sentences(text: str) -> List[str]:
286
+ t = re.sub(r"\s+", " ", (text or "")).strip()
287
+ if not t:
288
+ return []
289
+ parts = re.split(r"(?<=[\.\?\!])\s+", t)
290
+ return [p.strip() for p in parts if p.strip()]
291
+
292
+ def expanded_context(page_text: str, query: str, n_sentences: int = 5) -> str:
293
+ sents = split_sentences(page_text)
294
+ if not sents:
295
+ return ""
296
+ q = (query or "").strip().lower()
297
+ if not q:
298
+ return " ".join(sents[:n_sentences])
299
+
300
+ qwords = [w for w in re.findall(r"[a-zA-Z0-9\-]+", q) if len(w) >= 3]
301
+ hit_i = None
302
+ for i, s in enumerate(sents):
303
+ sl = s.lower()
304
+ if any(w in sl for w in qwords):
305
+ hit_i = i
306
+ break
307
+ if hit_i is None:
308
+ return " ".join(sents[:n_sentences])
309
+
310
+ start = max(0, hit_i - 2)
311
+ end = min(len(sents), hit_i + 3)
312
+ return " ".join(sents[start:end])
313
+
314
+
315
+ # =============================
316
+ # Index state object (stored in gr.State)
317
+ # =============================
318
+ def empty_index() -> Dict[str, Any]:
319
+ return {
320
+ "papers": [], # {paper_id, file, organ, pages_indexed, text_based}
321
+ "pages": [], # {paper_id, file, page, text, endpoints, enzymes, pathways}
322
+ "embeddings": None, # np.ndarray normalized
323
+ "embedding_model": None,
324
+ "has_embeddings": False,
325
+ "enzymes_vocab": [],
326
+ "pathways_vocab": [],
327
+ }
328
+
329
+
330
+ def build_index(files, api_key: str, embedding_model: str):
331
+ if not files:
332
+ return empty_index(), pd.DataFrame(), pd.DataFrame(), "Upload PDFs then click Build Search Index.", gr.update(choices=[]), gr.update(choices=[])
333
+
334
+ if len(files) > MAX_PDFS:
335
+ return empty_index(), pd.DataFrame(), pd.DataFrame(), f"Upload limit exceeded: max {MAX_PDFS} PDFs for pilot.", gr.update(choices=[]), gr.update(choices=[])
336
+
337
+ idx = empty_index()
338
+ papers_rows: List[Dict[str, Any]] = []
339
+ page_rows: List[Dict[str, Any]] = []
340
+
341
+ for f in files:
342
+ pdf_path = f.name
343
+ filename = os.path.basename(pdf_path)
344
+ pages, total = extract_pages(pdf_path, MAX_PAGES_PER_PDF)
345
+ text_ok = is_text_based(pages)
346
+
347
+ doc_text = " ".join([clean_text(t) for _, t in pages if clean_text(t)])
348
+ organ = infer_organ_label(doc_text) if text_ok else "unknown"
349
+
350
+ paper_id = filename
351
+ papers_rows.append({
352
+ "paper_id": paper_id,
353
+ "file": filename,
354
+ "organ": organ,
355
+ "pages_indexed": min(total, MAX_PAGES_PER_PDF),
356
+ "text_based": bool(text_ok),
357
+ })
358
+
359
+ if not text_ok:
360
+ continue
361
+
362
+ for pno, raw in pages:
363
+ txt = clean_text(raw)
364
+ if not txt:
365
+ continue
366
+ txt = txt[:MAX_CHARS_PER_PAGE_FOR_INDEX]
367
+
368
+ eps = detect_endpoints(txt)
369
+ enz = detect_enzymes(txt, organ)
370
+ pws = detect_pathways(txt)
371
+
372
+ page_rows.append({
373
+ "paper_id": paper_id,
374
+ "file": filename,
375
+ "page": pno,
376
+ "text": txt,
377
+ "endpoints": eps,
378
+ "enzymes": enz,
379
+ "pathways": pws,
380
+ })
381
+
382
+ idx["papers"] = papers_rows
383
+ idx["pages"] = page_rows
384
+
385
+ papers_df = pd.DataFrame(papers_rows, columns=["file","organ","pages_indexed","text_based"])
386
+
387
+ # Endpoint × Paper matrix (counts of pages mentioning each endpoint)
388
+ matrix = []
389
+ endpoint_names = list(ENDPOINT_HINTS.keys())
390
+ for p in papers_rows:
391
+ if not p.get("text_based"):
392
+ continue
393
+ pid = p["paper_id"]
394
+ row = {"file": p["file"], "organ": p["organ"]}
395
+ p_pages = [r for r in page_rows if r["paper_id"] == pid]
396
+ for ep in endpoint_names:
397
+ row[ep] = sum(1 for r in p_pages if ep in (r.get("endpoints") or []))
398
+ matrix.append(row)
399
+ endpoint_matrix_df = pd.DataFrame(matrix) if matrix else pd.DataFrame(columns=["file","organ"] + endpoint_names)
400
+
401
+ # vocab lists for filters (computed at indexing time)
402
+ enzymes_vocab = sorted({e for r in page_rows for e in (r.get("enzymes") or [])})
403
+ pathways_vocab = sorted({p for r in page_rows for p in (r.get("pathways") or [])})
404
+ idx["enzymes_vocab"] = enzymes_vocab
405
+ idx["pathways_vocab"] = pathways_vocab
406
+
407
+ # embeddings
408
+ status = "✅ Indexed pages locally (no embeddings)."
409
+ try:
410
+ client = get_client(api_key)
411
+ texts = [r["text"] for r in page_rows]
412
+ if texts:
413
+ em = embed_texts(client, embedding_model or DEFAULT_EMBEDDING_MODEL, texts)
414
+ idx["embeddings"] = em
415
+ idx["embedding_model"] = embedding_model or DEFAULT_EMBEDDING_MODEL
416
+ idx["has_embeddings"] = True
417
+ status = f"✅ Indexed {len(papers_rows)} paper(s), {len(texts)} page(s). Embeddings built ({idx['embedding_model']})."
418
+ else:
419
+ status = "⚠️ No text pages found to index (text-based PDFs only)."
420
+ except Exception as e:
421
+ status = f"⚠️ Indexed pages, but embeddings unavailable: {e}. You can still run search with fallback ranking."
422
+
423
+ return (
424
+ idx,
425
+ papers_df,
426
+ endpoint_matrix_df,
427
+ status,
428
+ gr.update(choices=[""] + enzymes_vocab, value=""),
429
+ gr.update(choices=[""] + pathways_vocab, value="")
430
+ )
431
+
432
+
433
+ def search(
434
+ query: str,
435
+ idx: Dict[str, Any],
436
+ api_key: str,
437
+ embedding_model: str,
438
+ summary_model: str,
439
+ endpoint_filter: List[str],
440
+ organ_filter: str,
441
+ enzyme_filter: str,
442
+ pathway_filter: str,
443
+ top_k: int,
444
+ ):
445
+ query = (query or "").strip()
446
+ if not query:
447
+ return pd.DataFrame(), "### Grounded mini-summary\n(type a query)", "### Evidence used\n"
448
+
449
+ if not idx or not idx.get("pages"):
450
+ return pd.DataFrame(), "### Grounded mini-summary\n(Build the index first)", "### Evidence used\n"
451
+
452
+ pages = idx["pages"]
453
+ papers = {p["paper_id"]: p for p in (idx.get("papers") or [])}
454
+
455
+ def passes(r: Dict[str, Any]) -> bool:
456
+ if organ_filter and organ_filter != "any":
457
+ org = (papers.get(r["paper_id"], {}) or {}).get("organ", "unknown")
458
+ if org != organ_filter:
459
+ return False
460
+ if endpoint_filter:
461
+ eps = r.get("endpoints") or []
462
+ if not any(e in eps for e in endpoint_filter):
463
+ return False
464
+ if enzyme_filter:
465
+ enz = r.get("enzymes") or []
466
+ if enzyme_filter not in enz:
467
+ return False
468
+ if pathway_filter:
469
+ pws = r.get("pathways") or []
470
+ if pathway_filter not in pws:
471
+ return False
472
+ return True
473
+
474
+ filtered_idx = [i for i, r in enumerate(pages) if passes(r)]
475
+ if not filtered_idx:
476
+ return pd.DataFrame(), "### Grounded mini-summary\n(No pages match your filters)", "### Evidence used\n"
477
+
478
+ ranked: List[Tuple[float, Dict[str, Any]]] = []
479
+
480
+ # embeddings path
481
+ if idx.get("has_embeddings") and idx.get("embeddings") is not None:
482
+ try:
483
+ client = get_client(api_key)
484
+ qemb = embed_texts(client, embedding_model or idx.get("embedding_model") or DEFAULT_EMBEDDING_MODEL, [query])[0]
485
+ mat = idx["embeddings"][filtered_idx, :]
486
+ scores = mat @ qemb
487
+ order = np.argsort(scores)[::-1][:max(1, int(top_k))]
488
+ for j in order:
489
+ page_i = filtered_idx[int(j)]
490
+ ranked.append((float(scores[int(j)]), pages[page_i]))
491
+ except Exception:
492
+ ranked = []
493
+
494
+ # fallback ranking
495
+ if not ranked:
496
+ qwords = set([w for w in re.findall(r"[a-zA-Z0-9\-]+", query.lower()) if len(w) >= 3])
497
+ tmp = []
498
+ for i in filtered_idx:
499
+ t = (pages[i].get("text") or "").lower()
500
+ hits = sum(1 for w in qwords if w in t)
501
+ tmp.append((hits, pages[i]))
502
+ tmp.sort(key=lambda x: x[0], reverse=True)
503
+ ranked = [(float(h), r) for h, r in tmp[:max(1, int(top_k))]]
504
+
505
+ rows = []
506
+ evidence = []
507
+ for score, r in ranked:
508
+ pid = r["paper_id"]
509
+ org = (papers.get(pid, {}) or {}).get("organ", "unknown")
510
+ ctx = expanded_context(r.get("text", ""), query, n_sentences=5)
511
+
512
+ rows.append({
513
+ "file": r.get("file",""),
514
+ "page": r.get("page",""),
515
+ "score": round(score, 4),
516
+ "organ": org,
517
+ "endpoints": "; ".join(r.get("endpoints") or []),
518
+ "enzymes": "; ".join((r.get("enzymes") or [])[:12]),
519
+ "pathways": "; ".join((r.get("pathways") or [])[:12]),
520
+ "context": ctx
521
+ })
522
+
523
+ snippet = ctx[:360] + ("…" if len(ctx) > 360 else "")
524
+ evidence.append(f"- **{r.get('file','')}** (p.{r.get('page','')}): {snippet}")
525
+
526
+ results_df = pd.DataFrame(rows, columns=["file","page","score","organ","endpoints","enzymes","pathways","context"])
527
+ evidence_md = "### Evidence used\n" + "\n".join(evidence[:8])
528
+
529
+ # grounded mini-summary
530
+ mini_summary = "(mini-summary unavailable)"
531
+ try:
532
+ client = get_client(api_key)
533
+ payload = [{"file": x["file"], "page": x["page"], "context": x["context"]} for x in rows[:8]]
534
+
535
+ system_msg = (
536
+ "You are a literature assistant for toxicology researchers. "
537
+ "Write ONE neutral paragraph that answers the user's query based ONLY on the evidence excerpts. "
538
+ "Cite sources inline as (File p.X). Do not add outside facts."
539
+ )
540
+ user_msg = "USER QUERY:\n" + query + "\n\nEVIDENCE EXCERPTS:\n" + json.dumps(payload, indent=2)
541
+ resp = client.responses.create(
542
+ model=summary_model or DEFAULT_SUMMARY_MODEL,
543
+ input=[{"role":"system","content":system_msg},{"role":"user","content":user_msg}]
544
+ )
545
+ mini_summary = resp.output_text.strip()
546
+ except Exception as e:
547
+ mini_summary = f"(mini-summary unavailable: {e})"
548
+
549
+ mini_md = "### Grounded mini-summary\n" + mini_summary
550
+ return results_df, mini_md, evidence_md
551
+
552
+
553
+ # =============================
554
+ # Tab plugin (Option A)
555
+ # =============================
556
+ def build_literature_explorer_tab():
557
+ gr.Markdown(
558
+ "## Literature Explorer (Pilot)\n"
559
+ f"- Limits: **max {MAX_PDFS} PDFs**, **max {MAX_PAGES_PER_PDF} pages/PDF**\n"
560
+ "- Text-based PDFs only (not scanned/image PDFs).\n"
561
+ "- Semantic search is page-level; “3–5 lines context” is approximated as **3–5 sentences**.\n"
562
+ )
563
+
564
+ idx_state = gr.State(empty_index())
565
+
566
+ with gr.Group():
567
+ files = gr.File(label="Upload PDFs (Explorer only)", file_types=[".pdf"], file_count="multiple")
568
+ with gr.Row():
569
+ api_key = gr.Textbox(label="OpenAI API key (Explorer)", type="password")
570
+ embedding_model = gr.Dropdown(label="Embedding model", choices=["text-embedding-3-small","text-embedding-3-large"], value=DEFAULT_EMBEDDING_MODEL)
571
+ summary_model = gr.Dropdown(label="Mini-summary model", choices=["gpt-4o-mini","gpt-4o","gpt-4o-2024-08-06"], value=DEFAULT_SUMMARY_MODEL)
572
+
573
+ build_btn = gr.Button("Build Search Index", variant="primary")
574
+ index_status = gr.Textbox(label="Index status", interactive=False)
575
+ papers_df = gr.Dataframe(label="Indexed papers", interactive=False, wrap=True)
576
+ endpoint_matrix_df = gr.Dataframe(label="Endpoint correlation (pages per endpoint per paper)", interactive=False, wrap=True)
577
+
578
+ with gr.Group():
579
+ gr.Markdown("### Search across indexed papers")
580
+ query = gr.Textbox(label="Search query", placeholder="e.g., CYP3A4 oxidative stress and genotoxicity", lines=2)
581
+
582
+ with gr.Row():
583
+ endpoint_filter = gr.Dropdown(label="Endpoint filter (optional)", choices=list(ENDPOINT_HINTS.keys()), multiselect=True, value=[])
584
+ organ_filter = gr.Dropdown(label="Organ filter (optional)", choices=["any"] + ORGANS, value="any")
585
+ enzyme_filter = gr.Dropdown(label="Enzyme filter (optional)", choices=[""], value="")
586
+ pathway_filter = gr.Dropdown(label="Pathway filter (optional)", choices=[""], value="")
587
+
588
+ top_k = gr.Slider(5, 30, value=12, step=1, label="Top results")
589
+ search_btn = gr.Button("Search", variant="secondary")
590
+
591
+ mini_summary_md = gr.Markdown()
592
+ results_df = gr.Dataframe(label="Search results (page-level)", interactive=False, wrap=True)
593
+ evidence_md = gr.Markdown()
594
+
595
+ build_btn.click(
596
+ fn=build_index,
597
+ inputs=[files, api_key, embedding_model],
598
+ outputs=[idx_state, papers_df, endpoint_matrix_df, index_status, enzyme_filter, pathway_filter]
599
+ )
600
+
601
+ search_btn.click(
602
+ fn=search,
603
+ inputs=[query, idx_state, api_key, embedding_model, summary_model, endpoint_filter, organ_filter, enzyme_filter, pathway_filter, top_k],
604
+ outputs=[results_df, mini_summary_md, evidence_md]
605
+ )