hchevva commited on
Commit
50467c5
·
verified ·
1 Parent(s): 4a6dfec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +512 -406
app.py CHANGED
@@ -1,511 +1,617 @@
1
  import os
2
  import re
 
3
  import math
4
  import tempfile
5
  from pathlib import Path
6
- from typing import Dict, List, Tuple
7
 
8
  import gradio as gr
9
  import numpy as np
10
  import pandas as pd
11
 
12
- import nltk
13
- from nltk.sentiment import SentimentIntensityAnalyzer
14
-
15
  from pypdf import PdfReader
16
-
17
  from sklearn.feature_extraction.text import TfidfVectorizer
18
 
19
- import matplotlib.pyplot as plt
20
- import seaborn as sns
21
- from wordcloud import WordCloud
22
-
23
- from sumy.parsers.plaintext import PlaintextParser
24
- from sumy.nlp.tokenizers import Tokenizer
25
- from sumy.summarizers.text_rank import TextRankSummarizer
26
 
27
 
28
  # -----------------------------
29
- # NLTK setup (downloads once)
30
  # -----------------------------
31
- _NLTK_READY = False
32
-
33
- def ensure_nltk():
34
- global _NLTK_READY
35
- if _NLTK_READY:
36
- return
37
- nltk.download("punkt", quiet=True)
38
- nltk.download("punkt_tab", quiet=True) # some envs need this
39
- nltk.download("vader_lexicon", quiet=True)
40
- _NLTK_READY = True
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
 
43
  # -----------------------------
44
- # PDF extraction
45
  # -----------------------------
46
- def extract_text_from_pdf(pdf_path: str, max_pages: int = 0) -> Tuple[str, int]:
47
- """
48
- Returns (text, page_count). max_pages=0 means all pages.
49
- Note: scanned-image PDFs may yield little/no text.
50
- """
51
  reader = PdfReader(pdf_path)
52
  page_count = len(reader.pages)
53
- pages_to_read = page_count if (max_pages is None or max_pages <= 0) else min(page_count, max_pages)
54
 
55
- parts = []
56
  for i in range(pages_to_read):
57
  try:
58
  t = reader.pages[i].extract_text() or ""
59
  except Exception:
60
  t = ""
61
- if t.strip():
62
- parts.append(t)
 
63
 
64
- return "\n".join(parts).strip(), page_count
65
 
 
 
 
 
 
66
 
67
- # -----------------------------
68
- # Utilities
69
- # -----------------------------
70
- def clean_whitespace(text: str) -> str:
71
- text = text or ""
72
- text = text.replace("\x00", " ")
73
- text = re.sub(r"\s+", " ", text).strip()
74
- return text
75
 
76
- def split_into_chunks(text: str, chunk_chars: int = 3000) -> List[str]:
77
  """
78
- Chunk by sentences into ~chunk_chars blocks.
79
  """
80
- text = text or ""
81
- if not text.strip():
82
- return []
83
-
84
- sentences = nltk.sent_tokenize(text)
85
  chunks = []
86
- cur = []
87
-
88
  cur_len = 0
89
- for s in sentences:
90
- s = s.strip()
91
- if not s:
 
92
  continue
93
- if cur_len + len(s) + 1 > chunk_chars and cur:
94
- chunks.append(" ".join(cur))
95
- cur = [s]
96
- cur_len = len(s)
 
 
 
 
 
 
 
 
97
  else:
98
- cur.append(s)
99
- cur_len += len(s) + 1
100
 
101
- if cur:
102
- chunks.append(" ".join(cur))
 
103
 
104
  return chunks
105
 
106
- def vader_doc_sentiment(text: str, chunk_chars: int = 3000) -> Tuple[float, str, List[float]]:
107
- """
108
- Returns: (avg_compound_score, label, chunk_scores)
109
- """
110
- ensure_nltk()
111
- sia = SentimentIntensityAnalyzer()
112
 
113
- chunks = split_into_chunks(text, chunk_chars=chunk_chars)
114
- if not chunks:
115
- return 0.0, "Neutral", []
 
 
 
 
116
 
117
- scores = [sia.polarity_scores(c).get("compound", 0.0) for c in chunks]
118
- avg = float(np.mean(scores))
119
 
120
- if avg >= 0.05:
121
- label = "Positive"
122
- elif avg <= -0.05:
123
- label = "Negative"
124
- else:
125
- label = "Neutral"
 
 
 
 
 
126
 
127
- return avg, label, scores
 
 
128
 
129
- def extract_keywords_tfidf(text: str, top_k: int = 20) -> List[Tuple[str, float]]:
130
- """
131
- TF-IDF keywords for a single document.
132
- Uses unigrams + bigrams; returns list of (term, score).
133
- """
134
- text = text or ""
135
- if not text.strip():
136
- return []
137
 
138
- vectorizer = TfidfVectorizer(
139
- stop_words="english",
140
- ngram_range=(1, 2),
141
- max_features=5000
142
- )
143
- X = vectorizer.fit_transform([text])
144
- feats = np.array(vectorizer.get_feature_names_out())
145
- scores = X.toarray().ravel()
146
 
147
- if scores.size == 0:
148
- return []
 
 
 
 
 
 
 
 
149
 
150
- idx = np.argsort(scores)[::-1]
151
- idx = idx[: max(1, int(top_k))]
152
- return [(feats[i], float(scores[i])) for i in idx if scores[i] > 0]
153
-
154
- def make_wordcloud_figure(text: str):
155
- text = text or ""
156
- if not text.strip():
157
- return None
158
- wc = WordCloud(width=1200, height=600, background_color="white").generate(text)
159
- fig = plt.figure(figsize=(10, 5))
160
- ax = fig.add_subplot(111)
161
- ax.imshow(wc, interpolation="bilinear")
162
- ax.axis("off")
163
- fig.tight_layout()
164
- return fig
165
-
166
- def textrank_summary(text: str, num_sentences: int = 6) -> str:
167
- text = (text or "").strip()
168
- if not text:
169
- return ""
170
- num_sentences = max(1, int(num_sentences))
171
-
172
- parser = PlaintextParser.from_string(text, Tokenizer("english"))
173
- summarizer = TextRankSummarizer()
174
- sents = summarizer(parser.document, num_sentences)
175
- return " ".join(str(s) for s in sents)
176
-
177
- def detect_title(text: str) -> str:
178
- """
179
- Heuristic: pick the first 'strong' line from the first ~30 lines.
180
- """
181
- raw = text or ""
182
- lines = [l.strip() for l in raw.splitlines() if l.strip()]
183
- lines = lines[:30]
184
- for l in lines:
185
- if 8 <= len(l) <= 200 and not l.lower().startswith(("abstract", "introduction")):
186
- # avoid obvious author lines
187
- if not re.search(r"\b(university|department|email|corresponding)\b", l.lower()):
188
- return l
189
- return lines[0] if lines else ""
190
-
191
- def extract_abstract(text: str) -> str:
192
- """
193
- Try: ABSTRACT ... INTRODUCTION
194
- """
195
- t = text or ""
196
- m = re.search(r"\babstract\b(.*?)(\bintroduction\b|\b1\.\s*introduction\b)", t, flags=re.IGNORECASE | re.DOTALL)
197
- if not m:
198
- return ""
199
- abs_text = clean_whitespace(m.group(1))
200
- # keep reasonable length
201
- return abs_text[:2000]
202
-
203
- def extract_section_headings(text: str, max_headings: int = 20) -> List[str]:
204
  """
205
- Simple heading heuristic:
206
- - Lines that look like: "1. Introduction", "2 Methods", "RESULTS", etc.
207
  """
208
- lines = [l.strip() for l in (text or "").splitlines()]
209
- headings = []
210
- for l in lines:
211
- if not l or len(l) > 120:
 
 
 
212
  continue
213
- if re.match(r"^\d+(\.\d+)*\s+[A-Z].{2,}$", l):
214
- headings.append(l)
215
- elif l.isupper() and 4 <= len(l) <= 60:
216
- headings.append(l)
217
- if len(headings) >= max_headings:
218
- break
219
- # dedupe while preserving order
220
- seen = set()
221
- out = []
222
- for h in headings:
223
- key = h.lower()
224
- if key not in seen:
225
- seen.add(key)
226
- out.append(h)
227
- return out
228
-
229
- def detect_cas_numbers(text: str) -> List[str]:
230
- """
231
- CAS format: 2-7 digits - 2 digits - 1 digit
232
- """
233
- cas = re.findall(r"\b\d{2,7}-\d{2}-\d\b", text or "")
234
- # unique preserve order
235
- seen = set()
236
- out = []
237
- for c in cas:
238
- if c not in seen:
239
- seen.add(c)
240
- out.append(c)
241
- return out
242
-
243
- TOX_TERMS = [
244
- "hazard", "risk", "exposure", "dose", "response", "toxicity",
245
- "adverse", "noael", "loael", "benchmark dose", "bmd", "bmdl",
246
- "carcinogenic", "mutagen", "genotoxic", "teratogenic",
247
- "lc50", "ld50", "in vitro", "in vivo", "metabolite"
248
- ]
249
-
250
- def tox_term_counts(text: str) -> List[Tuple[str, int]]:
251
- t = (text or "").lower()
252
- counts = []
253
- for term in TOX_TERMS:
254
- c = len(re.findall(r"\b" + re.escape(term) + r"\b", t))
255
- if c > 0:
256
- counts.append((term, c))
257
- counts.sort(key=lambda x: x[1], reverse=True)
258
- return counts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
 
261
  # -----------------------------
262
- # Batch pipeline + reporting
263
  # -----------------------------
264
- def build_context_report(
265
- filename: str,
266
- title: str,
267
- pages: int,
268
- word_count: int,
269
- sent_score: float,
270
- sent_label: str,
271
- keywords: List[Tuple[str, float]],
272
- abstract: str,
273
- headings: List[str],
274
- summary: str,
275
- cas: List[str],
276
- tox_counts: List[Tuple[str, int]]
277
- ) -> str:
278
- kw = ", ".join([k for k, _ in keywords[:15]]) if keywords else "(none)"
279
- cas_str = ", ".join(cas[:15]) + (" ..." if len(cas) > 15 else "") if cas else "(none)"
280
- headings_str = "\n".join([f"- {h}" for h in headings]) if headings else "- (none detected)"
281
- tox_str = "\n".join([f"- {t}: {c}" for t, c in tox_counts[:12]]) if tox_counts else "- (none detected)"
282
-
283
- abs_block = abstract if abstract else "(abstract not detected)"
284
- sum_block = summary if summary else "(summary unavailable)"
285
-
286
- return f"""## {filename}
287
-
288
- **Title (heuristic):** {title or "(not detected)"}
289
- **Pages:** {pages}
290
- **Approx. word count:** {word_count:,}
291
-
292
- ### Sentiment / Tone
293
- - **Average compound score:** {sent_score:.3f}
294
- - **Label:** **{sent_label}**
295
- > Interpretation note: for research papers, this is best read as *tone polarity* rather than emotion.
296
-
297
- ### Keywords (TF-IDF)
298
- {kw}
299
-
300
- ### Abstract (if detected)
301
- {abs_block}
302
-
303
- ### Extractive summary (TextRank)
304
- {sum_block}
305
-
306
- ### Section outline (heuristic)
307
- {headings_str}
308
-
309
- ### CAS numbers detected
310
- {cas_str}
311
-
312
- ### Toxicology concept coverage
313
- {tox_str}
314
- """
315
 
 
 
 
 
 
 
 
 
 
316
 
317
- def analyze_pdfs(files, top_k_keywords, summary_sentences, chunk_chars, max_pages, make_wordcloud):
318
- ensure_nltk()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  if not files:
321
- return None, None, [], "", None, None, None, "Upload one or more PDFs."
322
 
323
- top_k_keywords = int(top_k_keywords)
324
- summary_sentences = int(summary_sentences)
325
- chunk_chars = int(chunk_chars)
326
- max_pages = int(max_pages)
327
 
328
- results_rows = []
329
- details: Dict[str, Dict] = {}
 
330
 
331
- tmpdir = Path(tempfile.mkdtemp(prefix="tox_paper_nlp_"))
 
 
 
 
 
 
 
 
 
 
332
 
333
  for f in files:
334
  pdf_path = f.name
335
  filename = os.path.basename(pdf_path)
336
 
337
- raw_text, pages = extract_text_from_pdf(pdf_path, max_pages=max_pages)
338
- raw_text = raw_text or ""
339
- word_count = len(clean_whitespace(raw_text).split())
340
-
341
- # sentiment
342
- sent_score, sent_label, chunk_scores = vader_doc_sentiment(raw_text, chunk_chars=chunk_chars)
343
-
344
- # keywords + summary + context
345
- keywords = extract_keywords_tfidf(raw_text, top_k=top_k_keywords)
346
- abstract = extract_abstract(raw_text)
347
- title = detect_title(raw_text)
348
- headings = extract_section_headings(raw_text)
349
- summary = textrank_summary(raw_text, num_sentences=summary_sentences)
350
- cas = detect_cas_numbers(raw_text)
351
- tox_counts = tox_term_counts(raw_text)
352
-
353
- report_md = build_context_report(
354
- filename=filename,
355
- title=title,
356
- pages=pages,
357
- word_count=word_count,
358
- sent_score=sent_score,
359
- sent_label=sent_label,
360
- keywords=keywords,
361
- abstract=abstract,
362
- headings=headings,
363
- summary=summary,
364
- cas=cas,
365
- tox_counts=tox_counts
366
- )
367
-
368
- # Save extracted text + per-doc JSON for portability
369
- txt_path = tmpdir / f"{Path(filename).stem}.txt"
370
- txt_path.write_text(raw_text, encoding="utf-8", errors="ignore")
371
-
372
- details[filename] = {
373
- "filename": filename,
374
- "pages": pages,
375
- "word_count": word_count,
376
- "sentiment_score": sent_score,
377
- "sentiment_label": sent_label,
378
- "chunk_scores": chunk_scores,
379
- "keywords": keywords,
380
- "abstract": abstract,
381
- "title": title,
382
- "headings": headings,
383
- "summary": summary,
384
- "cas_numbers": cas,
385
- "tox_term_counts": tox_counts,
386
- "report_md": report_md,
387
- "text_path": str(txt_path),
388
- "raw_text_preview": (raw_text[:6000] + " ...") if len(raw_text) > 6000 else raw_text
389
- }
390
-
391
- results_rows.append({
392
  "file": filename,
393
- "pages": pages,
394
- "word_count": word_count,
395
- "sentiment_score": round(sent_score, 4),
396
- "sentiment_label": sent_label,
397
- "top_keywords": ", ".join([k for k, _ in keywords[:10]]),
398
- "cas_count": len(cas),
399
- })
400
-
401
- df = pd.DataFrame(results_rows).sort_values(["sentiment_score", "word_count"], ascending=[False, False])
402
-
403
- # Save table as CSV for download
404
- csv_path = tmpdir / "pdf_nlp_results.csv"
 
 
 
 
 
405
  df.to_csv(csv_path, index=False)
 
406
 
407
- # Populate doc selector and default view
408
- doc_names = list(details.keys())
409
- first = doc_names[0]
410
 
411
- state = details
412
- report_md = details[first]["report_md"]
413
 
414
- # sentiment distribution plot for first doc
415
- fig_sent = None
416
- scores = details[first]["chunk_scores"]
417
- if scores:
418
- fig_sent = plt.figure()
419
- ax = fig_sent.add_subplot(111)
420
- sns.histplot(scores, kde=True, ax=ax)
421
- ax.set_title(f"Chunk Sentiment Distribution: {first}")
422
- ax.set_xlabel("VADER compound score")
423
- ax.set_ylabel("Chunk count")
424
- fig_sent.tight_layout()
425
 
426
- fig_wc = None
427
- if make_wordcloud:
428
- fig_wc = make_wordcloud_figure(details[first]["raw_text_preview"])
 
429
 
430
- return df, str(csv_path), doc_names, report_md, fig_sent, fig_wc, details[first]["raw_text_preview"], "Done."
 
 
431
 
432
 
433
- def render_doc(doc_name, state, make_wordcloud):
434
- if not state or not doc_name or doc_name not in state:
435
- return "", None, None, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
 
437
- d = state[doc_name]
438
- report_md = d["report_md"]
439
- preview = d["raw_text_preview"]
440
 
441
- fig_sent = None
442
- scores = d.get("chunk_scores", [])
443
- if scores:
444
- fig_sent = plt.figure()
445
- ax = fig_sent.add_subplot(111)
446
- sns.histplot(scores, kde=True, ax=ax)
447
- ax.set_title(f"Chunk Sentiment Distribution: {doc_name}")
448
- ax.set_xlabel("VADER compound score")
449
- ax.set_ylabel("Chunk count")
450
- fig_sent.tight_layout()
451
 
452
- fig_wc = None
453
- if make_wordcloud:
454
- fig_wc = make_wordcloud_figure(preview)
 
 
 
 
 
 
 
 
 
455
 
456
- return report_md, fig_sent, fig_wc, preview
457
 
458
 
459
  # -----------------------------
460
  # Gradio UI
461
  # -----------------------------
462
- with gr.Blocks(title="Toxicology PDF NLP Analyzer") as demo:
463
- gr.Markdown("# Toxicology PDF NLP Analyzer")
464
-
465
- state = gr.State({})
466
 
467
- with gr.Tab("Batch (Upload PDFs)"):
468
  files = gr.File(label="Upload toxicology research PDFs", file_types=[".pdf"], file_count="multiple")
469
 
 
 
 
 
 
 
 
470
  with gr.Row():
471
- top_k_keywords = gr.Slider(5, 50, value=20, step=1, label="Top keywords (TF-IDF)")
472
- summary_sentences = gr.Slider(2, 12, value=6, step=1, label="Summary sentences (TextRank)")
473
- with gr.Row():
474
- chunk_chars = gr.Slider(800, 8000, value=3000, step=100, label="Chunk size for sentiment (chars)")
475
  max_pages = gr.Slider(0, 200, value=0, step=1, label="Max pages to read (0 = all)")
476
- make_wordcloud = gr.Checkbox(label="Generate word cloud", value=True)
 
477
 
478
- run_btn = gr.Button("Analyze PDFs")
 
479
 
 
 
 
480
  status = gr.Textbox(label="Status", interactive=False)
481
 
482
- results_df = gr.Dataframe(label="Batch Results", interactive=False)
483
- results_csv = gr.File(label="Download: results CSV")
 
484
 
485
- with gr.Row():
486
- doc_selector = gr.Dropdown(label="Select a document for details", choices=[], value=None)
487
-
488
- report_md = gr.Markdown()
489
- sent_plot = gr.Plot(label="Sentiment Distribution (by chunk)")
490
- wc_plot = gr.Plot(label="Word Cloud")
491
- raw_preview = gr.Textbox(label="Extracted text preview (first ~6k chars)", lines=10)
492
-
493
- run_btn.click(
494
- fn=analyze_pdfs,
495
- inputs=[files, top_k_keywords, summary_sentences, chunk_chars, max_pages, make_wordcloud],
496
- outputs=[results_df, results_csv, doc_selector, report_md, sent_plot, wc_plot, raw_preview, status]
497
- ).then(
498
- fn=lambda d: d, inputs=None, outputs=state
499
  )
500
 
501
- # Update details view on selection change
502
- doc_selector.change(
503
- fn=render_doc,
504
- inputs=[doc_selector, state, make_wordcloud],
505
- outputs=[report_md, sent_plot, wc_plot, raw_preview]
506
  )
507
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
 
509
  if __name__ == "__main__":
510
  port = int(os.environ.get("PORT", "7860"))
511
- demo.launch(server_name="0.0.0.0", server_port=port)
 
1
  import os
2
  import re
3
+ import json
4
  import math
5
  import tempfile
6
  from pathlib import Path
7
+ from typing import Dict, List, Tuple, Any
8
 
9
  import gradio as gr
10
  import numpy as np
11
  import pandas as pd
12
 
 
 
 
13
  from pypdf import PdfReader
 
14
  from sklearn.feature_extraction.text import TfidfVectorizer
15
 
16
+ from openai import OpenAI # OpenAI Responses API client
 
 
 
 
 
 
17
 
18
 
19
  # -----------------------------
20
+ # Defaults
21
  # -----------------------------
22
+ DEFAULT_CONTROLLED_VOCAB_JSON = """{
23
+ "risk_stance_enum": ["high_concern","moderate_concern","low_concern","inconclusive","not_assessed"],
24
+ "study_type_enum": ["in_vivo","in_vitro","epidemiology","in_silico","review","methodology","other"],
25
+ "exposure_route_enum": ["oral","inhalation","dermal","parenteral","multiple","not_reported"],
26
+ "species_enum": ["human","rat","mouse","rabbit","dog","non_human_primate","cell_line","other","not_reported"],
27
+ "endpoint_terms": ["hepatotoxicity","nephrotoxicity","neurotoxicity","immunotoxicity","reproductive_toxicity","developmental_toxicity","genotoxicity","carcinogenicity","endocrine_activity","respiratory_toxicity","dermal_toxicity","hematotoxicity","cytotoxicity","oxidative_stress","inflammation"],
28
+ "dose_metric_terms": ["noael","loael","bmd","bmdl","ld50","lc50","ec50","ic50"],
29
+ "risk_language_terms": ["adverse_effect","no_adverse_effect_observed","increased_risk","safe_at_tested_dose","insufficient_evidence","uncertainty_high"]
30
+ }"""
31
+
32
+ DEFAULT_FIELD_SPEC = """# One field per line: Field Name | type | instructions | optional: enum values
33
+ # types: str, num, bool, list[str], list[num], enum[a,b,c]
34
+ Chemical(s) | list[str] | Primary chemical(s) studied; include common name + abbreviation if present.
35
+ CAS_numbers | list[str] | Extract any CAS numbers mentioned.
36
+ Study_type | enum[in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other] | Choose the best match.
37
+ Exposure_route | enum[oral,inhalation,dermal,parenteral,multiple,not_reported] | Choose best match.
38
+ Species | enum[human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported] | Choose best match.
39
+ Key_endpoints | list[str] | Extract endpoints; prefer controlled vocab terms if applicable.
40
+ Key_findings | str | 2-4 bullet-like sentences summarizing the main findings.
41
+ Dose_metrics | list[str] | Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available.
42
+ Conclusion | str | What does the paper conclude about safety/risk?
43
+ """
44
 
45
 
46
  # -----------------------------
47
+ # PDF extraction (page-aware)
48
  # -----------------------------
49
+ def extract_pages_from_pdf(pdf_path: str, max_pages: int = 0) -> Tuple[List[Tuple[int, str]], int]:
 
 
 
 
50
  reader = PdfReader(pdf_path)
51
  page_count = len(reader.pages)
52
+ pages_to_read = page_count if (max_pages is None or max_pages <= 0) else min(page_count, int(max_pages))
53
 
54
+ pages: List[Tuple[int, str]] = []
55
  for i in range(pages_to_read):
56
  try:
57
  t = reader.pages[i].extract_text() or ""
58
  except Exception:
59
  t = ""
60
+ t = (t or "").strip()
61
+ pages.append((i + 1, t))
62
+ return pages, page_count
63
 
 
64
 
65
+ def clean_text(t: str) -> str:
66
+ t = t or ""
67
+ t = t.replace("\x00", " ")
68
+ t = re.sub(r"\s+", " ", t).strip()
69
+ return t
70
 
 
 
 
 
 
 
 
 
71
 
72
+ def chunk_pages(pages: List[Tuple[int, str]], target_chars: int = 3000) -> List[Dict[str, Any]]:
73
  """
74
+ Build chunks with page ranges, roughly target_chars each.
75
  """
 
 
 
 
 
76
  chunks = []
77
+ buf = []
78
+ start_page = None
79
  cur_len = 0
80
+
81
+ for pno, txt in pages:
82
+ txt = clean_text(txt)
83
+ if not txt:
84
  continue
85
+ if start_page is None:
86
+ start_page = pno
87
+
88
+ # If adding this page exceeds chunk size, flush
89
+ if cur_len + len(txt) + 1 > target_chars and buf:
90
+ end_page = (pno - 1) if (pno - 1) >= start_page else start_page
91
+ chunks.append(
92
+ {"pages": f"{start_page}-{end_page}", "text": " ".join(buf)}
93
+ )
94
+ buf = [txt]
95
+ start_page = pno
96
+ cur_len = len(txt)
97
  else:
98
+ buf.append(txt)
99
+ cur_len += len(txt) + 1
100
 
101
+ if buf and start_page is not None:
102
+ end_page = pages[-1][0]
103
+ chunks.append({"pages": f"{start_page}-{end_page}", "text": " ".join(buf)})
104
 
105
  return chunks
106
 
 
 
 
 
 
 
107
 
108
+ # -----------------------------
109
+ # Lightweight retrieval (TF-IDF) to select relevant excerpts
110
+ # -----------------------------
111
+ def select_relevant_chunks(chunks: List[Dict[str, Any]], queries: List[str], top_per_query: int = 2, max_chunks: int = 10) -> List[Dict[str, Any]]:
112
+ texts = [c["text"] for c in chunks]
113
+ if not texts:
114
+ return []
115
 
116
+ vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=20000)
117
+ X = vectorizer.fit_transform(texts)
118
 
119
+ selected_idx = []
120
+ for q in queries:
121
+ q = (q or "").strip()
122
+ if not q:
123
+ continue
124
+ qv = vectorizer.transform([q])
125
+ sims = (X @ qv.T).toarray().ravel() # cosine-like (not normalized), good enough for ranking
126
+ idx = np.argsort(sims)[::-1]
127
+ for i in idx[:top_per_query]:
128
+ if i not in selected_idx:
129
+ selected_idx.append(i)
130
 
131
+ # fallback: if nothing selected, take first few chunks
132
+ if not selected_idx:
133
+ selected_idx = list(range(min(len(chunks), max_chunks)))
134
 
135
+ selected = [chunks[i] for i in selected_idx[:max_chunks]]
136
+ return selected
 
 
 
 
 
 
137
 
 
 
 
 
 
 
 
 
138
 
139
+ def build_context(selected_chunks: List[Dict[str, Any]], max_chars: int = 20000) -> str:
140
+ parts = []
141
+ total = 0
142
+ for c in selected_chunks:
143
+ block = f"[pages {c['pages']}]\n{c['text']}\n"
144
+ if total + len(block) > max_chars:
145
+ break
146
+ parts.append(block)
147
+ total += len(block)
148
+ return "\n".join(parts).strip()
149
 
150
+
151
+ # -----------------------------
152
+ # User-defined extraction spec -> JSON Schema
153
+ # -----------------------------
154
+ def slugify_field(name: str) -> str:
155
+ name = name.strip()
156
+ name = re.sub(r"[^\w\s-]", "", name)
157
+ name = re.sub(r"[\s-]+", "_", name).lower()
158
+ return name[:60] if name else "field"
159
+
160
+
161
+ def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], List[str], Dict[str, str]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  """
163
+ spec lines: Field Name | type | instructions
164
+ Returns: properties dict, required list, instructions map (field_key -> instruction)
165
  """
166
+ props = {}
167
+ required = []
168
+ instr = {}
169
+
170
+ for raw_line in (spec or "").splitlines():
171
+ line = raw_line.strip()
172
+ if not line or line.startswith("#"):
173
  continue
174
+
175
+ parts = [p.strip() for p in line.split("|")]
176
+ if len(parts) < 2:
177
+ continue
178
+
179
+ field_name = parts[0]
180
+ ftype = parts[1]
181
+ finstr = parts[2] if len(parts) >= 3 else ""
182
+
183
+ is_required = False
184
+ if field_name.startswith("*"):
185
+ is_required = True
186
+ field_name = field_name[1:].strip()
187
+
188
+ key = slugify_field(field_name)
189
+ instr[key] = finstr
190
+
191
+ schema = {"type": "string"}
192
+
193
+ if ftype == "str":
194
+ schema = {"type": "string"}
195
+ elif ftype == "num":
196
+ schema = {"type": "number"}
197
+ elif ftype == "bool":
198
+ schema = {"type": "boolean"}
199
+ elif ftype.startswith("list[str]"):
200
+ schema = {"type": "array", "items": {"type": "string"}}
201
+ elif ftype.startswith("list[num]"):
202
+ schema = {"type": "array", "items": {"type": "number"}}
203
+ elif ftype.startswith("enum[") and ftype.endswith("]"):
204
+ inside = ftype[len("enum["):-1].strip()
205
+ vals = [v.strip() for v in inside.split(",") if v.strip()]
206
+ schema = {"type": "string", "enum": vals}
207
+ else:
208
+ schema = {"type": "string"}
209
+
210
+ props[key] = schema
211
+ if is_required:
212
+ required.append(key)
213
+
214
+ # If user didn’t mark required fields, keep it permissive
215
+ return props, required, instr
216
+
217
+
218
+ def build_extraction_schema(field_props: Dict[str, Any], required_fields: List[str], vocab: Dict[str, Any]) -> Dict[str, Any]:
219
+ risk_enum = vocab.get("risk_stance_enum", ["high_concern","moderate_concern","low_concern","inconclusive","not_assessed"])
220
+
221
+ schema = {
222
+ "type": "object",
223
+ "additionalProperties": False,
224
+ "properties": {
225
+ "paper_title": {"type": "string"},
226
+ "risk_stance": {"type": "string", "enum": risk_enum},
227
+ "risk_confidence": {"type": "number", "minimum": 0, "maximum": 1},
228
+ "risk_summary": {"type": "string"},
229
+ "extracted": {
230
+ "type": "object",
231
+ "additionalProperties": False,
232
+ "properties": field_props,
233
+ "required": required_fields
234
+ },
235
+ "evidence": {
236
+ "type": "array",
237
+ "items": {
238
+ "type": "object",
239
+ "additionalProperties": False,
240
+ "properties": {
241
+ "field": {"type": "string"},
242
+ "quote": {"type": "string"},
243
+ "pages": {"type": "string"}
244
+ },
245
+ "required": ["field", "quote", "pages"]
246
+ }
247
+ }
248
+ },
249
+ "required": ["paper_title", "risk_stance", "risk_confidence", "risk_summary", "extracted", "evidence"]
250
+ }
251
+ return schema
252
 
253
 
254
  # -----------------------------
255
+ # OpenAI call (Responses API + Structured Outputs)
256
  # -----------------------------
257
+ def get_openai_client(api_key: str) -> OpenAI:
258
+ key = (api_key or "").strip() or os.getenv("OPENAI_API_KEY", "").strip()
259
+ if not key:
260
+ raise ValueError("Missing OpenAI API key. Provide it in the UI or set OPENAI_API_KEY.")
261
+ return OpenAI(api_key=key)
262
+
263
+
264
+ def openai_structured_extract(
265
+ client: OpenAI,
266
+ model: str,
267
+ schema: Dict[str, Any],
268
+ controlled_vocab: Dict[str, Any],
269
+ field_instructions: Dict[str, str],
270
+ context: str
271
+ ) -> Dict[str, Any]:
272
+
273
+ # Build instruction text for the model
274
+ field_instr_lines = []
275
+ for k, v in field_instructions.items():
276
+ if v:
277
+ field_instr_lines.append(f"- {k}: {v}")
278
+ else:
279
+ field_instr_lines.append(f"- {k}: (no extra instructions)")
280
+
281
+ vocab_text = json.dumps(controlled_vocab, indent=2)
282
+
283
+ system_msg = (
284
+ "You are a toxicology research paper data-extraction assistant.\n"
285
+ "Rules:\n"
286
+ "1) Use ONLY the provided excerpts; do not invent details.\n"
287
+ "2) If a value is not stated, use an empty string, empty list, or 'not_reported' if the enum allows it.\n"
288
+ "3) Always include evidence quotes with page ranges (from excerpt headers).\n"
289
+ "4) risk_stance reflects overall concern from the paper's findings (high/moderate/low/inconclusive/not_assessed).\n"
290
+ "5) Prefer controlled vocabulary terms when applicable.\n"
291
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
+ user_msg = (
294
+ "CONTROLLED VOCAB (JSON):\n"
295
+ f"{vocab_text}\n\n"
296
+ "FIELD INSTRUCTIONS:\n"
297
+ + "\n".join(field_instr_lines)
298
+ + "\n\n"
299
+ "EXCERPTS:\n"
300
+ f"{context}\n"
301
+ )
302
 
303
+ resp = client.responses.create(
304
+ model=model,
305
+ input=[
306
+ {"role": "system", "content": system_msg},
307
+ {"role": "user", "content": user_msg}
308
+ ],
309
+ text={
310
+ "format": {
311
+ "type": "json_schema",
312
+ "name": "tox_extraction",
313
+ "schema": schema,
314
+ "strict": True
315
+ }
316
+ }
317
+ )
318
+
319
+ # Structured outputs: JSON is in output_text
320
+ out = resp.output_text
321
+ return json.loads(out)
322
 
323
+
324
+ def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[str, Any]]) -> str:
325
+ system_msg = (
326
+ "You are a senior toxicology scientist summarizing multiple papers.\n"
327
+ "Produce a concise synthesis for researchers: consensus, disagreements, data gaps, and next steps.\n"
328
+ "Base your synthesis strictly on the provided extracted JSON (which itself is evidence-backed).\n"
329
+ )
330
+ user_msg = "EXTRACTED_ROWS_JSON:\n" + json.dumps(rows, indent=2)
331
+
332
+ resp = client.responses.create(
333
+ model=model,
334
+ input=[
335
+ {"role": "system", "content": system_msg},
336
+ {"role": "user", "content": user_msg}
337
+ ]
338
+ )
339
+ return resp.output_text
340
+
341
+
342
+ def openai_suggest_vocab_additions(client: OpenAI, model: str, current_vocab: Dict[str, Any], context: str) -> Dict[str, Any]:
343
+ schema = {
344
+ "type": "object",
345
+ "additionalProperties": False,
346
+ "properties": {
347
+ "additions": {
348
+ "type": "object",
349
+ "additionalProperties": {
350
+ "type": "array",
351
+ "items": {"type": "string"}
352
+ }
353
+ },
354
+ "notes": {"type": "string"}
355
+ },
356
+ "required": ["additions", "notes"]
357
+ }
358
+
359
+ system_msg = (
360
+ "You propose controlled-vocabulary additions for toxicology paper extraction.\n"
361
+ "Return only new candidate terms grouped under keys that already exist or new keys if needed.\n"
362
+ "Avoid duplicates already in current vocab.\n"
363
+ )
364
+ user_msg = (
365
+ "CURRENT_VOCAB_JSON:\n"
366
+ + json.dumps(current_vocab, indent=2)
367
+ + "\n\n"
368
+ "EXCERPTS:\n"
369
+ + context
370
+ )
371
+
372
+ resp = client.responses.create(
373
+ model=model,
374
+ input=[
375
+ {"role": "system", "content": system_msg},
376
+ {"role": "user", "content": user_msg}
377
+ ],
378
+ text={
379
+ "format": {
380
+ "type": "json_schema",
381
+ "name": "vocab_additions",
382
+ "schema": schema,
383
+ "strict": True
384
+ }
385
+ }
386
+ )
387
+ return json.loads(resp.output_text)
388
+
389
+
390
+ # -----------------------------
391
+ # Gradio handlers
392
+ # -----------------------------
393
+ def run_extraction(files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars):
394
  if not files:
395
+ return None, None, None, "Upload one or more PDFs."
396
 
397
+ try:
398
+ vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
399
+ except Exception as e:
400
+ return None, None, None, f"Controlled vocab JSON is invalid: {e}"
401
 
402
+ field_props, required_fields, field_instr = parse_field_spec(field_spec or DEFAULT_FIELD_SPEC)
403
+ if not field_props:
404
+ return None, None, None, "Field spec produced no fields. Add lines like: Field | str | instructions"
405
 
406
+ schema = build_extraction_schema(field_props, required_fields, vocab)
407
+
408
+ try:
409
+ client = get_openai_client(api_key)
410
+ except Exception as e:
411
+ return None, None, None, str(e)
412
+
413
+ results = []
414
+ flat_rows = []
415
+
416
+ tmpdir = Path(tempfile.mkdtemp(prefix="tox_extract_"))
417
 
418
  for f in files:
419
  pdf_path = f.name
420
  filename = os.path.basename(pdf_path)
421
 
422
+ pages, page_count = extract_pages_from_pdf(pdf_path, max_pages=int(max_pages))
423
+ chunks = chunk_pages(pages, target_chars=int(chunk_chars))
424
+
425
+ # Build queries: risk stance + each field instruction
426
+ queries = [
427
+ "risk stance hazard risk conclusion adverse effect noael loael bmd bmdl ld50 lc50 safety concern",
428
+ ]
429
+ for k, ins in field_instr.items():
430
+ if ins:
431
+ queries.append(ins)
432
+ else:
433
+ queries.append(k)
434
+
435
+ selected = select_relevant_chunks(chunks, queries, top_per_query=2, max_chunks=12)
436
+ context = build_context(selected, max_chars=int(max_context_chars))
437
+
438
+ if not context.strip():
439
+ # nothing extractable (scanned or empty)
440
+ extracted = {
441
+ "paper_title": "",
442
+ "risk_stance": "not_assessed",
443
+ "risk_confidence": 0.0,
444
+ "risk_summary": "No text extracted from PDF (may be scanned).",
445
+ "extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
446
+ "evidence": []
447
+ }
448
+ else:
449
+ extracted = openai_structured_extract(
450
+ client=client,
451
+ model=model,
452
+ schema=schema,
453
+ controlled_vocab=vocab,
454
+ field_instructions=field_instr,
455
+ context=context
456
+ )
457
+
458
+ extracted["_file"] = filename
459
+ extracted["_pages_in_pdf"] = page_count
460
+ results.append(extracted)
461
+
462
+ # Flatten to table row
463
+ row = {
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  "file": filename,
465
+ "paper_title": extracted.get("paper_title", ""),
466
+ "risk_stance": extracted.get("risk_stance", ""),
467
+ "risk_confidence": extracted.get("risk_confidence", ""),
468
+ "risk_summary": extracted.get("risk_summary", "")
469
+ }
470
+ for k in field_props.keys():
471
+ v = (extracted.get("extracted") or {}).get(k, "")
472
+ if isinstance(v, list):
473
+ row[k] = "; ".join([str(x) for x in v])
474
+ else:
475
+ row[k] = v
476
+ flat_rows.append(row)
477
+
478
+ df = pd.DataFrame(flat_rows)
479
+
480
+ csv_path = tmpdir / "extraction_table.csv"
481
+ json_path = tmpdir / "extraction_details.json"
482
  df.to_csv(csv_path, index=False)
483
+ json_path.write_text(json.dumps(results, indent=2), encoding="utf-8")
484
 
485
+ status = "Done. Download the CSV table (productivity output) and JSON details (evidence + structure)."
486
+ return df, str(csv_path), str(json_path), status
 
487
 
 
 
488
 
489
+ def run_synthesis(api_key, model, extraction_json_file):
490
+ if extraction_json_file is None:
491
+ return "Upload the extraction_details.json first (from the extraction step)."
 
 
 
 
 
 
 
 
492
 
493
+ try:
494
+ client = get_openai_client(api_key)
495
+ except Exception as e:
496
+ return str(e)
497
 
498
+ rows = json.loads(Path(extraction_json_file.name).read_text(encoding="utf-8"))
499
+ md = openai_synthesize_across_papers(client, model, rows)
500
+ return md
501
 
502
 
503
+ def suggest_vocab(api_key, model, vocab_json, files, max_pages, chunk_chars, max_context_chars):
504
+ if not files:
505
+ return vocab_json, "Upload PDFs so I can propose vocab additions from their content."
506
+
507
+ try:
508
+ client = get_openai_client(api_key)
509
+ except Exception as e:
510
+ return vocab_json, str(e)
511
+
512
+ try:
513
+ vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
514
+ except Exception as e:
515
+ return vocab_json, f"Controlled vocab JSON is invalid: {e}"
516
+
517
+ # Build a small context from the first 1-2 docs
518
+ contexts = []
519
+ for f in files[:2]:
520
+ pages, _ = extract_pages_from_pdf(f.name, max_pages=int(max_pages))
521
+ chunks = chunk_pages(pages, target_chars=int(chunk_chars))
522
+ selected = select_relevant_chunks(
523
+ chunks,
524
+ queries=["toxicology endpoints noael loael bmd genotoxicity carcinogenicity endocrine exposure route species"],
525
+ top_per_query=2,
526
+ max_chunks=8
527
+ )
528
+ ctx = build_context(selected, max_chars=int(max_context_chars))
529
+ if ctx:
530
+ contexts.append(ctx)
531
 
532
+ combined = "\n\n---\n\n".join(contexts)[:int(max_context_chars)]
 
 
533
 
534
+ additions = openai_suggest_vocab_additions(client, model, vocab, combined)
 
 
 
 
 
 
 
 
 
535
 
536
+ # Merge additions (simple)
537
+ merged = dict(vocab)
538
+ add_obj = additions.get("additions", {})
539
+ for k, arr in add_obj.items():
540
+ if not isinstance(arr, list):
541
+ continue
542
+ if k not in merged:
543
+ merged[k] = []
544
+ if isinstance(merged[k], list):
545
+ for term in arr:
546
+ if term not in merged[k]:
547
+ merged[k].append(term)
548
 
549
+ return json.dumps(merged, indent=2), "Vocab updated with suggested additions. Review/edit before extracting."
550
 
551
 
552
  # -----------------------------
553
  # Gradio UI
554
  # -----------------------------
555
+ with gr.Blocks(title="Toxicology PDF Table Extractor (GPT-4o)") as demo:
556
+ gr.Markdown("# Toxicology PDF Table Extractor (GPT-4o)")
 
 
557
 
558
+ with gr.Tab("Extract to Table"):
559
  files = gr.File(label="Upload toxicology research PDFs", file_types=[".pdf"], file_count="multiple")
560
 
561
+ api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
562
+ model = gr.Dropdown(
563
+ label="Model",
564
+ choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"],
565
+ value="gpt-4o-2024-08-06"
566
+ )
567
+
568
  with gr.Row():
 
 
 
 
569
  max_pages = gr.Slider(0, 200, value=0, step=1, label="Max pages to read (0 = all)")
570
+ chunk_chars = gr.Slider(1200, 8000, value=3000, step=100, label="Chunk size (chars)")
571
+ max_context_chars = gr.Slider(5000, 40000, value=20000, step=1000, label="Max context sent to GPT (chars)")
572
 
573
+ vocab_json = gr.Textbox(label="Controlled vocabulary (JSON)", value=DEFAULT_CONTROLLED_VOCAB_JSON, lines=12)
574
+ field_spec = gr.Textbox(label="Extraction spec (you control what fields to extract)", value=DEFAULT_FIELD_SPEC, lines=10)
575
 
576
+ with gr.Row():
577
+ vocab_btn = gr.Button("Suggest vocab additions from PDFs")
578
+ extract_btn = gr.Button("Run Extraction (Table)")
579
  status = gr.Textbox(label="Status", interactive=False)
580
 
581
+ table = gr.Dataframe(label="Extracted Table (one row per paper)", interactive=False)
582
+ out_csv = gr.File(label="Download: extraction_table.csv")
583
+ out_json = gr.File(label="Download: extraction_details.json (evidence + structured data)")
584
 
585
+ vocab_btn.click(
586
+ fn=suggest_vocab,
587
+ inputs=[api_key, model, vocab_json, files, max_pages, chunk_chars, max_context_chars],
588
+ outputs=[vocab_json, status]
 
 
 
 
 
 
 
 
 
 
589
  )
590
 
591
+ extract_btn.click(
592
+ fn=run_extraction,
593
+ inputs=[files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars],
594
+ outputs=[table, out_csv, out_json, status]
 
595
  )
596
 
597
+ with gr.Tab("Cross-paper Synthesis"):
598
+ gr.Markdown("Upload the `extraction_details.json` produced by the Extract tab, then synthesize across papers.")
599
+ api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
600
+ model2 = gr.Dropdown(
601
+ label="Model",
602
+ choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"],
603
+ value="gpt-4o-2024-08-06"
604
+ )
605
+ extraction_json_file = gr.File(label="Upload extraction_details.json", file_types=[".json"], file_count="single")
606
+ synth_btn = gr.Button("Synthesize Across Papers")
607
+ synth_md = gr.Markdown()
608
+
609
+ synth_btn.click(
610
+ fn=run_synthesis,
611
+ inputs=[api_key2, model2, extraction_json_file],
612
+ outputs=[synth_md]
613
+ )
614
 
615
  if __name__ == "__main__":
616
  port = int(os.environ.get("PORT", "7860"))
617
+ demo.queue().launch(server_name="0.0.0.0", server_port=port)