hchevva commited on
Commit
79b39e6
·
verified ·
1 Parent(s): 5b07a9d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +364 -243
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
  import re
3
  import json
4
- import math
5
  import tempfile
6
  from pathlib import Path
7
  from typing import Dict, List, Tuple, Any
@@ -13,39 +12,52 @@ import pandas as pd
13
  from pypdf import PdfReader
14
  from sklearn.feature_extraction.text import TfidfVectorizer
15
 
16
- from openai import OpenAI # OpenAI Responses API client
17
 
18
 
19
- # -----------------------------
20
  # Defaults
21
- # -----------------------------
22
  DEFAULT_CONTROLLED_VOCAB_JSON = """{
23
  "risk_stance_enum": ["acceptable","acceptable_with_uncertainty","not_acceptable","insufficient_data"],
 
24
  "study_type_enum": ["in_vivo","in_vitro","epidemiology","in_silico","review","methodology","other"],
25
  "exposure_route_enum": ["oral","inhalation","dermal","parenteral","multiple","not_reported"],
26
  "species_enum": ["human","rat","mouse","rabbit","dog","non_human_primate","cell_line","other","not_reported"],
27
- "endpoint_terms": ["hepatotoxicity","nephrotoxicity","neurotoxicity","immunotoxicity","reproductive_toxicity","developmental_toxicity","genotoxicity","carcinogenicity","endocrine_activity","respiratory_toxicity","dermal_toxicity","hematotoxicity","cytotoxicity","oxidative_stress","inflammation"],
28
- "dose_metric_terms": ["noael","loael","bmd","bmdl","ld50","lc50","ec50","ic50"],
29
- "risk_language_terms": ["adverse_effect","no_adverse_effect_observed","increased_risk","safe_at_tested_dose","insufficient_evidence","uncertainty_high"]
 
 
 
 
 
 
 
 
 
 
 
30
  }"""
31
 
32
- DEFAULT_FIELD_SPEC = """# One field per line: Field Name | type | instructions | optional: enum values
33
  # types: str, num, bool, list[str], list[num], enum[a,b,c]
34
  Chemical(s) | list[str] | Primary chemical(s) studied; include common name + abbreviation if present.
35
  CAS_numbers | list[str] | Extract any CAS numbers mentioned.
36
  Study_type | enum[in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other] | Choose the best match.
37
  Exposure_route | enum[oral,inhalation,dermal,parenteral,multiple,not_reported] | Choose best match.
38
  Species | enum[human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported] | Choose best match.
39
- Key_endpoints | list[str] | Extract endpoints; prefer controlled vocab terms if applicable.
40
- Key_findings | str | 2-4 bullet-like sentences summarizing the main findings.
41
  Dose_metrics | list[str] | Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available.
 
42
  Conclusion | str | What does the paper conclude about safety/risk?
43
  """
44
 
45
 
46
- # -----------------------------
47
- # PDF extraction (page-aware)
48
- # -----------------------------
49
  def extract_pages_from_pdf(pdf_path: str, max_pages: int = 0) -> Tuple[List[Tuple[int, str]], int]:
50
  reader = PdfReader(pdf_path)
51
  page_count = len(reader.pages)
@@ -57,8 +69,7 @@ def extract_pages_from_pdf(pdf_path: str, max_pages: int = 0) -> Tuple[List[Tupl
57
  t = reader.pages[i].extract_text() or ""
58
  except Exception:
59
  t = ""
60
- t = (t or "").strip()
61
- pages.append((i + 1, t))
62
  return pages, page_count
63
 
64
 
@@ -70,9 +81,6 @@ def clean_text(t: str) -> str:
70
 
71
 
72
  def chunk_pages(pages: List[Tuple[int, str]], target_chars: int = 3000) -> List[Dict[str, Any]]:
73
- """
74
- Build chunks with page ranges, roughly target_chars each.
75
- """
76
  chunks = []
77
  buf = []
78
  start_page = None
@@ -85,12 +93,10 @@ def chunk_pages(pages: List[Tuple[int, str]], target_chars: int = 3000) -> List[
85
  if start_page is None:
86
  start_page = pno
87
 
88
- # If adding this page exceeds chunk size, flush
89
  if cur_len + len(txt) + 1 > target_chars and buf:
90
- end_page = (pno - 1) if (pno - 1) >= start_page else start_page
91
- chunks.append(
92
- {"pages": f"{start_page}-{end_page}", "text": " ".join(buf)}
93
- )
94
  buf = [txt]
95
  start_page = pno
96
  cur_len = len(txt)
@@ -99,16 +105,21 @@ def chunk_pages(pages: List[Tuple[int, str]], target_chars: int = 3000) -> List[
99
  cur_len += len(txt) + 1
100
 
101
  if buf and start_page is not None:
102
- end_page = pages[-1][0]
103
  chunks.append({"pages": f"{start_page}-{end_page}", "text": " ".join(buf)})
104
 
105
  return chunks
106
 
107
 
108
- # -----------------------------
109
  # Lightweight retrieval (TF-IDF) to select relevant excerpts
110
- # -----------------------------
111
- def select_relevant_chunks(chunks: List[Dict[str, Any]], queries: List[str], top_per_query: int = 2, max_chunks: int = 10) -> List[Dict[str, Any]]:
 
 
 
 
 
112
  texts = [c["text"] for c in chunks]
113
  if not texts:
114
  return []
@@ -116,24 +127,22 @@ def select_relevant_chunks(chunks: List[Dict[str, Any]], queries: List[str], top
116
  vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=20000)
117
  X = vectorizer.fit_transform(texts)
118
 
119
- selected_idx = []
120
  for q in queries:
121
  q = (q or "").strip()
122
  if not q:
123
  continue
124
  qv = vectorizer.transform([q])
125
- sims = (X @ qv.T).toarray().ravel() # cosine-like (not normalized), good enough for ranking
126
  idx = np.argsort(sims)[::-1]
127
  for i in idx[:top_per_query]:
128
  if i not in selected_idx:
129
  selected_idx.append(i)
130
 
131
- # fallback: if nothing selected, take first few chunks
132
  if not selected_idx:
133
  selected_idx = list(range(min(len(chunks), max_chunks)))
134
 
135
- selected = [chunks[i] for i in selected_idx[:max_chunks]]
136
- return selected
137
 
138
 
139
  def build_context(selected_chunks: List[Dict[str, Any]], max_chars: int = 20000) -> str:
@@ -148,9 +157,9 @@ def build_context(selected_chunks: List[Dict[str, Any]], max_chars: int = 20000)
148
  return "\n".join(parts).strip()
149
 
150
 
151
- # -----------------------------
152
  # User-defined extraction spec -> JSON Schema
153
- # -----------------------------
154
  def slugify_field(name: str) -> str:
155
  name = name.strip()
156
  name = re.sub(r"[^\w\s-]", "", name)
@@ -158,14 +167,13 @@ def slugify_field(name: str) -> str:
158
  return name[:60] if name else "field"
159
 
160
 
161
- def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], List[str], Dict[str, str]]:
162
  """
163
  spec lines: Field Name | type | instructions
164
- Returns: properties dict, required list, instructions map (field_key -> instruction)
165
  """
166
- props = {}
167
- required = []
168
- instr = {}
169
 
170
  for raw_line in (spec or "").splitlines():
171
  line = raw_line.strip()
@@ -180,15 +188,10 @@ def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], List[str], Dict[str, st
180
  ftype = parts[1]
181
  finstr = parts[2] if len(parts) >= 3 else ""
182
 
183
- is_required = False
184
- if field_name.startswith("*"):
185
- is_required = True
186
- field_name = field_name[1:].strip()
187
-
188
  key = slugify_field(field_name)
189
  instr[key] = finstr
190
 
191
- schema = {"type": "string"}
192
 
193
  if ftype == "str":
194
  schema = {"type": "string"}
@@ -208,20 +211,20 @@ def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], List[str], Dict[str, st
208
  schema = {"type": "string"}
209
 
210
  props[key] = schema
211
- if is_required:
212
- required.append(key)
213
 
214
- # If user didn’t mark required fields, keep it permissive
215
- return props, required, instr
216
 
217
 
218
- def build_extraction_schema(field_props: Dict[str, Any], required_fields: List[str], vocab: Dict[str, Any]) -> Dict[str, Any]:
 
 
 
 
219
  risk_enum = vocab.get(
220
  "risk_stance_enum",
221
  ["acceptable", "acceptable_with_uncertainty", "not_acceptable", "insufficient_data"]
222
  )
223
 
224
- # IMPORTANT: strict schema requires required == all property keys
225
  all_field_keys = list(field_props.keys())
226
 
227
  schema = {
@@ -236,7 +239,7 @@ def build_extraction_schema(field_props: Dict[str, Any], required_fields: List[s
236
  "type": "object",
237
  "additionalProperties": False,
238
  "properties": field_props,
239
- "required": all_field_keys # <-- FIX
240
  },
241
  "evidence": {
242
  "type": "array",
@@ -257,13 +260,13 @@ def build_extraction_schema(field_props: Dict[str, Any], required_fields: List[s
257
  return schema
258
 
259
 
260
- # -----------------------------
261
- # OpenAI call (Responses API + Structured Outputs)
262
- # -----------------------------
263
  def get_openai_client(api_key: str) -> OpenAI:
264
  key = (api_key or "").strip() or os.getenv("OPENAI_API_KEY", "").strip()
265
  if not key:
266
- raise ValueError("Missing OpenAI API key. Provide it in the UI or set OPENAI_API_KEY.")
267
  return OpenAI(api_key=key)
268
 
269
 
@@ -275,25 +278,20 @@ def openai_structured_extract(
275
  field_instructions: Dict[str, str],
276
  context: str
277
  ) -> Dict[str, Any]:
278
-
279
- # Build instruction text for the model
280
  field_instr_lines = []
281
  for k, v in field_instructions.items():
282
- if v:
283
- field_instr_lines.append(f"- {k}: {v}")
284
- else:
285
- field_instr_lines.append(f"- {k}: (no extra instructions)")
286
 
287
  vocab_text = json.dumps(controlled_vocab, indent=2)
288
 
289
  system_msg = (
290
  "You are a toxicology research paper data-extraction assistant.\n"
291
- "Rules:\n"
292
- "1) Use ONLY the provided excerpts; do not invent details.\n"
293
- "2) If a value is not stated, use an empty string, empty list, or 'not_reported' if the enum allows it.\n"
294
- "3) Always include evidence quotes with page ranges (from excerpt headers).\n"
295
- "4) risk_stance reflects overall concern from the paper's findings (high/moderate/low/inconclusive/not_assessed).\n"
296
- "5) Prefer controlled vocabulary terms when applicable.\n"
297
  )
298
 
299
  user_msg = (
@@ -302,7 +300,7 @@ def openai_structured_extract(
302
  "FIELD INSTRUCTIONS:\n"
303
  + "\n".join(field_instr_lines)
304
  + "\n\n"
305
- "EXCERPTS:\n"
306
  f"{context}\n"
307
  )
308
 
@@ -321,103 +319,104 @@ def openai_structured_extract(
321
  }
322
  }
323
  )
324
-
325
- # Structured outputs: JSON is in output_text
326
- out = resp.output_text
327
- return json.loads(out)
328
 
329
 
330
  def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[str, Any]]) -> str:
331
  system_msg = (
332
  "You are a senior toxicology scientist summarizing multiple papers.\n"
333
- "Produce a concise synthesis for researchers: consensus, disagreements, data gaps, and next steps.\n"
334
- "Base your synthesis strictly on the provided extracted JSON (which itself is evidence-backed).\n"
335
  )
336
  user_msg = "EXTRACTED_ROWS_JSON:\n" + json.dumps(rows, indent=2)
337
 
338
- resp = client.responses.create(
339
- model=model,
340
- input=[
341
- {"role": "system", "content": system_msg},
342
- {"role": "user", "content": user_msg}
343
- ]
344
- )
345
- return resp.output_text
346
-
347
-
348
- def openai_suggest_vocab_additions(client: OpenAI, model: str, current_vocab: Dict[str, Any], context: str) -> Dict[str, Any]:
349
- schema = {
350
- "type": "object",
351
- "additionalProperties": False,
352
- "properties": {
353
- "additions": {
354
- "type": "object",
355
- "additionalProperties": {
356
- "type": "array",
357
- "items": {"type": "string"}
358
- }
359
- },
360
- "notes": {"type": "string"}
361
- },
362
- "required": ["additions", "notes"]
363
- }
364
-
365
- system_msg = (
366
- "You propose controlled-vocabulary additions for toxicology paper extraction.\n"
367
- "Return only new candidate terms grouped under keys that already exist or new keys if needed.\n"
368
- "Avoid duplicates already in current vocab.\n"
369
- )
370
- user_msg = (
371
- "CURRENT_VOCAB_JSON:\n"
372
- + json.dumps(current_vocab, indent=2)
373
- + "\n\n"
374
- "EXCERPTS:\n"
375
- + context
376
- )
377
-
378
  resp = client.responses.create(
379
  model=model,
380
  input=[
381
  {"role": "system", "content": system_msg},
382
  {"role": "user", "content": user_msg}
383
  ],
384
- text={
385
- "format": {
386
- "type": "json_schema",
387
- "name": "vocab_additions",
388
- "schema": schema,
389
- "strict": True
390
- }
391
- }
392
  )
393
- return json.loads(resp.output_text)
394
 
395
 
396
- # -----------------------------
397
- # Gradio handlers
398
- # -----------------------------
399
- def run_extraction(files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  if not files:
401
- return None, None, None, "Upload one or more PDFs."
402
 
 
403
  try:
404
  vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
405
  except Exception as e:
406
- return None, None, None, f"Controlled vocab JSON is invalid: {e}"
407
 
408
- field_props, required_fields, field_instr = parse_field_spec(field_spec or DEFAULT_FIELD_SPEC)
 
409
  if not field_props:
410
- return None, None, None, "Field spec produced no fields. Add lines like: Field | str | instructions"
411
 
412
- schema = build_extraction_schema(field_props, required_fields, vocab)
413
 
 
414
  try:
415
  client = get_openai_client(api_key)
416
  except Exception as e:
417
- return None, None, None, str(e)
418
 
419
- results = []
420
- flat_rows = []
421
 
422
  tmpdir = Path(tempfile.mkdtemp(prefix="tox_extract_"))
423
 
@@ -426,46 +425,51 @@ def run_extraction(files, api_key, model, field_spec, vocab_json, max_pages, chu
426
  filename = os.path.basename(pdf_path)
427
 
428
  pages, page_count = extract_pages_from_pdf(pdf_path, max_pages=int(max_pages))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
  chunks = chunk_pages(pages, target_chars=int(chunk_chars))
430
 
431
- # Build queries: risk stance + each field instruction
432
- queries = [
433
- "risk stance hazard risk conclusion adverse effect noael loael bmd bmdl ld50 lc50 safety concern",
434
- ]
435
  for k, ins in field_instr.items():
436
- if ins:
437
- queries.append(ins)
438
- else:
439
- queries.append(k)
440
 
441
  selected = select_relevant_chunks(chunks, queries, top_per_query=2, max_chunks=12)
442
  context = build_context(selected, max_chars=int(max_context_chars))
443
 
444
- if not context.strip():
445
- # nothing extractable (scanned or empty)
446
- extracted = {
447
- "paper_title": "",
448
- "risk_stance": "not_assessed",
449
- "risk_confidence": 0.0,
450
- "risk_summary": "No text extracted from PDF (may be scanned).",
451
- "extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
452
- "evidence": []
453
- }
454
- else:
455
- extracted = openai_structured_extract(
456
- client=client,
457
- model=model,
458
- schema=schema,
459
- controlled_vocab=vocab,
460
- field_instructions=field_instr,
461
- context=context
462
- )
463
 
464
  extracted["_file"] = filename
465
  extracted["_pages_in_pdf"] = page_count
466
  results.append(extracted)
467
 
468
- # Flatten to table row
469
  row = {
470
  "file": filename,
471
  "paper_title": extracted.get("paper_title", ""),
@@ -473,12 +477,15 @@ def run_extraction(files, api_key, model, field_spec, vocab_json, max_pages, chu
473
  "risk_confidence": extracted.get("risk_confidence", ""),
474
  "risk_summary": extracted.get("risk_summary", "")
475
  }
 
 
476
  for k in field_props.keys():
477
- v = (extracted.get("extracted") or {}).get(k, "")
478
  if isinstance(v, list):
479
  row[k] = "; ".join([str(x) for x in v])
480
  else:
481
  row[k] = v
 
482
  flat_rows.append(row)
483
 
484
  df = pd.DataFrame(flat_rows)
@@ -488,120 +495,208 @@ def run_extraction(files, api_key, model, field_spec, vocab_json, max_pages, chu
488
  df.to_csv(csv_path, index=False)
489
  json_path.write_text(json.dumps(results, indent=2), encoding="utf-8")
490
 
491
- status = "Done. Download the CSV table (productivity output) and JSON details (evidence + structure)."
492
- return df, str(csv_path), str(json_path), status
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
 
494
 
495
- def run_synthesis(api_key, model, extraction_json_file):
496
- if extraction_json_file is None:
497
- return "Upload the extraction_details.json first (from the extraction step)."
 
 
498
 
499
- try:
500
- client = get_openai_client(api_key)
501
- except Exception as e:
502
- return str(e)
503
 
504
- rows = json.loads(Path(extraction_json_file.name).read_text(encoding="utf-8"))
505
- md = openai_synthesize_across_papers(client, model, rows)
506
- return md
507
 
508
 
509
- def suggest_vocab(api_key, model, vocab_json, files, max_pages, chunk_chars, max_context_chars):
510
- if not files:
511
- return vocab_json, "Upload PDFs so I can propose vocab additions from their content."
 
 
 
 
512
 
 
513
  try:
514
- client = get_openai_client(api_key)
515
- except Exception as e:
516
- return vocab_json, str(e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
 
518
- try:
519
- vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
520
- except Exception as e:
521
- return vocab_json, f"Controlled vocab JSON is invalid: {e}"
522
 
523
- # Build a small context from the first 1-2 docs
524
- contexts = []
525
- for f in files[:2]:
526
- pages, _ = extract_pages_from_pdf(f.name, max_pages=int(max_pages))
527
- chunks = chunk_pages(pages, target_chars=int(chunk_chars))
528
- selected = select_relevant_chunks(
529
- chunks,
530
- queries=["toxicology endpoints noael loael bmd genotoxicity carcinogenicity endocrine exposure route species"],
531
- top_per_query=2,
532
- max_chunks=8
533
- )
534
- ctx = build_context(selected, max_chars=int(max_context_chars))
535
- if ctx:
536
- contexts.append(ctx)
537
 
538
- combined = "\n\n---\n\n".join(contexts)[:int(max_context_chars)]
 
 
 
 
 
 
539
 
540
- additions = openai_suggest_vocab_additions(client, model, vocab, combined)
541
 
542
- # Merge additions (simple)
543
- merged = dict(vocab)
544
- add_obj = additions.get("additions", {})
545
- for k, arr in add_obj.items():
546
- if not isinstance(arr, list):
547
- continue
548
- if k not in merged:
549
- merged[k] = []
550
- if isinstance(merged[k], list):
551
- for term in arr:
552
- if term not in merged[k]:
553
- merged[k].append(term)
554
 
555
- return json.dumps(merged, indent=2), "Vocab updated with suggested additions. Review/edit before extracting."
 
 
 
 
 
 
556
 
557
 
558
- # -----------------------------
559
  # Gradio UI
560
- # -----------------------------
561
- with gr.Blocks(title="Toxicology PDF → Table Extractor (GPT-4o)") as demo:
562
- gr.Markdown("# Toxicology PDF → Table Extractor (GPT-4o)")
 
 
 
 
 
 
 
 
563
 
564
  with gr.Tab("Extract to Table"):
565
  files = gr.File(label="Upload toxicology research PDFs", file_types=[".pdf"], file_count="multiple")
566
 
567
- api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
568
- model = gr.Dropdown(
569
- label="Model",
570
- choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"],
571
- value="gpt-4o-2024-08-06"
572
- )
 
573
 
574
  with gr.Row():
575
- max_pages = gr.Slider(0, 200, value=0, step=1, label="Max pages to read (0 = all)")
576
- chunk_chars = gr.Slider(1200, 8000, value=3000, step=100, label="Chunk size (chars)")
577
- max_context_chars = gr.Slider(5000, 40000, value=20000, step=1000, label="Max context sent to GPT (chars)")
578
 
579
- vocab_json = gr.Textbox(label="Controlled vocabulary (JSON)", value=DEFAULT_CONTROLLED_VOCAB_JSON, lines=12)
580
- field_spec = gr.Textbox(label="Extraction spec (you control what fields to extract)", value=DEFAULT_FIELD_SPEC, lines=10)
581
 
582
- with gr.Row():
583
- vocab_btn = gr.Button("Suggest vocab additions from PDFs")
584
- extract_btn = gr.Button("Run Extraction (Table)")
585
  status = gr.Textbox(label="Status", interactive=False)
586
 
587
- table = gr.Dataframe(label="Extracted Table (one row per paper)", interactive=False)
588
- out_csv = gr.File(label="Download: extraction_table.csv")
589
- out_json = gr.File(label="Download: extraction_details.json (evidence + structured data)")
 
 
 
 
 
 
 
590
 
591
- vocab_btn.click(
592
- fn=suggest_vocab,
593
- inputs=[api_key, model, vocab_json, files, max_pages, chunk_chars, max_context_chars],
594
- outputs=[vocab_json, status]
 
 
 
 
 
 
 
 
 
 
 
 
595
  )
 
596
 
 
 
 
597
  extract_btn.click(
598
  fn=run_extraction,
599
  inputs=[files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars],
600
- outputs=[table, out_csv, out_json, status]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
601
  )
602
 
603
  with gr.Tab("Cross-paper Synthesis"):
604
- gr.Markdown("Upload the `extraction_details.json` produced by the Extract tab, then synthesize across papers.")
605
  api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
606
  model2 = gr.Dropdown(
607
  label="Model",
@@ -618,6 +713,32 @@ with gr.Blocks(title="Toxicology PDF → Table Extractor (GPT-4o)") as demo:
618
  outputs=[synth_md]
619
  )
620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
621
  if __name__ == "__main__":
622
  port = int(os.environ.get("PORT", "7860"))
623
  demo.queue().launch(server_name="0.0.0.0", server_port=port)
 
1
  import os
2
  import re
3
  import json
 
4
  import tempfile
5
  from pathlib import Path
6
  from typing import Dict, List, Tuple, Any
 
12
  from pypdf import PdfReader
13
  from sklearn.feature_extraction.text import TfidfVectorizer
14
 
15
+ from openai import OpenAI
16
 
17
 
18
+ # =============================
19
  # Defaults
20
+ # =============================
21
  DEFAULT_CONTROLLED_VOCAB_JSON = """{
22
  "risk_stance_enum": ["acceptable","acceptable_with_uncertainty","not_acceptable","insufficient_data"],
23
+
24
  "study_type_enum": ["in_vivo","in_vitro","epidemiology","in_silico","review","methodology","other"],
25
  "exposure_route_enum": ["oral","inhalation","dermal","parenteral","multiple","not_reported"],
26
  "species_enum": ["human","rat","mouse","rabbit","dog","non_human_primate","cell_line","other","not_reported"],
27
+
28
+ "oecd_endpoints": [
29
+ "acute_toxicity","subacute_toxicity","subchronic_toxicity","chronic_toxicity",
30
+ "carcinogenicity","genotoxicity","reproductive_toxicity","developmental_toxicity",
31
+ "neurotoxicity","immunotoxicity","endocrine_activity","sensitization","irritation_corrosion"
32
+ ],
33
+
34
+ "meddra_like_terms": [
35
+ "hepatic_disorder","renal_disorder","nervous_system_disorder","respiratory_disorder",
36
+ "skin_and_subcutaneous_tissue_disorder","reproductive_system_disorder",
37
+ "immune_system_disorder","blood_and_lymphatic_system_disorder"
38
+ ],
39
+
40
+ "dose_metric_terms": ["noael","loael","bmd","bmdl","ld50","lc50","ec50","ic50"]
41
  }"""
42
 
43
+ DEFAULT_FIELD_SPEC = """# One field per line: Field Name | type | instructions
44
  # types: str, num, bool, list[str], list[num], enum[a,b,c]
45
  Chemical(s) | list[str] | Primary chemical(s) studied; include common name + abbreviation if present.
46
  CAS_numbers | list[str] | Extract any CAS numbers mentioned.
47
  Study_type | enum[in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other] | Choose the best match.
48
  Exposure_route | enum[oral,inhalation,dermal,parenteral,multiple,not_reported] | Choose best match.
49
  Species | enum[human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported] | Choose best match.
50
+ OECD_endpoints | list[str] | Extract endpoints; prefer controlled vocab 'oecd_endpoints' when applicable.
51
+ MedDRA_like_terms | list[str] | Extract effects; prefer controlled vocab 'meddra_like_terms' when applicable.
52
  Dose_metrics | list[str] | Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available.
53
+ Key_findings | str | 2-4 bullet-like sentences summarizing the main findings.
54
  Conclusion | str | What does the paper conclude about safety/risk?
55
  """
56
 
57
 
58
+ # =============================
59
+ # PDF extraction (text-based PDFs only)
60
+ # =============================
61
  def extract_pages_from_pdf(pdf_path: str, max_pages: int = 0) -> Tuple[List[Tuple[int, str]], int]:
62
  reader = PdfReader(pdf_path)
63
  page_count = len(reader.pages)
 
69
  t = reader.pages[i].extract_text() or ""
70
  except Exception:
71
  t = ""
72
+ pages.append((i + 1, t or ""))
 
73
  return pages, page_count
74
 
75
 
 
81
 
82
 
83
  def chunk_pages(pages: List[Tuple[int, str]], target_chars: int = 3000) -> List[Dict[str, Any]]:
 
 
 
84
  chunks = []
85
  buf = []
86
  start_page = None
 
93
  if start_page is None:
94
  start_page = pno
95
 
 
96
  if cur_len + len(txt) + 1 > target_chars and buf:
97
+ end_page = pno - 1
98
+ end_page = end_page if end_page >= start_page else start_page
99
+ chunks.append({"pages": f"{start_page}-{end_page}", "text": " ".join(buf)})
 
100
  buf = [txt]
101
  start_page = pno
102
  cur_len = len(txt)
 
105
  cur_len += len(txt) + 1
106
 
107
  if buf and start_page is not None:
108
+ end_page = pages[-1][0] if pages else start_page
109
  chunks.append({"pages": f"{start_page}-{end_page}", "text": " ".join(buf)})
110
 
111
  return chunks
112
 
113
 
114
+ # =============================
115
  # Lightweight retrieval (TF-IDF) to select relevant excerpts
116
+ # =============================
117
+ def select_relevant_chunks(
118
+ chunks: List[Dict[str, Any]],
119
+ queries: List[str],
120
+ top_per_query: int = 2,
121
+ max_chunks: int = 12
122
+ ) -> List[Dict[str, Any]]:
123
  texts = [c["text"] for c in chunks]
124
  if not texts:
125
  return []
 
127
  vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=20000)
128
  X = vectorizer.fit_transform(texts)
129
 
130
+ selected_idx: List[int] = []
131
  for q in queries:
132
  q = (q or "").strip()
133
  if not q:
134
  continue
135
  qv = vectorizer.transform([q])
136
+ sims = (X @ qv.T).toarray().ravel()
137
  idx = np.argsort(sims)[::-1]
138
  for i in idx[:top_per_query]:
139
  if i not in selected_idx:
140
  selected_idx.append(i)
141
 
 
142
  if not selected_idx:
143
  selected_idx = list(range(min(len(chunks), max_chunks)))
144
 
145
+ return [chunks[i] for i in selected_idx[:max_chunks]]
 
146
 
147
 
148
  def build_context(selected_chunks: List[Dict[str, Any]], max_chars: int = 20000) -> str:
 
157
  return "\n".join(parts).strip()
158
 
159
 
160
+ # =============================
161
  # User-defined extraction spec -> JSON Schema
162
+ # =============================
163
  def slugify_field(name: str) -> str:
164
  name = name.strip()
165
  name = re.sub(r"[^\w\s-]", "", name)
 
167
  return name[:60] if name else "field"
168
 
169
 
170
+ def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
171
  """
172
  spec lines: Field Name | type | instructions
173
+ Returns: properties dict, instructions map (field_key -> instruction)
174
  """
175
+ props: Dict[str, Any] = {}
176
+ instr: Dict[str, str] = {}
 
177
 
178
  for raw_line in (spec or "").splitlines():
179
  line = raw_line.strip()
 
188
  ftype = parts[1]
189
  finstr = parts[2] if len(parts) >= 3 else ""
190
 
 
 
 
 
 
191
  key = slugify_field(field_name)
192
  instr[key] = finstr
193
 
194
+ schema: Dict[str, Any] = {"type": "string"}
195
 
196
  if ftype == "str":
197
  schema = {"type": "string"}
 
211
  schema = {"type": "string"}
212
 
213
  props[key] = schema
 
 
214
 
215
+ return props, instr
 
216
 
217
 
218
+ def build_extraction_schema(field_props: Dict[str, Any], vocab: Dict[str, Any]) -> Dict[str, Any]:
219
+ """
220
+ IMPORTANT: Structured Outputs (strict=True) requires that for every object:
221
+ required must exist and include every key in properties.
222
+ """
223
  risk_enum = vocab.get(
224
  "risk_stance_enum",
225
  ["acceptable", "acceptable_with_uncertainty", "not_acceptable", "insufficient_data"]
226
  )
227
 
 
228
  all_field_keys = list(field_props.keys())
229
 
230
  schema = {
 
239
  "type": "object",
240
  "additionalProperties": False,
241
  "properties": field_props,
242
+ "required": all_field_keys # strict requirement
243
  },
244
  "evidence": {
245
  "type": "array",
 
260
  return schema
261
 
262
 
263
+ # =============================
264
+ # OpenAI client + extraction
265
+ # =============================
266
  def get_openai_client(api_key: str) -> OpenAI:
267
  key = (api_key or "").strip() or os.getenv("OPENAI_API_KEY", "").strip()
268
  if not key:
269
+ raise ValueError("Missing OpenAI API key. Provide it in the UI or set OPENAI_API_KEY secret in Hugging Face.")
270
  return OpenAI(api_key=key)
271
 
272
 
 
278
  field_instructions: Dict[str, str],
279
  context: str
280
  ) -> Dict[str, Any]:
 
 
281
  field_instr_lines = []
282
  for k, v in field_instructions.items():
283
+ field_instr_lines.append(f"- {k}: {v if v else '(no extra instructions)'}")
 
 
 
284
 
285
  vocab_text = json.dumps(controlled_vocab, indent=2)
286
 
287
  system_msg = (
288
  "You are a toxicology research paper data-extraction assistant.\n"
289
+ "Grounding rules (must follow):\n"
290
+ "1) Use ONLY the provided excerpts; do NOT invent details.\n"
291
+ "2) If a value is not explicitly stated, output empty string or empty list (or an allowed enum like 'not_reported').\n"
292
+ "3) Provide evidence quotes + page ranges for extracted fields.\n"
293
+ "4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
294
+ "5) Prefer controlled vocab terms when applicable.\n"
295
  )
296
 
297
  user_msg = (
 
300
  "FIELD INSTRUCTIONS:\n"
301
  + "\n".join(field_instr_lines)
302
  + "\n\n"
303
+ "EXCERPTS (with page ranges):\n"
304
  f"{context}\n"
305
  )
306
 
 
319
  }
320
  }
321
  )
322
+ return json.loads(resp.output_text)
 
 
 
323
 
324
 
325
  def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[str, Any]]) -> str:
326
  system_msg = (
327
  "You are a senior toxicology scientist summarizing multiple papers.\n"
328
+ "Create a concise synthesis: consensus, disagreements, data gaps, and actionable next steps.\n"
329
+ "Base strictly on the provided extracted JSON (which is evidence-backed).\n"
330
  )
331
  user_msg = "EXTRACTED_ROWS_JSON:\n" + json.dumps(rows, indent=2)
332
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  resp = client.responses.create(
334
  model=model,
335
  input=[
336
  {"role": "system", "content": system_msg},
337
  {"role": "user", "content": user_msg}
338
  ],
 
 
 
 
 
 
 
 
339
  )
340
+ return resp.output_text
341
 
342
 
343
+ # =============================
344
+ # Grounding helpers (UI)
345
+ # =============================
346
+ def _make_vertical(records: List[Dict[str, Any]], file_name: str) -> pd.DataFrame:
347
+ if not records or not file_name:
348
+ return pd.DataFrame(columns=["Field", "Value"])
349
+ row = next((r for r in records if r.get("file") == file_name), None)
350
+ if not row:
351
+ return pd.DataFrame(columns=["Field", "Value"])
352
+ return pd.DataFrame({"Field": list(row.keys()), "Value": [row[k] for k in row.keys()]})
353
+
354
+
355
+ def _render_evidence(details: List[Dict[str, Any]], file_name: str, max_items: int = 80) -> str:
356
+ if not details or not file_name:
357
+ return ""
358
+ d = next((x for x in details if x.get("_file") == file_name), None)
359
+ if not d:
360
+ return ""
361
+ ev = d.get("evidence", []) or []
362
+ lines = []
363
+ for e in ev[:max_items]:
364
+ quote = (e.get("quote", "") or "").strip()
365
+ pages = (e.get("pages", "") or "").strip()
366
+ field = (e.get("field", "") or "").strip()
367
+ if quote:
368
+ if len(quote) > 280:
369
+ quote = quote[:280] + "…"
370
+ lines.append(f"- **{field}** (pages {pages}): “{quote}”")
371
+ header = "### Evidence (grounding)\n"
372
+ if not lines:
373
+ lines = ["- (no evidence returned)"]
374
+ return header + "\n".join(lines) + "\n\n> Review note: evidence reflects the original extraction. If you change values, re-run extraction to refresh evidence."
375
+
376
+
377
+ def _text_based_pdf_warning(pages: List[Tuple[int, str]]) -> bool:
378
+ # If almost no text exists across pages, treat as non-text PDF.
379
+ joined = " ".join([clean_text(t) for _, t in pages if clean_text(t)])
380
+ return len(joined.strip()) < 200 # heuristic threshold
381
+
382
+
383
+ # =============================
384
+ # Main extraction handler
385
+ # =============================
386
+ def run_extraction(
387
+ files,
388
+ api_key,
389
+ model,
390
+ field_spec,
391
+ vocab_json,
392
+ max_pages,
393
+ chunk_chars,
394
+ max_context_chars
395
+ ):
396
  if not files:
397
+ return None, None, None, "Upload one or more PDFs.", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
398
 
399
+ # vocab
400
  try:
401
  vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
402
  except Exception as e:
403
+ return None, None, None, f"Controlled vocab JSON is invalid: {e}", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
404
 
405
+ # field spec
406
+ field_props, field_instr = parse_field_spec(field_spec or DEFAULT_FIELD_SPEC)
407
  if not field_props:
408
+ return None, None, None, "Field spec produced no fields. Add lines like: Field | str | instructions", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
409
 
410
+ schema = build_extraction_schema(field_props, vocab)
411
 
412
+ # OpenAI
413
  try:
414
  client = get_openai_client(api_key)
415
  except Exception as e:
416
+ return None, None, None, str(e), gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
417
 
418
+ results: List[Dict[str, Any]] = []
419
+ flat_rows: List[Dict[str, Any]] = []
420
 
421
  tmpdir = Path(tempfile.mkdtemp(prefix="tox_extract_"))
422
 
 
425
  filename = os.path.basename(pdf_path)
426
 
427
  pages, page_count = extract_pages_from_pdf(pdf_path, max_pages=int(max_pages))
428
+
429
+ # enforce text-based PDFs note
430
+ if _text_based_pdf_warning(pages):
431
+ # create an "empty" record with warning
432
+ row = {"file": filename, "paper_title": "", "risk_stance": "insufficient_data", "risk_confidence": 0.0, "risk_summary": "No extractable text found. This app supports text-based PDFs only."}
433
+ for k, sch in field_props.items():
434
+ row[k] = "" if sch.get("type") != "array" else ""
435
+ flat_rows.append(row)
436
+
437
+ results.append({
438
+ "_file": filename,
439
+ "_pages_in_pdf": page_count,
440
+ "paper_title": "",
441
+ "risk_stance": "insufficient_data",
442
+ "risk_confidence": 0.0,
443
+ "risk_summary": "No extractable text found. This app supports text-based PDFs only.",
444
+ "extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
445
+ "evidence": []
446
+ })
447
+ continue
448
+
449
  chunks = chunk_pages(pages, target_chars=int(chunk_chars))
450
 
451
+ # Queries: risk stance + each field instruction (or field key)
452
+ queries = ["regulatory acceptability risk hazard concern conclusion noael loael bmd bmdl adverse effect uncertainty"]
 
 
453
  for k, ins in field_instr.items():
454
+ queries.append(ins if ins else k)
 
 
 
455
 
456
  selected = select_relevant_chunks(chunks, queries, top_per_query=2, max_chunks=12)
457
  context = build_context(selected, max_chars=int(max_context_chars))
458
 
459
+ extracted = openai_structured_extract(
460
+ client=client,
461
+ model=model,
462
+ schema=schema,
463
+ controlled_vocab=vocab,
464
+ field_instructions=field_instr,
465
+ context=context
466
+ )
 
 
 
 
 
 
 
 
 
 
 
467
 
468
  extracted["_file"] = filename
469
  extracted["_pages_in_pdf"] = page_count
470
  results.append(extracted)
471
 
472
+ # flatten to table (wide)
473
  row = {
474
  "file": filename,
475
  "paper_title": extracted.get("paper_title", ""),
 
477
  "risk_confidence": extracted.get("risk_confidence", ""),
478
  "risk_summary": extracted.get("risk_summary", "")
479
  }
480
+
481
+ ext = extracted.get("extracted") or {}
482
  for k in field_props.keys():
483
+ v = ext.get(k, "" if field_props[k].get("type") != "array" else [])
484
  if isinstance(v, list):
485
  row[k] = "; ".join([str(x) for x in v])
486
  else:
487
  row[k] = v
488
+
489
  flat_rows.append(row)
490
 
491
  df = pd.DataFrame(flat_rows)
 
495
  df.to_csv(csv_path, index=False)
496
  json_path.write_text(json.dumps(results, indent=2), encoding="utf-8")
497
 
498
+ records = df.to_dict("records")
499
+ choices = [r["file"] for r in records if "file" in r]
500
+ default = choices[0] if choices else None
501
+ vertical = _make_vertical(records, default)
502
+ evidence = _render_evidence(results, default)
503
+
504
+ status = "Done. Use the vertical view to read cleanly. Enable Review Mode to edit and export a reviewed CSV."
505
+
506
+ return (
507
+ df,
508
+ str(csv_path),
509
+ str(json_path),
510
+ status,
511
+ gr.update(choices=choices, value=default),
512
+ records,
513
+ results,
514
+ vertical,
515
+ evidence
516
+ )
517
 
518
 
519
+ # =============================
520
+ # Review mode handlers
521
+ # =============================
522
+ def on_pick(file_name: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
523
+ return _make_vertical(records, file_name), _render_evidence(details, file_name)
524
 
 
 
 
 
525
 
526
+ def toggle_review_mode(is_on: bool):
527
+ # make vertical table editable when review mode is on
528
+ return gr.update(interactive=bool(is_on))
529
 
530
 
531
+ def save_review_changes(file_name: str, vertical_df: Any, records: List[Dict[str, Any]]):
532
+ """
533
+ vertical_df comes from gr.Dataframe: typically list[list] or pandas df-like.
534
+ Expect two columns: Field, Value
535
+ """
536
+ if not file_name or not records:
537
+ return None, records, "Nothing to save."
538
 
539
+ # Convert vertical_df into dict
540
  try:
541
+ if isinstance(vertical_df, pd.DataFrame):
542
+ dfv = vertical_df
543
+ else:
544
+ # gradio may pass list-of-lists
545
+ dfv = pd.DataFrame(vertical_df, columns=["Field", "Value"])
546
+ except Exception:
547
+ return None, records, "Could not parse edited vertical table."
548
+
549
+ dfv = dfv.dropna(subset=["Field"])
550
+ updates = {str(r["Field"]): r["Value"] for _, r in dfv.iterrows() if str(r["Field"]).strip()}
551
+
552
+ # Update matching record
553
+ new_records = []
554
+ updated = False
555
+ for r in records:
556
+ if r.get("file") == file_name:
557
+ rr = dict(r)
558
+ for k, v in updates.items():
559
+ rr[k] = v
560
+ new_records.append(rr)
561
+ updated = True
562
+ else:
563
+ new_records.append(r)
564
 
565
+ df_wide = pd.DataFrame(new_records) if new_records else pd.DataFrame()
566
+ msg = "Saved changes into session table. Export reviewed CSV to download." if updated else "Record not found."
567
+ return df_wide, new_records, msg
 
568
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
569
 
570
+ def export_reviewed_csv(records: List[Dict[str, Any]]):
571
+ if not records:
572
+ return None, "No reviewed data to export."
573
+ tmpdir = Path(tempfile.mkdtemp(prefix="tox_review_"))
574
+ path = tmpdir / "reviewed_extraction_table.csv"
575
+ pd.DataFrame(records).to_csv(path, index=False)
576
+ return str(path), "Reviewed CSV ready to download."
577
 
 
578
 
579
+ # =============================
580
+ # Synthesis tab handler
581
+ # =============================
582
+ def run_synthesis(api_key, model, extraction_json_file):
583
+ if extraction_json_file is None:
584
+ return "Upload the extraction_details.json produced by the Extract tab first."
 
 
 
 
 
 
585
 
586
+ try:
587
+ client = get_openai_client(api_key)
588
+ except Exception as e:
589
+ return str(e)
590
+
591
+ rows = json.loads(Path(extraction_json_file.name).read_text(encoding="utf-8"))
592
+ return openai_synthesize_across_papers(client, model, rows)
593
 
594
 
595
+ # =============================
596
  # Gradio UI
597
+ # =============================
598
+ with gr.Blocks(title="Toxicology PDF → Grounded Table Extractor") as demo:
599
+ gr.Markdown(
600
+ "# Toxicology PDF → Grounded Table Extractor (GPT-4o)\n\n"
601
+ "**Important:** This app supports **text-based PDFs only** (not scanned/image PDFs). If a PDF has no extractable text, it will be flagged as insufficient_data.\n\n"
602
+ "You control *what* to extract using the **Extraction spec**. Outputs are grounded by evidence quotes + page ranges."
603
+ )
604
+
605
+ # State stores for review mode
606
+ state_records = gr.State([]) # wide table rows: list[dict]
607
+ state_details = gr.State([]) # extraction details JSON: list[dict]
608
 
609
  with gr.Tab("Extract to Table"):
610
  files = gr.File(label="Upload toxicology research PDFs", file_types=[".pdf"], file_count="multiple")
611
 
612
+ with gr.Row():
613
+ api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
614
+ model = gr.Dropdown(
615
+ label="Model",
616
+ choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"],
617
+ value="gpt-4o-2024-08-06"
618
+ )
619
 
620
  with gr.Row():
621
+ max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
622
+ chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
623
+ max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
624
 
625
+ vocab_json = gr.Textbox(label="Controlled vocabulary (JSON)", value=DEFAULT_CONTROLLED_VOCAB_JSON, lines=10)
626
+ field_spec = gr.Textbox(label="Extraction spec (you control the columns)", value=DEFAULT_FIELD_SPEC, lines=10)
627
 
628
+ extract_btn = gr.Button("Run Extraction (Grounded)")
 
 
629
  status = gr.Textbox(label="Status", interactive=False)
630
 
631
+ table = gr.Dataframe(
632
+ label="Wide Table (download-friendly)",
633
+ interactive=False,
634
+ wrap=True,
635
+ show_row_numbers=True,
636
+ buttons=["fullscreen", "copy"]
637
+ )
638
+ with gr.Row():
639
+ out_csv = gr.File(label="Download: extraction_table.csv")
640
+ out_json = gr.File(label="Download: extraction_details.json (evidence + structured data)")
641
 
642
+ gr.Markdown("## Readable view (vertical) + evidence")
643
+ record_pick = gr.Dropdown(label="Select record", choices=[], value=None)
644
+
645
+ with gr.Row():
646
+ review_mode = gr.Checkbox(label="Review mode (enable editing)", value=False)
647
+ save_btn = gr.Button("Save changes to session table")
648
+ export_btn = gr.Button("Export reviewed CSV")
649
+
650
+ review_status = gr.Textbox(label="Review status", interactive=False)
651
+
652
+ vertical_view = gr.Dataframe(
653
+ headers=["Field", "Value"],
654
+ interactive=False,
655
+ wrap=True,
656
+ show_row_numbers=False,
657
+ label="Vertical record view (Field → Value)"
658
  )
659
+ evidence_md = gr.Markdown()
660
 
661
+ reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
662
+
663
+ # Run extraction
664
  extract_btn.click(
665
  fn=run_extraction,
666
  inputs=[files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars],
667
+ outputs=[table, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
668
+ )
669
+
670
+ # On select record
671
+ record_pick.change(
672
+ fn=on_pick,
673
+ inputs=[record_pick, state_records, state_details],
674
+ outputs=[vertical_view, evidence_md]
675
+ )
676
+
677
+ # Toggle review mode editing
678
+ review_mode.change(
679
+ fn=toggle_review_mode,
680
+ inputs=[review_mode],
681
+ outputs=[vertical_view]
682
+ )
683
+
684
+ # Save edits back to wide table + state
685
+ save_btn.click(
686
+ fn=save_review_changes,
687
+ inputs=[record_pick, vertical_view, state_records],
688
+ outputs=[table, state_records, review_status]
689
+ )
690
+
691
+ # Export reviewed CSV
692
+ export_btn.click(
693
+ fn=export_reviewed_csv,
694
+ inputs=[state_records],
695
+ outputs=[reviewed_csv, review_status]
696
  )
697
 
698
  with gr.Tab("Cross-paper Synthesis"):
699
+ gr.Markdown("Upload the `extraction_details.json` from the Extract tab. Synthesis is based strictly on those grounded extractions.")
700
  api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
701
  model2 = gr.Dropdown(
702
  label="Model",
 
713
  outputs=[synth_md]
714
  )
715
 
716
+ with gr.Tab("Pending tasks"):
717
+ gr.Markdown(
718
+ "## Product roadmap (pending tasks)\n\n"
719
+ "### 1) Granular data model (one row per chemical–endpoint pair)\n"
720
+ "- Change schema to return `records: [ {chemical, endpoint, ...} ]`\n"
721
+ "- Flatten into wide table; vertical viewer targets a single record\n\n"
722
+ "### 2) Stronger grounding & verification\n"
723
+ "- Require evidence per field (already)\n"
724
+ "- Add automatic evidence verification (quote must exist in excerpt)\n"
725
+ "- Add `UNVERIFIED` flags + force empty values when evidence fails\n\n"
726
+ "### 3) Controlled vocab expansion & mapping\n"
727
+ "- Add synonym lists and preferred terms\n"
728
+ "- Map extracted terms into: FDA taxonomy / OECD endpoints / MedDRA-like groupings\n"
729
+ "- Add a vocab editor + import/export vocab JSON\n\n"
730
+ "### 4) Column transforms (structured parsing)\n"
731
+ "- Parse dose metrics into `{metric, value, unit, route, duration}`\n"
732
+ "- Normalize units (e.g., mg/kg/day)\n"
733
+ "- Auto-split multi-chemical text into canonical list\n\n"
734
+ "### 5) Multi-document compare mode\n"
735
+ "- Compare by chemical or endpoint\n"
736
+ "- Create a consensus + disagreements table\n\n"
737
+ "### 6) PDF limitations\n"
738
+ "- Current: **text-based PDFs only**\n"
739
+ "- Optional future: OCR for scanned PDFs (adds heavy dependencies)\n"
740
+ )
741
+
742
  if __name__ == "__main__":
743
  port = int(os.environ.get("PORT", "7860"))
744
  demo.queue().launch(server_name="0.0.0.0", server_port=port)