hchevva commited on
Commit
640a01a
·
verified ·
1 Parent(s): 594d25a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +545 -206
app.py CHANGED
@@ -25,28 +25,12 @@ DEFAULT_CONTROLLED_VOCAB_JSON = """{
25
 
26
  "study_type_enum": ["in_vivo","in_vitro","epidemiology","in_silico","review","methodology","other"],
27
  "in_silico_method_enum": [
28
- "qsar",
29
- "read_across",
30
- "molecular_docking",
31
- "molecular_dynamics",
32
- "pbpk_pbtK",
33
- "aop_based",
34
- "ml_model",
35
- "other",
36
- "not_reported"
37
  ],
38
  "nams_method_enum": [
39
- "high_throughput_screening_hts",
40
- "omics_transcriptomics",
41
- "omics_proteomics",
42
- "omics_metabolomics",
43
- "organ_on_chip",
44
- "microphysiological_system_mps",
45
- "3d_tissue_model",
46
- "in_chemico_assay",
47
- "in_silico_as_nams",
48
- "other",
49
- "not_reported"
50
  ],
51
 
52
  "exposure_route_enum": ["oral","inhalation","dermal","parenteral","multiple","not_reported"],
@@ -70,22 +54,9 @@ DEFAULT_CONTROLLED_VOCAB_JSON = """{
70
  "not_reported"
71
  ],
72
 
73
- "genotoxicity_result_enum": ["positive","negative","equivocal","not_reported"],
74
-
75
- "genotoxicity_result_keywords": {
76
- "positive": [
77
- "genotoxic","mutagenic","clastogenic","statistically_significant_increase",
78
- "significant_increase_in_mutations","induced_dna_damage","dose_dependent_increase"
79
- ],
80
- "negative": [
81
- "non_genotoxic","not_genotoxic","not_mutagenic","no_evidence_of_genotoxicity",
82
- "no_statistically_significant_increase","negative_result"
83
- ],
84
- "equivocal": ["equivocal","inconclusive"]
85
- }
86
  }"""
87
 
88
-
89
  DEFAULT_FIELD_SPEC = """# One field per line: Field Name | type | instructions
90
  # types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]
91
 
@@ -117,7 +88,7 @@ Genotox_OECD_TG_in_vivo | list[enum[
117
  not_reported
118
  ]] | If genotoxicity in vivo tests are reported, select all applicable TGs. Otherwise not_reported.
119
 
120
- Genotoxicity_result | enum[positive,negative,equivocal,not_reported] | Classify based on reported results language (see genotoxicity_result_keywords in vocab).
121
  Genotoxicity_result_notes | str | Short explanation grounded to the paper’s wording + what test context it applies to.
122
 
123
  Dose_metrics | list[str] | Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available.
@@ -126,6 +97,39 @@ Conclusion | str | What does the paper conclude about safety/risk?
126
  """
127
 
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
  # =============================
131
  # PDF extraction (text-based PDFs only)
@@ -183,8 +187,13 @@ def chunk_pages(pages: List[Tuple[int, str]], target_chars: int = 3000) -> List[
183
  return chunks
184
 
185
 
 
 
 
 
 
186
  # =============================
187
- # Lightweight retrieval (TF-IDF) to select relevant excerpts
188
  # =============================
189
  def select_relevant_chunks(
190
  chunks: List[Dict[str, Any]],
@@ -230,7 +239,7 @@ def build_context(selected_chunks: List[Dict[str, Any]], max_chars: int = 20000)
230
 
231
 
232
  # =============================
233
- # User-defined extraction spec -> JSON Schema
234
  # =============================
235
  def slugify_field(name: str) -> str:
236
  name = name.strip()
@@ -242,7 +251,7 @@ def slugify_field(name: str) -> str:
242
  def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
243
  """
244
  spec lines: Field Name | type | instructions
245
- Returns: properties dict, instructions map (field_key -> instruction)
246
  """
247
  props: Dict[str, Any] = {}
248
  instr: Dict[str, str] = {}
@@ -292,18 +301,10 @@ def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
292
 
293
 
294
  def build_extraction_schema(field_props: Dict[str, Any], vocab: Dict[str, Any]) -> Dict[str, Any]:
295
- """
296
- IMPORTANT: Structured Outputs (strict=True) requires that for every object:
297
- required must exist and include every key in properties.
298
- """
299
- risk_enum = vocab.get(
300
- "risk_stance_enum",
301
- ["acceptable", "acceptable_with_uncertainty", "not_acceptable", "insufficient_data"]
302
- )
303
-
304
  all_field_keys = list(field_props.keys())
305
 
306
- schema = {
307
  "type": "object",
308
  "additionalProperties": False,
309
  "properties": {
@@ -331,9 +332,8 @@ def build_extraction_schema(field_props: Dict[str, Any], vocab: Dict[str, Any])
331
  }
332
  }
333
  },
334
- "required": ["paper_title", "risk_stance", "risk_confidence", "risk_summary", "extracted", "evidence"]
335
  }
336
- return schema
337
 
338
 
339
  # =============================
@@ -354,10 +354,7 @@ def openai_structured_extract(
354
  field_instructions: Dict[str, str],
355
  context: str
356
  ) -> Dict[str, Any]:
357
- field_instr_lines = []
358
- for k, v in field_instructions.items():
359
- field_instr_lines.append(f"- {k}: {v if v else '(no extra instructions)'}")
360
-
361
  vocab_text = json.dumps(controlled_vocab, indent=2)
362
 
363
  system_msg = (
@@ -368,10 +365,8 @@ def openai_structured_extract(
368
  "3) Provide evidence quotes + page ranges for extracted fields.\n"
369
  "4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
370
  "5) Prefer controlled vocab terms when applicable.\n"
371
- "6) For Genotoxicity_result, use genotoxicity_result_keywords (positive/negative/equivocal) as guidance, but ONLY if the paper explicitly reports results.\n"
372
- "7) For OECD TG fields, only populate if the TG is explicitly stated or clearly described; otherwise use not_reported.\n"
373
- "8) For NAMs/in_silico fields, only populate if methods are explicitly described; otherwise not_reported.\n"
374
-
375
  )
376
 
377
  user_msg = (
@@ -409,19 +404,12 @@ def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[
409
  "Base strictly on the provided extracted JSON (which is evidence-backed).\n"
410
  )
411
  user_msg = "EXTRACTED_ROWS_JSON:\n" + json.dumps(rows, indent=2)
412
-
413
- resp = client.responses.create(
414
- model=model,
415
- input=[
416
- {"role": "system", "content": system_msg},
417
- {"role": "user", "content": user_msg}
418
- ],
419
- )
420
  return resp.output_text
421
 
422
 
423
  # =============================
424
- # Grounding helpers (UI)
425
  # =============================
426
  def _make_vertical(records: List[Dict[str, Any]], file_name: str) -> pd.DataFrame:
427
  if not records or not file_name:
@@ -451,17 +439,282 @@ def _render_evidence(details: List[Dict[str, Any]], file_name: str, max_items: i
451
  header = "### Evidence (grounding)\n"
452
  if not lines:
453
  lines = ["- (no evidence returned)"]
454
- return header + "\n".join(lines) + "\n\n> Review note: evidence reflects the original extraction. If you change values, re-run extraction to refresh evidence."
455
 
456
 
457
- def _text_based_pdf_warning(pages: List[Tuple[int, str]]) -> bool:
458
- # If almost no text exists across pages, treat as non-text PDF.
459
- joined = " ".join([clean_text(t) for _, t in pages if clean_text(t)])
460
- return len(joined.strip()) < 200 # heuristic threshold
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461
 
462
 
463
  # =============================
464
- # Main extraction handler
465
  # =============================
466
  def run_extraction(
467
  files,
@@ -474,26 +727,23 @@ def run_extraction(
474
  max_context_chars
475
  ):
476
  if not files:
477
- return None, None, None, "Upload one or more PDFs.", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
478
 
479
- # vocab
480
  try:
481
  vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
482
  except Exception as e:
483
- return None, None, None, f"Controlled vocab JSON is invalid: {e}", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
484
 
485
- # field spec
486
  field_props, field_instr = parse_field_spec(field_spec or DEFAULT_FIELD_SPEC)
487
  if not field_props:
488
- return None, None, None, "Field spec produced no fields. Add lines like: Field | str | instructions", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
489
 
490
  schema = build_extraction_schema(field_props, vocab)
491
 
492
- # OpenAI
493
  try:
494
  client = get_openai_client(api_key)
495
  except Exception as e:
496
- return None, None, None, str(e), gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
497
 
498
  results: List[Dict[str, Any]] = []
499
  flat_rows: List[Dict[str, Any]] = []
@@ -506,14 +756,7 @@ def run_extraction(
506
 
507
  pages, page_count = extract_pages_from_pdf(pdf_path, max_pages=int(max_pages))
508
 
509
- # enforce text-based PDFs note
510
  if _text_based_pdf_warning(pages):
511
- # create an "empty" record with warning
512
- row = {"file": filename, "paper_title": "", "risk_stance": "insufficient_data", "risk_confidence": 0.0, "risk_summary": "No extractable text found. This app supports text-based PDFs only."}
513
- for k, sch in field_props.items():
514
- row[k] = "" if sch.get("type") != "array" else ""
515
- flat_rows.append(row)
516
-
517
  results.append({
518
  "_file": filename,
519
  "_pages_in_pdf": page_count,
@@ -524,67 +767,64 @@ def run_extraction(
524
  "extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
525
  "evidence": []
526
  })
527
- continue
528
-
529
- chunks = chunk_pages(pages, target_chars=int(chunk_chars))
530
-
531
- # Queries: risk stance + each field instruction (or field key)
532
- queries = ["regulatory acceptability risk hazard concern conclusion noael loael bmd bmdl adverse effect uncertainty"]
533
- for k, ins in field_instr.items():
534
- queries.append(ins if ins else k)
535
-
536
- selected = select_relevant_chunks(chunks, queries, top_per_query=2, max_chunks=12)
537
- context = build_context(selected, max_chars=int(max_context_chars))
538
-
539
- extracted = openai_structured_extract(
540
- client=client,
541
- model=model,
542
- schema=schema,
543
- controlled_vocab=vocab,
544
- field_instructions=field_instr,
545
- context=context
546
- )
547
-
548
- extracted["_file"] = filename
549
- extracted["_pages_in_pdf"] = page_count
550
- results.append(extracted)
551
 
552
- # flatten to table (wide)
 
553
  row = {
554
  "file": filename,
555
- "paper_title": extracted.get("paper_title", ""),
556
- "risk_stance": extracted.get("risk_stance", ""),
557
- "risk_confidence": extracted.get("risk_confidence", ""),
558
- "risk_summary": extracted.get("risk_summary", "")
559
  }
560
-
561
- ext = extracted.get("extracted") or {}
562
  for k in field_props.keys():
563
  v = ext.get(k, "" if field_props[k].get("type") != "array" else [])
564
  if isinstance(v, list):
565
  row[k] = "; ".join([str(x) for x in v])
566
  else:
567
  row[k] = v
568
-
569
  flat_rows.append(row)
570
 
571
  df = pd.DataFrame(flat_rows)
 
572
 
573
  csv_path = tmpdir / "extraction_table.csv"
574
  json_path = tmpdir / "extraction_details.json"
575
  df.to_csv(csv_path, index=False)
576
  json_path.write_text(json.dumps(results, indent=2), encoding="utf-8")
577
 
578
- records = df.to_dict("records")
579
  choices = [r["file"] for r in records if "file" in r]
580
  default = choices[0] if choices else None
581
  vertical = _make_vertical(records, default)
582
  evidence = _render_evidence(results, default)
583
 
584
- status = "Done. Use the vertical view to read cleanly. Enable Review Mode to edit and export a reviewed CSV."
585
 
 
586
  return (
587
- df,
588
  str(csv_path),
589
  str(json_path),
590
  status,
@@ -602,34 +842,24 @@ def run_extraction(
602
  def on_pick(file_name: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
603
  return _make_vertical(records, file_name), _render_evidence(details, file_name)
604
 
605
-
606
  def toggle_review_mode(is_on: bool):
607
- # make vertical table editable when review mode is on
608
  return gr.update(interactive=bool(is_on))
609
 
610
-
611
  def save_review_changes(file_name: str, vertical_df: Any, records: List[Dict[str, Any]]):
612
- """
613
- vertical_df comes from gr.Dataframe: typically list[list] or pandas df-like.
614
- Expect two columns: Field, Value
615
- """
616
  if not file_name or not records:
617
- return None, records, "Nothing to save."
618
 
619
- # Convert vertical_df into dict
620
  try:
621
  if isinstance(vertical_df, pd.DataFrame):
622
  dfv = vertical_df
623
  else:
624
- # gradio may pass list-of-lists
625
  dfv = pd.DataFrame(vertical_df, columns=["Field", "Value"])
626
  except Exception:
627
- return None, records, "Could not parse edited vertical table."
628
 
629
  dfv = dfv.dropna(subset=["Field"])
630
  updates = {str(r["Field"]): r["Value"] for _, r in dfv.iterrows() if str(r["Field"]).strip()}
631
 
632
- # Update matching record
633
  new_records = []
634
  updated = False
635
  for r in records:
@@ -642,10 +872,8 @@ def save_review_changes(file_name: str, vertical_df: Any, records: List[Dict[str
642
  else:
643
  new_records.append(r)
644
 
645
- df_wide = pd.DataFrame(new_records) if new_records else pd.DataFrame()
646
- msg = "Saved changes into session table. Export reviewed CSV to download." if updated else "Record not found."
647
- return df_wide, new_records, msg
648
-
649
 
650
  def export_reviewed_csv(records: List[Dict[str, Any]]):
651
  if not records:
@@ -657,11 +885,11 @@ def export_reviewed_csv(records: List[Dict[str, Any]]):
657
 
658
 
659
  # =============================
660
- # Synthesis tab handler
661
  # =============================
662
  def run_synthesis(api_key, model, extraction_json_file):
663
  if extraction_json_file is None:
664
- return "Upload the extraction_details.json produced by the Extract tab first."
665
 
666
  try:
667
  client = get_openai_client(api_key)
@@ -675,46 +903,92 @@ def run_synthesis(api_key, model, extraction_json_file):
675
  # =============================
676
  # Gradio UI
677
  # =============================
678
- with gr.Blocks(title="Toxicology PDF → Grounded Table Extractor") as demo:
679
  gr.Markdown(
680
- "# Toxicology PDF → Grounded Table Extractor (GPT-4o)\n\n"
681
- "**Important:** This app supports **text-based PDFs only** (not scanned/image PDFs). If a PDF has no extractable text, it will be flagged as insufficient_data.\n\n"
682
- "You control *what* to extract using the **Extraction spec**. Outputs are grounded by evidence quotes + page ranges."
 
683
  )
684
 
685
- # State stores for review mode
686
- state_records = gr.State([]) # wide table rows: list[dict]
687
- state_details = gr.State([]) # extraction details JSON: list[dict]
 
 
688
 
689
- with gr.Tab("Extract to Table"):
690
- files = gr.File(label="Upload toxicology research PDFs", file_types=[".pdf"], file_count="multiple")
691
 
692
  with gr.Row():
693
  api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
694
- model = gr.Dropdown(
695
- label="Model",
696
- choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"],
697
- value="gpt-4o-2024-08-06"
698
- )
699
 
700
  with gr.Row():
701
  max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
702
  chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
703
  max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
704
 
705
- vocab_json = gr.Textbox(label="Controlled vocabulary (JSON)", value=DEFAULT_CONTROLLED_VOCAB_JSON, lines=10)
706
- field_spec = gr.Textbox(label="Extraction spec (you control the columns)", value=DEFAULT_FIELD_SPEC, lines=10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
707
 
708
  extract_btn = gr.Button("Run Extraction (Grounded)")
709
  status = gr.Textbox(label="Status", interactive=False)
710
 
711
- table = gr.Dataframe(
712
- label="Wide Table (download-friendly)",
 
713
  interactive=False,
714
  wrap=True,
715
  show_row_numbers=True,
716
  buttons=["fullscreen", "copy"]
717
  )
 
718
  with gr.Row():
719
  out_csv = gr.File(label="Download: extraction_table.csv")
720
  out_json = gr.File(label="Download: extraction_details.json (evidence + structured data)")
@@ -724,7 +998,7 @@ with gr.Blocks(title="Toxicology PDF → Grounded Table Extractor") as demo:
724
 
725
  with gr.Row():
726
  review_mode = gr.Checkbox(label="Review mode (enable editing)", value=False)
727
- save_btn = gr.Button("Save changes to session table")
728
  export_btn = gr.Button("Export reviewed CSV")
729
 
730
  review_status = gr.Textbox(label="Review status", interactive=False)
@@ -737,38 +1011,121 @@ with gr.Blocks(title="Toxicology PDF → Grounded Table Extractor") as demo:
737
  label="Vertical record view (Field → Value)"
738
  )
739
  evidence_md = gr.Markdown()
740
-
741
  reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
742
 
743
- # Run extraction
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
744
  extract_btn.click(
745
  fn=run_extraction,
746
  inputs=[files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars],
747
- outputs=[table, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
748
  )
749
 
750
- # On select record
751
  record_pick.change(
752
  fn=on_pick,
753
  inputs=[record_pick, state_records, state_details],
754
  outputs=[vertical_view, evidence_md]
755
  )
756
 
757
- # Toggle review mode editing
758
- review_mode.change(
759
- fn=toggle_review_mode,
760
- inputs=[review_mode],
761
- outputs=[vertical_view]
762
- )
763
 
764
- # Save edits back to wide table + state
765
  save_btn.click(
766
  fn=save_review_changes,
767
  inputs=[record_pick, vertical_view, state_records],
768
- outputs=[table, state_records, review_status]
769
  )
770
 
771
- # Export reviewed CSV
772
  export_btn.click(
773
  fn=export_reviewed_csv,
774
  inputs=[state_records],
@@ -776,47 +1133,29 @@ with gr.Blocks(title="Toxicology PDF → Grounded Table Extractor") as demo:
776
  )
777
 
778
  with gr.Tab("Cross-paper Synthesis"):
779
- gr.Markdown("Upload the `extraction_details.json` from the Extract tab. Synthesis is based strictly on those grounded extractions.")
780
  api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
781
- model2 = gr.Dropdown(
782
- label="Model",
783
- choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"],
784
- value="gpt-4o-2024-08-06"
785
- )
786
  extraction_json_file = gr.File(label="Upload extraction_details.json", file_types=[".json"], file_count="single")
787
  synth_btn = gr.Button("Synthesize Across Papers")
788
  synth_md = gr.Markdown()
789
-
790
- synth_btn.click(
791
- fn=run_synthesis,
792
- inputs=[api_key2, model2, extraction_json_file],
793
- outputs=[synth_md]
794
- )
795
 
796
  with gr.Tab("Pending tasks"):
797
  gr.Markdown(
798
- "## Product roadmap (pending tasks)\n\n"
799
- "### 1) Granular data model (one row per chemical–endpoint pair)\n"
800
- "- Change schema to return `records: [ {chemical, endpoint, ...} ]`\n"
801
- "- Flatten into wide table; vertical viewer targets a single record\n\n"
802
- "### 2) Stronger grounding & verification\n"
803
- "- Require evidence per field (already)\n"
804
- "- Add automatic evidence verification (quote must exist in excerpt)\n"
805
- "- Add `UNVERIFIED` flags + force empty values when evidence fails\n\n"
806
- "### 3) Controlled vocab expansion & mapping\n"
807
- "- Add synonym lists and preferred terms\n"
808
- "- Map extracted terms into: FDA taxonomy / OECD endpoints / MedDRA-like groupings\n"
809
- "- Add a vocab editor + import/export vocab JSON\n\n"
810
- "### 4) Column transforms (structured parsing)\n"
811
- "- Parse dose metrics into `{metric, value, unit, route, duration}`\n"
812
- "- Normalize units (e.g., mg/kg/day)\n"
813
- "- Auto-split multi-chemical text into canonical list\n\n"
814
- "### 5) Multi-document compare mode\n"
815
- "- Compare by chemical or endpoint\n"
816
- "- Create a consensus + disagreements table\n\n"
817
- "### 6) PDF limitations\n"
818
- "- Current: **text-based PDFs only**\n"
819
- "- Optional future: OCR for scanned PDFs (adds heavy dependencies)\n"
820
  )
821
 
822
  if __name__ == "__main__":
 
25
 
26
  "study_type_enum": ["in_vivo","in_vitro","epidemiology","in_silico","review","methodology","other"],
27
  "in_silico_method_enum": [
28
+ "qsar","read_across","molecular_docking","molecular_dynamics","pbpk_pbtK","aop_based","ml_model","other","not_reported"
 
 
 
 
 
 
 
 
29
  ],
30
  "nams_method_enum": [
31
+ "high_throughput_screening_hts","omics_transcriptomics","omics_proteomics","omics_metabolomics",
32
+ "organ_on_chip","microphysiological_system_mps","3d_tissue_model","in_chemico_assay",
33
+ "in_silico_as_nams","other","not_reported"
 
 
 
 
 
 
 
 
34
  ],
35
 
36
  "exposure_route_enum": ["oral","inhalation","dermal","parenteral","multiple","not_reported"],
 
54
  "not_reported"
55
  ],
56
 
57
+ "genotoxicity_result_enum": ["positive","negative","equivocal","not_reported"]
 
 
 
 
 
 
 
 
 
 
 
 
58
  }"""
59
 
 
60
  DEFAULT_FIELD_SPEC = """# One field per line: Field Name | type | instructions
61
  # types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]
62
 
 
88
  not_reported
89
  ]] | If genotoxicity in vivo tests are reported, select all applicable TGs. Otherwise not_reported.
90
 
91
+ Genotoxicity_result | enum[positive,negative,equivocal,not_reported] | Classify based on reported results. If unclear, not_reported.
92
  Genotoxicity_result_notes | str | Short explanation grounded to the paper’s wording + what test context it applies to.
93
 
94
  Dose_metrics | list[str] | Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available.
 
97
  """
98
 
99
 
100
+ # =============================
101
+ # Field presets (UI)
102
+ # =============================
103
+ PRESET_CORE = [
104
+ {"field": "Chemical(s)", "type": "list[str]", "enum_values": "", "instructions": "Primary chemical(s) studied; include common name + abbreviation if present."},
105
+ {"field": "CAS_numbers", "type": "list[str]", "enum_values": "", "instructions": "Extract any CAS numbers mentioned."},
106
+ {"field": "Study_type", "type": "enum", "enum_values": "in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other", "instructions": "Choose the best match."},
107
+ {"field": "Exposure_route", "type": "enum", "enum_values": "oral,inhalation,dermal,parenteral,multiple,not_reported", "instructions": "Choose best match."},
108
+ {"field": "Species", "type": "enum", "enum_values": "human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported", "instructions": "Choose best match."},
109
+ {"field": "Dose_metrics", "type": "list[str]", "enum_values": "", "instructions": "Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available."},
110
+ {"field": "Key_findings", "type": "str", "enum_values": "", "instructions": "2-4 bullet-like sentences summarizing the main findings."},
111
+ {"field": "Conclusion", "type": "str", "enum_values": "", "instructions": "What does the paper conclude about safety/risk?"},
112
+ ]
113
+
114
+ PRESET_NAMS_INSILICO = [
115
+ {"field": "Approach", "type": "enum", "enum_values": "in_vivo,in_vitro,in_silico,nams,mixed,not_reported", "instructions": "Identify if results are in silico or NAMs; use 'mixed' if multiple."},
116
+ {"field": "In_silico_methods", "type": "list[enum]", "enum_values": "qsar,read_across,molecular_docking,molecular_dynamics,pbpk_pbtK,aop_based,ml_model,other,not_reported", "instructions": "If in_silico, list methods used (can be multiple)."},
117
+ {"field": "NAMs_methods", "type": "list[enum]", "enum_values": "high_throughput_screening_hts,omics_transcriptomics,omics_proteomics,omics_metabolomics,organ_on_chip,microphysiological_system_mps,3d_tissue_model,in_chemico_assay,in_silico_as_nams,other,not_reported", "instructions": "If NAMs, list methods used (can be multiple)."},
118
+ ]
119
+
120
+ PRESET_GENOTOX_OECD = [
121
+ {"field": "Genotox_OECD_TG_in_vitro", "type": "list[enum]", "enum_values": "OECD_TG_471_Bacterial Reverse mutation test(AMES test),OECD_TG_473_In Vitro Mammalian Chromosomal Aberration Test,OECD_TG_476_In Vitro Mammalian Cell Gene Mutation Tests (Hprt & xprt),OECD_TG_487_In Vitro Mammalian Cell Micronucleus Test,OECD_TG_490_In Vitro Mammalian Cell Gene Mutation Tests (Thymidine Kinase),not_reported", "instructions": "If in vitro genotox tests are reported, select TGs. Otherwise not_reported."},
122
+ {"field": "Genotox_OECD_TG_in_vivo", "type": "list[enum]", "enum_values": "OECD_TG_474_In Vivo Mammalian Erythrocyte Micronucleus Test,OECD_TG_475_Mammalian Bone Marrow Chromosomal Aberration Test,OECD_TG_488_Transgenic Rodent Somatic & Germ Cell Gene Mutation Assays,OECD_TG_489_In Vivo Mammalian Alkaline Comet Assay,not_reported", "instructions": "If in vivo genotox tests are reported, select TGs. Otherwise not_reported."},
123
+ {"field": "Genotoxicity_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Classify based on reported results. If unclear, not_reported."},
124
+ {"field": "Genotoxicity_result_notes", "type": "str", "enum_values": "", "instructions": "Short explanation grounded to the paper’s wording + test context."},
125
+ ]
126
+
127
+ PRESET_MAP = {
128
+ "Core (recommended)": PRESET_CORE,
129
+ "NAMs + In Silico": PRESET_NAMS_INSILICO,
130
+ "Genotox (OECD TGs)": PRESET_GENOTOX_OECD,
131
+ }
132
+
133
 
134
  # =============================
135
  # PDF extraction (text-based PDFs only)
 
187
  return chunks
188
 
189
 
190
+ def _text_based_pdf_warning(pages: List[Tuple[int, str]]) -> bool:
191
+ joined = " ".join([clean_text(t) for _, t in pages if clean_text(t)])
192
+ return len(joined.strip()) < 200
193
+
194
+
195
  # =============================
196
+ # Lightweight retrieval (TF-IDF)
197
  # =============================
198
  def select_relevant_chunks(
199
  chunks: List[Dict[str, Any]],
 
239
 
240
 
241
  # =============================
242
+ # Spec -> JSON schema
243
  # =============================
244
  def slugify_field(name: str) -> str:
245
  name = name.strip()
 
251
  def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
252
  """
253
  spec lines: Field Name | type | instructions
254
+ types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]
255
  """
256
  props: Dict[str, Any] = {}
257
  instr: Dict[str, str] = {}
 
301
 
302
 
303
  def build_extraction_schema(field_props: Dict[str, Any], vocab: Dict[str, Any]) -> Dict[str, Any]:
304
+ risk_enum = vocab.get("risk_stance_enum", ["acceptable","acceptable_with_uncertainty","not_acceptable","insufficient_data"])
 
 
 
 
 
 
 
 
305
  all_field_keys = list(field_props.keys())
306
 
307
+ return {
308
  "type": "object",
309
  "additionalProperties": False,
310
  "properties": {
 
332
  }
333
  }
334
  },
335
+ "required": ["paper_title","risk_stance","risk_confidence","risk_summary","extracted","evidence"]
336
  }
 
337
 
338
 
339
  # =============================
 
354
  field_instructions: Dict[str, str],
355
  context: str
356
  ) -> Dict[str, Any]:
357
+ field_instr_lines = [f"- {k}: {v if v else '(no extra instructions)'}" for k, v in field_instructions.items()]
 
 
 
358
  vocab_text = json.dumps(controlled_vocab, indent=2)
359
 
360
  system_msg = (
 
365
  "3) Provide evidence quotes + page ranges for extracted fields.\n"
366
  "4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
367
  "5) Prefer controlled vocab terms when applicable.\n"
368
+ "6) For OECD TG fields, only populate if explicitly stated or clearly described; otherwise use not_reported.\n"
369
+ "7) For NAMs/in_silico fields, only populate if explicitly described; otherwise not_reported.\n"
 
 
370
  )
371
 
372
  user_msg = (
 
404
  "Base strictly on the provided extracted JSON (which is evidence-backed).\n"
405
  )
406
  user_msg = "EXTRACTED_ROWS_JSON:\n" + json.dumps(rows, indent=2)
407
+ resp = client.responses.create(model=model, input=[{"role":"system","content":system_msg},{"role":"user","content":user_msg}])
 
 
 
 
 
 
 
408
  return resp.output_text
409
 
410
 
411
  # =============================
412
+ # UI helpers: vertical view + evidence
413
  # =============================
414
  def _make_vertical(records: List[Dict[str, Any]], file_name: str) -> pd.DataFrame:
415
  if not records or not file_name:
 
439
  header = "### Evidence (grounding)\n"
440
  if not lines:
441
  lines = ["- (no evidence returned)"]
442
+ return header + "\n".join(lines)
443
 
444
 
445
+ def _overview_df_from_records(records: List[Dict[str, Any]]) -> pd.DataFrame:
446
+ if not records:
447
+ return pd.DataFrame(columns=["file","paper_title","risk_stance","risk_confidence"])
448
+ df = pd.DataFrame(records)
449
+ cols = ["file","paper_title","risk_stance","risk_confidence"]
450
+ # Include chemicals if present
451
+ for c in ["chemicals", "chemical_s", "chemical", "chemical_s_"]:
452
+ if c in df.columns and c not in cols:
453
+ cols.append(c)
454
+ break
455
+ cols = [c for c in cols if c in df.columns]
456
+ return df[cols].copy() if cols else df.head(50)
457
+
458
+ def _filter_terms_df(df: pd.DataFrame, query: str) -> pd.DataFrame:
459
+ if df is None or df.empty:
460
+ return pd.DataFrame(columns=["term"])
461
+ q = (query or "").strip().lower()
462
+ if not q:
463
+ return df[["term"]].copy()
464
+ mask = df["term"].astype(str).str.lower().str.contains(q, na=False)
465
+ out = df.loc[mask, ["term"]].copy()
466
+ return out
467
+
468
+ # =============================
469
+ # Controlled vocab guided editor (lists only)
470
+ # =============================
471
+ vocab_search = gr.Textbox(label="Search terms", placeholder="Type to filter (e.g., 471, AMES, comet)", lines=1)
472
+
473
+ vocab_terms_filtered = gr.Dataframe(
474
+ headers=["term"],
475
+ label="Filtered preview (read-only)",
476
+ interactive=False,
477
+ wrap=True
478
+ )
479
+
480
+ def vocab_init_state(vocab_json: str):
481
+ try:
482
+ vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
483
+ except Exception:
484
+ vocab = json.loads(DEFAULT_CONTROLLED_VOCAB_JSON)
485
+
486
+ list_keys = sorted([k for k, v in vocab.items() if isinstance(v, list)])
487
+ default_key = list_keys[0] if list_keys else None
488
+ terms = vocab.get(default_key, []) if default_key else []
489
+ terms_df = pd.DataFrame({"term": terms})
490
+ return vocab, list_keys, default_key, terms_df, json.dumps(vocab, indent=2), "✅ Vocab loaded."
491
+
492
+
493
+ def vocab_load_category(vocab_state: Dict[str, Any], category: str, search: str):
494
+ if not category or category not in vocab_state:
495
+ empty = pd.DataFrame(columns=["term"])
496
+ return empty, empty, "Select a category."
497
+ terms = vocab_state.get(category, [])
498
+ if not isinstance(terms, list):
499
+ empty = pd.DataFrame(columns=["term"])
500
+ return empty, empty, "This category is not a list."
501
+ full = pd.DataFrame({"term": terms})
502
+ filtered = _filter_terms_df(full, search)
503
+ return full, filtered, f"Editing: {category}"
504
+
505
+
506
+ def vocab_add_term(vocab_state: Dict[str, Any], category: str, term: str, search: str):
507
+ term = (term or "").strip()
508
+ if not term:
509
+ return gr.update(), gr.update(), "", "Enter a term to add."
510
+ if not category or category not in vocab_state or not isinstance(vocab_state.get(category), list):
511
+ return gr.update(), gr.update(), "", "Pick a list category first."
512
+
513
+ if term not in vocab_state[category]:
514
+ vocab_state[category].append(term)
515
+
516
+ full = pd.DataFrame({"term": vocab_state[category]})
517
+ filtered = _filter_terms_df(full, search)
518
+ return full, filtered, "", f"Added: {term}"
519
+
520
+
521
+ def vocab_remove_term(vocab_state: Dict[str, Any], category: str, term: str, search: str):
522
+ term = (term or "").strip()
523
+ if not term:
524
+ return gr.update(), gr.update(), "", "Enter a term to remove."
525
+ if not category or category not in vocab_state or not isinstance(vocab_state.get(category), list):
526
+ return gr.update(), gr.update(), "", "Pick a list category first."
527
+
528
+ vocab_state[category] = [t for t in vocab_state[category] if t != term]
529
+ full = pd.DataFrame({"term": vocab_state[category]})
530
+ filtered = _filter_terms_df(full, search)
531
+ return full, filtered, "", f"Removed: {term}"
532
+
533
+
534
+ def vocab_apply_df(vocab_state: Dict[str, Any], category: str, terms_df: Any, search: str):
535
+ if not category or category not in vocab_state or not isinstance(vocab_state.get(category), list):
536
+ return json.dumps(vocab_state, indent=2), pd.DataFrame(columns=["term"]), "Pick a list category first."
537
+
538
+ try:
539
+ if isinstance(terms_df, pd.DataFrame):
540
+ df = terms_df
541
+ else:
542
+ df = pd.DataFrame(terms_df, columns=["term"])
543
+ except Exception:
544
+ vjson = json.dumps(vocab_state, indent=2)
545
+ return vjson, pd.DataFrame(columns=["term"]), "Could not parse terms table."
546
+
547
+ terms = []
548
+ for t in df.get("term", []).tolist():
549
+ t = (str(t) if t is not None else "").strip()
550
+ if t and t not in terms:
551
+ terms.append(t)
552
+
553
+ vocab_state[category] = terms
554
+ vjson = json.dumps(vocab_state, indent=2)
555
+ filtered = _filter_terms_df(pd.DataFrame({"term": terms}), search)
556
+ return vjson, filtered, f"✅ Applied {len(terms)} terms to {category}."
557
+
558
+
559
+ def vocab_reset_defaults():
560
+ return vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
561
+
562
+ def vocab_filter_preview(terms_df, search):
563
+ try:
564
+ df = terms_df if isinstance(terms_df, pd.DataFrame) else pd.DataFrame(terms_df, columns=["term"])
565
+ except Exception:
566
+ df = pd.DataFrame(columns=["term"])
567
+ return _filter_terms_df(df, search)
568
+
569
+ vocab_search.change(
570
+ fn=vocab_filter_preview,
571
+ inputs=[vocab_terms_df, vocab_search],
572
+ outputs=[vocab_terms_filtered]
573
+ )
574
+
575
+
576
+ # =============================
577
+ # Field builder (type dropdown + presets)
578
+ # =============================
579
+ TYPE_CHOICES = ["str", "num", "bool", "list[str]", "list[num]", "enum", "list[enum]"]
580
+
581
+ def fields_init_state():
582
+ # start from DEFAULT_FIELD_SPEC by showing a friendly default builder (Core + Genotox + NAMs)
583
+ fields = []
584
+ for row in (PRESET_CORE + PRESET_NAMS_INSILICO + PRESET_GENOTOX_OECD):
585
+ fields.append(dict(row))
586
+ df = pd.DataFrame(fields, columns=["field","type","enum_values","instructions"])
587
+ spec = build_spec_from_field_df(df)
588
+ return fields, df, spec, "✅ Field builder loaded."
589
+
590
+ def build_spec_from_field_df(df: pd.DataFrame) -> str:
591
+ lines = [
592
+ "# One field per line: Field Name | type | instructions",
593
+ "# types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]",
594
+ ""
595
+ ]
596
+ for _, r in df.iterrows():
597
+ field = str(r.get("field","")).strip()
598
+ ftype = str(r.get("type","")).strip()
599
+ enums = str(r.get("enum_values","")).strip()
600
+ instr = str(r.get("instructions","")).strip()
601
+
602
+ if not field or not ftype:
603
+ continue
604
+
605
+ # normalize types
606
+ if ftype not in TYPE_CHOICES:
607
+ # keep as-is, but likely invalid; user can fix
608
+ pass
609
+
610
+ if ftype == "enum":
611
+ vals = [v.strip() for v in enums.split(",") if v.strip()]
612
+ type_str = f"enum[{','.join(vals)}]" if vals else "str"
613
+ elif ftype == "list[enum]":
614
+ vals = [v.strip() for v in enums.split(",") if v.strip()]
615
+ type_str = f"list[enum[{','.join(vals)}]]" if vals else "list[str]"
616
+ else:
617
+ type_str = ftype
618
+
619
+ lines.append(f"{field} | {type_str} | {instr}")
620
+
621
+ return "\n".join(lines).strip() + "\n"
622
+
623
+ def fields_add_or_update(field_name: str, ftype: str, enum_values: str, instructions: str, field_rows: List[Dict[str, Any]]):
624
+ field_name = (field_name or "").strip()
625
+ ftype = (ftype or "").strip()
626
+ enum_values = (enum_values or "").strip()
627
+ instructions = (instructions or "").strip()
628
+
629
+ if not field_name or not ftype:
630
+ df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
631
+ return field_rows, df, build_spec_from_field_df(df), "Field name and type are required."
632
+
633
+ # update if exists
634
+ updated = False
635
+ for r in field_rows:
636
+ if str(r.get("field","")).strip().lower() == field_name.lower():
637
+ r["type"] = ftype
638
+ r["enum_values"] = enum_values
639
+ r["instructions"] = instructions
640
+ updated = True
641
+ break
642
+
643
+ if not updated:
644
+ field_rows.append({"field": field_name, "type": ftype, "enum_values": enum_values, "instructions": instructions})
645
+
646
+ df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
647
+ spec = build_spec_from_field_df(df)
648
+ return field_rows, df, spec, ("Updated field." if updated else "Added field.")
649
+
650
+ def fields_remove(field_to_remove: str, field_rows: List[Dict[str, Any]]):
651
+ key = (field_to_remove or "").strip().lower()
652
+ if not key:
653
+ df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
654
+ return field_rows, df, build_spec_from_field_df(df), "Pick a field to remove."
655
+ field_rows = [r for r in field_rows if str(r.get("field","")).strip().lower() != key]
656
+ df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
657
+ spec = build_spec_from_field_df(df)
658
+ return field_rows, df, spec, "Removed."
659
+
660
+ def fields_apply_df(field_rows: List[Dict[str, Any]], df_in: Any):
661
+ try:
662
+ if isinstance(df_in, pd.DataFrame):
663
+ df = df_in
664
+ else:
665
+ df = pd.DataFrame(df_in, columns=["field","type","enum_values","instructions"])
666
+ except Exception:
667
+ df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
668
+ return field_rows, df, build_spec_from_field_df(df), "Could not parse builder table."
669
+
670
+ # clean + rebuild list of dicts
671
+ cleaned = []
672
+ seen = set()
673
+ for _, r in df.iterrows():
674
+ field = str(r.get("field","")).strip()
675
+ ftype = str(r.get("type","")).strip()
676
+ enums = str(r.get("enum_values","")).strip()
677
+ instr = str(r.get("instructions","")).strip()
678
+ if not field or not ftype:
679
+ continue
680
+ k = field.lower()
681
+ if k in seen:
682
+ continue
683
+ seen.add(k)
684
+ cleaned.append({"field": field, "type": ftype, "enum_values": enums, "instructions": instr})
685
+
686
+ df2 = pd.DataFrame(cleaned, columns=["field","type","enum_values","instructions"])
687
+ spec = build_spec_from_field_df(df2)
688
+ return cleaned, df2, spec, f"✅ Applied builder table ({len(cleaned)} fields)."
689
+
690
+ def fields_load_preset(preset_name: str, mode: str, field_rows: List[Dict[str, Any]]):
691
+ preset = PRESET_MAP.get(preset_name)
692
+ if not preset:
693
+ df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
694
+ return field_rows, df, build_spec_from_field_df(df), "Unknown preset."
695
+
696
+ if mode == "Replace":
697
+ new_rows = [dict(r) for r in preset]
698
+ else:
699
+ # Append (update existing fields if same name)
700
+ new_rows = [dict(r) for r in field_rows]
701
+ for p in preset:
702
+ found = False
703
+ for r in new_rows:
704
+ if str(r.get("field","")).strip().lower() == str(p.get("field","")).strip().lower():
705
+ r.update(p)
706
+ found = True
707
+ break
708
+ if not found:
709
+ new_rows.append(dict(p))
710
+
711
+ df = pd.DataFrame(new_rows, columns=["field","type","enum_values","instructions"])
712
+ spec = build_spec_from_field_df(df)
713
+ return new_rows, df, spec, f"✅ Loaded preset: {preset_name} ({mode})."
714
 
715
 
716
  # =============================
717
+ # Extraction handler
718
  # =============================
719
  def run_extraction(
720
  files,
 
727
  max_context_chars
728
  ):
729
  if not files:
730
+ return pd.DataFrame(), None, None, "Upload one or more PDFs.", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
731
 
 
732
  try:
733
  vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
734
  except Exception as e:
735
+ return pd.DataFrame(), None, None, f"Controlled vocab JSON invalid: {e}", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
736
 
 
737
  field_props, field_instr = parse_field_spec(field_spec or DEFAULT_FIELD_SPEC)
738
  if not field_props:
739
+ return pd.DataFrame(), None, None, "Extraction spec produced no fields.", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
740
 
741
  schema = build_extraction_schema(field_props, vocab)
742
 
 
743
  try:
744
  client = get_openai_client(api_key)
745
  except Exception as e:
746
+ return pd.DataFrame(), None, None, str(e), gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
747
 
748
  results: List[Dict[str, Any]] = []
749
  flat_rows: List[Dict[str, Any]] = []
 
756
 
757
  pages, page_count = extract_pages_from_pdf(pdf_path, max_pages=int(max_pages))
758
 
 
759
  if _text_based_pdf_warning(pages):
 
 
 
 
 
 
760
  results.append({
761
  "_file": filename,
762
  "_pages_in_pdf": page_count,
 
767
  "extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
768
  "evidence": []
769
  })
770
+ else:
771
+ chunks = chunk_pages(pages, target_chars=int(chunk_chars))
772
+
773
+ queries = ["regulatory acceptability risk hazard concern conclusion adverse effect uncertainty noael loael bmd bmdl"]
774
+ for k, ins in field_instr.items():
775
+ queries.append(ins if ins else k)
776
+
777
+ selected = select_relevant_chunks(chunks, queries, top_per_query=2, max_chunks=12)
778
+ context = build_context(selected, max_chars=int(max_context_chars))
779
+
780
+ extracted = openai_structured_extract(
781
+ client=client,
782
+ model=model,
783
+ schema=schema,
784
+ controlled_vocab=vocab,
785
+ field_instructions=field_instr,
786
+ context=context
787
+ )
788
+ extracted["_file"] = filename
789
+ extracted["_pages_in_pdf"] = page_count
790
+ results.append(extracted)
 
 
 
791
 
792
+ # flatten to internal records for vertical view + review/export
793
+ ex = results[-1]
794
  row = {
795
  "file": filename,
796
+ "paper_title": ex.get("paper_title",""),
797
+ "risk_stance": ex.get("risk_stance",""),
798
+ "risk_confidence": ex.get("risk_confidence",""),
799
+ "risk_summary": ex.get("risk_summary","")
800
  }
801
+ ext = ex.get("extracted") or {}
 
802
  for k in field_props.keys():
803
  v = ext.get(k, "" if field_props[k].get("type") != "array" else [])
804
  if isinstance(v, list):
805
  row[k] = "; ".join([str(x) for x in v])
806
  else:
807
  row[k] = v
 
808
  flat_rows.append(row)
809
 
810
  df = pd.DataFrame(flat_rows)
811
+ records = df.to_dict("records")
812
 
813
  csv_path = tmpdir / "extraction_table.csv"
814
  json_path = tmpdir / "extraction_details.json"
815
  df.to_csv(csv_path, index=False)
816
  json_path.write_text(json.dumps(results, indent=2), encoding="utf-8")
817
 
 
818
  choices = [r["file"] for r in records if "file" in r]
819
  default = choices[0] if choices else None
820
  vertical = _make_vertical(records, default)
821
  evidence = _render_evidence(results, default)
822
 
823
+ overview = _overview_df_from_records(records)
824
 
825
+ status = "Done. Use the vertical view + evidence for review. Export reviewed CSV when ready."
826
  return (
827
+ overview,
828
  str(csv_path),
829
  str(json_path),
830
  status,
 
842
  def on_pick(file_name: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
843
  return _make_vertical(records, file_name), _render_evidence(details, file_name)
844
 
 
845
  def toggle_review_mode(is_on: bool):
 
846
  return gr.update(interactive=bool(is_on))
847
 
 
848
  def save_review_changes(file_name: str, vertical_df: Any, records: List[Dict[str, Any]]):
 
 
 
 
849
  if not file_name or not records:
850
+ return pd.DataFrame(), records, "Nothing to save."
851
 
 
852
  try:
853
  if isinstance(vertical_df, pd.DataFrame):
854
  dfv = vertical_df
855
  else:
 
856
  dfv = pd.DataFrame(vertical_df, columns=["Field", "Value"])
857
  except Exception:
858
+ return _overview_df_from_records(records), records, "Could not parse edited vertical table."
859
 
860
  dfv = dfv.dropna(subset=["Field"])
861
  updates = {str(r["Field"]): r["Value"] for _, r in dfv.iterrows() if str(r["Field"]).strip()}
862
 
 
863
  new_records = []
864
  updated = False
865
  for r in records:
 
872
  else:
873
  new_records.append(r)
874
 
875
+ msg = "Saved changes into session data. Export reviewed CSV to download." if updated else "Record not found."
876
+ return _overview_df_from_records(new_records), new_records, msg
 
 
877
 
878
  def export_reviewed_csv(records: List[Dict[str, Any]]):
879
  if not records:
 
885
 
886
 
887
  # =============================
888
+ # Synthesis
889
  # =============================
890
  def run_synthesis(api_key, model, extraction_json_file):
891
  if extraction_json_file is None:
892
+ return "Upload the extraction_details.json from the Extract tab first."
893
 
894
  try:
895
  client = get_openai_client(api_key)
 
903
  # =============================
904
  # Gradio UI
905
  # =============================
906
+ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
907
  gr.Markdown(
908
+ "# Toxicology PDF → Grounded Extractor (GPT-4o)\n\n"
909
+ "**Important:** Text-based PDFs only (not scanned/image PDFs). If no extractable text is found, the record is marked `insufficient_data`.\n\n"
910
+ "This UI is optimized for non-JSON users: **Controlled vocab editor** + **Field Builder**.\n"
911
+ "Raw JSON/spec are available under **Advanced**."
912
  )
913
 
914
+ # State
915
+ state_records = gr.State([]) # list[dict]
916
+ state_details = gr.State([]) # list[dict]
917
+ vocab_state = gr.State({}) # dict
918
+ field_rows_state = gr.State([]) # list[dict]
919
 
920
+ with gr.Tab("Extract"):
921
+ files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
922
 
923
  with gr.Row():
924
  api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
925
+ model = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
 
 
 
 
926
 
927
  with gr.Row():
928
  max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
929
  chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
930
  max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
931
 
932
+ gr.Markdown("## Controlled Vocabulary (guided editor)")
933
+ vocab_mode = gr.Radio(choices=["Guided", "Advanced (Raw JSON)"], value="Guided", label="Vocab editor mode")
934
+
935
+ with gr.Row():
936
+ vocab_category = gr.Dropdown(label="Category (lists only)", choices=[], value=None)
937
+ vocab_term_add = gr.Textbox(label="Add term", placeholder="type term and click Add")
938
+ vocab_add_btn = gr.Button("Add")
939
+ with gr.Row():
940
+ vocab_term_remove = gr.Textbox(label="Remove term", placeholder="type exact term and click Remove")
941
+ vocab_remove_btn = gr.Button("Remove")
942
+ vocab_apply_btn = gr.Button("Apply table changes to category")
943
+ vocab_reset_btn = gr.Button("Reset vocab to defaults")
944
+
945
+ vocab_terms_df = gr.Dataframe(headers=["term"], label="Terms (edit directly)", interactive=True, wrap=True)
946
+ vocab_status = gr.Textbox(label="Vocab status", interactive=False)
947
+
948
+ with gr.Accordion("Advanced: Raw vocab JSON (auto-generated)", open=False):
949
+ vocab_json = gr.Textbox(label="Controlled vocab JSON", lines=12, interactive=False)
950
+
951
+ gr.Markdown("## Extraction Spec (Field Builder)")
952
+ with gr.Row():
953
+ preset_name = gr.Dropdown(label="Preset", choices=list(PRESET_MAP.keys()), value="Core (recommended)")
954
+ preset_mode = gr.Radio(label="Preset mode", choices=["Replace", "Append"], value="Append")
955
+ preset_btn = gr.Button("Load preset")
956
+
957
+ with gr.Row():
958
+ field_name_in = gr.Textbox(label="Field name", placeholder="e.g., Genotoxicity_result")
959
+ field_type_in = gr.Dropdown(label="Type", choices=TYPE_CHOICES, value="str")
960
+ enum_values_in = gr.Textbox(label="Enum values (comma-separated; used for enum/list[enum])", placeholder="a,b,c", lines=2)
961
+ instructions_in = gr.Textbox(label="Instructions", placeholder="Tell the extractor exactly what to pull.", lines=2)
962
+
963
+ with gr.Row():
964
+ add_update_field_btn = gr.Button("Add/Update field")
965
+ remove_field_name = gr.Dropdown(label="Remove field", choices=[], value=None)
966
+ remove_field_btn = gr.Button("Remove")
967
+
968
+ fields_df = gr.Dataframe(
969
+ label="Fields (edit if needed, then click Apply)",
970
+ headers=["field","type","enum_values","instructions"],
971
+ interactive=True,
972
+ wrap=True
973
+ )
974
+ fields_apply_btn = gr.Button("Apply builder table")
975
+ fields_status = gr.Textbox(label="Field builder status", interactive=False)
976
+
977
+ with gr.Accordion("Advanced: Raw extraction spec (auto-generated)", open=False):
978
+ field_spec = gr.Textbox(label="Extraction spec", lines=12, interactive=False)
979
 
980
  extract_btn = gr.Button("Run Extraction (Grounded)")
981
  status = gr.Textbox(label="Status", interactive=False)
982
 
983
+ # Replace wide table with a compact overview (not duplicate)
984
+ overview_df = gr.Dataframe(
985
+ label="Batch Overview (compact)",
986
  interactive=False,
987
  wrap=True,
988
  show_row_numbers=True,
989
  buttons=["fullscreen", "copy"]
990
  )
991
+
992
  with gr.Row():
993
  out_csv = gr.File(label="Download: extraction_table.csv")
994
  out_json = gr.File(label="Download: extraction_details.json (evidence + structured data)")
 
998
 
999
  with gr.Row():
1000
  review_mode = gr.Checkbox(label="Review mode (enable editing)", value=False)
1001
+ save_btn = gr.Button("Save edits")
1002
  export_btn = gr.Button("Export reviewed CSV")
1003
 
1004
  review_status = gr.Textbox(label="Review status", interactive=False)
 
1011
  label="Vertical record view (Field → Value)"
1012
  )
1013
  evidence_md = gr.Markdown()
 
1014
  reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
1015
 
1016
+ # -------------------------
1017
+ # INIT vocab + fields on load (via a button-less trick: use .load)
1018
+ # -------------------------
1019
+ def _init_all():
1020
+ v, keys, k0, df_terms, vjson, vmsg = vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
1021
+ frows, fdf, fspec, fmsg = fields_init_state()
1022
+ remove_choices = [r["field"] for r in frows]
1023
+ return (
1024
+ v, gr.update(choices=keys, value=k0), df_terms, vjson, vmsg,
1025
+ frows, fdf, fspec, fmsg, gr.update(choices=remove_choices, value=(remove_choices[0] if remove_choices else None))
1026
+ )
1027
+
1028
+ demo.load(
1029
+ _init_all,
1030
+ inputs=None,
1031
+ outputs=[vocab_state, vocab_category, vocab_terms_df, vocab_json, vocab_status,
1032
+ field_rows_state, fields_df, field_spec, fields_status, remove_field_name]
1033
+ )
1034
+
1035
+ # Vocab events
1036
+ vocab_category.change(
1037
+ fn=vocab_load_category,
1038
+ inputs=[vocab_state, vocab_category, vocab_search],
1039
+ outputs=[vocab_terms_df, vocab_terms_filtered, vocab_status]
1040
+ )
1041
+ vocab_add_btn.click(
1042
+ fn=vocab_add_term,
1043
+ inputs=[vocab_state, vocab_category, vocab_term_add, vocab_search],
1044
+ outputs=[vocab_terms_df, vocab_terms_filtered, vocab_term_add, vocab_status]
1045
+ )
1046
+
1047
+ vocab_remove_btn.click(
1048
+ fn=vocab_remove_term,
1049
+ inputs=[vocab_state, vocab_category, vocab_term_remove, vocab_search],
1050
+ outputs=[vocab_terms_df, vocab_terms_filtered, vocab_term_remove, vocab_status]
1051
+ )
1052
+
1053
+ vocab_apply_btn.click(
1054
+ fn=vocab_apply_df,
1055
+ inputs=[vocab_state, vocab_category, vocab_terms_df, vocab_search],
1056
+ outputs=[vocab_json, vocab_terms_filtered, vocab_status]
1057
+ )
1058
+
1059
+ vocab_reset_btn.click(
1060
+ fn=vocab_reset_defaults,
1061
+ inputs=None,
1062
+ outputs=[vocab_state, vocab_category, vocab_terms_df, vocab_json, vocab_status]
1063
+ )
1064
+
1065
+ # Field builder events
1066
+ preset_btn.click(
1067
+ fn=fields_load_preset,
1068
+ inputs=[preset_name, preset_mode, field_rows_state],
1069
+ outputs=[field_rows_state, fields_df, field_spec, fields_status]
1070
+ ).then(
1071
+ fn=lambda rows: gr.update(choices=[r["field"] for r in rows], value=None),
1072
+ inputs=[field_rows_state],
1073
+ outputs=[remove_field_name]
1074
+ )
1075
+
1076
+ add_update_field_btn.click(
1077
+ fn=fields_add_or_update,
1078
+ inputs=[field_name_in, field_type_in, enum_values_in, instructions_in, field_rows_state],
1079
+ outputs=[field_rows_state, fields_df, field_spec, fields_status]
1080
+ ).then(
1081
+ fn=lambda rows: gr.update(choices=[r["field"] for r in rows], value=None),
1082
+ inputs=[field_rows_state],
1083
+ outputs=[remove_field_name]
1084
+ )
1085
+
1086
+ remove_field_btn.click(
1087
+ fn=fields_remove,
1088
+ inputs=[remove_field_name, field_rows_state],
1089
+ outputs=[field_rows_state, fields_df, field_spec, fields_status]
1090
+ ).then(
1091
+ fn=lambda rows: gr.update(choices=[r["field"] for r in rows], value=None),
1092
+ inputs=[field_rows_state],
1093
+ outputs=[remove_field_name]
1094
+ )
1095
+
1096
+ fields_apply_btn.click(
1097
+ fn=fields_apply_df,
1098
+ inputs=[field_rows_state, fields_df],
1099
+ outputs=[field_rows_state, fields_df, field_spec, fields_status]
1100
+ ).then(
1101
+ fn=lambda rows: gr.update(choices=[r["field"] for r in rows], value=None),
1102
+ inputs=[field_rows_state],
1103
+ outputs=[remove_field_name]
1104
+ )
1105
+
1106
+ # Extraction
1107
  extract_btn.click(
1108
  fn=run_extraction,
1109
  inputs=[files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars],
1110
+ outputs=[overview_df, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
1111
  )
1112
 
1113
+ # Vertical view selection
1114
  record_pick.change(
1115
  fn=on_pick,
1116
  inputs=[record_pick, state_records, state_details],
1117
  outputs=[vertical_view, evidence_md]
1118
  )
1119
 
1120
+ # Review mode
1121
+ review_mode.change(fn=toggle_review_mode, inputs=[review_mode], outputs=[vertical_view])
 
 
 
 
1122
 
 
1123
  save_btn.click(
1124
  fn=save_review_changes,
1125
  inputs=[record_pick, vertical_view, state_records],
1126
+ outputs=[overview_df, state_records, review_status]
1127
  )
1128
 
 
1129
  export_btn.click(
1130
  fn=export_reviewed_csv,
1131
  inputs=[state_records],
 
1133
  )
1134
 
1135
  with gr.Tab("Cross-paper Synthesis"):
1136
+ gr.Markdown("Upload `extraction_details.json` from Extract. Synthesis is based strictly on grounded extractions.")
1137
  api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
1138
+ model2 = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
 
 
 
 
1139
  extraction_json_file = gr.File(label="Upload extraction_details.json", file_types=[".json"], file_count="single")
1140
  synth_btn = gr.Button("Synthesize Across Papers")
1141
  synth_md = gr.Markdown()
1142
+ synth_btn.click(fn=run_synthesis, inputs=[api_key2, model2, extraction_json_file], outputs=[synth_md])
 
 
 
 
 
1143
 
1144
  with gr.Tab("Pending tasks"):
1145
  gr.Markdown(
1146
+ "## Pending tasks\n\n"
1147
+ "1) One row per chemical–endpoint pair\n"
1148
+ "- Change schema to output `records[]` and flatten into multiple rows per paper\n\n"
1149
+ "2) Evidence verification\n"
1150
+ "- If evidence quote not found in context → blank value + flag UNVERIFIED\n\n"
1151
+ "3) Taxonomy mapping\n"
1152
+ "- Synonyms + preferred terms for FDA / OECD / MedDRA-like structure\n\n"
1153
+ "4) Column transforms\n"
1154
+ "- Parse NOAEL/LOAEL etc into structured {metric,value,unit,route,duration}\n\n"
1155
+ "5) Compare mode\n"
1156
+ "- Compare across papers by chemical/endpoint, output consensus + disagreements table\n\n"
1157
+ "6) OCR (optional)\n"
1158
+ "- Currently: text-based PDFs only; OCR adds heavy deps"
 
 
 
 
 
 
 
 
 
1159
  )
1160
 
1161
  if __name__ == "__main__":