hchevva commited on
Commit
ddb431d
·
verified ·
1 Parent(s): 6766619

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +480 -347
app.py CHANGED
@@ -3,7 +3,7 @@ import re
3
  import json
4
  import tempfile
5
  from pathlib import Path
6
- from typing import Dict, List, Tuple, Any
7
 
8
  import gradio as gr
9
  import numpy as np
@@ -23,7 +23,6 @@ DEFAULT_CONTROLLED_VOCAB_JSON = """{
23
 
24
  "approach_enum": ["in_vivo","in_vitro","in_silico","nams","mixed","not_reported"],
25
 
26
- "study_type_enum": ["in_vivo","in_vitro","epidemiology","in_silico","review","methodology","other"],
27
  "in_silico_method_enum": [
28
  "qsar","read_across","molecular_docking","molecular_dynamics","pbpk_pbtK","aop_based","ml_model","other","not_reported"
29
  ],
@@ -36,8 +35,6 @@ DEFAULT_CONTROLLED_VOCAB_JSON = """{
36
  "exposure_route_enum": ["oral","inhalation","dermal","parenteral","multiple","not_reported"],
37
  "species_enum": ["human","rat","mouse","rabbit","dog","non_human_primate","cell_line","other","not_reported"],
38
 
39
- "dose_metric_terms": ["noael","loael","bmd","bmdl","ld50","lc50","ec50","ic50"],
40
-
41
  "genotoxicity_oecd_tg_in_vitro_enum": [
42
  "OECD_TG_471_Bacterial Reverse mutation test(AMES test)",
43
  "OECD_TG_473_In Vitro Mammalian Chromosomal Aberration Test",
@@ -54,81 +51,98 @@ DEFAULT_CONTROLLED_VOCAB_JSON = """{
54
  "not_reported"
55
  ],
56
 
57
- "genotoxicity_result_enum": ["positive","negative","equivocal","not_reported"]
 
 
58
  }"""
59
 
60
- # (Used only as a fallback / advanced preview)
61
- DEFAULT_FIELD_SPEC = """# One field per line: Field Name | type | instructions
62
- # types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]
63
-
64
- Chemical(s) | list[str] | Primary chemical(s) studied; include common name + abbreviation if present.
65
- CAS_numbers | list[str] | Extract any CAS numbers mentioned.
66
-
67
- Approach | enum[in_vivo,in_vitro,in_silico,nams,mixed,not_reported] | Identify if results are in silico or NAMs; use 'mixed' if multiple.
68
- In_silico_methods | list[enum[qsar,read_across,molecular_docking,molecular_dynamics,pbpk_pbtK,aop_based,ml_model,other,not_reported]] | If in_silico, list methods used (can be multiple).
69
- NAMs_methods | list[enum[high_throughput_screening_hts,omics_transcriptomics,omics_proteomics,omics_metabolomics,organ_on_chip,microphysiological_system_mps,3d_tissue_model,in_chemico_assay,in_silico_as_nams,other,not_reported]] | If NAMs, list methods used (can be multiple).
70
-
71
- Study_type | enum[in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other] | Choose the best match.
72
- Exposure_route | enum[oral,inhalation,dermal,parenteral,multiple,not_reported] | Choose best match.
73
- Species | enum[human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported] | Choose best match.
74
-
75
- Genotox_OECD_TG_in_vitro | list[enum[
76
- OECD_TG_471_Bacterial Reverse mutation test(AMES test),
77
- OECD_TG_473_In Vitro Mammalian Chromosomal Aberration Test,
78
- OECD_TG_476_In Vitro Mammalian Cell Gene Mutation Tests (Hprt & xprt),
79
- OECD_TG_487_In Vitro Mammalian Cell Micronucleus Test,
80
- OECD_TG_490_In Vitro Mammalian Cell Gene Mutation Tests (Thymidine Kinase),
81
- not_reported
82
- ]] | If genotoxicity in vitro tests are reported, select all applicable TGs. Otherwise not_reported.
83
-
84
- Genotox_OECD_TG_in_vivo | list[enum[
85
- OECD_TG_474_In Vivo Mammalian Erythrocyte Micronucleus Test,
86
- OECD_TG_475_Mammalian Bone Marrow Chromosomal Aberration Test,
87
- OECD_TG_488_Transgenic Rodent Somatic & Germ Cell Gene Mutation Assays,
88
- OECD_TG_489_In Vivo Mammalian Alkaline Comet Assay,
89
- not_reported
90
- ]] | If genotoxicity in vivo tests are reported, select all applicable TGs. Otherwise not_reported.
91
-
92
- Genotoxicity_result | enum[positive,negative,equivocal,not_reported] | Classify based on reported results. If unclear, not_reported.
93
- Genotoxicity_result_notes | str | Short explanation grounded to the paper’s wording + what test context it applies to.
94
-
95
- Dose_metrics | list[str] | Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available.
96
- Key_findings | str | 2-4 bullet-like sentences summarizing the main findings.
97
- Conclusion | str | What does the paper conclude about safety/risk?
98
- """
99
-
100
 
101
  # =============================
102
- # Field presets (UI)
103
  # =============================
104
  PRESET_CORE = [
105
- {"field": "Chemical(s)", "type": "list[str]", "enum_values": "", "instructions": "Primary chemical(s) studied; include common name + abbreviation if present."},
106
- {"field": "CAS_numbers", "type": "list[str]", "enum_values": "", "instructions": "Extract any CAS numbers mentioned."},
107
- {"field": "Study_type", "type": "enum", "enum_values": "in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other", "instructions": "Choose the best match."},
108
- {"field": "Exposure_route", "type": "enum", "enum_values": "oral,inhalation,dermal,parenteral,multiple,not_reported", "instructions": "Choose best match."},
109
- {"field": "Species", "type": "enum", "enum_values": "human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported", "instructions": "Choose best match."},
110
- {"field": "Dose_metrics", "type": "list[str]", "enum_values": "", "instructions": "Include reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available."},
111
- {"field": "Key_findings", "type": "str", "enum_values": "", "instructions": "2-4 bullet-like sentences summarizing the main findings."},
112
- {"field": "Conclusion", "type": "str", "enum_values": "", "instructions": "What does the paper conclude about safety/risk?"},
113
  ]
114
 
115
  PRESET_NAMS_INSILICO = [
116
- {"field": "Approach", "type": "enum", "enum_values": "in_vivo,in_vitro,in_silico,nams,mixed,not_reported", "instructions": "Identify if results are in silico or NAMs; use 'mixed' if multiple."},
117
- {"field": "In_silico_methods", "type": "list[enum]", "enum_values": "qsar,read_across,molecular_docking,molecular_dynamics,pbpk_pbtK,aop_based,ml_model,other,not_reported", "instructions": "If in_silico, list methods used (can be multiple)."},
118
- {"field": "NAMs_methods", "type": "list[enum]", "enum_values": "high_throughput_screening_hts,omics_transcriptomics,omics_proteomics,omics_metabolomics,organ_on_chip,microphysiological_system_mps,3d_tissue_model,in_chemico_assay,in_silico_as_nams,other,not_reported", "instructions": "If NAMs, list methods used (can be multiple)."},
 
119
  ]
120
 
121
  PRESET_GENOTOX_OECD = [
122
- {"field": "Genotox_OECD_TG_in_vitro", "type": "list[enum]", "enum_values": "OECD_TG_471_Bacterial Reverse mutation test(AMES test),OECD_TG_473_In Vitro Mammalian Chromosomal Aberration Test,OECD_TG_476_In Vitro Mammalian Cell Gene Mutation Tests (Hprt & xprt),OECD_TG_487_In Vitro Mammalian Cell Micronucleus Test,OECD_TG_490_In Vitro Mammalian Cell Gene Mutation Tests (Thymidine Kinase),not_reported", "instructions": "If in vitro genotox tests are reported, select TGs. Otherwise not_reported."},
123
- {"field": "Genotox_OECD_TG_in_vivo", "type": "list[enum]", "enum_values": "OECD_TG_474_In Vivo Mammalian Erythrocyte Micronucleus Test,OECD_TG_475_Mammalian Bone Marrow Chromosomal Aberration Test,OECD_TG_488_Transgenic Rodent Somatic & Germ Cell Gene Mutation Assays,OECD_TG_489_In Vivo Mammalian Alkaline Comet Assay,not_reported", "instructions": "If in vivo genotox tests are reported, select TGs. Otherwise not_reported."},
124
- {"field": "Genotoxicity_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Classify based on reported results. If unclear, not_reported."},
125
- {"field": "Genotoxicity_result_notes", "type": "str", "enum_values": "", "instructions": "Short explanation grounded to the paper’s wording + test context."},
 
 
 
 
 
 
 
 
 
 
126
  ]
127
 
128
- PRESET_MAP = {
129
- "Core (recommended)": PRESET_CORE,
130
- "NAMs + In Silico": PRESET_NAMS_INSILICO,
131
- "Genotox (OECD TGs)": PRESET_GENOTOX_OECD,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  }
133
 
134
 
@@ -243,10 +257,10 @@ def build_context(selected_chunks: List[Dict[str, Any]], max_chars: int = 20000)
243
  # Spec -> JSON schema
244
  # =============================
245
  def slugify_field(name: str) -> str:
246
- name = name.strip()
247
  name = re.sub(r"[^\w\s-]", "", name)
248
  name = re.sub(r"[\s-]+", "_", name).lower()
249
- return name[:60] if name else "field"
250
 
251
 
252
  def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
@@ -317,7 +331,7 @@ def build_extraction_schema(field_props: Dict[str, Any], vocab: Dict[str, Any])
317
  "type": "object",
318
  "additionalProperties": False,
319
  "properties": field_props,
320
- "required": all_field_keys # strict requirement
321
  },
322
  "evidence": {
323
  "type": "array",
@@ -359,15 +373,13 @@ def openai_structured_extract(
359
  vocab_text = json.dumps(controlled_vocab, indent=2)
360
 
361
  system_msg = (
362
- "You are a toxicology research paper data-extraction assistant.\n"
363
  "Grounding rules (must follow):\n"
364
  "1) Use ONLY the provided excerpts; do NOT invent details.\n"
365
- "2) If a value is not explicitly stated, output empty string or empty list (or an allowed enum like 'not_reported').\n"
366
  "3) Provide evidence quotes + page ranges for extracted fields.\n"
367
  "4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
368
  "5) Prefer controlled vocab terms when applicable.\n"
369
- "6) For OECD TG fields, only populate if explicitly stated or clearly described; otherwise use not_reported.\n"
370
- "7) For NAMs/in_silico fields, only populate if explicitly described; otherwise not_reported.\n"
371
  )
372
 
373
  user_msg = (
@@ -400,7 +412,7 @@ def openai_structured_extract(
400
 
401
  def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[str, Any]]) -> str:
402
  system_msg = (
403
- "You are a senior toxicology scientist summarizing multiple papers.\n"
404
  "Create a concise synthesis: consensus, disagreements, data gaps, and actionable next steps.\n"
405
  "Base strictly on the provided extracted JSON (which is evidence-backed).\n"
406
  )
@@ -412,16 +424,19 @@ def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[
412
  # =============================
413
  # UI helpers: vertical view + evidence + overview
414
  # =============================
415
- def _make_vertical(records: List[Dict[str, Any]], file_name: str) -> pd.DataFrame:
416
- if not records or not file_name:
417
  return pd.DataFrame(columns=["Field", "Value"])
418
- row = next((r for r in records if r.get("file") == file_name), None)
419
  if not row:
420
  return pd.DataFrame(columns=["Field", "Value"])
421
- return pd.DataFrame({"Field": list(row.keys()), "Value": [row[k] for k in row.keys()]})
422
 
 
 
 
423
 
424
- def _render_evidence(details: List[Dict[str, Any]], file_name: str, max_items: int = 80) -> str:
 
425
  if not details or not file_name:
426
  return ""
427
  d = next((x for x in details if x.get("_file") == file_name), None)
@@ -429,29 +444,33 @@ def _render_evidence(details: List[Dict[str, Any]], file_name: str, max_items: i
429
  return ""
430
  ev = d.get("evidence", []) or []
431
  lines = []
432
- for e in ev[:max_items]:
 
 
 
433
  quote = (e.get("quote", "") or "").strip()
434
  pages = (e.get("pages", "") or "").strip()
435
- field = (e.get("field", "") or "").strip()
436
  if quote:
437
- if len(quote) > 280:
438
- quote = quote[:280] + "…"
439
  lines.append(f"- **{field}** (pages {pages}): “{quote}”")
 
 
440
  header = "### Evidence (grounding)\n"
441
  return header + ("\n".join(lines) if lines else "- (no evidence returned)")
442
 
443
 
444
  def _overview_df_from_records(records: List[Dict[str, Any]]) -> pd.DataFrame:
445
  if not records:
446
- return pd.DataFrame(columns=["file","paper_title","risk_stance","risk_confidence"])
447
  df = pd.DataFrame(records)
448
- cols = ["file","paper_title","risk_stance","risk_confidence"]
449
  cols = [c for c in cols if c in df.columns]
450
  return df[cols].copy() if cols else df.head(50)
451
 
452
 
453
  # =============================
454
- # Controlled vocab guided editor (lists only) + SEARCH FILTER
455
  # =============================
456
  def _filter_terms_df(df: pd.DataFrame, query: str) -> pd.DataFrame:
457
  if df is None or df.empty:
@@ -473,7 +492,13 @@ def vocab_init_state(vocab_json: str):
473
  default_key = list_keys[0] if list_keys else None
474
  terms = vocab.get(default_key, []) if default_key else []
475
  full_df = pd.DataFrame({"term": terms})
476
- return vocab, list_keys, default_key, full_df, json.dumps(vocab, indent=2), "✅ Vocab loaded."
 
 
 
 
 
 
477
 
478
 
479
  def vocab_load_category(vocab_state: Dict[str, Any], category: str, search: str):
@@ -538,10 +563,6 @@ def vocab_apply_df(vocab_state: Dict[str, Any], category: str, terms_df: Any, se
538
  return vjson, filtered, f"✅ Applied {len(terms)} terms to {category}."
539
 
540
 
541
- def vocab_reset_defaults():
542
- return vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
543
-
544
-
545
  def vocab_filter_preview(terms_df, search):
546
  try:
547
  df = terms_df if isinstance(terms_df, pd.DataFrame) else pd.DataFrame(terms_df, columns=["term"])
@@ -551,18 +572,18 @@ def vocab_filter_preview(terms_df, search):
551
 
552
 
553
  # =============================
554
- # Field builder (type dropdown + presets)
555
  # =============================
556
  TYPE_CHOICES = ["str", "num", "bool", "list[str]", "list[num]", "enum", "list[enum]"]
557
 
558
 
559
- def build_spec_from_field_df(df: pd.DataFrame) -> str:
560
  lines = [
561
  "# One field per line: Field Name | type | instructions",
562
  "# types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]",
563
  ""
564
  ]
565
- for _, r in df.iterrows():
566
  field = str(r.get("field","")).strip()
567
  ftype = str(r.get("type","")).strip()
568
  enums = str(r.get("enum_values","")).strip()
@@ -585,38 +606,50 @@ def build_spec_from_field_df(df: pd.DataFrame) -> str:
585
  return "\n".join(lines).strip() + "\n"
586
 
587
 
588
- def fields_init_state():
589
- fields = []
590
- for row in (PRESET_CORE + PRESET_NAMS_INSILICO + PRESET_GENOTOX_OECD):
591
- fields.append(dict(row))
592
- df = pd.DataFrame(fields, columns=["field","type","enum_values","instructions"])
593
- spec = build_spec_from_field_df(df)
594
- return fields, df, spec, "✅ Field builder loaded."
595
 
 
 
 
596
 
597
- def fields_load_preset(preset_name: str, mode: str, field_rows: List[Dict[str, Any]]):
598
- preset = PRESET_MAP.get(preset_name)
599
- if not preset:
600
- df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
601
- return field_rows, df, build_spec_from_field_df(df), "Unknown preset."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
602
 
603
- if mode == "Replace":
604
- new_rows = [dict(r) for r in preset]
605
- else:
606
- new_rows = [dict(r) for r in field_rows]
607
- for p in preset:
608
- found = False
609
- for r in new_rows:
610
- if str(r.get("field","")).strip().lower() == str(p.get("field","")).strip().lower():
611
- r.update(p)
612
- found = True
613
- break
614
- if not found:
615
- new_rows.append(dict(p))
616
-
617
- df = pd.DataFrame(new_rows, columns=["field","type","enum_values","instructions"])
618
- spec = build_spec_from_field_df(df)
619
- return new_rows, df, spec, f"✅ Loaded preset: {preset_name} ({mode})."
620
 
621
 
622
  def fields_add_or_update(field_name: str, ftype: str, enum_values: str, instructions: str, field_rows: List[Dict[str, Any]]):
@@ -627,7 +660,7 @@ def fields_add_or_update(field_name: str, ftype: str, enum_values: str, instruct
627
 
628
  if not field_name or not ftype:
629
  df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
630
- return field_rows, df, build_spec_from_field_df(df), "Field name and type are required."
631
 
632
  updated = False
633
  for r in field_rows:
@@ -642,8 +675,7 @@ def fields_add_or_update(field_name: str, ftype: str, enum_values: str, instruct
642
  field_rows.append({"field": field_name, "type": ftype, "enum_values": enum_values, "instructions": instructions})
643
 
644
  df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
645
- spec = build_spec_from_field_df(df)
646
- return field_rows, df, spec, ("Updated field." if updated else "Added field.")
647
 
648
 
649
  def fields_apply_df(field_rows: List[Dict[str, Any]], df_in: Any):
@@ -651,7 +683,7 @@ def fields_apply_df(field_rows: List[Dict[str, Any]], df_in: Any):
651
  df = df_in if isinstance(df_in, pd.DataFrame) else pd.DataFrame(df_in, columns=["field","type","enum_values","instructions"])
652
  except Exception:
653
  df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
654
- return field_rows, df, build_spec_from_field_df(df), "Could not parse builder table."
655
 
656
  cleaned = []
657
  seen = set()
@@ -669,10 +701,39 @@ def fields_apply_df(field_rows: List[Dict[str, Any]], df_in: Any):
669
  cleaned.append({"field": field, "type": ftype, "enum_values": enums, "instructions": instr})
670
 
671
  df2 = pd.DataFrame(cleaned, columns=["field","type","enum_values","instructions"])
672
- spec = build_spec_from_field_df(df2)
673
  return cleaned, df2, spec, f"✅ Applied builder table ({len(cleaned)} fields)."
674
 
675
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
676
  # =============================
677
  # Main extraction handler
678
  # =============================
@@ -680,33 +741,58 @@ def run_extraction(
680
  files,
681
  api_key,
682
  model,
 
683
  field_spec,
684
  vocab_json,
685
  max_pages,
686
  chunk_chars,
687
- max_context_chars
 
688
  ):
689
  if not files:
690
- return pd.DataFrame(), None, None, "Upload one or more PDFs.", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
 
 
 
 
691
 
692
  try:
693
  vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
694
  except Exception as e:
695
- return pd.DataFrame(), None, None, f"Controlled vocab JSON invalid: {e}", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
 
 
 
 
696
 
697
- field_props, field_instr = parse_field_spec(field_spec or DEFAULT_FIELD_SPEC)
698
  if not field_props:
699
- return pd.DataFrame(), None, None, "Extraction spec produced no fields.", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
 
 
 
 
700
 
701
  schema = build_extraction_schema(field_props, vocab)
702
 
 
 
 
 
 
 
 
703
  try:
704
  client = get_openai_client(api_key)
705
  except Exception as e:
706
- return pd.DataFrame(), None, None, str(e), gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
 
 
 
 
707
 
708
- results: List[Dict[str, Any]] = []
709
- flat_rows: List[Dict[str, Any]] = []
710
 
711
  tmpdir = Path(tempfile.mkdtemp(prefix="tox_extract_"))
712
 
@@ -723,21 +809,26 @@ def run_extraction(
723
  "paper_title": "",
724
  "risk_stance": "insufficient_data",
725
  "risk_confidence": 0.0,
726
- "risk_summary": "No extractable text found. This app supports text-based PDFs only.",
727
  "extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
728
  "evidence": []
729
  }
730
- results.append(ex)
731
  else:
732
  chunks = chunk_pages(pages, target_chars=int(chunk_chars))
733
- queries = ["regulatory acceptability risk hazard concern conclusion adverse effect uncertainty noael loael bmd bmdl"]
 
 
 
 
 
 
734
  for k, ins in field_instr.items():
735
  queries.append(ins if ins else k)
736
 
737
  selected = select_relevant_chunks(chunks, queries, top_per_query=2, max_chunks=12)
738
  context = build_context(selected, max_chars=int(max_context_chars))
739
 
740
- extracted = openai_structured_extract(
741
  client=client,
742
  model=model,
743
  schema=schema,
@@ -745,42 +836,76 @@ def run_extraction(
745
  field_instructions=field_instr,
746
  context=context
747
  )
748
- extracted["_file"] = filename
749
- extracted["_pages_in_pdf"] = page_count
750
- results.append(extracted)
 
751
 
752
- ex = results[-1]
753
- row = {
754
  "file": filename,
755
- "paper_title": ex.get("paper_title",""),
756
- "risk_stance": ex.get("risk_stance",""),
757
- "risk_confidence": ex.get("risk_confidence",""),
758
- "risk_summary": ex.get("risk_summary","")
759
  }
 
760
  ext = ex.get("extracted") or {}
761
- for k in field_props.keys():
762
- v = ext.get(k, "" if field_props[k].get("type") != "array" else [])
763
- if isinstance(v, list):
764
- row[k] = "; ".join([str(x) for x in v])
765
- else:
766
- row[k] = v
767
- flat_rows.append(row)
768
-
769
- df = pd.DataFrame(flat_rows)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
770
  records = df.to_dict("records")
771
 
772
  csv_path = tmpdir / "extraction_table.csv"
773
  json_path = tmpdir / "extraction_details.json"
774
  df.to_csv(csv_path, index=False)
775
- json_path.write_text(json.dumps(results, indent=2), encoding="utf-8")
776
 
777
- choices = [r["file"] for r in records if "file" in r]
778
  default = choices[0] if choices else None
779
- vertical = _make_vertical(records, default)
780
- evidence = _render_evidence(results, default)
 
 
 
 
 
 
 
 
 
 
781
  overview = _overview_df_from_records(records)
 
782
 
783
- status = "Done. Use the vertical view + evidence for review. Export reviewed CSV when ready."
784
  return (
785
  overview,
786
  str(csv_path),
@@ -788,7 +913,7 @@ def run_extraction(
788
  status,
789
  gr.update(choices=choices, value=default),
790
  records,
791
- results,
792
  vertical,
793
  evidence
794
  )
@@ -797,16 +922,21 @@ def run_extraction(
797
  # =============================
798
  # Review mode handlers
799
  # =============================
800
- def on_pick(file_name: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
801
- return _make_vertical(records, file_name), _render_evidence(details, file_name)
 
 
 
 
 
802
 
803
 
804
  def toggle_review_mode(is_on: bool):
805
  return gr.update(interactive=bool(is_on))
806
 
807
 
808
- def save_review_changes(file_name: str, vertical_df: Any, records: List[Dict[str, Any]]):
809
- if not file_name or not records:
810
  return pd.DataFrame(), records, "Nothing to save."
811
 
812
  try:
@@ -820,7 +950,7 @@ def save_review_changes(file_name: str, vertical_df: Any, records: List[Dict[str
820
  new_records = []
821
  updated = False
822
  for r in records:
823
- if r.get("file") == file_name:
824
  rr = dict(r)
825
  for k, v in updates.items():
826
  rr[k] = v
@@ -858,77 +988,189 @@ def run_synthesis(api_key, model, extraction_json_file):
858
  return openai_synthesize_across_papers(client, model, rows)
859
 
860
 
 
 
 
 
 
 
 
 
 
 
 
861
  # =============================
862
  # Gradio UI
863
  # =============================
864
  with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
865
  gr.Markdown(
866
- "# Toxicology PDF → Grounded Extractor (GPT-4o)\n\n"
867
- "**Important:** Text-based PDFs only (not scanned/image PDFs). If no extractable text is found, the record is marked `insufficient_data`.\n\n"
868
- "UI includes a guided **Controlled Vocab editor** (lists only, with search) and a **Field Builder** (type dropdown + presets)."
869
  )
870
 
871
- # States
872
- state_records = gr.State([]) # list[dict]
873
- state_details = gr.State([]) # list[dict]
874
- vocab_state = gr.State({}) # dict
875
- field_rows_state = gr.State([]) # list[dict]
 
 
876
 
877
  with gr.Tab("Extract"):
878
- files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
 
 
 
 
 
 
 
 
 
 
 
 
879
 
880
- with gr.Row():
881
- api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
882
- model = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
883
 
884
- with gr.Row():
885
- max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
886
- chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
887
- max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
 
 
 
888
 
889
- # -------------------------
890
- # Controlled Vocabulary (guided editor)
891
- # -------------------------
892
- gr.Markdown("## Controlled Vocabulary (guided editor)")
893
 
894
- vocab_category = gr.Dropdown(label="Category (lists only)", choices=[], value=None)
895
- vocab_search = gr.Textbox(label="Search terms", placeholder="Type to filter (e.g., 471, AMES, comet)", lines=1)
896
 
897
  with gr.Row():
898
- vocab_term_add = gr.Textbox(label="Add term", placeholder="type term and click Add")
899
- vocab_add_btn = gr.Button("Add")
 
 
 
 
900
  with gr.Row():
901
- vocab_term_remove = gr.Textbox(label="Remove term", placeholder="type exact term and click Remove")
902
- vocab_remove_btn = gr.Button("Remove")
903
- vocab_apply_btn = gr.Button("Apply full list to category")
904
- vocab_reset_btn = gr.Button("Reset vocab to defaults")
905
-
906
- vocab_terms_df = gr.Dataframe(
907
- headers=["term"],
908
- label="Terms (full list; edit directly)",
909
- interactive=True,
910
- wrap=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
911
  )
912
 
913
- vocab_terms_filtered = gr.Dataframe(
914
- headers=["term"],
915
- label="Filtered preview (read-only)",
916
- interactive=False,
917
- wrap=True
918
  )
919
 
920
- vocab_status = gr.Textbox(label="Vocab status", interactive=False)
 
 
 
 
921
 
922
- with gr.Accordion("Advanced: Raw vocab JSON (auto-generated)", open=False):
923
- vocab_json = gr.Textbox(label="Controlled vocab JSON", lines=12, interactive=False)
 
 
 
924
 
925
- # Filter preview wiring (must be AFTER vocab_terms_df exists)
926
- vocab_search.change(
927
- fn=vocab_filter_preview,
928
- inputs=[vocab_terms_df, vocab_search],
929
- outputs=[vocab_terms_filtered]
 
 
 
 
 
 
 
930
  )
931
 
 
 
932
  vocab_category.change(
933
  fn=vocab_load_category,
934
  inputs=[vocab_state, vocab_category, vocab_search],
@@ -950,53 +1192,22 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
950
  vocab_apply_btn.click(
951
  fn=vocab_apply_df,
952
  inputs=[vocab_state, vocab_category, vocab_terms_df, vocab_search],
953
- outputs=[vocab_json, vocab_terms_filtered, vocab_status]
 
 
 
 
954
  )
955
 
956
  vocab_reset_btn.click(
957
- fn=vocab_reset_defaults,
958
  inputs=None,
959
- outputs=[vocab_state, vocab_category, vocab_terms_df, vocab_json, vocab_status]
960
- ).then(
961
- fn=vocab_load_category,
962
- inputs=[vocab_state, vocab_category, vocab_search],
963
- outputs=[vocab_terms_df, vocab_terms_filtered, vocab_status]
964
- )
965
-
966
- # -------------------------
967
- # Field Builder
968
- # -------------------------
969
- gr.Markdown("## Extraction Spec (Field Builder)")
970
-
971
- with gr.Row():
972
- preset_name = gr.Dropdown(label="Preset", choices=list(PRESET_MAP.keys()), value="Core (recommended)")
973
- preset_mode = gr.Radio(label="Preset mode", choices=["Replace", "Append"], value="Append")
974
- preset_btn = gr.Button("Load preset")
975
-
976
- with gr.Row():
977
- field_name_in = gr.Textbox(label="Field name", placeholder="e.g., Genotoxicity_result")
978
- field_type_in = gr.Dropdown(label="Type", choices=TYPE_CHOICES, value="str")
979
- enum_values_in = gr.Textbox(label="Enum values (comma-separated; for enum/list[enum])", placeholder="a,b,c", lines=2)
980
- instructions_in = gr.Textbox(label="Instructions", placeholder="Tell the extractor exactly what to pull.", lines=2)
981
-
982
- add_update_field_btn = gr.Button("Add/Update field")
983
-
984
- fields_df = gr.Dataframe(
985
- label="Fields (edit and click Apply)",
986
- headers=["field","type","enum_values","instructions"],
987
- interactive=True,
988
- wrap=True
989
  )
990
 
991
- fields_apply_btn = gr.Button("Apply builder table")
992
- fields_status = gr.Textbox(label="Field builder status", interactive=False)
993
-
994
- with gr.Accordion("Advanced: Raw extraction spec (auto-generated)", open=False):
995
- field_spec = gr.Textbox(label="Extraction spec", lines=12, interactive=False)
996
-
997
- preset_btn.click(
998
- fn=fields_load_preset,
999
- inputs=[preset_name, preset_mode, field_rows_state],
1000
  outputs=[field_rows_state, fields_df, field_spec, fields_status]
1001
  )
1002
 
@@ -1012,88 +1223,26 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1012
  outputs=[field_rows_state, fields_df, field_spec, fields_status]
1013
  )
1014
 
1015
- # -------------------------
1016
- # Run extraction
1017
- # -------------------------
1018
- extract_btn = gr.Button("Run Extraction (Grounded)")
1019
- status = gr.Textbox(label="Status", interactive=False)
1020
-
1021
- overview_df = gr.Dataframe(
1022
- label="Batch Overview (compact)",
1023
- interactive=False,
1024
- wrap=True,
1025
- show_row_numbers=True,
1026
- buttons=["fullscreen", "copy"]
1027
- )
1028
-
1029
- with gr.Row():
1030
- out_csv = gr.File(label="Download: extraction_table.csv")
1031
- out_json = gr.File(label="Download: extraction_details.json (evidence + structured data)")
1032
-
1033
- gr.Markdown("## Readable view (vertical) + evidence")
1034
- record_pick = gr.Dropdown(label="Select record", choices=[], value=None)
1035
-
1036
- with gr.Row():
1037
- review_mode = gr.Checkbox(label="Review mode (enable editing)", value=False)
1038
- save_btn = gr.Button("Save edits")
1039
- export_btn = gr.Button("Export reviewed CSV")
1040
-
1041
- review_status = gr.Textbox(label="Review status", interactive=False)
1042
-
1043
- vertical_view = gr.Dataframe(
1044
- headers=["Field", "Value"],
1045
- interactive=False,
1046
- wrap=True,
1047
- show_row_numbers=False,
1048
- label="Vertical record view (Field → Value)"
1049
- )
1050
- evidence_md = gr.Markdown()
1051
- reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
1052
-
1053
- extract_btn.click(
1054
- fn=run_extraction,
1055
- inputs=[files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars],
1056
- outputs=[overview_df, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
1057
- )
1058
-
1059
- record_pick.change(
1060
- fn=on_pick,
1061
- inputs=[record_pick, state_records, state_details],
1062
- outputs=[vertical_view, evidence_md]
1063
- )
1064
-
1065
- review_mode.change(fn=toggle_review_mode, inputs=[review_mode], outputs=[vertical_view])
1066
-
1067
- save_btn.click(
1068
- fn=save_review_changes,
1069
- inputs=[record_pick, vertical_view, state_records],
1070
- outputs=[overview_df, state_records, review_status]
1071
- )
1072
 
1073
- export_btn.click(
1074
- fn=export_reviewed_csv,
1075
- inputs=[state_records],
1076
- outputs=[reviewed_csv, review_status]
1077
- )
1078
 
1079
- # -------------------------
1080
- # Initialize vocab + fields on load
1081
- # -------------------------
1082
- def _init_all():
1083
- v, keys, k0, full_df, vjson, vmsg = vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
1084
- filtered_df = _filter_terms_df(full_df, "")
1085
- frows, fdf, fspec, fmsg = fields_init_state()
1086
  return (
1087
- v,
1088
  gr.update(choices=keys, value=k0),
1089
  full_df,
1090
  filtered_df,
1091
  vjson,
1092
  vmsg,
1093
- frows,
 
1094
  fdf,
1095
  fspec,
1096
- fmsg
1097
  )
1098
 
1099
  demo.load(
@@ -1104,12 +1253,13 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1104
  vocab_category,
1105
  vocab_terms_df,
1106
  vocab_terms_filtered,
1107
- vocab_json,
1108
  vocab_status,
 
1109
  field_rows_state,
1110
  fields_df,
1111
  field_spec,
1112
- fields_status
1113
  ]
1114
  )
1115
 
@@ -1122,23 +1272,6 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1122
  synth_md = gr.Markdown()
1123
  synth_btn.click(fn=run_synthesis, inputs=[api_key2, model2, extraction_json_file], outputs=[synth_md])
1124
 
1125
- with gr.Tab("Pending tasks"):
1126
- gr.Markdown(
1127
- "## Pending tasks\n\n"
1128
- "1) One row per chemical–endpoint pair\n"
1129
- "- Change schema to output `records[]` and flatten into multiple rows per paper\n\n"
1130
- "2) Evidence verification\n"
1131
- "- If evidence quote not found in context → blank value + flag UNVERIFIED\n\n"
1132
- "3) Taxonomy mapping\n"
1133
- "- Synonyms + preferred terms for FDA / OECD / MedDRA-like structure\n\n"
1134
- "4) Column transforms\n"
1135
- "- Parse NOAEL/LOAEL etc into structured {metric,value,unit,route,duration}\n\n"
1136
- "5) Compare mode\n"
1137
- "- Compare across papers by chemical/endpoint, output consensus + disagreements table\n\n"
1138
- "6) OCR (optional)\n"
1139
- "- Currently: text-based PDFs only; OCR adds heavy deps"
1140
- )
1141
-
1142
  if __name__ == "__main__":
1143
  port = int(os.environ.get("PORT", "7860"))
1144
  demo.queue().launch(server_name="0.0.0.0", server_port=port)
 
3
  import json
4
  import tempfile
5
  from pathlib import Path
6
+ from typing import Dict, List, Tuple, Any, Optional
7
 
8
  import gradio as gr
9
  import numpy as np
 
23
 
24
  "approach_enum": ["in_vivo","in_vitro","in_silico","nams","mixed","not_reported"],
25
 
 
26
  "in_silico_method_enum": [
27
  "qsar","read_across","molecular_docking","molecular_dynamics","pbpk_pbtK","aop_based","ml_model","other","not_reported"
28
  ],
 
35
  "exposure_route_enum": ["oral","inhalation","dermal","parenteral","multiple","not_reported"],
36
  "species_enum": ["human","rat","mouse","rabbit","dog","non_human_primate","cell_line","other","not_reported"],
37
 
 
 
38
  "genotoxicity_oecd_tg_in_vitro_enum": [
39
  "OECD_TG_471_Bacterial Reverse mutation test(AMES test)",
40
  "OECD_TG_473_In Vitro Mammalian Chromosomal Aberration Test",
 
51
  "not_reported"
52
  ],
53
 
54
+ "genotoxicity_result_enum": ["positive","negative","equivocal","not_reported"],
55
+ "binary_result_enum": ["positive","negative","equivocal","not_reported"],
56
+ "carcinogenicity_result_enum": ["carcinogenic","not_carcinogenic","insufficient_data","not_reported"]
57
  }"""
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  # =============================
61
+ # Endpoint modules (what users choose)
62
  # =============================
63
  PRESET_CORE = [
64
+ {"field": "chemicals", "type": "list[str]", "enum_values": "", "instructions": "List chemical(s) studied. If multiple, include each separately."},
65
+ {"field": "cas_numbers", "type": "list[str]", "enum_values": "", "instructions": "Extract CAS number(s) mentioned (may be multiple)."},
66
+ {"field": "study_type", "type": "enum", "enum_values": "in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other,not_reported", "instructions": "Choose best match."},
67
+ {"field": "exposure_route", "type": "enum", "enum_values": "oral,inhalation,dermal,parenteral,multiple,not_reported", "instructions": "Choose best match."},
68
+ {"field": "species", "type": "enum", "enum_values": "human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported", "instructions": "Choose best match."},
69
+ {"field": "dose_metrics", "type": "list[str]", "enum_values": "", "instructions": "Capture NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units and route if available."},
70
+ {"field": "key_findings", "type": "str", "enum_values": "", "instructions": "24 short sentences summarizing major findings. Grounded to text."},
71
+ {"field": "conclusion", "type": "str", "enum_values": "", "instructions": "Paper's conclusion about safety/risk (grounded)."},
72
  ]
73
 
74
  PRESET_NAMS_INSILICO = [
75
+ {"field": "approach", "type": "enum", "enum_values": "in_vivo,in_vitro,in_silico,nams,mixed,not_reported", "instructions": "Identify if results are in silico or NAMs; use mixed if multiple."},
76
+ {"field": "in_silico_methods", "type": "list[enum]", "enum_values": "qsar,read_across,molecular_docking,molecular_dynamics,pbpk_pbtK,aop_based,ml_model,other,not_reported", "instructions": "If in_silico, list methods used (multiple allowed)."},
77
+ {"field": "nams_methods", "type": "list[enum]", "enum_values": "high_throughput_screening_hts,omics_transcriptomics,omics_proteomics,omics_metabolomics,organ_on_chip,microphysiological_system_mps,3d_tissue_model,in_chemico_assay,in_silico_as_nams,other,not_reported", "instructions": "If NAMs, list methods used (multiple allowed)."},
78
+ {"field": "nams_or_insilico_key_results", "type": "str", "enum_values": "", "instructions": "Summarize in silico / NAMs results and key metrics (grounded)."},
79
  ]
80
 
81
  PRESET_GENOTOX_OECD = [
82
+ {
83
+ "field": "genotox_oecd_tg_in_vitro",
84
+ "type": "list[enum]",
85
+ "enum_values": "OECD_TG_471_Bacterial Reverse mutation test(AMES test),OECD_TG_473_In Vitro Mammalian Chromosomal Aberration Test,OECD_TG_476_In Vitro Mammalian Cell Gene Mutation Tests (Hprt & xprt),OECD_TG_487_In Vitro Mammalian Cell Micronucleus Test,OECD_TG_490_In Vitro Mammalian Cell Gene Mutation Tests (Thymidine Kinase),not_reported",
86
+ "instructions": "Select all in vitro OECD TGs explicitly reported (or clearly described). If none, use not_reported."
87
+ },
88
+ {
89
+ "field": "genotox_oecd_tg_in_vivo",
90
+ "type": "list[enum]",
91
+ "enum_values": "OECD_TG_474_In Vivo Mammalian Erythrocyte Micronucleus Test,OECD_TG_475_Mammalian Bone Marrow Chromosomal Aberration Test,OECD_TG_488_Transgenic Rodent Somatic & Germ Cell Gene Mutation Assays,OECD_TG_489_In Vivo Mammalian Alkaline Comet Assay,not_reported",
92
+ "instructions": "Select all in vivo OECD TGs explicitly reported (or clearly described). If none, use not_reported."
93
+ },
94
+ {"field": "genotoxicity_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Classify overall genotoxicity outcome as reported. If unclear, not_reported."},
95
+ {"field": "genotoxicity_result_notes", "type": "str", "enum_values": "", "instructions": "Short explanation grounded to text + test context (e.g., AMES, micronucleus)."},
96
  ]
97
 
98
+ PRESET_ACUTE_TOX = [
99
+ {"field": "acute_toxicity_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "If acute toxicity is assessed, classify as positive/negative/equivocal; otherwise not_reported."},
100
+ {"field": "acute_toxicity_key_metrics", "type": "list[str]", "enum_values": "", "instructions": "Extract LD50/LC50/EC50/IC50 etc with units/route/species if available."},
101
+ {"field": "acute_toxicity_notes", "type": "str", "enum_values": "", "instructions": "Grounded summary of acute toxicity findings."},
102
+ ]
103
+
104
+ PRESET_REPEATED_DOSE = [
105
+ {"field": "repeated_dose_noael_loael", "type": "list[str]", "enum_values": "", "instructions": "Extract NOAEL/LOAEL (and study duration) with units/route if available."},
106
+ {"field": "repeated_dose_target_organs", "type": "list[str]", "enum_values": "", "instructions": "List target organs/critical effects explicitly reported."},
107
+ {"field": "repeated_dose_notes", "type": "str", "enum_values": "", "instructions": "Grounded summary of repeated-dose toxicity conclusions."},
108
+ ]
109
+
110
+ PRESET_IRR_SENS = [
111
+ {"field": "skin_irritation_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Skin irritation outcome (as reported)."},
112
+ {"field": "eye_irritation_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Eye irritation outcome (as reported)."},
113
+ {"field": "skin_sensitization_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Skin sensitization outcome (as reported)."},
114
+ {"field": "irritation_sensitization_notes", "type": "str", "enum_values": "", "instructions": "Grounded notes including method/model if stated."},
115
+ ]
116
+
117
+ PRESET_REPRO_DEV = [
118
+ {"field": "reproductive_toxicity_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Reproductive toxicity outcome (as reported)."},
119
+ {"field": "developmental_toxicity_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Developmental toxicity outcome (as reported)."},
120
+ {"field": "repro_dev_notes", "type": "str", "enum_values": "", "instructions": "Grounded notes including endpoints and study design if stated."},
121
+ ]
122
+
123
+ PRESET_CARCINOGENICITY = [
124
+ {"field": "carcinogenicity_result", "type": "enum", "enum_values": "carcinogenic,not_carcinogenic,insufficient_data,not_reported", "instructions": "As reported. If evidence insufficient, insufficient_data."},
125
+ {"field": "carcinogenicity_notes", "type": "str", "enum_values": "", "instructions": "Grounded notes including species, duration, tumor findings if stated."},
126
+ ]
127
+
128
+ ENDPOINT_MODULES: Dict[str, List[Dict[str, Any]]] = {
129
+ "Genotoxicity (OECD TG)": PRESET_GENOTOX_OECD,
130
+ "NAMs / In Silico": PRESET_NAMS_INSILICO,
131
+ "Acute toxicity": PRESET_ACUTE_TOX,
132
+ "Repeated dose toxicity": PRESET_REPEATED_DOSE,
133
+ "Irritation / Sensitization": PRESET_IRR_SENS,
134
+ "Repro / Developmental": PRESET_REPRO_DEV,
135
+ "Carcinogenicity": PRESET_CARCINOGENICITY,
136
+ }
137
+
138
+ ENDPOINT_QUERY_HINTS: Dict[str, List[str]] = {
139
+ "Genotoxicity (OECD TG)": ["genotoxicity", "mutagenicity", "AMES", "micronucleus", "comet assay", "chromosomal aberration", "OECD TG 471 473 476 487 490 474 489"],
140
+ "NAMs / In Silico": ["in silico", "QSAR", "read-across", "AOP", "PBPK", "high-throughput", "omics", "organ-on-chip", "microphysiological"],
141
+ "Acute toxicity": ["acute toxicity", "LD50", "LC50", "single dose", "lethality", "mortality"],
142
+ "Repeated dose toxicity": ["repeated dose", "subchronic", "chronic", "NOAEL", "LOAEL", "target organ", "90-day", "28-day"],
143
+ "Irritation / Sensitization": ["skin irritation", "eye irritation", "sensitization", "LLNA", "Draize"],
144
+ "Repro / Developmental": ["reproductive toxicity", "fertility", "developmental toxicity", "teratogenic", "prenatal", "postnatal"],
145
+ "Carcinogenicity": ["carcinogenicity", "tumor", "neoplasm", "cancer", "two-year bioassay"],
146
  }
147
 
148
 
 
257
  # Spec -> JSON schema
258
  # =============================
259
  def slugify_field(name: str) -> str:
260
+ name = (name or "").strip()
261
  name = re.sub(r"[^\w\s-]", "", name)
262
  name = re.sub(r"[\s-]+", "_", name).lower()
263
+ return name[:80] if name else "field"
264
 
265
 
266
  def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
 
331
  "type": "object",
332
  "additionalProperties": False,
333
  "properties": field_props,
334
+ "required": all_field_keys
335
  },
336
  "evidence": {
337
  "type": "array",
 
373
  vocab_text = json.dumps(controlled_vocab, indent=2)
374
 
375
  system_msg = (
376
+ "You are a toxicology research paper data-extraction assistant for an industry safety assessor.\n"
377
  "Grounding rules (must follow):\n"
378
  "1) Use ONLY the provided excerpts; do NOT invent details.\n"
379
+ "2) If a value is not explicitly stated, output empty string or empty list, OR the enum value 'not_reported'/'insufficient_data' when applicable.\n"
380
  "3) Provide evidence quotes + page ranges for extracted fields.\n"
381
  "4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
382
  "5) Prefer controlled vocab terms when applicable.\n"
 
 
383
  )
384
 
385
  user_msg = (
 
412
 
413
  def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[str, Any]]) -> str:
414
  system_msg = (
415
+ "You are a senior toxicology safety assessor summarizing multiple papers.\n"
416
  "Create a concise synthesis: consensus, disagreements, data gaps, and actionable next steps.\n"
417
  "Base strictly on the provided extracted JSON (which is evidence-backed).\n"
418
  )
 
424
  # =============================
425
  # UI helpers: vertical view + evidence + overview
426
  # =============================
427
+ def _make_vertical(records: List[Dict[str, Any]], record_id: str) -> pd.DataFrame:
428
+ if not records or not record_id:
429
  return pd.DataFrame(columns=["Field", "Value"])
430
+ row = next((r for r in records if r.get("record_id") == record_id), None)
431
  if not row:
432
  return pd.DataFrame(columns=["Field", "Value"])
 
433
 
434
+ hidden = {"record_id"}
435
+ keys = [k for k in row.keys() if k not in hidden]
436
+ return pd.DataFrame({"Field": keys, "Value": [row[k] for k in keys]})
437
 
438
+
439
+ def _render_evidence(details: List[Dict[str, Any]], file_name: str, allowed_fields: Optional[set] = None, max_items: int = 120) -> str:
440
  if not details or not file_name:
441
  return ""
442
  d = next((x for x in details if x.get("_file") == file_name), None)
 
444
  return ""
445
  ev = d.get("evidence", []) or []
446
  lines = []
447
+ for e in ev:
448
+ field = (e.get("field", "") or "").strip()
449
+ if allowed_fields is not None and field and field not in allowed_fields:
450
+ continue
451
  quote = (e.get("quote", "") or "").strip()
452
  pages = (e.get("pages", "") or "").strip()
 
453
  if quote:
454
+ if len(quote) > 320:
455
+ quote = quote[:320] + "…"
456
  lines.append(f"- **{field}** (pages {pages}): “{quote}”")
457
+ if len(lines) >= max_items:
458
+ break
459
  header = "### Evidence (grounding)\n"
460
  return header + ("\n".join(lines) if lines else "- (no evidence returned)")
461
 
462
 
463
  def _overview_df_from_records(records: List[Dict[str, Any]]) -> pd.DataFrame:
464
  if not records:
465
+ return pd.DataFrame(columns=["record_id","file","paper_title","chemical","endpoint","risk_stance","risk_confidence"])
466
  df = pd.DataFrame(records)
467
+ cols = ["record_id","file","paper_title","chemical","endpoint","risk_stance","risk_confidence"]
468
  cols = [c for c in cols if c in df.columns]
469
  return df[cols].copy() if cols else df.head(50)
470
 
471
 
472
  # =============================
473
+ # Controlled vocab editor helpers (lists only) + search filter
474
  # =============================
475
  def _filter_terms_df(df: pd.DataFrame, query: str) -> pd.DataFrame:
476
  if df is None or df.empty:
 
492
  default_key = list_keys[0] if list_keys else None
493
  terms = vocab.get(default_key, []) if default_key else []
494
  full_df = pd.DataFrame({"term": terms})
495
+ filtered_df = _filter_terms_df(full_df, "")
496
+ return vocab, list_keys, default_key, full_df, filtered_df, json.dumps(vocab, indent=2), "✅ Vocab loaded."
497
+
498
+
499
+ def vocab_reset_defaults_ui():
500
+ vocab, keys, k0, full_df, filtered_df, vjson, msg = vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
501
+ return vocab, gr.update(choices=keys, value=k0), full_df, filtered_df, vjson, msg, vjson
502
 
503
 
504
  def vocab_load_category(vocab_state: Dict[str, Any], category: str, search: str):
 
563
  return vjson, filtered, f"✅ Applied {len(terms)} terms to {category}."
564
 
565
 
 
 
 
 
566
  def vocab_filter_preview(terms_df, search):
567
  try:
568
  df = terms_df if isinstance(terms_df, pd.DataFrame) else pd.DataFrame(terms_df, columns=["term"])
 
572
 
573
 
574
  # =============================
575
+ # Field builder (admin) + endpoint selection mapping
576
  # =============================
577
  TYPE_CHOICES = ["str", "num", "bool", "list[str]", "list[num]", "enum", "list[enum]"]
578
 
579
 
580
+ def build_spec_from_field_rows(rows: List[Dict[str, Any]]) -> str:
581
  lines = [
582
  "# One field per line: Field Name | type | instructions",
583
  "# types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]",
584
  ""
585
  ]
586
+ for r in rows:
587
  field = str(r.get("field","")).strip()
588
  ftype = str(r.get("type","")).strip()
589
  enums = str(r.get("enum_values","")).strip()
 
606
  return "\n".join(lines).strip() + "\n"
607
 
608
 
609
+ def build_rows_from_endpoints(selected_endpoints: List[str]) -> Tuple[List[Dict[str, Any]], Dict[str, str]]:
610
+ selected_endpoints = selected_endpoints or []
611
+ rows: List[Dict[str, Any]] = []
612
+ field_key_to_module: Dict[str, str] = {}
 
 
 
613
 
614
+ for r in PRESET_CORE:
615
+ rows.append(dict(r))
616
+ field_key_to_module[slugify_field(r["field"])] = "Core"
617
 
618
+ for module in selected_endpoints:
619
+ preset = ENDPOINT_MODULES.get(module)
620
+ if not preset:
621
+ continue
622
+ for r in preset:
623
+ rows.append(dict(r))
624
+ field_key_to_module[slugify_field(r["field"])] = module
625
+
626
+ seen = set()
627
+ deduped: List[Dict[str, Any]] = []
628
+ for r in rows:
629
+ k = str(r.get("field","")).strip().lower()
630
+ if not k or k in seen:
631
+ continue
632
+ seen.add(k)
633
+ deduped.append(r)
634
+
635
+ field_key_to_module = {slugify_field(r["field"]): field_key_to_module.get(slugify_field(r["field"]), "Custom") for r in deduped}
636
+ return deduped, field_key_to_module
637
+
638
+
639
+ def sync_fields_from_endpoints(selected_endpoints: List[str], admin_mode: bool):
640
+ if admin_mode:
641
+ return gr.update(), gr.update(), gr.update(), "Admin mode: endpoint selection will not overwrite custom columns."
642
+ rows, _ = build_rows_from_endpoints(selected_endpoints)
643
+ df = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
644
+ spec = build_spec_from_field_rows(rows)
645
+ return rows, df, spec, "✅ Columns updated from selected endpoints."
646
 
647
+
648
+ def admin_apply_endpoints(selected_endpoints: List[str]):
649
+ rows, _ = build_rows_from_endpoints(selected_endpoints)
650
+ df = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
651
+ spec = build_spec_from_field_rows(rows)
652
+ return rows, df, spec, "✅ Loaded selected endpoints into the builder (Replace)."
 
 
 
 
 
 
 
 
 
 
 
653
 
654
 
655
  def fields_add_or_update(field_name: str, ftype: str, enum_values: str, instructions: str, field_rows: List[Dict[str, Any]]):
 
660
 
661
  if not field_name or not ftype:
662
  df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
663
+ return field_rows, df, build_spec_from_field_rows(field_rows), "Field name and type are required."
664
 
665
  updated = False
666
  for r in field_rows:
 
675
  field_rows.append({"field": field_name, "type": ftype, "enum_values": enum_values, "instructions": instructions})
676
 
677
  df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
678
+ return field_rows, df, build_spec_from_field_rows(field_rows), ("Updated field." if updated else "Added field.")
 
679
 
680
 
681
  def fields_apply_df(field_rows: List[Dict[str, Any]], df_in: Any):
 
683
  df = df_in if isinstance(df_in, pd.DataFrame) else pd.DataFrame(df_in, columns=["field","type","enum_values","instructions"])
684
  except Exception:
685
  df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
686
+ return field_rows, df, build_spec_from_field_rows(field_rows), "Could not parse builder table."
687
 
688
  cleaned = []
689
  seen = set()
 
701
  cleaned.append({"field": field, "type": ftype, "enum_values": enums, "instructions": instr})
702
 
703
  df2 = pd.DataFrame(cleaned, columns=["field","type","enum_values","instructions"])
704
+ spec = build_spec_from_field_rows(cleaned)
705
  return cleaned, df2, spec, f"✅ Applied builder table ({len(cleaned)} fields)."
706
 
707
 
708
+ # =============================
709
+ # Row-building logic (paper vs chemical-endpoint)
710
+ # =============================
711
+ def _as_list(x) -> List[str]:
712
+ if x is None:
713
+ return []
714
+ if isinstance(x, list):
715
+ out = []
716
+ for v in x:
717
+ s = str(v).strip()
718
+ if s:
719
+ out.append(s)
720
+ return out
721
+ s = str(x).strip()
722
+ return [s] if s else []
723
+
724
+
725
+ def _format_value(v: Any) -> Any:
726
+ if isinstance(v, list):
727
+ return "; ".join([str(x) for x in v if str(x).strip()])
728
+ return v
729
+
730
+
731
+ def _record_id(file_name: str, chemical: str, endpoint: str) -> str:
732
+ chemical = (chemical or "").strip() or "-"
733
+ endpoint = (endpoint or "").strip() or "Paper"
734
+ return f"{file_name} | {chemical} | {endpoint}"
735
+
736
+
737
  # =============================
738
  # Main extraction handler
739
  # =============================
 
741
  files,
742
  api_key,
743
  model,
744
+ selected_endpoints,
745
  field_spec,
746
  vocab_json,
747
  max_pages,
748
  chunk_chars,
749
+ max_context_chars,
750
+ admin_mode
751
  ):
752
  if not files:
753
+ return (
754
+ pd.DataFrame(), None, None, "Upload one or more PDFs.",
755
+ gr.update(choices=[], value=None),
756
+ [], [], pd.DataFrame(columns=["Field","Value"]), ""
757
+ )
758
 
759
  try:
760
  vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
761
  except Exception as e:
762
+ return (
763
+ pd.DataFrame(), None, None, f"Controlled vocab JSON invalid: {e}",
764
+ gr.update(choices=[], value=None),
765
+ [], [], pd.DataFrame(columns=["Field","Value"]), ""
766
+ )
767
 
768
+ field_props, field_instr = parse_field_spec(field_spec or "")
769
  if not field_props:
770
+ return (
771
+ pd.DataFrame(), None, None, "No extraction fields are defined. (Check selected endpoints or admin field spec.)",
772
+ gr.update(choices=[], value=None),
773
+ [], [], pd.DataFrame(columns=["Field","Value"]), ""
774
+ )
775
 
776
  schema = build_extraction_schema(field_props, vocab)
777
 
778
+ if admin_mode:
779
+ field_key_to_module = {k: "Custom" for k in field_props.keys()}
780
+ endpoint_modules_for_rows = ["Custom"]
781
+ else:
782
+ _, field_key_to_module = build_rows_from_endpoints(selected_endpoints or [])
783
+ endpoint_modules_for_rows = list(selected_endpoints or []) or ["Core"]
784
+
785
  try:
786
  client = get_openai_client(api_key)
787
  except Exception as e:
788
+ return (
789
+ pd.DataFrame(), None, None, str(e),
790
+ gr.update(choices=[], value=None),
791
+ [], [], pd.DataFrame(columns=["Field","Value"]), ""
792
+ )
793
 
794
+ paper_details: List[Dict[str, Any]] = []
795
+ output_rows: List[Dict[str, Any]] = []
796
 
797
  tmpdir = Path(tempfile.mkdtemp(prefix="tox_extract_"))
798
 
 
809
  "paper_title": "",
810
  "risk_stance": "insufficient_data",
811
  "risk_confidence": 0.0,
812
+ "risk_summary": "No extractable text found. This app supports text-based PDFs only (not scanned images).",
813
  "extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
814
  "evidence": []
815
  }
 
816
  else:
817
  chunks = chunk_pages(pages, target_chars=int(chunk_chars))
818
+
819
+ queries = [
820
+ "regulatory acceptability risk hazard concern conclusion uncertainty evidence NOAEL LOAEL BMD",
821
+ "chemical name CAS number",
822
+ ]
823
+ for ep in (selected_endpoints or []):
824
+ queries.extend(ENDPOINT_QUERY_HINTS.get(ep, []))
825
  for k, ins in field_instr.items():
826
  queries.append(ins if ins else k)
827
 
828
  selected = select_relevant_chunks(chunks, queries, top_per_query=2, max_chunks=12)
829
  context = build_context(selected, max_chars=int(max_context_chars))
830
 
831
+ ex = openai_structured_extract(
832
  client=client,
833
  model=model,
834
  schema=schema,
 
836
  field_instructions=field_instr,
837
  context=context
838
  )
839
+ ex["_file"] = filename
840
+ ex["_pages_in_pdf"] = page_count
841
+
842
+ paper_details.append(ex)
843
 
844
+ base = {
 
845
  "file": filename,
846
+ "paper_title": ex.get("paper_title", ""),
847
+ "risk_stance": ex.get("risk_stance", ""),
848
+ "risk_confidence": ex.get("risk_confidence", ""),
849
+ "risk_summary": ex.get("risk_summary", ""),
850
  }
851
+
852
  ext = ex.get("extracted") or {}
853
+ chemicals = _as_list(ext.get("chemicals"))
854
+ if not chemicals:
855
+ chemicals = ["-"]
856
+
857
+ if len(chemicals) <= 1:
858
+ row = dict(base)
859
+ row["chemical"] = chemicals[0]
860
+ row["endpoint"] = "Paper"
861
+ row["record_id"] = _record_id(filename, row["chemical"], row["endpoint"])
862
+ for k in field_props.keys():
863
+ row[k] = _format_value(ext.get(k, [] if field_props[k].get("type") == "array" else ""))
864
+ output_rows.append(row)
865
+ else:
866
+ core_keys = [k for k, m in field_key_to_module.items() if m == "Core"] if not admin_mode else []
867
+ for chem in chemicals:
868
+ for module in endpoint_modules_for_rows:
869
+ row = dict(base)
870
+ row["chemical"] = chem
871
+ row["endpoint"] = module
872
+ row["record_id"] = _record_id(filename, chem, module)
873
+
874
+ for k in field_props.keys():
875
+ m = field_key_to_module.get(k, "Custom")
876
+ include = (k in core_keys) or (m == module) or admin_mode
877
+ if include:
878
+ if k == "chemicals":
879
+ row[k] = chem # make per-row chemical consistent
880
+ else:
881
+ row[k] = _format_value(ext.get(k, [] if field_props[k].get("type") == "array" else ""))
882
+ output_rows.append(row)
883
+
884
+ df = pd.DataFrame(output_rows)
885
  records = df.to_dict("records")
886
 
887
  csv_path = tmpdir / "extraction_table.csv"
888
  json_path = tmpdir / "extraction_details.json"
889
  df.to_csv(csv_path, index=False)
890
+ json_path.write_text(json.dumps(paper_details, indent=2), encoding="utf-8")
891
 
892
+ choices = [r.get("record_id") for r in records if r.get("record_id")]
893
  default = choices[0] if choices else None
894
+
895
+ vertical = _make_vertical(records, default) if default else pd.DataFrame(columns=["Field","Value"])
896
+ allowed_fields = None
897
+ if default:
898
+ selected_row = next((r for r in records if r.get("record_id") == default), {})
899
+ allowed_fields = set([k for k in selected_row.keys() if k not in {"record_id"}])
900
+
901
+ file_for_evidence = None
902
+ if default:
903
+ file_for_evidence = default.split(" | ")[0].strip()
904
+ evidence = _render_evidence(paper_details, file_for_evidence, allowed_fields=allowed_fields) if file_for_evidence else ""
905
+
906
  overview = _overview_df_from_records(records)
907
+ status = "✅ Done. Review in the report below and export when ready."
908
 
 
909
  return (
910
  overview,
911
  str(csv_path),
 
913
  status,
914
  gr.update(choices=choices, value=default),
915
  records,
916
+ paper_details,
917
  vertical,
918
  evidence
919
  )
 
922
  # =============================
923
  # Review mode handlers
924
  # =============================
925
+ def on_pick(record_id: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
926
+ if not record_id:
927
+ return pd.DataFrame(columns=["Field","Value"]), ""
928
+ row = next((r for r in (records or []) if r.get("record_id") == record_id), {})
929
+ file_name = (row.get("file") or "")
930
+ allowed_fields = set(row.keys()) - {"record_id"}
931
+ return _make_vertical(records, record_id), _render_evidence(details, file_name, allowed_fields=allowed_fields)
932
 
933
 
934
  def toggle_review_mode(is_on: bool):
935
  return gr.update(interactive=bool(is_on))
936
 
937
 
938
+ def save_review_changes(record_id: str, vertical_df: Any, records: List[Dict[str, Any]]):
939
+ if not record_id or not records:
940
  return pd.DataFrame(), records, "Nothing to save."
941
 
942
  try:
 
950
  new_records = []
951
  updated = False
952
  for r in records:
953
+ if r.get("record_id") == record_id:
954
  rr = dict(r)
955
  for k, v in updates.items():
956
  rr[k] = v
 
988
  return openai_synthesize_across_papers(client, model, rows)
989
 
990
 
991
+ # =============================
992
+ # UI visibility helpers
993
+ # =============================
994
+ def set_admin_visibility(is_admin: bool):
995
+ return (
996
+ gr.update(visible=bool(is_admin)),
997
+ gr.update(visible=bool(is_admin)),
998
+ gr.update(visible=bool(is_admin))
999
+ )
1000
+
1001
+
1002
  # =============================
1003
  # Gradio UI
1004
  # =============================
1005
  with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1006
  gr.Markdown(
1007
+ "# Toxicology PDF → Grounded Extractor\n"
1008
+ "Upload PDFs choose endpoints Run review report export.\n\n"
1009
+ "**Note:** Text-based PDFs only (not scanned/image PDFs)."
1010
  )
1011
 
1012
+ state_records = gr.State([])
1013
+ state_details = gr.State([])
1014
+ vocab_state = gr.State({})
1015
+ field_rows_state = gr.State([])
1016
+
1017
+ field_spec = gr.Textbox(visible=False, interactive=False, lines=8, label="(hidden) field spec")
1018
+ vocab_json = gr.Textbox(visible=False, interactive=False, lines=8, label="(hidden) vocab json")
1019
 
1020
  with gr.Tab("Extract"):
1021
+ with gr.Group():
1022
+ files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
1023
+
1024
+ with gr.Row():
1025
+ api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
1026
+ model = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
1027
+
1028
+ endpoints = gr.Dropdown(
1029
+ label="Endpoints to extract (Core included automatically)",
1030
+ choices=list(ENDPOINT_MODULES.keys()),
1031
+ multiselect=True,
1032
+ value=["Genotoxicity (OECD TG)"]
1033
+ )
1034
 
1035
+ extract_btn = gr.Button("Run Extraction", variant="primary")
1036
+ status = gr.Textbox(label="Status", interactive=False)
 
1037
 
1038
+ gr.Markdown("## Report")
1039
+ overview_df = gr.Dataframe(
1040
+ label="Batch Overview",
1041
+ interactive=False,
1042
+ wrap=True,
1043
+ show_row_numbers=True
1044
+ )
1045
 
1046
+ with gr.Row():
1047
+ out_csv = gr.File(label="Download: extraction_table.csv")
1048
+ out_json = gr.File(label="Download: extraction_details.json (evidence + structured data)")
 
1049
 
1050
+ record_pick = gr.Dropdown(label="Select record", choices=[], value=None)
 
1051
 
1052
  with gr.Row():
1053
+ review_mode = gr.Checkbox(label="Review mode (enable editing)", value=False)
1054
+ save_btn = gr.Button("Save edits")
1055
+ export_btn = gr.Button("Export reviewed CSV")
1056
+
1057
+ review_status = gr.Textbox(label="Review status", interactive=False)
1058
+
1059
  with gr.Row():
1060
+ vertical_view = gr.Dataframe(
1061
+ headers=["Field", "Value"],
1062
+ interactive=False,
1063
+ wrap=True,
1064
+ show_row_numbers=False,
1065
+ label="Extracted fields (vertical)"
1066
+ )
1067
+ evidence_md = gr.Markdown()
1068
+
1069
+ reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
1070
+
1071
+ with gr.Accordion("Advanced runtime settings", open=False):
1072
+ with gr.Row():
1073
+ max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
1074
+ chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
1075
+ max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
1076
+
1077
+ with gr.Accordion("Admin tools (taxonomy + custom columns)", open=False):
1078
+ admin_mode = gr.Checkbox(label="Enable Admin mode", value=False)
1079
+
1080
+ admin_group = gr.Group(visible=False)
1081
+ admin_vocab_group = gr.Group(visible=False)
1082
+ admin_fields_group = gr.Group(visible=False)
1083
+
1084
+ with admin_group:
1085
+ gr.Markdown("### Admin: Configure what gets extracted (columns) and how terms are normalized.")
1086
+
1087
+ with admin_vocab_group:
1088
+ gr.Markdown("### Controlled vocabulary (lists only)")
1089
+ vocab_category = gr.Dropdown(label="Category (lists only)", choices=[], value=None)
1090
+ vocab_search = gr.Textbox(label="Search terms", placeholder="Type to filter (e.g., 471, AMES, comet)", lines=1)
1091
+
1092
+ with gr.Row():
1093
+ vocab_term_add = gr.Textbox(label="Add term", placeholder="type term and click Add")
1094
+ vocab_add_btn = gr.Button("Add")
1095
+ with gr.Row():
1096
+ vocab_term_remove = gr.Textbox(label="Remove term", placeholder="type exact term and click Remove")
1097
+ vocab_remove_btn = gr.Button("Remove")
1098
+ vocab_apply_btn = gr.Button("Apply full list to category")
1099
+ vocab_reset_btn = gr.Button("Reset vocab to defaults")
1100
+
1101
+ vocab_terms_df = gr.Dataframe(headers=["term"], label="Terms (full list; edit directly)", interactive=True, wrap=True)
1102
+ vocab_terms_filtered = gr.Dataframe(headers=["term"], label="Filtered preview (read-only)", interactive=False, wrap=True)
1103
+ vocab_status = gr.Textbox(label="Vocab status", interactive=False)
1104
+
1105
+ with gr.Accordion("Raw vocab JSON (auto-generated)", open=False):
1106
+ vocab_json_admin = gr.Textbox(label="Controlled vocab JSON", lines=12, interactive=False)
1107
+
1108
+ with admin_fields_group:
1109
+ gr.Markdown("### Custom columns (Field Builder)")
1110
+ gr.Markdown("Tip: Use endpoint selection to start, then tweak fields.")
1111
+
1112
+ with gr.Row():
1113
+ admin_apply_endpoints_btn = gr.Button("Load selected endpoints into builder (Replace)", variant="secondary")
1114
+ fields_apply_btn = gr.Button("Apply builder table")
1115
+
1116
+ with gr.Row():
1117
+ field_name_in = gr.Textbox(label="Field name", placeholder="e.g., genotoxicity_result")
1118
+ field_type_in = gr.Dropdown(label="Type", choices=TYPE_CHOICES, value="str")
1119
+
1120
+ enum_values_in = gr.Textbox(label="Enum values (comma-separated; for enum/list[enum])", placeholder="a,b,c", lines=2)
1121
+ instructions_in = gr.Textbox(label="Instructions", placeholder="Tell the extractor exactly what to pull.", lines=2)
1122
+
1123
+ add_update_field_btn = gr.Button("Add/Update field")
1124
+
1125
+ fields_df = gr.Dataframe(
1126
+ label="Fields (edit and click Apply)",
1127
+ headers=["field","type","enum_values","instructions"],
1128
+ interactive=True,
1129
+ wrap=True
1130
+ )
1131
+
1132
+ fields_status = gr.Textbox(label="Field builder status", interactive=False)
1133
+
1134
+ admin_mode.change(
1135
+ fn=set_admin_visibility,
1136
+ inputs=[admin_mode],
1137
+ outputs=[admin_group, admin_vocab_group, admin_fields_group]
1138
  )
1139
 
1140
+ endpoints.change(
1141
+ fn=sync_fields_from_endpoints,
1142
+ inputs=[endpoints, admin_mode],
1143
+ outputs=[field_rows_state, fields_df, field_spec, status]
 
1144
  )
1145
 
1146
+ extract_btn.click(
1147
+ fn=run_extraction,
1148
+ inputs=[files, api_key, model, endpoints, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars, admin_mode],
1149
+ outputs=[overview_df, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
1150
+ )
1151
 
1152
+ record_pick.change(
1153
+ fn=on_pick,
1154
+ inputs=[record_pick, state_records, state_details],
1155
+ outputs=[vertical_view, evidence_md]
1156
+ )
1157
 
1158
+ review_mode.change(fn=toggle_review_mode, inputs=[review_mode], outputs=[vertical_view])
1159
+
1160
+ save_btn.click(
1161
+ fn=save_review_changes,
1162
+ inputs=[record_pick, vertical_view, state_records],
1163
+ outputs=[overview_df, state_records, review_status]
1164
+ )
1165
+
1166
+ export_btn.click(
1167
+ fn=export_reviewed_csv,
1168
+ inputs=[state_records],
1169
+ outputs=[reviewed_csv, review_status]
1170
  )
1171
 
1172
+ vocab_search.change(fn=vocab_filter_preview, inputs=[vocab_terms_df, vocab_search], outputs=[vocab_terms_filtered])
1173
+
1174
  vocab_category.change(
1175
  fn=vocab_load_category,
1176
  inputs=[vocab_state, vocab_category, vocab_search],
 
1192
  vocab_apply_btn.click(
1193
  fn=vocab_apply_df,
1194
  inputs=[vocab_state, vocab_category, vocab_terms_df, vocab_search],
1195
+ outputs=[vocab_json_admin, vocab_terms_filtered, vocab_status]
1196
+ ).then(
1197
+ fn=lambda x: x,
1198
+ inputs=[vocab_json_admin],
1199
+ outputs=[vocab_json]
1200
  )
1201
 
1202
  vocab_reset_btn.click(
1203
+ fn=vocab_reset_defaults_ui,
1204
  inputs=None,
1205
+ outputs=[vocab_state, vocab_category, vocab_terms_df, vocab_terms_filtered, vocab_json_admin, vocab_status, vocab_json]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1206
  )
1207
 
1208
+ admin_apply_endpoints_btn.click(
1209
+ fn=admin_apply_endpoints,
1210
+ inputs=[endpoints],
 
 
 
 
 
 
1211
  outputs=[field_rows_state, fields_df, field_spec, fields_status]
1212
  )
1213
 
 
1223
  outputs=[field_rows_state, fields_df, field_spec, fields_status]
1224
  )
1225
 
1226
+ def _init_all():
1227
+ vocab, keys, k0, full_df, filtered_df, vjson, vmsg = vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1228
 
1229
+ default_endpoints = ["Genotoxicity (OECD TG)"]
1230
+ rows, _ = build_rows_from_endpoints(default_endpoints)
1231
+ fdf = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
1232
+ fspec = build_spec_from_field_rows(rows)
 
1233
 
 
 
 
 
 
 
 
1234
  return (
1235
+ vocab,
1236
  gr.update(choices=keys, value=k0),
1237
  full_df,
1238
  filtered_df,
1239
  vjson,
1240
  vmsg,
1241
+ vjson,
1242
+ rows,
1243
  fdf,
1244
  fspec,
1245
+ "✅ Ready."
1246
  )
1247
 
1248
  demo.load(
 
1253
  vocab_category,
1254
  vocab_terms_df,
1255
  vocab_terms_filtered,
1256
+ vocab_json_admin,
1257
  vocab_status,
1258
+ vocab_json,
1259
  field_rows_state,
1260
  fields_df,
1261
  field_spec,
1262
+ status
1263
  ]
1264
  )
1265
 
 
1272
  synth_md = gr.Markdown()
1273
  synth_btn.click(fn=run_synthesis, inputs=[api_key2, model2, extraction_json_file], outputs=[synth_md])
1274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1275
  if __name__ == "__main__":
1276
  port = int(os.environ.get("PORT", "7860"))
1277
  demo.queue().launch(server_name="0.0.0.0", server_port=port)