hchevva commited on
Commit
6766619
·
verified ·
1 Parent(s): b3fda43

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +151 -254
app.py CHANGED
@@ -57,6 +57,7 @@ DEFAULT_CONTROLLED_VOCAB_JSON = """{
57
  "genotoxicity_result_enum": ["positive","negative","equivocal","not_reported"]
58
  }"""
59
 
 
60
  DEFAULT_FIELD_SPEC = """# One field per line: Field Name | type | instructions
61
  # types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]
62
 
@@ -106,7 +107,7 @@ PRESET_CORE = [
106
  {"field": "Study_type", "type": "enum", "enum_values": "in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other", "instructions": "Choose the best match."},
107
  {"field": "Exposure_route", "type": "enum", "enum_values": "oral,inhalation,dermal,parenteral,multiple,not_reported", "instructions": "Choose best match."},
108
  {"field": "Species", "type": "enum", "enum_values": "human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported", "instructions": "Choose best match."},
109
- {"field": "Dose_metrics", "type": "list[str]", "enum_values": "", "instructions": "Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available."},
110
  {"field": "Key_findings", "type": "str", "enum_values": "", "instructions": "2-4 bullet-like sentences summarizing the main findings."},
111
  {"field": "Conclusion", "type": "str", "enum_values": "", "instructions": "What does the paper conclude about safety/risk?"},
112
  ]
@@ -409,7 +410,7 @@ def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[
409
 
410
 
411
  # =============================
412
- # UI helpers: vertical view + evidence
413
  # =============================
414
  def _make_vertical(records: List[Dict[str, Any]], file_name: str) -> pd.DataFrame:
415
  if not records or not file_name:
@@ -437,9 +438,7 @@ def _render_evidence(details: List[Dict[str, Any]], file_name: str, max_items: i
437
  quote = quote[:280] + "…"
438
  lines.append(f"- **{field}** (pages {pages}): “{quote}”")
439
  header = "### Evidence (grounding)\n"
440
- if not lines:
441
- lines = ["- (no evidence returned)"]
442
- return header + "\n".join(lines)
443
 
444
 
445
  def _overview_df_from_records(records: List[Dict[str, Any]]) -> pd.DataFrame:
@@ -447,14 +446,13 @@ def _overview_df_from_records(records: List[Dict[str, Any]]) -> pd.DataFrame:
447
  return pd.DataFrame(columns=["file","paper_title","risk_stance","risk_confidence"])
448
  df = pd.DataFrame(records)
449
  cols = ["file","paper_title","risk_stance","risk_confidence"]
450
- # Include chemicals if present
451
- for c in ["chemicals", "chemical_s", "chemical", "chemical_s_"]:
452
- if c in df.columns and c not in cols:
453
- cols.append(c)
454
- break
455
  cols = [c for c in cols if c in df.columns]
456
  return df[cols].copy() if cols else df.head(50)
457
 
 
 
 
 
458
  def _filter_terms_df(df: pd.DataFrame, query: str) -> pd.DataFrame:
459
  if df is None or df.empty:
460
  return pd.DataFrame(columns=["term"])
@@ -462,20 +460,8 @@ def _filter_terms_df(df: pd.DataFrame, query: str) -> pd.DataFrame:
462
  if not q:
463
  return df[["term"]].copy()
464
  mask = df["term"].astype(str).str.lower().str.contains(q, na=False)
465
- out = df.loc[mask, ["term"]].copy()
466
- return out
467
-
468
- # =============================
469
- # Controlled vocab guided editor (lists only)
470
- # =============================
471
- vocab_search = gr.Textbox(label="Search terms", placeholder="Type to filter (e.g., 471, AMES, comet)", lines=1)
472
 
473
- vocab_terms_filtered = gr.Dataframe(
474
- headers=["term"],
475
- label="Filtered preview (read-only)",
476
- interactive=False,
477
- wrap=True
478
- )
479
 
480
  def vocab_init_state(vocab_json: str):
481
  try:
@@ -486,8 +472,8 @@ def vocab_init_state(vocab_json: str):
486
  list_keys = sorted([k for k, v in vocab.items() if isinstance(v, list)])
487
  default_key = list_keys[0] if list_keys else None
488
  terms = vocab.get(default_key, []) if default_key else []
489
- terms_df = pd.DataFrame({"term": terms})
490
- return vocab, list_keys, default_key, terms_df, json.dumps(vocab, indent=2), "✅ Vocab loaded."
491
 
492
 
493
  def vocab_load_category(vocab_state: Dict[str, Any], category: str, search: str):
@@ -536,13 +522,9 @@ def vocab_apply_df(vocab_state: Dict[str, Any], category: str, terms_df: Any, se
536
  return json.dumps(vocab_state, indent=2), pd.DataFrame(columns=["term"]), "Pick a list category first."
537
 
538
  try:
539
- if isinstance(terms_df, pd.DataFrame):
540
- df = terms_df
541
- else:
542
- df = pd.DataFrame(terms_df, columns=["term"])
543
  except Exception:
544
- vjson = json.dumps(vocab_state, indent=2)
545
- return vjson, pd.DataFrame(columns=["term"]), "Could not parse terms table."
546
 
547
  terms = []
548
  for t in df.get("term", []).tolist():
@@ -559,6 +541,7 @@ def vocab_apply_df(vocab_state: Dict[str, Any], category: str, terms_df: Any, se
559
  def vocab_reset_defaults():
560
  return vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
561
 
 
562
  def vocab_filter_preview(terms_df, search):
563
  try:
564
  df = terms_df if isinstance(terms_df, pd.DataFrame) else pd.DataFrame(terms_df, columns=["term"])
@@ -566,26 +549,12 @@ def vocab_filter_preview(terms_df, search):
566
  df = pd.DataFrame(columns=["term"])
567
  return _filter_terms_df(df, search)
568
 
569
- vocab_search.change(
570
- fn=vocab_filter_preview,
571
- inputs=[vocab_terms_df, vocab_search],
572
- outputs=[vocab_terms_filtered]
573
- )
574
-
575
 
576
  # =============================
577
  # Field builder (type dropdown + presets)
578
  # =============================
579
  TYPE_CHOICES = ["str", "num", "bool", "list[str]", "list[num]", "enum", "list[enum]"]
580
 
581
- def fields_init_state():
582
- # start from DEFAULT_FIELD_SPEC by showing a friendly default builder (Core + Genotox + NAMs)
583
- fields = []
584
- for row in (PRESET_CORE + PRESET_NAMS_INSILICO + PRESET_GENOTOX_OECD):
585
- fields.append(dict(row))
586
- df = pd.DataFrame(fields, columns=["field","type","enum_values","instructions"])
587
- spec = build_spec_from_field_df(df)
588
- return fields, df, spec, "✅ Field builder loaded."
589
 
590
  def build_spec_from_field_df(df: pd.DataFrame) -> str:
591
  lines = [
@@ -602,11 +571,6 @@ def build_spec_from_field_df(df: pd.DataFrame) -> str:
602
  if not field or not ftype:
603
  continue
604
 
605
- # normalize types
606
- if ftype not in TYPE_CHOICES:
607
- # keep as-is, but likely invalid; user can fix
608
- pass
609
-
610
  if ftype == "enum":
611
  vals = [v.strip() for v in enums.split(",") if v.strip()]
612
  type_str = f"enum[{','.join(vals)}]" if vals else "str"
@@ -620,6 +584,41 @@ def build_spec_from_field_df(df: pd.DataFrame) -> str:
620
 
621
  return "\n".join(lines).strip() + "\n"
622
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
623
  def fields_add_or_update(field_name: str, ftype: str, enum_values: str, instructions: str, field_rows: List[Dict[str, Any]]):
624
  field_name = (field_name or "").strip()
625
  ftype = (ftype or "").strip()
@@ -630,7 +629,6 @@ def fields_add_or_update(field_name: str, ftype: str, enum_values: str, instruct
630
  df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
631
  return field_rows, df, build_spec_from_field_df(df), "Field name and type are required."
632
 
633
- # update if exists
634
  updated = False
635
  for r in field_rows:
636
  if str(r.get("field","")).strip().lower() == field_name.lower():
@@ -647,27 +645,14 @@ def fields_add_or_update(field_name: str, ftype: str, enum_values: str, instruct
647
  spec = build_spec_from_field_df(df)
648
  return field_rows, df, spec, ("Updated field." if updated else "Added field.")
649
 
650
- def fields_remove(field_to_remove: str, field_rows: List[Dict[str, Any]]):
651
- key = (field_to_remove or "").strip().lower()
652
- if not key:
653
- df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
654
- return field_rows, df, build_spec_from_field_df(df), "Pick a field to remove."
655
- field_rows = [r for r in field_rows if str(r.get("field","")).strip().lower() != key]
656
- df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
657
- spec = build_spec_from_field_df(df)
658
- return field_rows, df, spec, "Removed."
659
 
660
  def fields_apply_df(field_rows: List[Dict[str, Any]], df_in: Any):
661
  try:
662
- if isinstance(df_in, pd.DataFrame):
663
- df = df_in
664
- else:
665
- df = pd.DataFrame(df_in, columns=["field","type","enum_values","instructions"])
666
  except Exception:
667
  df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
668
  return field_rows, df, build_spec_from_field_df(df), "Could not parse builder table."
669
 
670
- # clean + rebuild list of dicts
671
  cleaned = []
672
  seen = set()
673
  for _, r in df.iterrows():
@@ -687,34 +672,9 @@ def fields_apply_df(field_rows: List[Dict[str, Any]], df_in: Any):
687
  spec = build_spec_from_field_df(df2)
688
  return cleaned, df2, spec, f"✅ Applied builder table ({len(cleaned)} fields)."
689
 
690
- def fields_load_preset(preset_name: str, mode: str, field_rows: List[Dict[str, Any]]):
691
- preset = PRESET_MAP.get(preset_name)
692
- if not preset:
693
- df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
694
- return field_rows, df, build_spec_from_field_df(df), "Unknown preset."
695
-
696
- if mode == "Replace":
697
- new_rows = [dict(r) for r in preset]
698
- else:
699
- # Append (update existing fields if same name)
700
- new_rows = [dict(r) for r in field_rows]
701
- for p in preset:
702
- found = False
703
- for r in new_rows:
704
- if str(r.get("field","")).strip().lower() == str(p.get("field","")).strip().lower():
705
- r.update(p)
706
- found = True
707
- break
708
- if not found:
709
- new_rows.append(dict(p))
710
-
711
- df = pd.DataFrame(new_rows, columns=["field","type","enum_values","instructions"])
712
- spec = build_spec_from_field_df(df)
713
- return new_rows, df, spec, f"✅ Loaded preset: {preset_name} ({mode})."
714
-
715
 
716
  # =============================
717
- # Extraction handler
718
  # =============================
719
  def run_extraction(
720
  files,
@@ -757,7 +717,7 @@ def run_extraction(
757
  pages, page_count = extract_pages_from_pdf(pdf_path, max_pages=int(max_pages))
758
 
759
  if _text_based_pdf_warning(pages):
760
- results.append({
761
  "_file": filename,
762
  "_pages_in_pdf": page_count,
763
  "paper_title": "",
@@ -766,10 +726,10 @@ def run_extraction(
766
  "risk_summary": "No extractable text found. This app supports text-based PDFs only.",
767
  "extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
768
  "evidence": []
769
- })
 
770
  else:
771
  chunks = chunk_pages(pages, target_chars=int(chunk_chars))
772
-
773
  queries = ["regulatory acceptability risk hazard concern conclusion adverse effect uncertainty noael loael bmd bmdl"]
774
  for k, ins in field_instr.items():
775
  queries.append(ins if ins else k)
@@ -789,7 +749,6 @@ def run_extraction(
789
  extracted["_pages_in_pdf"] = page_count
790
  results.append(extracted)
791
 
792
- # flatten to internal records for vertical view + review/export
793
  ex = results[-1]
794
  row = {
795
  "file": filename,
@@ -819,7 +778,6 @@ def run_extraction(
819
  default = choices[0] if choices else None
820
  vertical = _make_vertical(records, default)
821
  evidence = _render_evidence(results, default)
822
-
823
  overview = _overview_df_from_records(records)
824
 
825
  status = "Done. Use the vertical view + evidence for review. Export reviewed CSV when ready."
@@ -842,18 +800,17 @@ def run_extraction(
842
  def on_pick(file_name: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
843
  return _make_vertical(records, file_name), _render_evidence(details, file_name)
844
 
 
845
  def toggle_review_mode(is_on: bool):
846
  return gr.update(interactive=bool(is_on))
847
 
 
848
  def save_review_changes(file_name: str, vertical_df: Any, records: List[Dict[str, Any]]):
849
  if not file_name or not records:
850
  return pd.DataFrame(), records, "Nothing to save."
851
 
852
  try:
853
- if isinstance(vertical_df, pd.DataFrame):
854
- dfv = vertical_df
855
- else:
856
- dfv = pd.DataFrame(vertical_df, columns=["Field", "Value"])
857
  except Exception:
858
  return _overview_df_from_records(records), records, "Could not parse edited vertical table."
859
 
@@ -875,6 +832,7 @@ def save_review_changes(file_name: str, vertical_df: Any, records: List[Dict[str
875
  msg = "Saved changes into session data. Export reviewed CSV to download." if updated else "Record not found."
876
  return _overview_df_from_records(new_records), new_records, msg
877
 
 
878
  def export_reviewed_csv(records: List[Dict[str, Any]]):
879
  if not records:
880
  return None, "No reviewed data to export."
@@ -885,7 +843,7 @@ def export_reviewed_csv(records: List[Dict[str, Any]]):
885
 
886
 
887
  # =============================
888
- # Synthesis
889
  # =============================
890
  def run_synthesis(api_key, model, extraction_json_file):
891
  if extraction_json_file is None:
@@ -907,15 +865,14 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
907
  gr.Markdown(
908
  "# Toxicology PDF → Grounded Extractor (GPT-4o)\n\n"
909
  "**Important:** Text-based PDFs only (not scanned/image PDFs). If no extractable text is found, the record is marked `insufficient_data`.\n\n"
910
- "This UI is optimized for non-JSON users: **Controlled vocab editor** + **Field Builder**.\n"
911
- "Raw JSON/spec are available under **Advanced**."
912
  )
913
 
914
- # State
915
- state_records = gr.State([]) # list[dict]
916
- state_details = gr.State([]) # list[dict]
917
- vocab_state = gr.State({}) # dict
918
- field_rows_state = gr.State([]) # list[dict]
919
 
920
  with gr.Tab("Extract"):
921
  files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
@@ -933,94 +890,69 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
933
  # Controlled Vocabulary (guided editor)
934
  # -------------------------
935
  gr.Markdown("## Controlled Vocabulary (guided editor)")
936
-
937
- vocab_mode = gr.Radio(
938
- choices=["Guided", "Advanced (Raw JSON)"],
939
- value="Guided",
940
- label="Vocab editor mode"
941
- )
942
-
943
  vocab_category = gr.Dropdown(label="Category (lists only)", choices=[], value=None)
944
-
945
- # NEW: Search box
946
- vocab_search = gr.Textbox(
947
- label="Search terms",
948
- placeholder="Type to filter (e.g., 471, AMES, comet)",
949
- lines=1
950
- )
951
-
952
  with gr.Row():
953
  vocab_term_add = gr.Textbox(label="Add term", placeholder="type term and click Add")
954
  vocab_add_btn = gr.Button("Add")
955
-
956
  with gr.Row():
957
  vocab_term_remove = gr.Textbox(label="Remove term", placeholder="type exact term and click Remove")
958
  vocab_remove_btn = gr.Button("Remove")
959
- vocab_apply_btn = gr.Button("Apply table changes to category")
960
  vocab_reset_btn = gr.Button("Reset vocab to defaults")
961
-
962
- # IMPORTANT: define vocab_terms_df BEFORE using it in any event wiring
963
  vocab_terms_df = gr.Dataframe(
964
  headers=["term"],
965
  label="Terms (full list; edit directly)",
966
  interactive=True,
967
  wrap=True
968
  )
969
-
970
- # NEW: filtered preview (read-only)
971
  vocab_terms_filtered = gr.Dataframe(
972
  headers=["term"],
973
  label="Filtered preview (read-only)",
974
  interactive=False,
975
  wrap=True
976
  )
977
-
978
  vocab_status = gr.Textbox(label="Vocab status", interactive=False)
979
-
980
  with gr.Accordion("Advanced: Raw vocab JSON (auto-generated)", open=False):
981
  vocab_json = gr.Textbox(label="Controlled vocab JSON", lines=12, interactive=False)
982
-
983
- # -------------------------
984
- # Filtering helper + event
985
- # -------------------------
986
- def vocab_filter_preview(terms_df, search):
987
- try:
988
- df = terms_df if isinstance(terms_df, pd.DataFrame) else pd.DataFrame(terms_df, columns=["term"])
989
- except Exception:
990
- df = pd.DataFrame(columns=["term"])
991
- return _filter_terms_df(df, search)
992
-
993
- # Wire events AFTER components exist
994
- vocab_category.change(
995
- fn=vocab_load_category,
996
- inputs=[vocab_state, vocab_category, vocab_search],
997
- outputs=[vocab_terms_df, vocab_terms_filtered, vocab_status]
998
- )
999
-
1000
  vocab_search.change(
1001
  fn=vocab_filter_preview,
1002
  inputs=[vocab_terms_df, vocab_search],
1003
  outputs=[vocab_terms_filtered]
1004
  )
1005
-
 
 
 
 
 
 
1006
  vocab_add_btn.click(
1007
  fn=vocab_add_term,
1008
  inputs=[vocab_state, vocab_category, vocab_term_add, vocab_search],
1009
  outputs=[vocab_terms_df, vocab_terms_filtered, vocab_term_add, vocab_status]
1010
  )
1011
-
1012
  vocab_remove_btn.click(
1013
  fn=vocab_remove_term,
1014
  inputs=[vocab_state, vocab_category, vocab_term_remove, vocab_search],
1015
  outputs=[vocab_terms_df, vocab_terms_filtered, vocab_term_remove, vocab_status]
1016
  )
1017
-
1018
  vocab_apply_btn.click(
1019
  fn=vocab_apply_df,
1020
  inputs=[vocab_state, vocab_category, vocab_terms_df, vocab_search],
1021
  outputs=[vocab_json, vocab_terms_filtered, vocab_status]
1022
  )
1023
-
1024
  vocab_reset_btn.click(
1025
  fn=vocab_reset_defaults,
1026
  inputs=None,
@@ -1031,8 +963,11 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1031
  outputs=[vocab_terms_df, vocab_terms_filtered, vocab_status]
1032
  )
1033
 
1034
-
 
 
1035
  gr.Markdown("## Extraction Spec (Field Builder)")
 
1036
  with gr.Row():
1037
  preset_name = gr.Dropdown(label="Preset", choices=list(PRESET_MAP.keys()), value="Core (recommended)")
1038
  preset_mode = gr.Radio(label="Preset mode", choices=["Replace", "Append"], value="Append")
@@ -1041,30 +976,48 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1041
  with gr.Row():
1042
  field_name_in = gr.Textbox(label="Field name", placeholder="e.g., Genotoxicity_result")
1043
  field_type_in = gr.Dropdown(label="Type", choices=TYPE_CHOICES, value="str")
1044
- enum_values_in = gr.Textbox(label="Enum values (comma-separated; used for enum/list[enum])", placeholder="a,b,c", lines=2)
1045
  instructions_in = gr.Textbox(label="Instructions", placeholder="Tell the extractor exactly what to pull.", lines=2)
1046
 
1047
- with gr.Row():
1048
- add_update_field_btn = gr.Button("Add/Update field")
1049
- remove_field_name = gr.Dropdown(label="Remove field", choices=[], value=None)
1050
- remove_field_btn = gr.Button("Remove")
1051
 
1052
  fields_df = gr.Dataframe(
1053
- label="Fields (edit if needed, then click Apply)",
1054
  headers=["field","type","enum_values","instructions"],
1055
  interactive=True,
1056
  wrap=True
1057
  )
 
1058
  fields_apply_btn = gr.Button("Apply builder table")
1059
  fields_status = gr.Textbox(label="Field builder status", interactive=False)
1060
 
1061
  with gr.Accordion("Advanced: Raw extraction spec (auto-generated)", open=False):
1062
  field_spec = gr.Textbox(label="Extraction spec", lines=12, interactive=False)
1063
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1064
  extract_btn = gr.Button("Run Extraction (Grounded)")
1065
  status = gr.Textbox(label="Status", interactive=False)
1066
 
1067
- # Replace wide table with a compact overview (not duplicate)
1068
  overview_df = gr.Dataframe(
1069
  label="Batch Overview (compact)",
1070
  interactive=False,
@@ -1097,111 +1050,18 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1097
  evidence_md = gr.Markdown()
1098
  reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
1099
 
1100
- # -------------------------
1101
- # INIT vocab + fields on load (via a button-less trick: use .load)
1102
- # -------------------------
1103
- def _init_all():
1104
- v, keys, k0, df_terms, vjson, vmsg = vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
1105
- frows, fdf, fspec, fmsg = fields_init_state()
1106
- remove_choices = [r["field"] for r in frows]
1107
- return (
1108
- v, gr.update(choices=keys, value=k0), df_terms, vjson, vmsg,
1109
- frows, fdf, fspec, fmsg, gr.update(choices=remove_choices, value=(remove_choices[0] if remove_choices else None))
1110
- )
1111
-
1112
- demo.load(
1113
- _init_all,
1114
- inputs=None,
1115
- outputs=[vocab_state, vocab_category, vocab_terms_df, vocab_json, vocab_status,
1116
- field_rows_state, fields_df, field_spec, fields_status, remove_field_name]
1117
- )
1118
-
1119
- # Vocab events
1120
- vocab_category.change(
1121
- fn=vocab_load_category,
1122
- inputs=[vocab_state, vocab_category, vocab_search],
1123
- outputs=[vocab_terms_df, vocab_terms_filtered, vocab_status]
1124
- )
1125
- vocab_add_btn.click(
1126
- fn=vocab_add_term,
1127
- inputs=[vocab_state, vocab_category, vocab_term_add, vocab_search],
1128
- outputs=[vocab_terms_df, vocab_terms_filtered, vocab_term_add, vocab_status]
1129
- )
1130
-
1131
- vocab_remove_btn.click(
1132
- fn=vocab_remove_term,
1133
- inputs=[vocab_state, vocab_category, vocab_term_remove, vocab_search],
1134
- outputs=[vocab_terms_df, vocab_terms_filtered, vocab_term_remove, vocab_status]
1135
- )
1136
-
1137
- vocab_apply_btn.click(
1138
- fn=vocab_apply_df,
1139
- inputs=[vocab_state, vocab_category, vocab_terms_df, vocab_search],
1140
- outputs=[vocab_json, vocab_terms_filtered, vocab_status]
1141
- )
1142
-
1143
- vocab_reset_btn.click(
1144
- fn=vocab_reset_defaults,
1145
- inputs=None,
1146
- outputs=[vocab_state, vocab_category, vocab_terms_df, vocab_json, vocab_status]
1147
- )
1148
-
1149
- # Field builder events
1150
- preset_btn.click(
1151
- fn=fields_load_preset,
1152
- inputs=[preset_name, preset_mode, field_rows_state],
1153
- outputs=[field_rows_state, fields_df, field_spec, fields_status]
1154
- ).then(
1155
- fn=lambda rows: gr.update(choices=[r["field"] for r in rows], value=None),
1156
- inputs=[field_rows_state],
1157
- outputs=[remove_field_name]
1158
- )
1159
-
1160
- add_update_field_btn.click(
1161
- fn=fields_add_or_update,
1162
- inputs=[field_name_in, field_type_in, enum_values_in, instructions_in, field_rows_state],
1163
- outputs=[field_rows_state, fields_df, field_spec, fields_status]
1164
- ).then(
1165
- fn=lambda rows: gr.update(choices=[r["field"] for r in rows], value=None),
1166
- inputs=[field_rows_state],
1167
- outputs=[remove_field_name]
1168
- )
1169
-
1170
- remove_field_btn.click(
1171
- fn=fields_remove,
1172
- inputs=[remove_field_name, field_rows_state],
1173
- outputs=[field_rows_state, fields_df, field_spec, fields_status]
1174
- ).then(
1175
- fn=lambda rows: gr.update(choices=[r["field"] for r in rows], value=None),
1176
- inputs=[field_rows_state],
1177
- outputs=[remove_field_name]
1178
- )
1179
-
1180
- fields_apply_btn.click(
1181
- fn=fields_apply_df,
1182
- inputs=[field_rows_state, fields_df],
1183
- outputs=[field_rows_state, fields_df, field_spec, fields_status]
1184
- ).then(
1185
- fn=lambda rows: gr.update(choices=[r["field"] for r in rows], value=None),
1186
- inputs=[field_rows_state],
1187
- outputs=[remove_field_name]
1188
- )
1189
-
1190
- # Extraction
1191
  extract_btn.click(
1192
  fn=run_extraction,
1193
  inputs=[files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars],
1194
  outputs=[overview_df, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
1195
  )
1196
 
1197
- # Vertical view selection
1198
  record_pick.change(
1199
  fn=on_pick,
1200
  inputs=[record_pick, state_records, state_details],
1201
  outputs=[vertical_view, evidence_md]
1202
  )
1203
 
1204
- # Review mode
1205
  review_mode.change(fn=toggle_review_mode, inputs=[review_mode], outputs=[vertical_view])
1206
 
1207
  save_btn.click(
@@ -1216,6 +1076,43 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1216
  outputs=[reviewed_csv, review_status]
1217
  )
1218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1219
  with gr.Tab("Cross-paper Synthesis"):
1220
  gr.Markdown("Upload `extraction_details.json` from Extract. Synthesis is based strictly on grounded extractions.")
1221
  api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
 
57
  "genotoxicity_result_enum": ["positive","negative","equivocal","not_reported"]
58
  }"""
59
 
60
+ # (Used only as a fallback / advanced preview)
61
  DEFAULT_FIELD_SPEC = """# One field per line: Field Name | type | instructions
62
  # types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]
63
 
 
107
  {"field": "Study_type", "type": "enum", "enum_values": "in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other", "instructions": "Choose the best match."},
108
  {"field": "Exposure_route", "type": "enum", "enum_values": "oral,inhalation,dermal,parenteral,multiple,not_reported", "instructions": "Choose best match."},
109
  {"field": "Species", "type": "enum", "enum_values": "human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported", "instructions": "Choose best match."},
110
+ {"field": "Dose_metrics", "type": "list[str]", "enum_values": "", "instructions": "Include reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available."},
111
  {"field": "Key_findings", "type": "str", "enum_values": "", "instructions": "2-4 bullet-like sentences summarizing the main findings."},
112
  {"field": "Conclusion", "type": "str", "enum_values": "", "instructions": "What does the paper conclude about safety/risk?"},
113
  ]
 
410
 
411
 
412
  # =============================
413
+ # UI helpers: vertical view + evidence + overview
414
  # =============================
415
  def _make_vertical(records: List[Dict[str, Any]], file_name: str) -> pd.DataFrame:
416
  if not records or not file_name:
 
438
  quote = quote[:280] + "…"
439
  lines.append(f"- **{field}** (pages {pages}): “{quote}”")
440
  header = "### Evidence (grounding)\n"
441
+ return header + ("\n".join(lines) if lines else "- (no evidence returned)")
 
 
442
 
443
 
444
  def _overview_df_from_records(records: List[Dict[str, Any]]) -> pd.DataFrame:
 
446
  return pd.DataFrame(columns=["file","paper_title","risk_stance","risk_confidence"])
447
  df = pd.DataFrame(records)
448
  cols = ["file","paper_title","risk_stance","risk_confidence"]
 
 
 
 
 
449
  cols = [c for c in cols if c in df.columns]
450
  return df[cols].copy() if cols else df.head(50)
451
 
452
+
453
+ # =============================
454
+ # Controlled vocab guided editor (lists only) + SEARCH FILTER
455
+ # =============================
456
  def _filter_terms_df(df: pd.DataFrame, query: str) -> pd.DataFrame:
457
  if df is None or df.empty:
458
  return pd.DataFrame(columns=["term"])
 
460
  if not q:
461
  return df[["term"]].copy()
462
  mask = df["term"].astype(str).str.lower().str.contains(q, na=False)
463
+ return df.loc[mask, ["term"]].copy()
 
 
 
 
 
 
464
 
 
 
 
 
 
 
465
 
466
  def vocab_init_state(vocab_json: str):
467
  try:
 
472
  list_keys = sorted([k for k, v in vocab.items() if isinstance(v, list)])
473
  default_key = list_keys[0] if list_keys else None
474
  terms = vocab.get(default_key, []) if default_key else []
475
+ full_df = pd.DataFrame({"term": terms})
476
+ return vocab, list_keys, default_key, full_df, json.dumps(vocab, indent=2), "✅ Vocab loaded."
477
 
478
 
479
  def vocab_load_category(vocab_state: Dict[str, Any], category: str, search: str):
 
522
  return json.dumps(vocab_state, indent=2), pd.DataFrame(columns=["term"]), "Pick a list category first."
523
 
524
  try:
525
+ df = terms_df if isinstance(terms_df, pd.DataFrame) else pd.DataFrame(terms_df, columns=["term"])
 
 
 
526
  except Exception:
527
+ return json.dumps(vocab_state, indent=2), pd.DataFrame(columns=["term"]), "Could not parse terms table."
 
528
 
529
  terms = []
530
  for t in df.get("term", []).tolist():
 
541
  def vocab_reset_defaults():
542
  return vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
543
 
544
+
545
  def vocab_filter_preview(terms_df, search):
546
  try:
547
  df = terms_df if isinstance(terms_df, pd.DataFrame) else pd.DataFrame(terms_df, columns=["term"])
 
549
  df = pd.DataFrame(columns=["term"])
550
  return _filter_terms_df(df, search)
551
 
 
 
 
 
 
 
552
 
553
  # =============================
554
  # Field builder (type dropdown + presets)
555
  # =============================
556
  TYPE_CHOICES = ["str", "num", "bool", "list[str]", "list[num]", "enum", "list[enum]"]
557
 
 
 
 
 
 
 
 
 
558
 
559
  def build_spec_from_field_df(df: pd.DataFrame) -> str:
560
  lines = [
 
571
  if not field or not ftype:
572
  continue
573
 
 
 
 
 
 
574
  if ftype == "enum":
575
  vals = [v.strip() for v in enums.split(",") if v.strip()]
576
  type_str = f"enum[{','.join(vals)}]" if vals else "str"
 
584
 
585
  return "\n".join(lines).strip() + "\n"
586
 
587
+
588
+ def fields_init_state():
589
+ fields = []
590
+ for row in (PRESET_CORE + PRESET_NAMS_INSILICO + PRESET_GENOTOX_OECD):
591
+ fields.append(dict(row))
592
+ df = pd.DataFrame(fields, columns=["field","type","enum_values","instructions"])
593
+ spec = build_spec_from_field_df(df)
594
+ return fields, df, spec, "✅ Field builder loaded."
595
+
596
+
597
+ def fields_load_preset(preset_name: str, mode: str, field_rows: List[Dict[str, Any]]):
598
+ preset = PRESET_MAP.get(preset_name)
599
+ if not preset:
600
+ df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
601
+ return field_rows, df, build_spec_from_field_df(df), "Unknown preset."
602
+
603
+ if mode == "Replace":
604
+ new_rows = [dict(r) for r in preset]
605
+ else:
606
+ new_rows = [dict(r) for r in field_rows]
607
+ for p in preset:
608
+ found = False
609
+ for r in new_rows:
610
+ if str(r.get("field","")).strip().lower() == str(p.get("field","")).strip().lower():
611
+ r.update(p)
612
+ found = True
613
+ break
614
+ if not found:
615
+ new_rows.append(dict(p))
616
+
617
+ df = pd.DataFrame(new_rows, columns=["field","type","enum_values","instructions"])
618
+ spec = build_spec_from_field_df(df)
619
+ return new_rows, df, spec, f"✅ Loaded preset: {preset_name} ({mode})."
620
+
621
+
622
  def fields_add_or_update(field_name: str, ftype: str, enum_values: str, instructions: str, field_rows: List[Dict[str, Any]]):
623
  field_name = (field_name or "").strip()
624
  ftype = (ftype or "").strip()
 
629
  df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
630
  return field_rows, df, build_spec_from_field_df(df), "Field name and type are required."
631
 
 
632
  updated = False
633
  for r in field_rows:
634
  if str(r.get("field","")).strip().lower() == field_name.lower():
 
645
  spec = build_spec_from_field_df(df)
646
  return field_rows, df, spec, ("Updated field." if updated else "Added field.")
647
 
 
 
 
 
 
 
 
 
 
648
 
649
  def fields_apply_df(field_rows: List[Dict[str, Any]], df_in: Any):
650
  try:
651
+ df = df_in if isinstance(df_in, pd.DataFrame) else pd.DataFrame(df_in, columns=["field","type","enum_values","instructions"])
 
 
 
652
  except Exception:
653
  df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
654
  return field_rows, df, build_spec_from_field_df(df), "Could not parse builder table."
655
 
 
656
  cleaned = []
657
  seen = set()
658
  for _, r in df.iterrows():
 
672
  spec = build_spec_from_field_df(df2)
673
  return cleaned, df2, spec, f"✅ Applied builder table ({len(cleaned)} fields)."
674
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
675
 
676
  # =============================
677
+ # Main extraction handler
678
  # =============================
679
  def run_extraction(
680
  files,
 
717
  pages, page_count = extract_pages_from_pdf(pdf_path, max_pages=int(max_pages))
718
 
719
  if _text_based_pdf_warning(pages):
720
+ ex = {
721
  "_file": filename,
722
  "_pages_in_pdf": page_count,
723
  "paper_title": "",
 
726
  "risk_summary": "No extractable text found. This app supports text-based PDFs only.",
727
  "extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
728
  "evidence": []
729
+ }
730
+ results.append(ex)
731
  else:
732
  chunks = chunk_pages(pages, target_chars=int(chunk_chars))
 
733
  queries = ["regulatory acceptability risk hazard concern conclusion adverse effect uncertainty noael loael bmd bmdl"]
734
  for k, ins in field_instr.items():
735
  queries.append(ins if ins else k)
 
749
  extracted["_pages_in_pdf"] = page_count
750
  results.append(extracted)
751
 
 
752
  ex = results[-1]
753
  row = {
754
  "file": filename,
 
778
  default = choices[0] if choices else None
779
  vertical = _make_vertical(records, default)
780
  evidence = _render_evidence(results, default)
 
781
  overview = _overview_df_from_records(records)
782
 
783
  status = "Done. Use the vertical view + evidence for review. Export reviewed CSV when ready."
 
800
  def on_pick(file_name: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
801
  return _make_vertical(records, file_name), _render_evidence(details, file_name)
802
 
803
+
804
  def toggle_review_mode(is_on: bool):
805
  return gr.update(interactive=bool(is_on))
806
 
807
+
808
  def save_review_changes(file_name: str, vertical_df: Any, records: List[Dict[str, Any]]):
809
  if not file_name or not records:
810
  return pd.DataFrame(), records, "Nothing to save."
811
 
812
  try:
813
+ dfv = vertical_df if isinstance(vertical_df, pd.DataFrame) else pd.DataFrame(vertical_df, columns=["Field", "Value"])
 
 
 
814
  except Exception:
815
  return _overview_df_from_records(records), records, "Could not parse edited vertical table."
816
 
 
832
  msg = "Saved changes into session data. Export reviewed CSV to download." if updated else "Record not found."
833
  return _overview_df_from_records(new_records), new_records, msg
834
 
835
+
836
  def export_reviewed_csv(records: List[Dict[str, Any]]):
837
  if not records:
838
  return None, "No reviewed data to export."
 
843
 
844
 
845
  # =============================
846
+ # Synthesis tab handler
847
  # =============================
848
  def run_synthesis(api_key, model, extraction_json_file):
849
  if extraction_json_file is None:
 
865
  gr.Markdown(
866
  "# Toxicology PDF → Grounded Extractor (GPT-4o)\n\n"
867
  "**Important:** Text-based PDFs only (not scanned/image PDFs). If no extractable text is found, the record is marked `insufficient_data`.\n\n"
868
+ "UI includes a guided **Controlled Vocab editor** (lists only, with search) and a **Field Builder** (type dropdown + presets)."
 
869
  )
870
 
871
+ # States
872
+ state_records = gr.State([]) # list[dict]
873
+ state_details = gr.State([]) # list[dict]
874
+ vocab_state = gr.State({}) # dict
875
+ field_rows_state = gr.State([]) # list[dict]
876
 
877
  with gr.Tab("Extract"):
878
  files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
 
890
  # Controlled Vocabulary (guided editor)
891
  # -------------------------
892
  gr.Markdown("## Controlled Vocabulary (guided editor)")
893
+
 
 
 
 
 
 
894
  vocab_category = gr.Dropdown(label="Category (lists only)", choices=[], value=None)
895
+ vocab_search = gr.Textbox(label="Search terms", placeholder="Type to filter (e.g., 471, AMES, comet)", lines=1)
896
+
 
 
 
 
 
 
897
  with gr.Row():
898
  vocab_term_add = gr.Textbox(label="Add term", placeholder="type term and click Add")
899
  vocab_add_btn = gr.Button("Add")
 
900
  with gr.Row():
901
  vocab_term_remove = gr.Textbox(label="Remove term", placeholder="type exact term and click Remove")
902
  vocab_remove_btn = gr.Button("Remove")
903
+ vocab_apply_btn = gr.Button("Apply full list to category")
904
  vocab_reset_btn = gr.Button("Reset vocab to defaults")
905
+
 
906
  vocab_terms_df = gr.Dataframe(
907
  headers=["term"],
908
  label="Terms (full list; edit directly)",
909
  interactive=True,
910
  wrap=True
911
  )
912
+
 
913
  vocab_terms_filtered = gr.Dataframe(
914
  headers=["term"],
915
  label="Filtered preview (read-only)",
916
  interactive=False,
917
  wrap=True
918
  )
919
+
920
  vocab_status = gr.Textbox(label="Vocab status", interactive=False)
921
+
922
  with gr.Accordion("Advanced: Raw vocab JSON (auto-generated)", open=False):
923
  vocab_json = gr.Textbox(label="Controlled vocab JSON", lines=12, interactive=False)
924
+
925
+ # Filter preview wiring (must be AFTER vocab_terms_df exists)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
926
  vocab_search.change(
927
  fn=vocab_filter_preview,
928
  inputs=[vocab_terms_df, vocab_search],
929
  outputs=[vocab_terms_filtered]
930
  )
931
+
932
+ vocab_category.change(
933
+ fn=vocab_load_category,
934
+ inputs=[vocab_state, vocab_category, vocab_search],
935
+ outputs=[vocab_terms_df, vocab_terms_filtered, vocab_status]
936
+ )
937
+
938
  vocab_add_btn.click(
939
  fn=vocab_add_term,
940
  inputs=[vocab_state, vocab_category, vocab_term_add, vocab_search],
941
  outputs=[vocab_terms_df, vocab_terms_filtered, vocab_term_add, vocab_status]
942
  )
943
+
944
  vocab_remove_btn.click(
945
  fn=vocab_remove_term,
946
  inputs=[vocab_state, vocab_category, vocab_term_remove, vocab_search],
947
  outputs=[vocab_terms_df, vocab_terms_filtered, vocab_term_remove, vocab_status]
948
  )
949
+
950
  vocab_apply_btn.click(
951
  fn=vocab_apply_df,
952
  inputs=[vocab_state, vocab_category, vocab_terms_df, vocab_search],
953
  outputs=[vocab_json, vocab_terms_filtered, vocab_status]
954
  )
955
+
956
  vocab_reset_btn.click(
957
  fn=vocab_reset_defaults,
958
  inputs=None,
 
963
  outputs=[vocab_terms_df, vocab_terms_filtered, vocab_status]
964
  )
965
 
966
+ # -------------------------
967
+ # Field Builder
968
+ # -------------------------
969
  gr.Markdown("## Extraction Spec (Field Builder)")
970
+
971
  with gr.Row():
972
  preset_name = gr.Dropdown(label="Preset", choices=list(PRESET_MAP.keys()), value="Core (recommended)")
973
  preset_mode = gr.Radio(label="Preset mode", choices=["Replace", "Append"], value="Append")
 
976
  with gr.Row():
977
  field_name_in = gr.Textbox(label="Field name", placeholder="e.g., Genotoxicity_result")
978
  field_type_in = gr.Dropdown(label="Type", choices=TYPE_CHOICES, value="str")
979
+ enum_values_in = gr.Textbox(label="Enum values (comma-separated; for enum/list[enum])", placeholder="a,b,c", lines=2)
980
  instructions_in = gr.Textbox(label="Instructions", placeholder="Tell the extractor exactly what to pull.", lines=2)
981
 
982
+ add_update_field_btn = gr.Button("Add/Update field")
 
 
 
983
 
984
  fields_df = gr.Dataframe(
985
+ label="Fields (edit and click Apply)",
986
  headers=["field","type","enum_values","instructions"],
987
  interactive=True,
988
  wrap=True
989
  )
990
+
991
  fields_apply_btn = gr.Button("Apply builder table")
992
  fields_status = gr.Textbox(label="Field builder status", interactive=False)
993
 
994
  with gr.Accordion("Advanced: Raw extraction spec (auto-generated)", open=False):
995
  field_spec = gr.Textbox(label="Extraction spec", lines=12, interactive=False)
996
 
997
+ preset_btn.click(
998
+ fn=fields_load_preset,
999
+ inputs=[preset_name, preset_mode, field_rows_state],
1000
+ outputs=[field_rows_state, fields_df, field_spec, fields_status]
1001
+ )
1002
+
1003
+ add_update_field_btn.click(
1004
+ fn=fields_add_or_update,
1005
+ inputs=[field_name_in, field_type_in, enum_values_in, instructions_in, field_rows_state],
1006
+ outputs=[field_rows_state, fields_df, field_spec, fields_status]
1007
+ )
1008
+
1009
+ fields_apply_btn.click(
1010
+ fn=fields_apply_df,
1011
+ inputs=[field_rows_state, fields_df],
1012
+ outputs=[field_rows_state, fields_df, field_spec, fields_status]
1013
+ )
1014
+
1015
+ # -------------------------
1016
+ # Run extraction
1017
+ # -------------------------
1018
  extract_btn = gr.Button("Run Extraction (Grounded)")
1019
  status = gr.Textbox(label="Status", interactive=False)
1020
 
 
1021
  overview_df = gr.Dataframe(
1022
  label="Batch Overview (compact)",
1023
  interactive=False,
 
1050
  evidence_md = gr.Markdown()
1051
  reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
1052
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1053
  extract_btn.click(
1054
  fn=run_extraction,
1055
  inputs=[files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars],
1056
  outputs=[overview_df, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
1057
  )
1058
 
 
1059
  record_pick.change(
1060
  fn=on_pick,
1061
  inputs=[record_pick, state_records, state_details],
1062
  outputs=[vertical_view, evidence_md]
1063
  )
1064
 
 
1065
  review_mode.change(fn=toggle_review_mode, inputs=[review_mode], outputs=[vertical_view])
1066
 
1067
  save_btn.click(
 
1076
  outputs=[reviewed_csv, review_status]
1077
  )
1078
 
1079
+ # -------------------------
1080
+ # Initialize vocab + fields on load
1081
+ # -------------------------
1082
+ def _init_all():
1083
+ v, keys, k0, full_df, vjson, vmsg = vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
1084
+ filtered_df = _filter_terms_df(full_df, "")
1085
+ frows, fdf, fspec, fmsg = fields_init_state()
1086
+ return (
1087
+ v,
1088
+ gr.update(choices=keys, value=k0),
1089
+ full_df,
1090
+ filtered_df,
1091
+ vjson,
1092
+ vmsg,
1093
+ frows,
1094
+ fdf,
1095
+ fspec,
1096
+ fmsg
1097
+ )
1098
+
1099
+ demo.load(
1100
+ _init_all,
1101
+ inputs=None,
1102
+ outputs=[
1103
+ vocab_state,
1104
+ vocab_category,
1105
+ vocab_terms_df,
1106
+ vocab_terms_filtered,
1107
+ vocab_json,
1108
+ vocab_status,
1109
+ field_rows_state,
1110
+ fields_df,
1111
+ field_spec,
1112
+ fields_status
1113
+ ]
1114
+ )
1115
+
1116
  with gr.Tab("Cross-paper Synthesis"):
1117
  gr.Markdown("Upload `extraction_details.json` from Extract. Synthesis is based strictly on grounded extractions.")
1118
  api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")