hchevva commited on
Commit
f6221d9
·
verified ·
1 Parent(s): ddb431d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +297 -111
app.py CHANGED
@@ -135,6 +135,20 @@ ENDPOINT_MODULES: Dict[str, List[Dict[str, Any]]] = {
135
  "Carcinogenicity": PRESET_CARCINOGENICITY,
136
  }
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  ENDPOINT_QUERY_HINTS: Dict[str, List[str]] = {
139
  "Genotoxicity (OECD TG)": ["genotoxicity", "mutagenicity", "AMES", "micronucleus", "comet assay", "chromosomal aberration", "OECD TG 471 473 476 487 490 474 489"],
140
  "NAMs / In Silico": ["in silico", "QSAR", "read-across", "AOP", "PBPK", "high-throughput", "omics", "organ-on-chip", "microphysiological"],
@@ -264,10 +278,6 @@ def slugify_field(name: str) -> str:
264
 
265
 
266
  def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
267
- """
268
- spec lines: Field Name | type | instructions
269
- types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]
270
- """
271
  props: Dict[str, Any] = {}
272
  instr: Dict[str, str] = {}
273
 
@@ -421,54 +431,6 @@ def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[
421
  return resp.output_text
422
 
423
 
424
- # =============================
425
- # UI helpers: vertical view + evidence + overview
426
- # =============================
427
- def _make_vertical(records: List[Dict[str, Any]], record_id: str) -> pd.DataFrame:
428
- if not records or not record_id:
429
- return pd.DataFrame(columns=["Field", "Value"])
430
- row = next((r for r in records if r.get("record_id") == record_id), None)
431
- if not row:
432
- return pd.DataFrame(columns=["Field", "Value"])
433
-
434
- hidden = {"record_id"}
435
- keys = [k for k in row.keys() if k not in hidden]
436
- return pd.DataFrame({"Field": keys, "Value": [row[k] for k in keys]})
437
-
438
-
439
- def _render_evidence(details: List[Dict[str, Any]], file_name: str, allowed_fields: Optional[set] = None, max_items: int = 120) -> str:
440
- if not details or not file_name:
441
- return ""
442
- d = next((x for x in details if x.get("_file") == file_name), None)
443
- if not d:
444
- return ""
445
- ev = d.get("evidence", []) or []
446
- lines = []
447
- for e in ev:
448
- field = (e.get("field", "") or "").strip()
449
- if allowed_fields is not None and field and field not in allowed_fields:
450
- continue
451
- quote = (e.get("quote", "") or "").strip()
452
- pages = (e.get("pages", "") or "").strip()
453
- if quote:
454
- if len(quote) > 320:
455
- quote = quote[:320] + "…"
456
- lines.append(f"- **{field}** (pages {pages}): “{quote}”")
457
- if len(lines) >= max_items:
458
- break
459
- header = "### Evidence (grounding)\n"
460
- return header + ("\n".join(lines) if lines else "- (no evidence returned)")
461
-
462
-
463
- def _overview_df_from_records(records: List[Dict[str, Any]]) -> pd.DataFrame:
464
- if not records:
465
- return pd.DataFrame(columns=["record_id","file","paper_title","chemical","endpoint","risk_stance","risk_confidence"])
466
- df = pd.DataFrame(records)
467
- cols = ["record_id","file","paper_title","chemical","endpoint","risk_stance","risk_confidence"]
468
- cols = [c for c in cols if c in df.columns]
469
- return df[cols].copy() if cols else df.head(50)
470
-
471
-
472
  # =============================
473
  # Controlled vocab editor helpers (lists only) + search filter
474
  # =============================
@@ -572,7 +534,7 @@ def vocab_filter_preview(terms_df, search):
572
 
573
 
574
  # =============================
575
- # Field builder (admin) + endpoint selection mapping
576
  # =============================
577
  TYPE_CHOICES = ["str", "num", "bool", "list[str]", "list[num]", "enum", "list[enum]"]
578
 
@@ -606,14 +568,17 @@ def build_spec_from_field_rows(rows: List[Dict[str, Any]]) -> str:
606
  return "\n".join(lines).strip() + "\n"
607
 
608
 
609
- def build_rows_from_endpoints(selected_endpoints: List[str]) -> Tuple[List[Dict[str, Any]], Dict[str, str]]:
610
  selected_endpoints = selected_endpoints or []
611
  rows: List[Dict[str, Any]] = []
612
  field_key_to_module: Dict[str, str] = {}
 
613
 
614
  for r in PRESET_CORE:
615
  rows.append(dict(r))
616
- field_key_to_module[slugify_field(r["field"])] = "Core"
 
 
617
 
618
  for module in selected_endpoints:
619
  preset = ENDPOINT_MODULES.get(module)
@@ -621,7 +586,9 @@ def build_rows_from_endpoints(selected_endpoints: List[str]) -> Tuple[List[Dict[
621
  continue
622
  for r in preset:
623
  rows.append(dict(r))
624
- field_key_to_module[slugify_field(r["field"])] = module
 
 
625
 
626
  seen = set()
627
  deduped: List[Dict[str, Any]] = []
@@ -632,21 +599,30 @@ def build_rows_from_endpoints(selected_endpoints: List[str]) -> Tuple[List[Dict[
632
  seen.add(k)
633
  deduped.append(r)
634
 
635
- field_key_to_module = {slugify_field(r["field"]): field_key_to_module.get(slugify_field(r["field"]), "Custom") for r in deduped}
636
- return deduped, field_key_to_module
 
 
 
 
637
 
 
 
 
638
 
639
- def sync_fields_from_endpoints(selected_endpoints: List[str], admin_mode: bool):
 
640
  if admin_mode:
641
- return gr.update(), gr.update(), gr.update(), "Admin mode: endpoint selection will not overwrite custom columns."
642
- rows, _ = build_rows_from_endpoints(selected_endpoints)
 
643
  df = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
644
  spec = build_spec_from_field_rows(rows)
645
  return rows, df, spec, "✅ Columns updated from selected endpoints."
646
 
647
 
648
  def admin_apply_endpoints(selected_endpoints: List[str]):
649
- rows, _ = build_rows_from_endpoints(selected_endpoints)
650
  df = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
651
  spec = build_spec_from_field_rows(rows)
652
  return rows, df, spec, "✅ Loaded selected endpoints into the builder (Replace)."
@@ -706,7 +682,7 @@ def fields_apply_df(field_rows: List[Dict[str, Any]], df_in: Any):
706
 
707
 
708
  # =============================
709
- # Row-building logic (paper vs chemical-endpoint)
710
  # =============================
711
  def _as_list(x) -> List[str]:
712
  if x is None:
@@ -728,12 +704,174 @@ def _format_value(v: Any) -> Any:
728
  return v
729
 
730
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
731
  def _record_id(file_name: str, chemical: str, endpoint: str) -> str:
732
  chemical = (chemical or "").strip() or "-"
733
  endpoint = (endpoint or "").strip() or "Paper"
734
  return f"{file_name} | {chemical} | {endpoint}"
735
 
736
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
737
  # =============================
738
  # Main extraction handler
739
  # =============================
@@ -751,6 +889,7 @@ def run_extraction(
751
  ):
752
  if not files:
753
  return (
 
754
  pd.DataFrame(), None, None, "Upload one or more PDFs.",
755
  gr.update(choices=[], value=None),
756
  [], [], pd.DataFrame(columns=["Field","Value"]), ""
@@ -760,6 +899,7 @@ def run_extraction(
760
  vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
761
  except Exception as e:
762
  return (
 
763
  pd.DataFrame(), None, None, f"Controlled vocab JSON invalid: {e}",
764
  gr.update(choices=[], value=None),
765
  [], [], pd.DataFrame(columns=["Field","Value"]), ""
@@ -768,6 +908,7 @@ def run_extraction(
768
  field_props, field_instr = parse_field_spec(field_spec or "")
769
  if not field_props:
770
  return (
 
771
  pd.DataFrame(), None, None, "No extraction fields are defined. (Check selected endpoints or admin field spec.)",
772
  gr.update(choices=[], value=None),
773
  [], [], pd.DataFrame(columns=["Field","Value"]), ""
@@ -777,15 +918,17 @@ def run_extraction(
777
 
778
  if admin_mode:
779
  field_key_to_module = {k: "Custom" for k in field_props.keys()}
 
780
  endpoint_modules_for_rows = ["Custom"]
781
  else:
782
- _, field_key_to_module = build_rows_from_endpoints(selected_endpoints or [])
783
  endpoint_modules_for_rows = list(selected_endpoints or []) or ["Core"]
784
 
785
  try:
786
  client = get_openai_client(api_key)
787
  except Exception as e:
788
  return (
 
789
  pd.DataFrame(), None, None, str(e),
790
  gr.update(choices=[], value=None),
791
  [], [], pd.DataFrame(columns=["Field","Value"]), ""
@@ -854,32 +997,55 @@ def run_extraction(
854
  if not chemicals:
855
  chemicals = ["-"]
856
 
 
857
  if len(chemicals) <= 1:
 
858
  row = dict(base)
859
- row["chemical"] = chemicals[0]
860
  row["endpoint"] = "Paper"
861
- row["record_id"] = _record_id(filename, row["chemical"], row["endpoint"])
862
  for k in field_props.keys():
863
  row[k] = _format_value(ext.get(k, [] if field_props[k].get("type") == "array" else ""))
864
  output_rows.append(row)
 
 
865
  else:
866
  core_keys = [k for k, m in field_key_to_module.items() if m == "Core"] if not admin_mode else []
867
- for chem in chemicals:
868
- for module in endpoint_modules_for_rows:
869
- row = dict(base)
870
- row["chemical"] = chem
871
- row["endpoint"] = module
872
- row["record_id"] = _record_id(filename, chem, module)
873
-
874
- for k in field_props.keys():
875
- m = field_key_to_module.get(k, "Custom")
876
- include = (k in core_keys) or (m == module) or admin_mode
877
- if include:
878
- if k == "chemicals":
879
- row[k] = chem # make per-row chemical consistent
880
- else:
881
- row[k] = _format_value(ext.get(k, [] if field_props[k].get("type") == "array" else ""))
882
- output_rows.append(row)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
883
 
884
  df = pd.DataFrame(output_rows)
885
  records = df.to_dict("records")
@@ -893,20 +1059,20 @@ def run_extraction(
893
  default = choices[0] if choices else None
894
 
895
  vertical = _make_vertical(records, default) if default else pd.DataFrame(columns=["Field","Value"])
 
896
  allowed_fields = None
 
897
  if default:
898
  selected_row = next((r for r in records if r.get("record_id") == default), {})
899
  allowed_fields = set([k for k in selected_row.keys() if k not in {"record_id"}])
 
900
 
901
- file_for_evidence = None
902
- if default:
903
- file_for_evidence = default.split(" | ")[0].strip()
904
  evidence = _render_evidence(paper_details, file_for_evidence, allowed_fields=allowed_fields) if file_for_evidence else ""
905
-
906
  overview = _overview_df_from_records(records)
907
  status = "✅ Done. Review in the report below and export when ready."
908
 
909
  return (
 
910
  overview,
911
  str(csv_path),
912
  str(json_path),
@@ -924,11 +1090,11 @@ def run_extraction(
924
  # =============================
925
  def on_pick(record_id: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
926
  if not record_id:
927
- return pd.DataFrame(columns=["Field","Value"]), ""
928
  row = next((r for r in (records or []) if r.get("record_id") == record_id), {})
929
  file_name = (row.get("file") or "")
930
  allowed_fields = set(row.keys()) - {"record_id"}
931
- return _make_vertical(records, record_id), _render_evidence(details, file_name, allowed_fields=allowed_fields)
932
 
933
 
934
  def toggle_review_mode(is_on: bool):
@@ -937,12 +1103,12 @@ def toggle_review_mode(is_on: bool):
937
 
938
  def save_review_changes(record_id: str, vertical_df: Any, records: List[Dict[str, Any]]):
939
  if not record_id or not records:
940
- return pd.DataFrame(), records, "Nothing to save."
941
 
942
  try:
943
  dfv = vertical_df if isinstance(vertical_df, pd.DataFrame) else pd.DataFrame(vertical_df, columns=["Field", "Value"])
944
  except Exception:
945
- return _overview_df_from_records(records), records, "Could not parse edited vertical table."
946
 
947
  dfv = dfv.dropna(subset=["Field"])
948
  updates = {str(r["Field"]): r["Value"] for _, r in dfv.iterrows() if str(r["Field"]).strip()}
@@ -960,7 +1126,7 @@ def save_review_changes(record_id: str, vertical_df: Any, records: List[Dict[str
960
  new_records.append(r)
961
 
962
  msg = "Saved changes into session data. Export reviewed CSV to download." if updated else "Record not found."
963
- return _overview_df_from_records(new_records), new_records, msg
964
 
965
 
966
  def export_reviewed_csv(records: List[Dict[str, Any]]):
@@ -977,19 +1143,17 @@ def export_reviewed_csv(records: List[Dict[str, Any]]):
977
  # =============================
978
  def run_synthesis(api_key, model, extraction_json_file):
979
  if extraction_json_file is None:
980
- return "Upload the extraction_details.json from the Extract tab first."
981
-
982
  try:
983
  client = get_openai_client(api_key)
984
  except Exception as e:
985
  return str(e)
986
-
987
  rows = json.loads(Path(extraction_json_file.name).read_text(encoding="utf-8"))
988
  return openai_synthesize_across_papers(client, model, rows)
989
 
990
 
991
  # =============================
992
- # UI visibility helpers
993
  # =============================
994
  def set_admin_visibility(is_admin: bool):
995
  return (
@@ -1014,10 +1178,11 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1014
  vocab_state = gr.State({})
1015
  field_rows_state = gr.State([])
1016
 
1017
- field_spec = gr.Textbox(visible=False, interactive=False, lines=8, label="(hidden) field spec")
1018
- vocab_json = gr.Textbox(visible=False, interactive=False, lines=8, label="(hidden) vocab json")
1019
 
1020
  with gr.Tab("Extract"):
 
1021
  with gr.Group():
1022
  files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
1023
 
@@ -1025,17 +1190,26 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1025
  api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
1026
  model = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
1027
 
1028
- endpoints = gr.Dropdown(
1029
- label="Endpoints to extract (Core included automatically)",
1030
- choices=list(ENDPOINT_MODULES.keys()),
1031
- multiselect=True,
1032
- value=["Genotoxicity (OECD TG)"]
1033
- )
 
 
 
 
 
 
1034
 
1035
  extract_btn = gr.Button("Run Extraction", variant="primary")
1036
  status = gr.Textbox(label="Status", interactive=False)
1037
 
 
1038
  gr.Markdown("## Report")
 
 
1039
  overview_df = gr.Dataframe(
1040
  label="Batch Overview",
1041
  interactive=False,
@@ -1068,12 +1242,14 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1068
 
1069
  reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
1070
 
 
1071
  with gr.Accordion("Advanced runtime settings", open=False):
1072
  with gr.Row():
1073
  max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
1074
  chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
1075
  max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
1076
 
 
1077
  with gr.Accordion("Admin tools (taxonomy + custom columns)", open=False):
1078
  admin_mode = gr.Checkbox(label="Enable Admin mode", value=False)
1079
 
@@ -1082,7 +1258,7 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1082
  admin_fields_group = gr.Group(visible=False)
1083
 
1084
  with admin_group:
1085
- gr.Markdown("### Admin: Configure what gets extracted (columns) and how terms are normalized.")
1086
 
1087
  with admin_vocab_group:
1088
  gr.Markdown("### Controlled vocabulary (lists only)")
@@ -1131,28 +1307,35 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1131
 
1132
  fields_status = gr.Textbox(label="Field builder status", interactive=False)
1133
 
 
1134
  admin_mode.change(
1135
  fn=set_admin_visibility,
1136
  inputs=[admin_mode],
1137
  outputs=[admin_group, admin_vocab_group, admin_fields_group]
1138
  )
1139
 
 
 
 
 
 
 
1140
  endpoints.change(
1141
  fn=sync_fields_from_endpoints,
1142
- inputs=[endpoints, admin_mode],
1143
  outputs=[field_rows_state, fields_df, field_spec, status]
1144
  )
1145
 
1146
  extract_btn.click(
1147
  fn=run_extraction,
1148
  inputs=[files, api_key, model, endpoints, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars, admin_mode],
1149
- outputs=[overview_df, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
1150
  )
1151
 
1152
  record_pick.change(
1153
  fn=on_pick,
1154
  inputs=[record_pick, state_records, state_details],
1155
- outputs=[vertical_view, evidence_md]
1156
  )
1157
 
1158
  review_mode.change(fn=toggle_review_mode, inputs=[review_mode], outputs=[vertical_view])
@@ -1160,7 +1343,7 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1160
  save_btn.click(
1161
  fn=save_review_changes,
1162
  inputs=[record_pick, vertical_view, state_records],
1163
- outputs=[overview_df, state_records, review_status]
1164
  )
1165
 
1166
  export_btn.click(
@@ -1169,6 +1352,7 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1169
  outputs=[reviewed_csv, review_status]
1170
  )
1171
 
 
1172
  vocab_search.change(fn=vocab_filter_preview, inputs=[vocab_terms_df, vocab_search], outputs=[vocab_terms_filtered])
1173
 
1174
  vocab_category.change(
@@ -1205,6 +1389,7 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1205
  outputs=[vocab_state, vocab_category, vocab_terms_df, vocab_terms_filtered, vocab_json_admin, vocab_status, vocab_json]
1206
  )
1207
 
 
1208
  admin_apply_endpoints_btn.click(
1209
  fn=admin_apply_endpoints,
1210
  inputs=[endpoints],
@@ -1223,11 +1408,12 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1223
  outputs=[field_rows_state, fields_df, field_spec, fields_status]
1224
  )
1225
 
 
1226
  def _init_all():
1227
  vocab, keys, k0, full_df, filtered_df, vjson, vmsg = vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
1228
 
1229
- default_endpoints = ["Genotoxicity (OECD TG)"]
1230
- rows, _ = build_rows_from_endpoints(default_endpoints)
1231
  fdf = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
1232
  fspec = build_spec_from_field_rows(rows)
1233
 
@@ -1264,7 +1450,7 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1264
  )
1265
 
1266
  with gr.Tab("Cross-paper Synthesis"):
1267
- gr.Markdown("Upload `extraction_details.json` from Extract. Synthesis is based strictly on grounded extractions.")
1268
  api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
1269
  model2 = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
1270
  extraction_json_file = gr.File(label="Upload extraction_details.json", file_types=[".json"], file_count="single")
 
135
  "Carcinogenicity": PRESET_CARCINOGENICITY,
136
  }
137
 
138
+ # Endpoint presets (requested)
139
+ ENDPOINT_PRESETS: Dict[str, List[str]] = {
140
+ "Required – Safety Assessor": [
141
+ "Genotoxicity (OECD TG)",
142
+ "Repeated dose toxicity",
143
+ "Irritation / Sensitization",
144
+ "Repro / Developmental",
145
+ "Acute toxicity",
146
+ ],
147
+ "Core only (fast)": [],
148
+ "Screening – NAMs + Genotox": ["NAMs / In Silico", "Genotoxicity (OECD TG)"],
149
+ "Full – All endpoints": list(ENDPOINT_MODULES.keys()),
150
+ }
151
+
152
  ENDPOINT_QUERY_HINTS: Dict[str, List[str]] = {
153
  "Genotoxicity (OECD TG)": ["genotoxicity", "mutagenicity", "AMES", "micronucleus", "comet assay", "chromosomal aberration", "OECD TG 471 473 476 487 490 474 489"],
154
  "NAMs / In Silico": ["in silico", "QSAR", "read-across", "AOP", "PBPK", "high-throughput", "omics", "organ-on-chip", "microphysiological"],
 
278
 
279
 
280
  def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
 
 
 
 
281
  props: Dict[str, Any] = {}
282
  instr: Dict[str, str] = {}
283
 
 
431
  return resp.output_text
432
 
433
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
  # =============================
435
  # Controlled vocab editor helpers (lists only) + search filter
436
  # =============================
 
534
 
535
 
536
  # =============================
537
+ # Field mapping from endpoints
538
  # =============================
539
  TYPE_CHOICES = ["str", "num", "bool", "list[str]", "list[num]", "enum", "list[enum]"]
540
 
 
568
  return "\n".join(lines).strip() + "\n"
569
 
570
 
571
+ def build_rows_from_endpoints(selected_endpoints: List[str]) -> Tuple[List[Dict[str, Any]], Dict[str, str], Dict[str, List[str]]]:
572
  selected_endpoints = selected_endpoints or []
573
  rows: List[Dict[str, Any]] = []
574
  field_key_to_module: Dict[str, str] = {}
575
+ module_to_keys: Dict[str, List[str]] = {}
576
 
577
  for r in PRESET_CORE:
578
  rows.append(dict(r))
579
+ k = slugify_field(r["field"])
580
+ field_key_to_module[k] = "Core"
581
+ module_to_keys.setdefault("Core", []).append(k)
582
 
583
  for module in selected_endpoints:
584
  preset = ENDPOINT_MODULES.get(module)
 
586
  continue
587
  for r in preset:
588
  rows.append(dict(r))
589
+ k = slugify_field(r["field"])
590
+ field_key_to_module[k] = module
591
+ module_to_keys.setdefault(module, []).append(k)
592
 
593
  seen = set()
594
  deduped: List[Dict[str, Any]] = []
 
599
  seen.add(k)
600
  deduped.append(r)
601
 
602
+ # Rebuild module_to_keys to match deduped
603
+ dedup_keys = set([slugify_field(r["field"]) for r in deduped])
604
+ module_to_keys = {m: [k for k in ks if k in dedup_keys] for m, ks in module_to_keys.items()}
605
+
606
+ return deduped, field_key_to_module, module_to_keys
607
+
608
 
609
+ def apply_endpoint_preset(preset_name: str):
610
+ vals = ENDPOINT_PRESETS.get(preset_name, [])
611
+ return gr.update(value=vals)
612
 
613
+
614
+ def sync_fields_from_endpoints(selected_endpoints: List[str], admin_mode: bool, current_rows: List[Dict[str, Any]], current_spec: str):
615
  if admin_mode:
616
+ df = pd.DataFrame(current_rows or [], columns=["field","type","enum_values","instructions"])
617
+ return current_rows, df, current_spec, "Admin mode: endpoint selection will not overwrite custom columns."
618
+ rows, _, _ = build_rows_from_endpoints(selected_endpoints or [])
619
  df = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
620
  spec = build_spec_from_field_rows(rows)
621
  return rows, df, spec, "✅ Columns updated from selected endpoints."
622
 
623
 
624
  def admin_apply_endpoints(selected_endpoints: List[str]):
625
+ rows, _, _ = build_rows_from_endpoints(selected_endpoints or [])
626
  df = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
627
  spec = build_spec_from_field_rows(rows)
628
  return rows, df, spec, "✅ Loaded selected endpoints into the builder (Replace)."
 
682
 
683
 
684
  # =============================
685
+ # Row building + “non-empty module” logic
686
  # =============================
687
  def _as_list(x) -> List[str]:
688
  if x is None:
 
704
  return v
705
 
706
 
707
+ EMPTY_STRINGS = {"", "not_reported", "insufficient_data", "none", "na", "n/a", "null"}
708
+
709
+
710
+ def _is_empty_value(v: Any) -> bool:
711
+ if v is None:
712
+ return True
713
+ if isinstance(v, float) and np.isnan(v):
714
+ return True
715
+ if isinstance(v, list):
716
+ cleaned = [str(x).strip() for x in v if str(x).strip()]
717
+ if not cleaned:
718
+ return True
719
+ # empty if all items are not_reported / similar
720
+ return all((c.lower() in EMPTY_STRINGS) for c in cleaned)
721
+ s = str(v).strip()
722
+ if not s:
723
+ return True
724
+ return s.lower() in EMPTY_STRINGS
725
+
726
+
727
  def _record_id(file_name: str, chemical: str, endpoint: str) -> str:
728
  chemical = (chemical or "").strip() or "-"
729
  endpoint = (endpoint or "").strip() or "Paper"
730
  return f"{file_name} | {chemical} | {endpoint}"
731
 
732
 
733
+ def _module_has_any_data(ext: Dict[str, Any], module_keys: List[str], field_props: Dict[str, Any]) -> bool:
734
+ for k in (module_keys or []):
735
+ v = ext.get(k, None)
736
+ if not _is_empty_value(v):
737
+ return True
738
+ return False
739
+
740
+
741
+ # =============================
742
+ # Evidence + report helpers
743
+ # =============================
744
+ def _make_vertical(records: List[Dict[str, Any]], record_id: str) -> pd.DataFrame:
745
+ if not records or not record_id:
746
+ return pd.DataFrame(columns=["Field", "Value"])
747
+ row = next((r for r in records if r.get("record_id") == record_id), None)
748
+ if not row:
749
+ return pd.DataFrame(columns=["Field", "Value"])
750
+
751
+ hidden = {"record_id"}
752
+ keys = [k for k in row.keys() if k not in hidden]
753
+ return pd.DataFrame({"Field": keys, "Value": [row.get(k, "") for k in keys]})
754
+
755
+
756
+ def _render_evidence(details: List[Dict[str, Any]], file_name: str, allowed_fields: Optional[set] = None, max_items: int = 120) -> str:
757
+ if not details or not file_name:
758
+ return ""
759
+ d = next((x for x in details if x.get("_file") == file_name), None)
760
+ if not d:
761
+ return ""
762
+ ev = d.get("evidence", []) or []
763
+ lines = []
764
+ for e in ev:
765
+ field = (e.get("field", "") or "").strip()
766
+ if allowed_fields is not None and field and field not in allowed_fields:
767
+ continue
768
+ quote = (e.get("quote", "") or "").strip()
769
+ pages = (e.get("pages", "") or "").strip()
770
+ if quote:
771
+ if len(quote) > 320:
772
+ quote = quote[:320] + "…"
773
+ lines.append(f"- **{field}** (pages {pages}): “{quote}”")
774
+ if len(lines) >= max_items:
775
+ break
776
+ header = "### Evidence (grounding)\n"
777
+ return header + ("\n".join(lines) if lines else "- (no evidence returned)")
778
+
779
+
780
+ def _overview_df_from_records(records: List[Dict[str, Any]]) -> pd.DataFrame:
781
+ if not records:
782
+ return pd.DataFrame(columns=["record_id","file","paper_title","chemical","endpoint","risk_stance","risk_confidence"])
783
+ df = pd.DataFrame(records)
784
+ cols = ["record_id","file","paper_title","chemical","endpoint","risk_stance","risk_confidence"]
785
+ cols = [c for c in cols if c in df.columns]
786
+ return df[cols].copy() if cols else df.head(50)
787
+
788
+
789
+ def _risk_badge(risk: str) -> str:
790
+ r = (risk or "").strip().lower()
791
+ if r == "acceptable":
792
+ bg = "#e7f7ed"; fg = "#0f5132"
793
+ elif r == "acceptable_with_uncertainty":
794
+ bg = "#fff3cd"; fg = "#664d03"
795
+ elif r == "not_acceptable":
796
+ bg = "#f8d7da"; fg = "#842029"
797
+ else:
798
+ bg = "#e2e3e5"; fg = "#41464b"
799
+ label = risk if risk else "unknown"
800
+ return f'<span style="background:{bg};color:{fg};padding:4px 10px;border-radius:999px;font-weight:600;font-size:12px;">{label}</span>'
801
+
802
+
803
+ def _safe_str(x: Any) -> str:
804
+ if x is None:
805
+ return ""
806
+ if isinstance(x, float) and np.isnan(x):
807
+ return ""
808
+ return str(x)
809
+
810
+
811
+ def render_summary_card(record_id: str, records: List[Dict[str, Any]]) -> str:
812
+ if not record_id or not records:
813
+ return "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#666;'>Run extraction to view results.</div></div>"
814
+
815
+ row = next((r for r in records if r.get("record_id") == record_id), None)
816
+ if not row:
817
+ return "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#666;'>Select a record.</div></div>"
818
+
819
+ title = _safe_str(row.get("paper_title", "")).strip() or "Untitled paper"
820
+ file_name = _safe_str(row.get("file", ""))
821
+ chemical = _safe_str(row.get("chemical", "-"))
822
+ endpoint = _safe_str(row.get("endpoint", "Paper"))
823
+ risk = _safe_str(row.get("risk_stance", ""))
824
+ conf = row.get("risk_confidence", "")
825
+ try:
826
+ conf_txt = f"{float(conf):.2f}" if conf != "" else ""
827
+ except Exception:
828
+ conf_txt = _safe_str(conf)
829
+
830
+ key_findings = _safe_str(row.get("key_findings", "")).strip()
831
+ dose_metrics = _safe_str(row.get("dose_metrics", "")).strip()
832
+ conclusion = _safe_str(row.get("conclusion", "")).strip()
833
+
834
+ # Keep compact
835
+ def _clip(s: str, n: int = 380) -> str:
836
+ s = s.strip()
837
+ if len(s) <= n:
838
+ return s
839
+ return s[:n] + "…"
840
+
841
+ return f"""
842
+ <div style="border:1px solid #eaeaea;padding:14px;border-radius:12px;">
843
+ <div style="display:flex;align-items:center;justify-content:space-between;gap:12px;flex-wrap:wrap;">
844
+ <div style="font-weight:700;font-size:16px;">Executive Summary</div>
845
+ <div>{_risk_badge(risk)} <span style="margin-left:10px;color:#666;font-size:12px;">confidence: {conf_txt}</span></div>
846
+ </div>
847
+
848
+ <div style="margin-top:10px;">
849
+ <div style="font-weight:650;">{title}</div>
850
+ <div style="color:#666;font-size:12px;margin-top:4px;">
851
+ <span><b>File:</b> {file_name}</span> &nbsp; • &nbsp;
852
+ <span><b>Chemical:</b> {chemical}</span> &nbsp; • &nbsp;
853
+ <span><b>Endpoint:</b> {endpoint}</span>
854
+ </div>
855
+ </div>
856
+
857
+ <div style="margin-top:12px;display:grid;grid-template-columns:1fr;gap:10px;">
858
+ <div>
859
+ <div style="font-weight:650;margin-bottom:4px;">Key Findings</div>
860
+ <div style="color:#222;">{_clip(key_findings) if key_findings else "<span style='color:#666'>(not reported)</span>"}</div>
861
+ </div>
862
+ <div>
863
+ <div style="font-weight:650;margin-bottom:4px;">Dose Metrics</div>
864
+ <div style="color:#222;">{_clip(dose_metrics) if dose_metrics else "<span style='color:#666'>(not reported)</span>"}</div>
865
+ </div>
866
+ <div>
867
+ <div style="font-weight:650;margin-bottom:4px;">Conclusion</div>
868
+ <div style="color:#222;">{_clip(conclusion) if conclusion else "<span style='color:#666'>(not reported)</span>"}</div>
869
+ </div>
870
+ </div>
871
+ </div>
872
+ """
873
+
874
+
875
  # =============================
876
  # Main extraction handler
877
  # =============================
 
889
  ):
890
  if not files:
891
  return (
892
+ "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#666;'>Upload PDFs to run extraction.</div></div>",
893
  pd.DataFrame(), None, None, "Upload one or more PDFs.",
894
  gr.update(choices=[], value=None),
895
  [], [], pd.DataFrame(columns=["Field","Value"]), ""
 
899
  vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
900
  except Exception as e:
901
  return (
902
+ "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>Invalid vocab JSON.</div></div>",
903
  pd.DataFrame(), None, None, f"Controlled vocab JSON invalid: {e}",
904
  gr.update(choices=[], value=None),
905
  [], [], pd.DataFrame(columns=["Field","Value"]), ""
 
908
  field_props, field_instr = parse_field_spec(field_spec or "")
909
  if not field_props:
910
  return (
911
+ "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>No columns defined.</div></div>",
912
  pd.DataFrame(), None, None, "No extraction fields are defined. (Check selected endpoints or admin field spec.)",
913
  gr.update(choices=[], value=None),
914
  [], [], pd.DataFrame(columns=["Field","Value"]), ""
 
918
 
919
  if admin_mode:
920
  field_key_to_module = {k: "Custom" for k in field_props.keys()}
921
+ module_to_keys: Dict[str, List[str]] = {"Custom": list(field_props.keys())}
922
  endpoint_modules_for_rows = ["Custom"]
923
  else:
924
+ _, field_key_to_module, module_to_keys = build_rows_from_endpoints(selected_endpoints or [])
925
  endpoint_modules_for_rows = list(selected_endpoints or []) or ["Core"]
926
 
927
  try:
928
  client = get_openai_client(api_key)
929
  except Exception as e:
930
  return (
931
+ "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>Missing API key.</div></div>",
932
  pd.DataFrame(), None, None, str(e),
933
  gr.update(choices=[], value=None),
934
  [], [], pd.DataFrame(columns=["Field","Value"]), ""
 
997
  if not chemicals:
998
  chemicals = ["-"]
999
 
1000
+ # Single-chemical => one-row-per-paper
1001
  if len(chemicals) <= 1:
1002
+ chem = chemicals[0]
1003
  row = dict(base)
1004
+ row["chemical"] = chem
1005
  row["endpoint"] = "Paper"
1006
+ row["record_id"] = _record_id(filename, chem, row["endpoint"])
1007
  for k in field_props.keys():
1008
  row[k] = _format_value(ext.get(k, [] if field_props[k].get("type") == "array" else ""))
1009
  output_rows.append(row)
1010
+
1011
+ # Multi-chemical => chemical–endpoint rows (ONLY non-empty modules)
1012
  else:
1013
  core_keys = [k for k, m in field_key_to_module.items() if m == "Core"] if not admin_mode else []
1014
+
1015
+ # determine which endpoint modules have any data (skip empty ones)
1016
+ candidate_modules = [m for m in endpoint_modules_for_rows if m != "Core"]
1017
+ non_empty_modules = []
1018
+ for m in candidate_modules:
1019
+ if _module_has_any_data(ext, module_to_keys.get(m, []), field_props):
1020
+ non_empty_modules.append(m)
1021
+
1022
+ # If everything empty, fall back to a single Paper row (otherwise you get no rows)
1023
+ if not non_empty_modules:
1024
+ row = dict(base)
1025
+ row["chemical"] = "multiple"
1026
+ row["endpoint"] = "Paper"
1027
+ row["record_id"] = _record_id(filename, row["chemical"], row["endpoint"])
1028
+ for k in field_props.keys():
1029
+ row[k] = _format_value(ext.get(k, [] if field_props[k].get("type") == "array" else ""))
1030
+ output_rows.append(row)
1031
+ else:
1032
+ for chem in chemicals:
1033
+ for module in non_empty_modules:
1034
+ row = dict(base)
1035
+ row["chemical"] = chem
1036
+ row["endpoint"] = module
1037
+ row["record_id"] = _record_id(filename, chem, module)
1038
+
1039
+ for k in field_props.keys():
1040
+ m = field_key_to_module.get(k, "Custom")
1041
+ include = (k in core_keys) or (m == module) or admin_mode
1042
+ if include:
1043
+ if k == "chemicals":
1044
+ row[k] = chem
1045
+ else:
1046
+ row[k] = _format_value(ext.get(k, [] if field_props[k].get("type") == "array" else ""))
1047
+
1048
+ output_rows.append(row)
1049
 
1050
  df = pd.DataFrame(output_rows)
1051
  records = df.to_dict("records")
 
1059
  default = choices[0] if choices else None
1060
 
1061
  vertical = _make_vertical(records, default) if default else pd.DataFrame(columns=["Field","Value"])
1062
+ summary_html = render_summary_card(default, records) if default else render_summary_card("", [])
1063
  allowed_fields = None
1064
+ file_for_evidence = None
1065
  if default:
1066
  selected_row = next((r for r in records if r.get("record_id") == default), {})
1067
  allowed_fields = set([k for k in selected_row.keys() if k not in {"record_id"}])
1068
+ file_for_evidence = (default.split(" | ")[0] or "").strip()
1069
 
 
 
 
1070
  evidence = _render_evidence(paper_details, file_for_evidence, allowed_fields=allowed_fields) if file_for_evidence else ""
 
1071
  overview = _overview_df_from_records(records)
1072
  status = "✅ Done. Review in the report below and export when ready."
1073
 
1074
  return (
1075
+ summary_html,
1076
  overview,
1077
  str(csv_path),
1078
  str(json_path),
 
1090
  # =============================
1091
  def on_pick(record_id: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
1092
  if not record_id:
1093
+ return render_summary_card("", []), pd.DataFrame(columns=["Field","Value"]), ""
1094
  row = next((r for r in (records or []) if r.get("record_id") == record_id), {})
1095
  file_name = (row.get("file") or "")
1096
  allowed_fields = set(row.keys()) - {"record_id"}
1097
+ return render_summary_card(record_id, records), _make_vertical(records, record_id), _render_evidence(details, file_name, allowed_fields=allowed_fields)
1098
 
1099
 
1100
  def toggle_review_mode(is_on: bool):
 
1103
 
1104
  def save_review_changes(record_id: str, vertical_df: Any, records: List[Dict[str, Any]]):
1105
  if not record_id or not records:
1106
+ return pd.DataFrame(), records, "Nothing to save.", render_summary_card("", [])
1107
 
1108
  try:
1109
  dfv = vertical_df if isinstance(vertical_df, pd.DataFrame) else pd.DataFrame(vertical_df, columns=["Field", "Value"])
1110
  except Exception:
1111
+ return _overview_df_from_records(records), records, "Could not parse edited vertical table.", render_summary_card(record_id, records)
1112
 
1113
  dfv = dfv.dropna(subset=["Field"])
1114
  updates = {str(r["Field"]): r["Value"] for _, r in dfv.iterrows() if str(r["Field"]).strip()}
 
1126
  new_records.append(r)
1127
 
1128
  msg = "Saved changes into session data. Export reviewed CSV to download." if updated else "Record not found."
1129
+ return _overview_df_from_records(new_records), new_records, msg, render_summary_card(record_id, new_records)
1130
 
1131
 
1132
  def export_reviewed_csv(records: List[Dict[str, Any]]):
 
1143
  # =============================
1144
  def run_synthesis(api_key, model, extraction_json_file):
1145
  if extraction_json_file is None:
1146
+ return "Upload the extraction_details.json from Extract tab first."
 
1147
  try:
1148
  client = get_openai_client(api_key)
1149
  except Exception as e:
1150
  return str(e)
 
1151
  rows = json.loads(Path(extraction_json_file.name).read_text(encoding="utf-8"))
1152
  return openai_synthesize_across_papers(client, model, rows)
1153
 
1154
 
1155
  # =============================
1156
+ # Admin visibility helpers
1157
  # =============================
1158
  def set_admin_visibility(is_admin: bool):
1159
  return (
 
1178
  vocab_state = gr.State({})
1179
  field_rows_state = gr.State([])
1180
 
1181
+ field_spec = gr.Textbox(visible=False, interactive=False, lines=8)
1182
+ vocab_json = gr.Textbox(visible=False, interactive=False, lines=8)
1183
 
1184
  with gr.Tab("Extract"):
1185
+ # --- Run section (simple) ---
1186
  with gr.Group():
1187
  files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
1188
 
 
1190
  api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
1191
  model = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
1192
 
1193
+ with gr.Row():
1194
+ endpoint_preset = gr.Dropdown(
1195
+ label="Endpoint preset",
1196
+ choices=list(ENDPOINT_PRESETS.keys()),
1197
+ value="Required Safety Assessor"
1198
+ )
1199
+ endpoints = gr.Dropdown(
1200
+ label="Endpoints to extract (Core included automatically)",
1201
+ choices=list(ENDPOINT_MODULES.keys()),
1202
+ multiselect=True,
1203
+ value=ENDPOINT_PRESETS["Required – Safety Assessor"]
1204
+ )
1205
 
1206
  extract_btn = gr.Button("Run Extraction", variant="primary")
1207
  status = gr.Textbox(label="Status", interactive=False)
1208
 
1209
+ # --- Report (results-first) ---
1210
  gr.Markdown("## Report")
1211
+ summary_card = gr.HTML(render_summary_card("", []))
1212
+
1213
  overview_df = gr.Dataframe(
1214
  label="Batch Overview",
1215
  interactive=False,
 
1242
 
1243
  reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
1244
 
1245
+ # --- Advanced runtime settings (collapsed) ---
1246
  with gr.Accordion("Advanced runtime settings", open=False):
1247
  with gr.Row():
1248
  max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
1249
  chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
1250
  max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
1251
 
1252
+ # --- Admin tools (collapsed) ---
1253
  with gr.Accordion("Admin tools (taxonomy + custom columns)", open=False):
1254
  admin_mode = gr.Checkbox(label="Enable Admin mode", value=False)
1255
 
 
1258
  admin_fields_group = gr.Group(visible=False)
1259
 
1260
  with admin_group:
1261
+ gr.Markdown("### Admin: Configure extraction taxonomy + custom columns.")
1262
 
1263
  with admin_vocab_group:
1264
  gr.Markdown("### Controlled vocabulary (lists only)")
 
1307
 
1308
  fields_status = gr.Textbox(label="Field builder status", interactive=False)
1309
 
1310
+ # --- Wiring ---
1311
  admin_mode.change(
1312
  fn=set_admin_visibility,
1313
  inputs=[admin_mode],
1314
  outputs=[admin_group, admin_vocab_group, admin_fields_group]
1315
  )
1316
 
1317
+ endpoint_preset.change(
1318
+ fn=apply_endpoint_preset,
1319
+ inputs=[endpoint_preset],
1320
+ outputs=[endpoints]
1321
+ )
1322
+
1323
  endpoints.change(
1324
  fn=sync_fields_from_endpoints,
1325
+ inputs=[endpoints, admin_mode, field_rows_state, field_spec],
1326
  outputs=[field_rows_state, fields_df, field_spec, status]
1327
  )
1328
 
1329
  extract_btn.click(
1330
  fn=run_extraction,
1331
  inputs=[files, api_key, model, endpoints, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars, admin_mode],
1332
+ outputs=[summary_card, overview_df, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
1333
  )
1334
 
1335
  record_pick.change(
1336
  fn=on_pick,
1337
  inputs=[record_pick, state_records, state_details],
1338
+ outputs=[summary_card, vertical_view, evidence_md]
1339
  )
1340
 
1341
  review_mode.change(fn=toggle_review_mode, inputs=[review_mode], outputs=[vertical_view])
 
1343
  save_btn.click(
1344
  fn=save_review_changes,
1345
  inputs=[record_pick, vertical_view, state_records],
1346
+ outputs=[overview_df, state_records, review_status, summary_card]
1347
  )
1348
 
1349
  export_btn.click(
 
1352
  outputs=[reviewed_csv, review_status]
1353
  )
1354
 
1355
+ # Admin vocab wiring
1356
  vocab_search.change(fn=vocab_filter_preview, inputs=[vocab_terms_df, vocab_search], outputs=[vocab_terms_filtered])
1357
 
1358
  vocab_category.change(
 
1389
  outputs=[vocab_state, vocab_category, vocab_terms_df, vocab_terms_filtered, vocab_json_admin, vocab_status, vocab_json]
1390
  )
1391
 
1392
+ # Admin field builder wiring
1393
  admin_apply_endpoints_btn.click(
1394
  fn=admin_apply_endpoints,
1395
  inputs=[endpoints],
 
1408
  outputs=[field_rows_state, fields_df, field_spec, fields_status]
1409
  )
1410
 
1411
+ # Init
1412
  def _init_all():
1413
  vocab, keys, k0, full_df, filtered_df, vjson, vmsg = vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
1414
 
1415
+ default_endpoints = ENDPOINT_PRESETS["Required Safety Assessor"]
1416
+ rows, _, _ = build_rows_from_endpoints(default_endpoints)
1417
  fdf = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
1418
  fspec = build_spec_from_field_rows(rows)
1419
 
 
1450
  )
1451
 
1452
  with gr.Tab("Cross-paper Synthesis"):
1453
+ gr.Markdown("Upload `extraction_details.json` from Extract tab. Synthesis is based strictly on grounded extractions.")
1454
  api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
1455
  model2 = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
1456
  extraction_json_file = gr.File(label="Upload extraction_details.json", file_types=[".json"], file_count="single")