Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -135,6 +135,20 @@ ENDPOINT_MODULES: Dict[str, List[Dict[str, Any]]] = {
|
|
| 135 |
"Carcinogenicity": PRESET_CARCINOGENICITY,
|
| 136 |
}
|
| 137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
ENDPOINT_QUERY_HINTS: Dict[str, List[str]] = {
|
| 139 |
"Genotoxicity (OECD TG)": ["genotoxicity", "mutagenicity", "AMES", "micronucleus", "comet assay", "chromosomal aberration", "OECD TG 471 473 476 487 490 474 489"],
|
| 140 |
"NAMs / In Silico": ["in silico", "QSAR", "read-across", "AOP", "PBPK", "high-throughput", "omics", "organ-on-chip", "microphysiological"],
|
|
@@ -264,10 +278,6 @@ def slugify_field(name: str) -> str:
|
|
| 264 |
|
| 265 |
|
| 266 |
def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
|
| 267 |
-
"""
|
| 268 |
-
spec lines: Field Name | type | instructions
|
| 269 |
-
types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]
|
| 270 |
-
"""
|
| 271 |
props: Dict[str, Any] = {}
|
| 272 |
instr: Dict[str, str] = {}
|
| 273 |
|
|
@@ -421,54 +431,6 @@ def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[
|
|
| 421 |
return resp.output_text
|
| 422 |
|
| 423 |
|
| 424 |
-
# =============================
|
| 425 |
-
# UI helpers: vertical view + evidence + overview
|
| 426 |
-
# =============================
|
| 427 |
-
def _make_vertical(records: List[Dict[str, Any]], record_id: str) -> pd.DataFrame:
|
| 428 |
-
if not records or not record_id:
|
| 429 |
-
return pd.DataFrame(columns=["Field", "Value"])
|
| 430 |
-
row = next((r for r in records if r.get("record_id") == record_id), None)
|
| 431 |
-
if not row:
|
| 432 |
-
return pd.DataFrame(columns=["Field", "Value"])
|
| 433 |
-
|
| 434 |
-
hidden = {"record_id"}
|
| 435 |
-
keys = [k for k in row.keys() if k not in hidden]
|
| 436 |
-
return pd.DataFrame({"Field": keys, "Value": [row[k] for k in keys]})
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
def _render_evidence(details: List[Dict[str, Any]], file_name: str, allowed_fields: Optional[set] = None, max_items: int = 120) -> str:
|
| 440 |
-
if not details or not file_name:
|
| 441 |
-
return ""
|
| 442 |
-
d = next((x for x in details if x.get("_file") == file_name), None)
|
| 443 |
-
if not d:
|
| 444 |
-
return ""
|
| 445 |
-
ev = d.get("evidence", []) or []
|
| 446 |
-
lines = []
|
| 447 |
-
for e in ev:
|
| 448 |
-
field = (e.get("field", "") or "").strip()
|
| 449 |
-
if allowed_fields is not None and field and field not in allowed_fields:
|
| 450 |
-
continue
|
| 451 |
-
quote = (e.get("quote", "") or "").strip()
|
| 452 |
-
pages = (e.get("pages", "") or "").strip()
|
| 453 |
-
if quote:
|
| 454 |
-
if len(quote) > 320:
|
| 455 |
-
quote = quote[:320] + "…"
|
| 456 |
-
lines.append(f"- **{field}** (pages {pages}): “{quote}”")
|
| 457 |
-
if len(lines) >= max_items:
|
| 458 |
-
break
|
| 459 |
-
header = "### Evidence (grounding)\n"
|
| 460 |
-
return header + ("\n".join(lines) if lines else "- (no evidence returned)")
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
def _overview_df_from_records(records: List[Dict[str, Any]]) -> pd.DataFrame:
|
| 464 |
-
if not records:
|
| 465 |
-
return pd.DataFrame(columns=["record_id","file","paper_title","chemical","endpoint","risk_stance","risk_confidence"])
|
| 466 |
-
df = pd.DataFrame(records)
|
| 467 |
-
cols = ["record_id","file","paper_title","chemical","endpoint","risk_stance","risk_confidence"]
|
| 468 |
-
cols = [c for c in cols if c in df.columns]
|
| 469 |
-
return df[cols].copy() if cols else df.head(50)
|
| 470 |
-
|
| 471 |
-
|
| 472 |
# =============================
|
| 473 |
# Controlled vocab editor helpers (lists only) + search filter
|
| 474 |
# =============================
|
|
@@ -572,7 +534,7 @@ def vocab_filter_preview(terms_df, search):
|
|
| 572 |
|
| 573 |
|
| 574 |
# =============================
|
| 575 |
-
# Field
|
| 576 |
# =============================
|
| 577 |
TYPE_CHOICES = ["str", "num", "bool", "list[str]", "list[num]", "enum", "list[enum]"]
|
| 578 |
|
|
@@ -606,14 +568,17 @@ def build_spec_from_field_rows(rows: List[Dict[str, Any]]) -> str:
|
|
| 606 |
return "\n".join(lines).strip() + "\n"
|
| 607 |
|
| 608 |
|
| 609 |
-
def build_rows_from_endpoints(selected_endpoints: List[str]) -> Tuple[List[Dict[str, Any]], Dict[str, str]]:
|
| 610 |
selected_endpoints = selected_endpoints or []
|
| 611 |
rows: List[Dict[str, Any]] = []
|
| 612 |
field_key_to_module: Dict[str, str] = {}
|
|
|
|
| 613 |
|
| 614 |
for r in PRESET_CORE:
|
| 615 |
rows.append(dict(r))
|
| 616 |
-
|
|
|
|
|
|
|
| 617 |
|
| 618 |
for module in selected_endpoints:
|
| 619 |
preset = ENDPOINT_MODULES.get(module)
|
|
@@ -621,7 +586,9 @@ def build_rows_from_endpoints(selected_endpoints: List[str]) -> Tuple[List[Dict[
|
|
| 621 |
continue
|
| 622 |
for r in preset:
|
| 623 |
rows.append(dict(r))
|
| 624 |
-
|
|
|
|
|
|
|
| 625 |
|
| 626 |
seen = set()
|
| 627 |
deduped: List[Dict[str, Any]] = []
|
|
@@ -632,21 +599,30 @@ def build_rows_from_endpoints(selected_endpoints: List[str]) -> Tuple[List[Dict[
|
|
| 632 |
seen.add(k)
|
| 633 |
deduped.append(r)
|
| 634 |
|
| 635 |
-
|
| 636 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 637 |
|
|
|
|
|
|
|
|
|
|
| 638 |
|
| 639 |
-
|
|
|
|
| 640 |
if admin_mode:
|
| 641 |
-
|
| 642 |
-
|
|
|
|
| 643 |
df = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
|
| 644 |
spec = build_spec_from_field_rows(rows)
|
| 645 |
return rows, df, spec, "✅ Columns updated from selected endpoints."
|
| 646 |
|
| 647 |
|
| 648 |
def admin_apply_endpoints(selected_endpoints: List[str]):
|
| 649 |
-
rows, _ = build_rows_from_endpoints(selected_endpoints)
|
| 650 |
df = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
|
| 651 |
spec = build_spec_from_field_rows(rows)
|
| 652 |
return rows, df, spec, "✅ Loaded selected endpoints into the builder (Replace)."
|
|
@@ -706,7 +682,7 @@ def fields_apply_df(field_rows: List[Dict[str, Any]], df_in: Any):
|
|
| 706 |
|
| 707 |
|
| 708 |
# =============================
|
| 709 |
-
# Row
|
| 710 |
# =============================
|
| 711 |
def _as_list(x) -> List[str]:
|
| 712 |
if x is None:
|
|
@@ -728,12 +704,174 @@ def _format_value(v: Any) -> Any:
|
|
| 728 |
return v
|
| 729 |
|
| 730 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 731 |
def _record_id(file_name: str, chemical: str, endpoint: str) -> str:
|
| 732 |
chemical = (chemical or "").strip() or "-"
|
| 733 |
endpoint = (endpoint or "").strip() or "Paper"
|
| 734 |
return f"{file_name} | {chemical} | {endpoint}"
|
| 735 |
|
| 736 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 737 |
# =============================
|
| 738 |
# Main extraction handler
|
| 739 |
# =============================
|
|
@@ -751,6 +889,7 @@ def run_extraction(
|
|
| 751 |
):
|
| 752 |
if not files:
|
| 753 |
return (
|
|
|
|
| 754 |
pd.DataFrame(), None, None, "Upload one or more PDFs.",
|
| 755 |
gr.update(choices=[], value=None),
|
| 756 |
[], [], pd.DataFrame(columns=["Field","Value"]), ""
|
|
@@ -760,6 +899,7 @@ def run_extraction(
|
|
| 760 |
vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
|
| 761 |
except Exception as e:
|
| 762 |
return (
|
|
|
|
| 763 |
pd.DataFrame(), None, None, f"Controlled vocab JSON invalid: {e}",
|
| 764 |
gr.update(choices=[], value=None),
|
| 765 |
[], [], pd.DataFrame(columns=["Field","Value"]), ""
|
|
@@ -768,6 +908,7 @@ def run_extraction(
|
|
| 768 |
field_props, field_instr = parse_field_spec(field_spec or "")
|
| 769 |
if not field_props:
|
| 770 |
return (
|
|
|
|
| 771 |
pd.DataFrame(), None, None, "No extraction fields are defined. (Check selected endpoints or admin field spec.)",
|
| 772 |
gr.update(choices=[], value=None),
|
| 773 |
[], [], pd.DataFrame(columns=["Field","Value"]), ""
|
|
@@ -777,15 +918,17 @@ def run_extraction(
|
|
| 777 |
|
| 778 |
if admin_mode:
|
| 779 |
field_key_to_module = {k: "Custom" for k in field_props.keys()}
|
|
|
|
| 780 |
endpoint_modules_for_rows = ["Custom"]
|
| 781 |
else:
|
| 782 |
-
_, field_key_to_module = build_rows_from_endpoints(selected_endpoints or [])
|
| 783 |
endpoint_modules_for_rows = list(selected_endpoints or []) or ["Core"]
|
| 784 |
|
| 785 |
try:
|
| 786 |
client = get_openai_client(api_key)
|
| 787 |
except Exception as e:
|
| 788 |
return (
|
|
|
|
| 789 |
pd.DataFrame(), None, None, str(e),
|
| 790 |
gr.update(choices=[], value=None),
|
| 791 |
[], [], pd.DataFrame(columns=["Field","Value"]), ""
|
|
@@ -854,32 +997,55 @@ def run_extraction(
|
|
| 854 |
if not chemicals:
|
| 855 |
chemicals = ["-"]
|
| 856 |
|
|
|
|
| 857 |
if len(chemicals) <= 1:
|
|
|
|
| 858 |
row = dict(base)
|
| 859 |
-
row["chemical"] =
|
| 860 |
row["endpoint"] = "Paper"
|
| 861 |
-
row["record_id"] = _record_id(filename,
|
| 862 |
for k in field_props.keys():
|
| 863 |
row[k] = _format_value(ext.get(k, [] if field_props[k].get("type") == "array" else ""))
|
| 864 |
output_rows.append(row)
|
|
|
|
|
|
|
| 865 |
else:
|
| 866 |
core_keys = [k for k, m in field_key_to_module.items() if m == "Core"] if not admin_mode else []
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
| 877 |
-
|
| 878 |
-
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 883 |
|
| 884 |
df = pd.DataFrame(output_rows)
|
| 885 |
records = df.to_dict("records")
|
|
@@ -893,20 +1059,20 @@ def run_extraction(
|
|
| 893 |
default = choices[0] if choices else None
|
| 894 |
|
| 895 |
vertical = _make_vertical(records, default) if default else pd.DataFrame(columns=["Field","Value"])
|
|
|
|
| 896 |
allowed_fields = None
|
|
|
|
| 897 |
if default:
|
| 898 |
selected_row = next((r for r in records if r.get("record_id") == default), {})
|
| 899 |
allowed_fields = set([k for k in selected_row.keys() if k not in {"record_id"}])
|
|
|
|
| 900 |
|
| 901 |
-
file_for_evidence = None
|
| 902 |
-
if default:
|
| 903 |
-
file_for_evidence = default.split(" | ")[0].strip()
|
| 904 |
evidence = _render_evidence(paper_details, file_for_evidence, allowed_fields=allowed_fields) if file_for_evidence else ""
|
| 905 |
-
|
| 906 |
overview = _overview_df_from_records(records)
|
| 907 |
status = "✅ Done. Review in the report below and export when ready."
|
| 908 |
|
| 909 |
return (
|
|
|
|
| 910 |
overview,
|
| 911 |
str(csv_path),
|
| 912 |
str(json_path),
|
|
@@ -924,11 +1090,11 @@ def run_extraction(
|
|
| 924 |
# =============================
|
| 925 |
def on_pick(record_id: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
|
| 926 |
if not record_id:
|
| 927 |
-
return pd.DataFrame(columns=["Field","Value"]), ""
|
| 928 |
row = next((r for r in (records or []) if r.get("record_id") == record_id), {})
|
| 929 |
file_name = (row.get("file") or "")
|
| 930 |
allowed_fields = set(row.keys()) - {"record_id"}
|
| 931 |
-
return _make_vertical(records, record_id), _render_evidence(details, file_name, allowed_fields=allowed_fields)
|
| 932 |
|
| 933 |
|
| 934 |
def toggle_review_mode(is_on: bool):
|
|
@@ -937,12 +1103,12 @@ def toggle_review_mode(is_on: bool):
|
|
| 937 |
|
| 938 |
def save_review_changes(record_id: str, vertical_df: Any, records: List[Dict[str, Any]]):
|
| 939 |
if not record_id or not records:
|
| 940 |
-
return pd.DataFrame(), records, "Nothing to save."
|
| 941 |
|
| 942 |
try:
|
| 943 |
dfv = vertical_df if isinstance(vertical_df, pd.DataFrame) else pd.DataFrame(vertical_df, columns=["Field", "Value"])
|
| 944 |
except Exception:
|
| 945 |
-
return _overview_df_from_records(records), records, "Could not parse edited vertical table."
|
| 946 |
|
| 947 |
dfv = dfv.dropna(subset=["Field"])
|
| 948 |
updates = {str(r["Field"]): r["Value"] for _, r in dfv.iterrows() if str(r["Field"]).strip()}
|
|
@@ -960,7 +1126,7 @@ def save_review_changes(record_id: str, vertical_df: Any, records: List[Dict[str
|
|
| 960 |
new_records.append(r)
|
| 961 |
|
| 962 |
msg = "Saved changes into session data. Export reviewed CSV to download." if updated else "Record not found."
|
| 963 |
-
return _overview_df_from_records(new_records), new_records, msg
|
| 964 |
|
| 965 |
|
| 966 |
def export_reviewed_csv(records: List[Dict[str, Any]]):
|
|
@@ -977,19 +1143,17 @@ def export_reviewed_csv(records: List[Dict[str, Any]]):
|
|
| 977 |
# =============================
|
| 978 |
def run_synthesis(api_key, model, extraction_json_file):
|
| 979 |
if extraction_json_file is None:
|
| 980 |
-
return "Upload the extraction_details.json from
|
| 981 |
-
|
| 982 |
try:
|
| 983 |
client = get_openai_client(api_key)
|
| 984 |
except Exception as e:
|
| 985 |
return str(e)
|
| 986 |
-
|
| 987 |
rows = json.loads(Path(extraction_json_file.name).read_text(encoding="utf-8"))
|
| 988 |
return openai_synthesize_across_papers(client, model, rows)
|
| 989 |
|
| 990 |
|
| 991 |
# =============================
|
| 992 |
-
#
|
| 993 |
# =============================
|
| 994 |
def set_admin_visibility(is_admin: bool):
|
| 995 |
return (
|
|
@@ -1014,10 +1178,11 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
|
| 1014 |
vocab_state = gr.State({})
|
| 1015 |
field_rows_state = gr.State([])
|
| 1016 |
|
| 1017 |
-
field_spec = gr.Textbox(visible=False, interactive=False, lines=8
|
| 1018 |
-
vocab_json = gr.Textbox(visible=False, interactive=False, lines=8
|
| 1019 |
|
| 1020 |
with gr.Tab("Extract"):
|
|
|
|
| 1021 |
with gr.Group():
|
| 1022 |
files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
|
| 1023 |
|
|
@@ -1025,17 +1190,26 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
|
| 1025 |
api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
|
| 1026 |
model = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
|
| 1027 |
|
| 1028 |
-
|
| 1029 |
-
|
| 1030 |
-
|
| 1031 |
-
|
| 1032 |
-
|
| 1033 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1034 |
|
| 1035 |
extract_btn = gr.Button("Run Extraction", variant="primary")
|
| 1036 |
status = gr.Textbox(label="Status", interactive=False)
|
| 1037 |
|
|
|
|
| 1038 |
gr.Markdown("## Report")
|
|
|
|
|
|
|
| 1039 |
overview_df = gr.Dataframe(
|
| 1040 |
label="Batch Overview",
|
| 1041 |
interactive=False,
|
|
@@ -1068,12 +1242,14 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
|
| 1068 |
|
| 1069 |
reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
|
| 1070 |
|
|
|
|
| 1071 |
with gr.Accordion("Advanced runtime settings", open=False):
|
| 1072 |
with gr.Row():
|
| 1073 |
max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
|
| 1074 |
chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
|
| 1075 |
max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
|
| 1076 |
|
|
|
|
| 1077 |
with gr.Accordion("Admin tools (taxonomy + custom columns)", open=False):
|
| 1078 |
admin_mode = gr.Checkbox(label="Enable Admin mode", value=False)
|
| 1079 |
|
|
@@ -1082,7 +1258,7 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
|
| 1082 |
admin_fields_group = gr.Group(visible=False)
|
| 1083 |
|
| 1084 |
with admin_group:
|
| 1085 |
-
gr.Markdown("### Admin: Configure
|
| 1086 |
|
| 1087 |
with admin_vocab_group:
|
| 1088 |
gr.Markdown("### Controlled vocabulary (lists only)")
|
|
@@ -1131,28 +1307,35 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
|
| 1131 |
|
| 1132 |
fields_status = gr.Textbox(label="Field builder status", interactive=False)
|
| 1133 |
|
|
|
|
| 1134 |
admin_mode.change(
|
| 1135 |
fn=set_admin_visibility,
|
| 1136 |
inputs=[admin_mode],
|
| 1137 |
outputs=[admin_group, admin_vocab_group, admin_fields_group]
|
| 1138 |
)
|
| 1139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1140 |
endpoints.change(
|
| 1141 |
fn=sync_fields_from_endpoints,
|
| 1142 |
-
inputs=[endpoints, admin_mode],
|
| 1143 |
outputs=[field_rows_state, fields_df, field_spec, status]
|
| 1144 |
)
|
| 1145 |
|
| 1146 |
extract_btn.click(
|
| 1147 |
fn=run_extraction,
|
| 1148 |
inputs=[files, api_key, model, endpoints, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars, admin_mode],
|
| 1149 |
-
outputs=[overview_df, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
|
| 1150 |
)
|
| 1151 |
|
| 1152 |
record_pick.change(
|
| 1153 |
fn=on_pick,
|
| 1154 |
inputs=[record_pick, state_records, state_details],
|
| 1155 |
-
outputs=[vertical_view, evidence_md]
|
| 1156 |
)
|
| 1157 |
|
| 1158 |
review_mode.change(fn=toggle_review_mode, inputs=[review_mode], outputs=[vertical_view])
|
|
@@ -1160,7 +1343,7 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
|
| 1160 |
save_btn.click(
|
| 1161 |
fn=save_review_changes,
|
| 1162 |
inputs=[record_pick, vertical_view, state_records],
|
| 1163 |
-
outputs=[overview_df, state_records, review_status]
|
| 1164 |
)
|
| 1165 |
|
| 1166 |
export_btn.click(
|
|
@@ -1169,6 +1352,7 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
|
| 1169 |
outputs=[reviewed_csv, review_status]
|
| 1170 |
)
|
| 1171 |
|
|
|
|
| 1172 |
vocab_search.change(fn=vocab_filter_preview, inputs=[vocab_terms_df, vocab_search], outputs=[vocab_terms_filtered])
|
| 1173 |
|
| 1174 |
vocab_category.change(
|
|
@@ -1205,6 +1389,7 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
|
| 1205 |
outputs=[vocab_state, vocab_category, vocab_terms_df, vocab_terms_filtered, vocab_json_admin, vocab_status, vocab_json]
|
| 1206 |
)
|
| 1207 |
|
|
|
|
| 1208 |
admin_apply_endpoints_btn.click(
|
| 1209 |
fn=admin_apply_endpoints,
|
| 1210 |
inputs=[endpoints],
|
|
@@ -1223,11 +1408,12 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
|
| 1223 |
outputs=[field_rows_state, fields_df, field_spec, fields_status]
|
| 1224 |
)
|
| 1225 |
|
|
|
|
| 1226 |
def _init_all():
|
| 1227 |
vocab, keys, k0, full_df, filtered_df, vjson, vmsg = vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
|
| 1228 |
|
| 1229 |
-
default_endpoints = ["
|
| 1230 |
-
rows, _ = build_rows_from_endpoints(default_endpoints)
|
| 1231 |
fdf = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
|
| 1232 |
fspec = build_spec_from_field_rows(rows)
|
| 1233 |
|
|
@@ -1264,7 +1450,7 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
|
| 1264 |
)
|
| 1265 |
|
| 1266 |
with gr.Tab("Cross-paper Synthesis"):
|
| 1267 |
-
gr.Markdown("Upload `extraction_details.json` from Extract. Synthesis is based strictly on grounded extractions.")
|
| 1268 |
api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
|
| 1269 |
model2 = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
|
| 1270 |
extraction_json_file = gr.File(label="Upload extraction_details.json", file_types=[".json"], file_count="single")
|
|
|
|
| 135 |
"Carcinogenicity": PRESET_CARCINOGENICITY,
|
| 136 |
}
|
| 137 |
|
| 138 |
+
# Endpoint presets (requested)
|
| 139 |
+
ENDPOINT_PRESETS: Dict[str, List[str]] = {
|
| 140 |
+
"Required – Safety Assessor": [
|
| 141 |
+
"Genotoxicity (OECD TG)",
|
| 142 |
+
"Repeated dose toxicity",
|
| 143 |
+
"Irritation / Sensitization",
|
| 144 |
+
"Repro / Developmental",
|
| 145 |
+
"Acute toxicity",
|
| 146 |
+
],
|
| 147 |
+
"Core only (fast)": [],
|
| 148 |
+
"Screening – NAMs + Genotox": ["NAMs / In Silico", "Genotoxicity (OECD TG)"],
|
| 149 |
+
"Full – All endpoints": list(ENDPOINT_MODULES.keys()),
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
ENDPOINT_QUERY_HINTS: Dict[str, List[str]] = {
|
| 153 |
"Genotoxicity (OECD TG)": ["genotoxicity", "mutagenicity", "AMES", "micronucleus", "comet assay", "chromosomal aberration", "OECD TG 471 473 476 487 490 474 489"],
|
| 154 |
"NAMs / In Silico": ["in silico", "QSAR", "read-across", "AOP", "PBPK", "high-throughput", "omics", "organ-on-chip", "microphysiological"],
|
|
|
|
| 278 |
|
| 279 |
|
| 280 |
def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
props: Dict[str, Any] = {}
|
| 282 |
instr: Dict[str, str] = {}
|
| 283 |
|
|
|
|
| 431 |
return resp.output_text
|
| 432 |
|
| 433 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
# =============================
|
| 435 |
# Controlled vocab editor helpers (lists only) + search filter
|
| 436 |
# =============================
|
|
|
|
| 534 |
|
| 535 |
|
| 536 |
# =============================
|
| 537 |
+
# Field mapping from endpoints
|
| 538 |
# =============================
|
| 539 |
TYPE_CHOICES = ["str", "num", "bool", "list[str]", "list[num]", "enum", "list[enum]"]
|
| 540 |
|
|
|
|
| 568 |
return "\n".join(lines).strip() + "\n"
|
| 569 |
|
| 570 |
|
| 571 |
+
def build_rows_from_endpoints(selected_endpoints: List[str]) -> Tuple[List[Dict[str, Any]], Dict[str, str], Dict[str, List[str]]]:
|
| 572 |
selected_endpoints = selected_endpoints or []
|
| 573 |
rows: List[Dict[str, Any]] = []
|
| 574 |
field_key_to_module: Dict[str, str] = {}
|
| 575 |
+
module_to_keys: Dict[str, List[str]] = {}
|
| 576 |
|
| 577 |
for r in PRESET_CORE:
|
| 578 |
rows.append(dict(r))
|
| 579 |
+
k = slugify_field(r["field"])
|
| 580 |
+
field_key_to_module[k] = "Core"
|
| 581 |
+
module_to_keys.setdefault("Core", []).append(k)
|
| 582 |
|
| 583 |
for module in selected_endpoints:
|
| 584 |
preset = ENDPOINT_MODULES.get(module)
|
|
|
|
| 586 |
continue
|
| 587 |
for r in preset:
|
| 588 |
rows.append(dict(r))
|
| 589 |
+
k = slugify_field(r["field"])
|
| 590 |
+
field_key_to_module[k] = module
|
| 591 |
+
module_to_keys.setdefault(module, []).append(k)
|
| 592 |
|
| 593 |
seen = set()
|
| 594 |
deduped: List[Dict[str, Any]] = []
|
|
|
|
| 599 |
seen.add(k)
|
| 600 |
deduped.append(r)
|
| 601 |
|
| 602 |
+
# Rebuild module_to_keys to match deduped
|
| 603 |
+
dedup_keys = set([slugify_field(r["field"]) for r in deduped])
|
| 604 |
+
module_to_keys = {m: [k for k in ks if k in dedup_keys] for m, ks in module_to_keys.items()}
|
| 605 |
+
|
| 606 |
+
return deduped, field_key_to_module, module_to_keys
|
| 607 |
+
|
| 608 |
|
| 609 |
+
def apply_endpoint_preset(preset_name: str):
|
| 610 |
+
vals = ENDPOINT_PRESETS.get(preset_name, [])
|
| 611 |
+
return gr.update(value=vals)
|
| 612 |
|
| 613 |
+
|
| 614 |
+
def sync_fields_from_endpoints(selected_endpoints: List[str], admin_mode: bool, current_rows: List[Dict[str, Any]], current_spec: str):
|
| 615 |
if admin_mode:
|
| 616 |
+
df = pd.DataFrame(current_rows or [], columns=["field","type","enum_values","instructions"])
|
| 617 |
+
return current_rows, df, current_spec, "Admin mode: endpoint selection will not overwrite custom columns."
|
| 618 |
+
rows, _, _ = build_rows_from_endpoints(selected_endpoints or [])
|
| 619 |
df = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
|
| 620 |
spec = build_spec_from_field_rows(rows)
|
| 621 |
return rows, df, spec, "✅ Columns updated from selected endpoints."
|
| 622 |
|
| 623 |
|
| 624 |
def admin_apply_endpoints(selected_endpoints: List[str]):
|
| 625 |
+
rows, _, _ = build_rows_from_endpoints(selected_endpoints or [])
|
| 626 |
df = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
|
| 627 |
spec = build_spec_from_field_rows(rows)
|
| 628 |
return rows, df, spec, "✅ Loaded selected endpoints into the builder (Replace)."
|
|
|
|
| 682 |
|
| 683 |
|
| 684 |
# =============================
|
| 685 |
+
# Row building + “non-empty module” logic
|
| 686 |
# =============================
|
| 687 |
def _as_list(x) -> List[str]:
|
| 688 |
if x is None:
|
|
|
|
| 704 |
return v
|
| 705 |
|
| 706 |
|
| 707 |
+
EMPTY_STRINGS = {"", "not_reported", "insufficient_data", "none", "na", "n/a", "null"}
|
| 708 |
+
|
| 709 |
+
|
| 710 |
+
def _is_empty_value(v: Any) -> bool:
|
| 711 |
+
if v is None:
|
| 712 |
+
return True
|
| 713 |
+
if isinstance(v, float) and np.isnan(v):
|
| 714 |
+
return True
|
| 715 |
+
if isinstance(v, list):
|
| 716 |
+
cleaned = [str(x).strip() for x in v if str(x).strip()]
|
| 717 |
+
if not cleaned:
|
| 718 |
+
return True
|
| 719 |
+
# empty if all items are not_reported / similar
|
| 720 |
+
return all((c.lower() in EMPTY_STRINGS) for c in cleaned)
|
| 721 |
+
s = str(v).strip()
|
| 722 |
+
if not s:
|
| 723 |
+
return True
|
| 724 |
+
return s.lower() in EMPTY_STRINGS
|
| 725 |
+
|
| 726 |
+
|
| 727 |
def _record_id(file_name: str, chemical: str, endpoint: str) -> str:
|
| 728 |
chemical = (chemical or "").strip() or "-"
|
| 729 |
endpoint = (endpoint or "").strip() or "Paper"
|
| 730 |
return f"{file_name} | {chemical} | {endpoint}"
|
| 731 |
|
| 732 |
|
| 733 |
+
def _module_has_any_data(ext: Dict[str, Any], module_keys: List[str], field_props: Dict[str, Any]) -> bool:
|
| 734 |
+
for k in (module_keys or []):
|
| 735 |
+
v = ext.get(k, None)
|
| 736 |
+
if not _is_empty_value(v):
|
| 737 |
+
return True
|
| 738 |
+
return False
|
| 739 |
+
|
| 740 |
+
|
| 741 |
+
# =============================
|
| 742 |
+
# Evidence + report helpers
|
| 743 |
+
# =============================
|
| 744 |
+
def _make_vertical(records: List[Dict[str, Any]], record_id: str) -> pd.DataFrame:
|
| 745 |
+
if not records or not record_id:
|
| 746 |
+
return pd.DataFrame(columns=["Field", "Value"])
|
| 747 |
+
row = next((r for r in records if r.get("record_id") == record_id), None)
|
| 748 |
+
if not row:
|
| 749 |
+
return pd.DataFrame(columns=["Field", "Value"])
|
| 750 |
+
|
| 751 |
+
hidden = {"record_id"}
|
| 752 |
+
keys = [k for k in row.keys() if k not in hidden]
|
| 753 |
+
return pd.DataFrame({"Field": keys, "Value": [row.get(k, "") for k in keys]})
|
| 754 |
+
|
| 755 |
+
|
| 756 |
+
def _render_evidence(details: List[Dict[str, Any]], file_name: str, allowed_fields: Optional[set] = None, max_items: int = 120) -> str:
|
| 757 |
+
if not details or not file_name:
|
| 758 |
+
return ""
|
| 759 |
+
d = next((x for x in details if x.get("_file") == file_name), None)
|
| 760 |
+
if not d:
|
| 761 |
+
return ""
|
| 762 |
+
ev = d.get("evidence", []) or []
|
| 763 |
+
lines = []
|
| 764 |
+
for e in ev:
|
| 765 |
+
field = (e.get("field", "") or "").strip()
|
| 766 |
+
if allowed_fields is not None and field and field not in allowed_fields:
|
| 767 |
+
continue
|
| 768 |
+
quote = (e.get("quote", "") or "").strip()
|
| 769 |
+
pages = (e.get("pages", "") or "").strip()
|
| 770 |
+
if quote:
|
| 771 |
+
if len(quote) > 320:
|
| 772 |
+
quote = quote[:320] + "…"
|
| 773 |
+
lines.append(f"- **{field}** (pages {pages}): “{quote}”")
|
| 774 |
+
if len(lines) >= max_items:
|
| 775 |
+
break
|
| 776 |
+
header = "### Evidence (grounding)\n"
|
| 777 |
+
return header + ("\n".join(lines) if lines else "- (no evidence returned)")
|
| 778 |
+
|
| 779 |
+
|
| 780 |
+
def _overview_df_from_records(records: List[Dict[str, Any]]) -> pd.DataFrame:
|
| 781 |
+
if not records:
|
| 782 |
+
return pd.DataFrame(columns=["record_id","file","paper_title","chemical","endpoint","risk_stance","risk_confidence"])
|
| 783 |
+
df = pd.DataFrame(records)
|
| 784 |
+
cols = ["record_id","file","paper_title","chemical","endpoint","risk_stance","risk_confidence"]
|
| 785 |
+
cols = [c for c in cols if c in df.columns]
|
| 786 |
+
return df[cols].copy() if cols else df.head(50)
|
| 787 |
+
|
| 788 |
+
|
| 789 |
+
def _risk_badge(risk: str) -> str:
|
| 790 |
+
r = (risk or "").strip().lower()
|
| 791 |
+
if r == "acceptable":
|
| 792 |
+
bg = "#e7f7ed"; fg = "#0f5132"
|
| 793 |
+
elif r == "acceptable_with_uncertainty":
|
| 794 |
+
bg = "#fff3cd"; fg = "#664d03"
|
| 795 |
+
elif r == "not_acceptable":
|
| 796 |
+
bg = "#f8d7da"; fg = "#842029"
|
| 797 |
+
else:
|
| 798 |
+
bg = "#e2e3e5"; fg = "#41464b"
|
| 799 |
+
label = risk if risk else "unknown"
|
| 800 |
+
return f'<span style="background:{bg};color:{fg};padding:4px 10px;border-radius:999px;font-weight:600;font-size:12px;">{label}</span>'
|
| 801 |
+
|
| 802 |
+
|
| 803 |
+
def _safe_str(x: Any) -> str:
|
| 804 |
+
if x is None:
|
| 805 |
+
return ""
|
| 806 |
+
if isinstance(x, float) and np.isnan(x):
|
| 807 |
+
return ""
|
| 808 |
+
return str(x)
|
| 809 |
+
|
| 810 |
+
|
| 811 |
+
def render_summary_card(record_id: str, records: List[Dict[str, Any]]) -> str:
|
| 812 |
+
if not record_id or not records:
|
| 813 |
+
return "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#666;'>Run extraction to view results.</div></div>"
|
| 814 |
+
|
| 815 |
+
row = next((r for r in records if r.get("record_id") == record_id), None)
|
| 816 |
+
if not row:
|
| 817 |
+
return "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#666;'>Select a record.</div></div>"
|
| 818 |
+
|
| 819 |
+
title = _safe_str(row.get("paper_title", "")).strip() or "Untitled paper"
|
| 820 |
+
file_name = _safe_str(row.get("file", ""))
|
| 821 |
+
chemical = _safe_str(row.get("chemical", "-"))
|
| 822 |
+
endpoint = _safe_str(row.get("endpoint", "Paper"))
|
| 823 |
+
risk = _safe_str(row.get("risk_stance", ""))
|
| 824 |
+
conf = row.get("risk_confidence", "")
|
| 825 |
+
try:
|
| 826 |
+
conf_txt = f"{float(conf):.2f}" if conf != "" else ""
|
| 827 |
+
except Exception:
|
| 828 |
+
conf_txt = _safe_str(conf)
|
| 829 |
+
|
| 830 |
+
key_findings = _safe_str(row.get("key_findings", "")).strip()
|
| 831 |
+
dose_metrics = _safe_str(row.get("dose_metrics", "")).strip()
|
| 832 |
+
conclusion = _safe_str(row.get("conclusion", "")).strip()
|
| 833 |
+
|
| 834 |
+
# Keep compact
|
| 835 |
+
def _clip(s: str, n: int = 380) -> str:
|
| 836 |
+
s = s.strip()
|
| 837 |
+
if len(s) <= n:
|
| 838 |
+
return s
|
| 839 |
+
return s[:n] + "…"
|
| 840 |
+
|
| 841 |
+
return f"""
|
| 842 |
+
<div style="border:1px solid #eaeaea;padding:14px;border-radius:12px;">
|
| 843 |
+
<div style="display:flex;align-items:center;justify-content:space-between;gap:12px;flex-wrap:wrap;">
|
| 844 |
+
<div style="font-weight:700;font-size:16px;">Executive Summary</div>
|
| 845 |
+
<div>{_risk_badge(risk)} <span style="margin-left:10px;color:#666;font-size:12px;">confidence: {conf_txt}</span></div>
|
| 846 |
+
</div>
|
| 847 |
+
|
| 848 |
+
<div style="margin-top:10px;">
|
| 849 |
+
<div style="font-weight:650;">{title}</div>
|
| 850 |
+
<div style="color:#666;font-size:12px;margin-top:4px;">
|
| 851 |
+
<span><b>File:</b> {file_name}</span> •
|
| 852 |
+
<span><b>Chemical:</b> {chemical}</span> •
|
| 853 |
+
<span><b>Endpoint:</b> {endpoint}</span>
|
| 854 |
+
</div>
|
| 855 |
+
</div>
|
| 856 |
+
|
| 857 |
+
<div style="margin-top:12px;display:grid;grid-template-columns:1fr;gap:10px;">
|
| 858 |
+
<div>
|
| 859 |
+
<div style="font-weight:650;margin-bottom:4px;">Key Findings</div>
|
| 860 |
+
<div style="color:#222;">{_clip(key_findings) if key_findings else "<span style='color:#666'>(not reported)</span>"}</div>
|
| 861 |
+
</div>
|
| 862 |
+
<div>
|
| 863 |
+
<div style="font-weight:650;margin-bottom:4px;">Dose Metrics</div>
|
| 864 |
+
<div style="color:#222;">{_clip(dose_metrics) if dose_metrics else "<span style='color:#666'>(not reported)</span>"}</div>
|
| 865 |
+
</div>
|
| 866 |
+
<div>
|
| 867 |
+
<div style="font-weight:650;margin-bottom:4px;">Conclusion</div>
|
| 868 |
+
<div style="color:#222;">{_clip(conclusion) if conclusion else "<span style='color:#666'>(not reported)</span>"}</div>
|
| 869 |
+
</div>
|
| 870 |
+
</div>
|
| 871 |
+
</div>
|
| 872 |
+
"""
|
| 873 |
+
|
| 874 |
+
|
| 875 |
# =============================
|
| 876 |
# Main extraction handler
|
| 877 |
# =============================
|
|
|
|
| 889 |
):
|
| 890 |
if not files:
|
| 891 |
return (
|
| 892 |
+
"<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#666;'>Upload PDFs to run extraction.</div></div>",
|
| 893 |
pd.DataFrame(), None, None, "Upload one or more PDFs.",
|
| 894 |
gr.update(choices=[], value=None),
|
| 895 |
[], [], pd.DataFrame(columns=["Field","Value"]), ""
|
|
|
|
| 899 |
vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
|
| 900 |
except Exception as e:
|
| 901 |
return (
|
| 902 |
+
"<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>Invalid vocab JSON.</div></div>",
|
| 903 |
pd.DataFrame(), None, None, f"Controlled vocab JSON invalid: {e}",
|
| 904 |
gr.update(choices=[], value=None),
|
| 905 |
[], [], pd.DataFrame(columns=["Field","Value"]), ""
|
|
|
|
| 908 |
field_props, field_instr = parse_field_spec(field_spec or "")
|
| 909 |
if not field_props:
|
| 910 |
return (
|
| 911 |
+
"<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>No columns defined.</div></div>",
|
| 912 |
pd.DataFrame(), None, None, "No extraction fields are defined. (Check selected endpoints or admin field spec.)",
|
| 913 |
gr.update(choices=[], value=None),
|
| 914 |
[], [], pd.DataFrame(columns=["Field","Value"]), ""
|
|
|
|
| 918 |
|
| 919 |
if admin_mode:
|
| 920 |
field_key_to_module = {k: "Custom" for k in field_props.keys()}
|
| 921 |
+
module_to_keys: Dict[str, List[str]] = {"Custom": list(field_props.keys())}
|
| 922 |
endpoint_modules_for_rows = ["Custom"]
|
| 923 |
else:
|
| 924 |
+
_, field_key_to_module, module_to_keys = build_rows_from_endpoints(selected_endpoints or [])
|
| 925 |
endpoint_modules_for_rows = list(selected_endpoints or []) or ["Core"]
|
| 926 |
|
| 927 |
try:
|
| 928 |
client = get_openai_client(api_key)
|
| 929 |
except Exception as e:
|
| 930 |
return (
|
| 931 |
+
"<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>Missing API key.</div></div>",
|
| 932 |
pd.DataFrame(), None, None, str(e),
|
| 933 |
gr.update(choices=[], value=None),
|
| 934 |
[], [], pd.DataFrame(columns=["Field","Value"]), ""
|
|
|
|
| 997 |
if not chemicals:
|
| 998 |
chemicals = ["-"]
|
| 999 |
|
| 1000 |
+
# Single-chemical => one-row-per-paper
|
| 1001 |
if len(chemicals) <= 1:
|
| 1002 |
+
chem = chemicals[0]
|
| 1003 |
row = dict(base)
|
| 1004 |
+
row["chemical"] = chem
|
| 1005 |
row["endpoint"] = "Paper"
|
| 1006 |
+
row["record_id"] = _record_id(filename, chem, row["endpoint"])
|
| 1007 |
for k in field_props.keys():
|
| 1008 |
row[k] = _format_value(ext.get(k, [] if field_props[k].get("type") == "array" else ""))
|
| 1009 |
output_rows.append(row)
|
| 1010 |
+
|
| 1011 |
+
# Multi-chemical => chemical–endpoint rows (ONLY non-empty modules)
|
| 1012 |
else:
|
| 1013 |
core_keys = [k for k, m in field_key_to_module.items() if m == "Core"] if not admin_mode else []
|
| 1014 |
+
|
| 1015 |
+
# determine which endpoint modules have any data (skip empty ones)
|
| 1016 |
+
candidate_modules = [m for m in endpoint_modules_for_rows if m != "Core"]
|
| 1017 |
+
non_empty_modules = []
|
| 1018 |
+
for m in candidate_modules:
|
| 1019 |
+
if _module_has_any_data(ext, module_to_keys.get(m, []), field_props):
|
| 1020 |
+
non_empty_modules.append(m)
|
| 1021 |
+
|
| 1022 |
+
# If everything empty, fall back to a single Paper row (otherwise you get no rows)
|
| 1023 |
+
if not non_empty_modules:
|
| 1024 |
+
row = dict(base)
|
| 1025 |
+
row["chemical"] = "multiple"
|
| 1026 |
+
row["endpoint"] = "Paper"
|
| 1027 |
+
row["record_id"] = _record_id(filename, row["chemical"], row["endpoint"])
|
| 1028 |
+
for k in field_props.keys():
|
| 1029 |
+
row[k] = _format_value(ext.get(k, [] if field_props[k].get("type") == "array" else ""))
|
| 1030 |
+
output_rows.append(row)
|
| 1031 |
+
else:
|
| 1032 |
+
for chem in chemicals:
|
| 1033 |
+
for module in non_empty_modules:
|
| 1034 |
+
row = dict(base)
|
| 1035 |
+
row["chemical"] = chem
|
| 1036 |
+
row["endpoint"] = module
|
| 1037 |
+
row["record_id"] = _record_id(filename, chem, module)
|
| 1038 |
+
|
| 1039 |
+
for k in field_props.keys():
|
| 1040 |
+
m = field_key_to_module.get(k, "Custom")
|
| 1041 |
+
include = (k in core_keys) or (m == module) or admin_mode
|
| 1042 |
+
if include:
|
| 1043 |
+
if k == "chemicals":
|
| 1044 |
+
row[k] = chem
|
| 1045 |
+
else:
|
| 1046 |
+
row[k] = _format_value(ext.get(k, [] if field_props[k].get("type") == "array" else ""))
|
| 1047 |
+
|
| 1048 |
+
output_rows.append(row)
|
| 1049 |
|
| 1050 |
df = pd.DataFrame(output_rows)
|
| 1051 |
records = df.to_dict("records")
|
|
|
|
| 1059 |
default = choices[0] if choices else None
|
| 1060 |
|
| 1061 |
vertical = _make_vertical(records, default) if default else pd.DataFrame(columns=["Field","Value"])
|
| 1062 |
+
summary_html = render_summary_card(default, records) if default else render_summary_card("", [])
|
| 1063 |
allowed_fields = None
|
| 1064 |
+
file_for_evidence = None
|
| 1065 |
if default:
|
| 1066 |
selected_row = next((r for r in records if r.get("record_id") == default), {})
|
| 1067 |
allowed_fields = set([k for k in selected_row.keys() if k not in {"record_id"}])
|
| 1068 |
+
file_for_evidence = (default.split(" | ")[0] or "").strip()
|
| 1069 |
|
|
|
|
|
|
|
|
|
|
| 1070 |
evidence = _render_evidence(paper_details, file_for_evidence, allowed_fields=allowed_fields) if file_for_evidence else ""
|
|
|
|
| 1071 |
overview = _overview_df_from_records(records)
|
| 1072 |
status = "✅ Done. Review in the report below and export when ready."
|
| 1073 |
|
| 1074 |
return (
|
| 1075 |
+
summary_html,
|
| 1076 |
overview,
|
| 1077 |
str(csv_path),
|
| 1078 |
str(json_path),
|
|
|
|
| 1090 |
# =============================
|
| 1091 |
def on_pick(record_id: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
|
| 1092 |
if not record_id:
|
| 1093 |
+
return render_summary_card("", []), pd.DataFrame(columns=["Field","Value"]), ""
|
| 1094 |
row = next((r for r in (records or []) if r.get("record_id") == record_id), {})
|
| 1095 |
file_name = (row.get("file") or "")
|
| 1096 |
allowed_fields = set(row.keys()) - {"record_id"}
|
| 1097 |
+
return render_summary_card(record_id, records), _make_vertical(records, record_id), _render_evidence(details, file_name, allowed_fields=allowed_fields)
|
| 1098 |
|
| 1099 |
|
| 1100 |
def toggle_review_mode(is_on: bool):
|
|
|
|
| 1103 |
|
| 1104 |
def save_review_changes(record_id: str, vertical_df: Any, records: List[Dict[str, Any]]):
|
| 1105 |
if not record_id or not records:
|
| 1106 |
+
return pd.DataFrame(), records, "Nothing to save.", render_summary_card("", [])
|
| 1107 |
|
| 1108 |
try:
|
| 1109 |
dfv = vertical_df if isinstance(vertical_df, pd.DataFrame) else pd.DataFrame(vertical_df, columns=["Field", "Value"])
|
| 1110 |
except Exception:
|
| 1111 |
+
return _overview_df_from_records(records), records, "Could not parse edited vertical table.", render_summary_card(record_id, records)
|
| 1112 |
|
| 1113 |
dfv = dfv.dropna(subset=["Field"])
|
| 1114 |
updates = {str(r["Field"]): r["Value"] for _, r in dfv.iterrows() if str(r["Field"]).strip()}
|
|
|
|
| 1126 |
new_records.append(r)
|
| 1127 |
|
| 1128 |
msg = "Saved changes into session data. Export reviewed CSV to download." if updated else "Record not found."
|
| 1129 |
+
return _overview_df_from_records(new_records), new_records, msg, render_summary_card(record_id, new_records)
|
| 1130 |
|
| 1131 |
|
| 1132 |
def export_reviewed_csv(records: List[Dict[str, Any]]):
|
|
|
|
| 1143 |
# =============================
|
| 1144 |
def run_synthesis(api_key, model, extraction_json_file):
|
| 1145 |
if extraction_json_file is None:
|
| 1146 |
+
return "Upload the extraction_details.json from Extract tab first."
|
|
|
|
| 1147 |
try:
|
| 1148 |
client = get_openai_client(api_key)
|
| 1149 |
except Exception as e:
|
| 1150 |
return str(e)
|
|
|
|
| 1151 |
rows = json.loads(Path(extraction_json_file.name).read_text(encoding="utf-8"))
|
| 1152 |
return openai_synthesize_across_papers(client, model, rows)
|
| 1153 |
|
| 1154 |
|
| 1155 |
# =============================
|
| 1156 |
+
# Admin visibility helpers
|
| 1157 |
# =============================
|
| 1158 |
def set_admin_visibility(is_admin: bool):
|
| 1159 |
return (
|
|
|
|
| 1178 |
vocab_state = gr.State({})
|
| 1179 |
field_rows_state = gr.State([])
|
| 1180 |
|
| 1181 |
+
field_spec = gr.Textbox(visible=False, interactive=False, lines=8)
|
| 1182 |
+
vocab_json = gr.Textbox(visible=False, interactive=False, lines=8)
|
| 1183 |
|
| 1184 |
with gr.Tab("Extract"):
|
| 1185 |
+
# --- Run section (simple) ---
|
| 1186 |
with gr.Group():
|
| 1187 |
files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
|
| 1188 |
|
|
|
|
| 1190 |
api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
|
| 1191 |
model = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
|
| 1192 |
|
| 1193 |
+
with gr.Row():
|
| 1194 |
+
endpoint_preset = gr.Dropdown(
|
| 1195 |
+
label="Endpoint preset",
|
| 1196 |
+
choices=list(ENDPOINT_PRESETS.keys()),
|
| 1197 |
+
value="Required – Safety Assessor"
|
| 1198 |
+
)
|
| 1199 |
+
endpoints = gr.Dropdown(
|
| 1200 |
+
label="Endpoints to extract (Core included automatically)",
|
| 1201 |
+
choices=list(ENDPOINT_MODULES.keys()),
|
| 1202 |
+
multiselect=True,
|
| 1203 |
+
value=ENDPOINT_PRESETS["Required – Safety Assessor"]
|
| 1204 |
+
)
|
| 1205 |
|
| 1206 |
extract_btn = gr.Button("Run Extraction", variant="primary")
|
| 1207 |
status = gr.Textbox(label="Status", interactive=False)
|
| 1208 |
|
| 1209 |
+
# --- Report (results-first) ---
|
| 1210 |
gr.Markdown("## Report")
|
| 1211 |
+
summary_card = gr.HTML(render_summary_card("", []))
|
| 1212 |
+
|
| 1213 |
overview_df = gr.Dataframe(
|
| 1214 |
label="Batch Overview",
|
| 1215 |
interactive=False,
|
|
|
|
| 1242 |
|
| 1243 |
reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
|
| 1244 |
|
| 1245 |
+
# --- Advanced runtime settings (collapsed) ---
|
| 1246 |
with gr.Accordion("Advanced runtime settings", open=False):
|
| 1247 |
with gr.Row():
|
| 1248 |
max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
|
| 1249 |
chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
|
| 1250 |
max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
|
| 1251 |
|
| 1252 |
+
# --- Admin tools (collapsed) ---
|
| 1253 |
with gr.Accordion("Admin tools (taxonomy + custom columns)", open=False):
|
| 1254 |
admin_mode = gr.Checkbox(label="Enable Admin mode", value=False)
|
| 1255 |
|
|
|
|
| 1258 |
admin_fields_group = gr.Group(visible=False)
|
| 1259 |
|
| 1260 |
with admin_group:
|
| 1261 |
+
gr.Markdown("### Admin: Configure extraction taxonomy + custom columns.")
|
| 1262 |
|
| 1263 |
with admin_vocab_group:
|
| 1264 |
gr.Markdown("### Controlled vocabulary (lists only)")
|
|
|
|
| 1307 |
|
| 1308 |
fields_status = gr.Textbox(label="Field builder status", interactive=False)
|
| 1309 |
|
| 1310 |
+
# --- Wiring ---
|
| 1311 |
admin_mode.change(
|
| 1312 |
fn=set_admin_visibility,
|
| 1313 |
inputs=[admin_mode],
|
| 1314 |
outputs=[admin_group, admin_vocab_group, admin_fields_group]
|
| 1315 |
)
|
| 1316 |
|
| 1317 |
+
endpoint_preset.change(
|
| 1318 |
+
fn=apply_endpoint_preset,
|
| 1319 |
+
inputs=[endpoint_preset],
|
| 1320 |
+
outputs=[endpoints]
|
| 1321 |
+
)
|
| 1322 |
+
|
| 1323 |
endpoints.change(
|
| 1324 |
fn=sync_fields_from_endpoints,
|
| 1325 |
+
inputs=[endpoints, admin_mode, field_rows_state, field_spec],
|
| 1326 |
outputs=[field_rows_state, fields_df, field_spec, status]
|
| 1327 |
)
|
| 1328 |
|
| 1329 |
extract_btn.click(
|
| 1330 |
fn=run_extraction,
|
| 1331 |
inputs=[files, api_key, model, endpoints, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars, admin_mode],
|
| 1332 |
+
outputs=[summary_card, overview_df, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
|
| 1333 |
)
|
| 1334 |
|
| 1335 |
record_pick.change(
|
| 1336 |
fn=on_pick,
|
| 1337 |
inputs=[record_pick, state_records, state_details],
|
| 1338 |
+
outputs=[summary_card, vertical_view, evidence_md]
|
| 1339 |
)
|
| 1340 |
|
| 1341 |
review_mode.change(fn=toggle_review_mode, inputs=[review_mode], outputs=[vertical_view])
|
|
|
|
| 1343 |
save_btn.click(
|
| 1344 |
fn=save_review_changes,
|
| 1345 |
inputs=[record_pick, vertical_view, state_records],
|
| 1346 |
+
outputs=[overview_df, state_records, review_status, summary_card]
|
| 1347 |
)
|
| 1348 |
|
| 1349 |
export_btn.click(
|
|
|
|
| 1352 |
outputs=[reviewed_csv, review_status]
|
| 1353 |
)
|
| 1354 |
|
| 1355 |
+
# Admin vocab wiring
|
| 1356 |
vocab_search.change(fn=vocab_filter_preview, inputs=[vocab_terms_df, vocab_search], outputs=[vocab_terms_filtered])
|
| 1357 |
|
| 1358 |
vocab_category.change(
|
|
|
|
| 1389 |
outputs=[vocab_state, vocab_category, vocab_terms_df, vocab_terms_filtered, vocab_json_admin, vocab_status, vocab_json]
|
| 1390 |
)
|
| 1391 |
|
| 1392 |
+
# Admin field builder wiring
|
| 1393 |
admin_apply_endpoints_btn.click(
|
| 1394 |
fn=admin_apply_endpoints,
|
| 1395 |
inputs=[endpoints],
|
|
|
|
| 1408 |
outputs=[field_rows_state, fields_df, field_spec, fields_status]
|
| 1409 |
)
|
| 1410 |
|
| 1411 |
+
# Init
|
| 1412 |
def _init_all():
|
| 1413 |
vocab, keys, k0, full_df, filtered_df, vjson, vmsg = vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
|
| 1414 |
|
| 1415 |
+
default_endpoints = ENDPOINT_PRESETS["Required – Safety Assessor"]
|
| 1416 |
+
rows, _, _ = build_rows_from_endpoints(default_endpoints)
|
| 1417 |
fdf = pd.DataFrame(rows, columns=["field","type","enum_values","instructions"])
|
| 1418 |
fspec = build_spec_from_field_rows(rows)
|
| 1419 |
|
|
|
|
| 1450 |
)
|
| 1451 |
|
| 1452 |
with gr.Tab("Cross-paper Synthesis"):
|
| 1453 |
+
gr.Markdown("Upload `extraction_details.json` from Extract tab. Synthesis is based strictly on grounded extractions.")
|
| 1454 |
api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
|
| 1455 |
model2 = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
|
| 1456 |
extraction_json_file = gr.File(label="Upload extraction_details.json", file_types=[".json"], file_count="single")
|