Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -57,6 +57,7 @@ DEFAULT_CONTROLLED_VOCAB_JSON = """{
|
|
| 57 |
"genotoxicity_result_enum": ["positive","negative","equivocal","not_reported"]
|
| 58 |
}"""
|
| 59 |
|
|
|
|
| 60 |
DEFAULT_FIELD_SPEC = """# One field per line: Field Name | type | instructions
|
| 61 |
# types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]
|
| 62 |
|
|
@@ -106,7 +107,7 @@ PRESET_CORE = [
|
|
| 106 |
{"field": "Study_type", "type": "enum", "enum_values": "in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other", "instructions": "Choose the best match."},
|
| 107 |
{"field": "Exposure_route", "type": "enum", "enum_values": "oral,inhalation,dermal,parenteral,multiple,not_reported", "instructions": "Choose best match."},
|
| 108 |
{"field": "Species", "type": "enum", "enum_values": "human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported", "instructions": "Choose best match."},
|
| 109 |
-
{"field": "Dose_metrics", "type": "list[str]", "enum_values": "", "instructions": "Include
|
| 110 |
{"field": "Key_findings", "type": "str", "enum_values": "", "instructions": "2-4 bullet-like sentences summarizing the main findings."},
|
| 111 |
{"field": "Conclusion", "type": "str", "enum_values": "", "instructions": "What does the paper conclude about safety/risk?"},
|
| 112 |
]
|
|
@@ -409,7 +410,7 @@ def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[
|
|
| 409 |
|
| 410 |
|
| 411 |
# =============================
|
| 412 |
-
# UI helpers: vertical view + evidence
|
| 413 |
# =============================
|
| 414 |
def _make_vertical(records: List[Dict[str, Any]], file_name: str) -> pd.DataFrame:
|
| 415 |
if not records or not file_name:
|
|
@@ -437,9 +438,7 @@ def _render_evidence(details: List[Dict[str, Any]], file_name: str, max_items: i
|
|
| 437 |
quote = quote[:280] + "…"
|
| 438 |
lines.append(f"- **{field}** (pages {pages}): “{quote}”")
|
| 439 |
header = "### Evidence (grounding)\n"
|
| 440 |
-
if
|
| 441 |
-
lines = ["- (no evidence returned)"]
|
| 442 |
-
return header + "\n".join(lines)
|
| 443 |
|
| 444 |
|
| 445 |
def _overview_df_from_records(records: List[Dict[str, Any]]) -> pd.DataFrame:
|
|
@@ -447,14 +446,13 @@ def _overview_df_from_records(records: List[Dict[str, Any]]) -> pd.DataFrame:
|
|
| 447 |
return pd.DataFrame(columns=["file","paper_title","risk_stance","risk_confidence"])
|
| 448 |
df = pd.DataFrame(records)
|
| 449 |
cols = ["file","paper_title","risk_stance","risk_confidence"]
|
| 450 |
-
# Include chemicals if present
|
| 451 |
-
for c in ["chemicals", "chemical_s", "chemical", "chemical_s_"]:
|
| 452 |
-
if c in df.columns and c not in cols:
|
| 453 |
-
cols.append(c)
|
| 454 |
-
break
|
| 455 |
cols = [c for c in cols if c in df.columns]
|
| 456 |
return df[cols].copy() if cols else df.head(50)
|
| 457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
def _filter_terms_df(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
| 459 |
if df is None or df.empty:
|
| 460 |
return pd.DataFrame(columns=["term"])
|
|
@@ -462,20 +460,8 @@ def _filter_terms_df(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
|
| 462 |
if not q:
|
| 463 |
return df[["term"]].copy()
|
| 464 |
mask = df["term"].astype(str).str.lower().str.contains(q, na=False)
|
| 465 |
-
|
| 466 |
-
return out
|
| 467 |
-
|
| 468 |
-
# =============================
|
| 469 |
-
# Controlled vocab guided editor (lists only)
|
| 470 |
-
# =============================
|
| 471 |
-
vocab_search = gr.Textbox(label="Search terms", placeholder="Type to filter (e.g., 471, AMES, comet)", lines=1)
|
| 472 |
|
| 473 |
-
vocab_terms_filtered = gr.Dataframe(
|
| 474 |
-
headers=["term"],
|
| 475 |
-
label="Filtered preview (read-only)",
|
| 476 |
-
interactive=False,
|
| 477 |
-
wrap=True
|
| 478 |
-
)
|
| 479 |
|
| 480 |
def vocab_init_state(vocab_json: str):
|
| 481 |
try:
|
|
@@ -486,8 +472,8 @@ def vocab_init_state(vocab_json: str):
|
|
| 486 |
list_keys = sorted([k for k, v in vocab.items() if isinstance(v, list)])
|
| 487 |
default_key = list_keys[0] if list_keys else None
|
| 488 |
terms = vocab.get(default_key, []) if default_key else []
|
| 489 |
-
|
| 490 |
-
return vocab, list_keys, default_key,
|
| 491 |
|
| 492 |
|
| 493 |
def vocab_load_category(vocab_state: Dict[str, Any], category: str, search: str):
|
|
@@ -536,13 +522,9 @@ def vocab_apply_df(vocab_state: Dict[str, Any], category: str, terms_df: Any, se
|
|
| 536 |
return json.dumps(vocab_state, indent=2), pd.DataFrame(columns=["term"]), "Pick a list category first."
|
| 537 |
|
| 538 |
try:
|
| 539 |
-
if isinstance(terms_df, pd.DataFrame)
|
| 540 |
-
df = terms_df
|
| 541 |
-
else:
|
| 542 |
-
df = pd.DataFrame(terms_df, columns=["term"])
|
| 543 |
except Exception:
|
| 544 |
-
|
| 545 |
-
return vjson, pd.DataFrame(columns=["term"]), "Could not parse terms table."
|
| 546 |
|
| 547 |
terms = []
|
| 548 |
for t in df.get("term", []).tolist():
|
|
@@ -559,6 +541,7 @@ def vocab_apply_df(vocab_state: Dict[str, Any], category: str, terms_df: Any, se
|
|
| 559 |
def vocab_reset_defaults():
|
| 560 |
return vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
|
| 561 |
|
|
|
|
| 562 |
def vocab_filter_preview(terms_df, search):
|
| 563 |
try:
|
| 564 |
df = terms_df if isinstance(terms_df, pd.DataFrame) else pd.DataFrame(terms_df, columns=["term"])
|
|
@@ -566,26 +549,12 @@ def vocab_filter_preview(terms_df, search):
|
|
| 566 |
df = pd.DataFrame(columns=["term"])
|
| 567 |
return _filter_terms_df(df, search)
|
| 568 |
|
| 569 |
-
vocab_search.change(
|
| 570 |
-
fn=vocab_filter_preview,
|
| 571 |
-
inputs=[vocab_terms_df, vocab_search],
|
| 572 |
-
outputs=[vocab_terms_filtered]
|
| 573 |
-
)
|
| 574 |
-
|
| 575 |
|
| 576 |
# =============================
|
| 577 |
# Field builder (type dropdown + presets)
|
| 578 |
# =============================
|
| 579 |
TYPE_CHOICES = ["str", "num", "bool", "list[str]", "list[num]", "enum", "list[enum]"]
|
| 580 |
|
| 581 |
-
def fields_init_state():
|
| 582 |
-
# start from DEFAULT_FIELD_SPEC by showing a friendly default builder (Core + Genotox + NAMs)
|
| 583 |
-
fields = []
|
| 584 |
-
for row in (PRESET_CORE + PRESET_NAMS_INSILICO + PRESET_GENOTOX_OECD):
|
| 585 |
-
fields.append(dict(row))
|
| 586 |
-
df = pd.DataFrame(fields, columns=["field","type","enum_values","instructions"])
|
| 587 |
-
spec = build_spec_from_field_df(df)
|
| 588 |
-
return fields, df, spec, "✅ Field builder loaded."
|
| 589 |
|
| 590 |
def build_spec_from_field_df(df: pd.DataFrame) -> str:
|
| 591 |
lines = [
|
|
@@ -602,11 +571,6 @@ def build_spec_from_field_df(df: pd.DataFrame) -> str:
|
|
| 602 |
if not field or not ftype:
|
| 603 |
continue
|
| 604 |
|
| 605 |
-
# normalize types
|
| 606 |
-
if ftype not in TYPE_CHOICES:
|
| 607 |
-
# keep as-is, but likely invalid; user can fix
|
| 608 |
-
pass
|
| 609 |
-
|
| 610 |
if ftype == "enum":
|
| 611 |
vals = [v.strip() for v in enums.split(",") if v.strip()]
|
| 612 |
type_str = f"enum[{','.join(vals)}]" if vals else "str"
|
|
@@ -620,6 +584,41 @@ def build_spec_from_field_df(df: pd.DataFrame) -> str:
|
|
| 620 |
|
| 621 |
return "\n".join(lines).strip() + "\n"
|
| 622 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 623 |
def fields_add_or_update(field_name: str, ftype: str, enum_values: str, instructions: str, field_rows: List[Dict[str, Any]]):
|
| 624 |
field_name = (field_name or "").strip()
|
| 625 |
ftype = (ftype or "").strip()
|
|
@@ -630,7 +629,6 @@ def fields_add_or_update(field_name: str, ftype: str, enum_values: str, instruct
|
|
| 630 |
df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
|
| 631 |
return field_rows, df, build_spec_from_field_df(df), "Field name and type are required."
|
| 632 |
|
| 633 |
-
# update if exists
|
| 634 |
updated = False
|
| 635 |
for r in field_rows:
|
| 636 |
if str(r.get("field","")).strip().lower() == field_name.lower():
|
|
@@ -647,27 +645,14 @@ def fields_add_or_update(field_name: str, ftype: str, enum_values: str, instruct
|
|
| 647 |
spec = build_spec_from_field_df(df)
|
| 648 |
return field_rows, df, spec, ("Updated field." if updated else "Added field.")
|
| 649 |
|
| 650 |
-
def fields_remove(field_to_remove: str, field_rows: List[Dict[str, Any]]):
|
| 651 |
-
key = (field_to_remove or "").strip().lower()
|
| 652 |
-
if not key:
|
| 653 |
-
df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
|
| 654 |
-
return field_rows, df, build_spec_from_field_df(df), "Pick a field to remove."
|
| 655 |
-
field_rows = [r for r in field_rows if str(r.get("field","")).strip().lower() != key]
|
| 656 |
-
df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
|
| 657 |
-
spec = build_spec_from_field_df(df)
|
| 658 |
-
return field_rows, df, spec, "Removed."
|
| 659 |
|
| 660 |
def fields_apply_df(field_rows: List[Dict[str, Any]], df_in: Any):
|
| 661 |
try:
|
| 662 |
-
if isinstance(df_in, pd.DataFrame)
|
| 663 |
-
df = df_in
|
| 664 |
-
else:
|
| 665 |
-
df = pd.DataFrame(df_in, columns=["field","type","enum_values","instructions"])
|
| 666 |
except Exception:
|
| 667 |
df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
|
| 668 |
return field_rows, df, build_spec_from_field_df(df), "Could not parse builder table."
|
| 669 |
|
| 670 |
-
# clean + rebuild list of dicts
|
| 671 |
cleaned = []
|
| 672 |
seen = set()
|
| 673 |
for _, r in df.iterrows():
|
|
@@ -687,34 +672,9 @@ def fields_apply_df(field_rows: List[Dict[str, Any]], df_in: Any):
|
|
| 687 |
spec = build_spec_from_field_df(df2)
|
| 688 |
return cleaned, df2, spec, f"✅ Applied builder table ({len(cleaned)} fields)."
|
| 689 |
|
| 690 |
-
def fields_load_preset(preset_name: str, mode: str, field_rows: List[Dict[str, Any]]):
|
| 691 |
-
preset = PRESET_MAP.get(preset_name)
|
| 692 |
-
if not preset:
|
| 693 |
-
df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
|
| 694 |
-
return field_rows, df, build_spec_from_field_df(df), "Unknown preset."
|
| 695 |
-
|
| 696 |
-
if mode == "Replace":
|
| 697 |
-
new_rows = [dict(r) for r in preset]
|
| 698 |
-
else:
|
| 699 |
-
# Append (update existing fields if same name)
|
| 700 |
-
new_rows = [dict(r) for r in field_rows]
|
| 701 |
-
for p in preset:
|
| 702 |
-
found = False
|
| 703 |
-
for r in new_rows:
|
| 704 |
-
if str(r.get("field","")).strip().lower() == str(p.get("field","")).strip().lower():
|
| 705 |
-
r.update(p)
|
| 706 |
-
found = True
|
| 707 |
-
break
|
| 708 |
-
if not found:
|
| 709 |
-
new_rows.append(dict(p))
|
| 710 |
-
|
| 711 |
-
df = pd.DataFrame(new_rows, columns=["field","type","enum_values","instructions"])
|
| 712 |
-
spec = build_spec_from_field_df(df)
|
| 713 |
-
return new_rows, df, spec, f"✅ Loaded preset: {preset_name} ({mode})."
|
| 714 |
-
|
| 715 |
|
| 716 |
# =============================
|
| 717 |
-
#
|
| 718 |
# =============================
|
| 719 |
def run_extraction(
|
| 720 |
files,
|
|
@@ -757,7 +717,7 @@ def run_extraction(
|
|
| 757 |
pages, page_count = extract_pages_from_pdf(pdf_path, max_pages=int(max_pages))
|
| 758 |
|
| 759 |
if _text_based_pdf_warning(pages):
|
| 760 |
-
|
| 761 |
"_file": filename,
|
| 762 |
"_pages_in_pdf": page_count,
|
| 763 |
"paper_title": "",
|
|
@@ -766,10 +726,10 @@ def run_extraction(
|
|
| 766 |
"risk_summary": "No extractable text found. This app supports text-based PDFs only.",
|
| 767 |
"extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
|
| 768 |
"evidence": []
|
| 769 |
-
}
|
|
|
|
| 770 |
else:
|
| 771 |
chunks = chunk_pages(pages, target_chars=int(chunk_chars))
|
| 772 |
-
|
| 773 |
queries = ["regulatory acceptability risk hazard concern conclusion adverse effect uncertainty noael loael bmd bmdl"]
|
| 774 |
for k, ins in field_instr.items():
|
| 775 |
queries.append(ins if ins else k)
|
|
@@ -789,7 +749,6 @@ def run_extraction(
|
|
| 789 |
extracted["_pages_in_pdf"] = page_count
|
| 790 |
results.append(extracted)
|
| 791 |
|
| 792 |
-
# flatten to internal records for vertical view + review/export
|
| 793 |
ex = results[-1]
|
| 794 |
row = {
|
| 795 |
"file": filename,
|
|
@@ -819,7 +778,6 @@ def run_extraction(
|
|
| 819 |
default = choices[0] if choices else None
|
| 820 |
vertical = _make_vertical(records, default)
|
| 821 |
evidence = _render_evidence(results, default)
|
| 822 |
-
|
| 823 |
overview = _overview_df_from_records(records)
|
| 824 |
|
| 825 |
status = "Done. Use the vertical view + evidence for review. Export reviewed CSV when ready."
|
|
@@ -842,18 +800,17 @@ def run_extraction(
|
|
| 842 |
def on_pick(file_name: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
|
| 843 |
return _make_vertical(records, file_name), _render_evidence(details, file_name)
|
| 844 |
|
|
|
|
| 845 |
def toggle_review_mode(is_on: bool):
|
| 846 |
return gr.update(interactive=bool(is_on))
|
| 847 |
|
|
|
|
| 848 |
def save_review_changes(file_name: str, vertical_df: Any, records: List[Dict[str, Any]]):
|
| 849 |
if not file_name or not records:
|
| 850 |
return pd.DataFrame(), records, "Nothing to save."
|
| 851 |
|
| 852 |
try:
|
| 853 |
-
if isinstance(vertical_df, pd.DataFrame)
|
| 854 |
-
dfv = vertical_df
|
| 855 |
-
else:
|
| 856 |
-
dfv = pd.DataFrame(vertical_df, columns=["Field", "Value"])
|
| 857 |
except Exception:
|
| 858 |
return _overview_df_from_records(records), records, "Could not parse edited vertical table."
|
| 859 |
|
|
@@ -875,6 +832,7 @@ def save_review_changes(file_name: str, vertical_df: Any, records: List[Dict[str
|
|
| 875 |
msg = "Saved changes into session data. Export reviewed CSV to download." if updated else "Record not found."
|
| 876 |
return _overview_df_from_records(new_records), new_records, msg
|
| 877 |
|
|
|
|
| 878 |
def export_reviewed_csv(records: List[Dict[str, Any]]):
|
| 879 |
if not records:
|
| 880 |
return None, "No reviewed data to export."
|
|
@@ -885,7 +843,7 @@ def export_reviewed_csv(records: List[Dict[str, Any]]):
|
|
| 885 |
|
| 886 |
|
| 887 |
# =============================
|
| 888 |
-
# Synthesis
|
| 889 |
# =============================
|
| 890 |
def run_synthesis(api_key, model, extraction_json_file):
|
| 891 |
if extraction_json_file is None:
|
|
@@ -907,15 +865,14 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
|
| 907 |
gr.Markdown(
|
| 908 |
"# Toxicology PDF → Grounded Extractor (GPT-4o)\n\n"
|
| 909 |
"**Important:** Text-based PDFs only (not scanned/image PDFs). If no extractable text is found, the record is marked `insufficient_data`.\n\n"
|
| 910 |
-
"
|
| 911 |
-
"Raw JSON/spec are available under **Advanced**."
|
| 912 |
)
|
| 913 |
|
| 914 |
-
#
|
| 915 |
-
state_records = gr.State([])
|
| 916 |
-
state_details = gr.State([])
|
| 917 |
-
vocab_state = gr.State({})
|
| 918 |
-
field_rows_state = gr.State([])
|
| 919 |
|
| 920 |
with gr.Tab("Extract"):
|
| 921 |
files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
|
|
@@ -933,94 +890,69 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
|
| 933 |
# Controlled Vocabulary (guided editor)
|
| 934 |
# -------------------------
|
| 935 |
gr.Markdown("## Controlled Vocabulary (guided editor)")
|
| 936 |
-
|
| 937 |
-
vocab_mode = gr.Radio(
|
| 938 |
-
choices=["Guided", "Advanced (Raw JSON)"],
|
| 939 |
-
value="Guided",
|
| 940 |
-
label="Vocab editor mode"
|
| 941 |
-
)
|
| 942 |
-
|
| 943 |
vocab_category = gr.Dropdown(label="Category (lists only)", choices=[], value=None)
|
| 944 |
-
|
| 945 |
-
|
| 946 |
-
vocab_search = gr.Textbox(
|
| 947 |
-
label="Search terms",
|
| 948 |
-
placeholder="Type to filter (e.g., 471, AMES, comet)",
|
| 949 |
-
lines=1
|
| 950 |
-
)
|
| 951 |
-
|
| 952 |
with gr.Row():
|
| 953 |
vocab_term_add = gr.Textbox(label="Add term", placeholder="type term and click Add")
|
| 954 |
vocab_add_btn = gr.Button("Add")
|
| 955 |
-
|
| 956 |
with gr.Row():
|
| 957 |
vocab_term_remove = gr.Textbox(label="Remove term", placeholder="type exact term and click Remove")
|
| 958 |
vocab_remove_btn = gr.Button("Remove")
|
| 959 |
-
vocab_apply_btn = gr.Button("Apply
|
| 960 |
vocab_reset_btn = gr.Button("Reset vocab to defaults")
|
| 961 |
-
|
| 962 |
-
# IMPORTANT: define vocab_terms_df BEFORE using it in any event wiring
|
| 963 |
vocab_terms_df = gr.Dataframe(
|
| 964 |
headers=["term"],
|
| 965 |
label="Terms (full list; edit directly)",
|
| 966 |
interactive=True,
|
| 967 |
wrap=True
|
| 968 |
)
|
| 969 |
-
|
| 970 |
-
# NEW: filtered preview (read-only)
|
| 971 |
vocab_terms_filtered = gr.Dataframe(
|
| 972 |
headers=["term"],
|
| 973 |
label="Filtered preview (read-only)",
|
| 974 |
interactive=False,
|
| 975 |
wrap=True
|
| 976 |
)
|
| 977 |
-
|
| 978 |
vocab_status = gr.Textbox(label="Vocab status", interactive=False)
|
| 979 |
-
|
| 980 |
with gr.Accordion("Advanced: Raw vocab JSON (auto-generated)", open=False):
|
| 981 |
vocab_json = gr.Textbox(label="Controlled vocab JSON", lines=12, interactive=False)
|
| 982 |
-
|
| 983 |
-
#
|
| 984 |
-
# Filtering helper + event
|
| 985 |
-
# -------------------------
|
| 986 |
-
def vocab_filter_preview(terms_df, search):
|
| 987 |
-
try:
|
| 988 |
-
df = terms_df if isinstance(terms_df, pd.DataFrame) else pd.DataFrame(terms_df, columns=["term"])
|
| 989 |
-
except Exception:
|
| 990 |
-
df = pd.DataFrame(columns=["term"])
|
| 991 |
-
return _filter_terms_df(df, search)
|
| 992 |
-
|
| 993 |
-
# Wire events AFTER components exist
|
| 994 |
-
vocab_category.change(
|
| 995 |
-
fn=vocab_load_category,
|
| 996 |
-
inputs=[vocab_state, vocab_category, vocab_search],
|
| 997 |
-
outputs=[vocab_terms_df, vocab_terms_filtered, vocab_status]
|
| 998 |
-
)
|
| 999 |
-
|
| 1000 |
vocab_search.change(
|
| 1001 |
fn=vocab_filter_preview,
|
| 1002 |
inputs=[vocab_terms_df, vocab_search],
|
| 1003 |
outputs=[vocab_terms_filtered]
|
| 1004 |
)
|
| 1005 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1006 |
vocab_add_btn.click(
|
| 1007 |
fn=vocab_add_term,
|
| 1008 |
inputs=[vocab_state, vocab_category, vocab_term_add, vocab_search],
|
| 1009 |
outputs=[vocab_terms_df, vocab_terms_filtered, vocab_term_add, vocab_status]
|
| 1010 |
)
|
| 1011 |
-
|
| 1012 |
vocab_remove_btn.click(
|
| 1013 |
fn=vocab_remove_term,
|
| 1014 |
inputs=[vocab_state, vocab_category, vocab_term_remove, vocab_search],
|
| 1015 |
outputs=[vocab_terms_df, vocab_terms_filtered, vocab_term_remove, vocab_status]
|
| 1016 |
)
|
| 1017 |
-
|
| 1018 |
vocab_apply_btn.click(
|
| 1019 |
fn=vocab_apply_df,
|
| 1020 |
inputs=[vocab_state, vocab_category, vocab_terms_df, vocab_search],
|
| 1021 |
outputs=[vocab_json, vocab_terms_filtered, vocab_status]
|
| 1022 |
)
|
| 1023 |
-
|
| 1024 |
vocab_reset_btn.click(
|
| 1025 |
fn=vocab_reset_defaults,
|
| 1026 |
inputs=None,
|
|
@@ -1031,8 +963,11 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
|
| 1031 |
outputs=[vocab_terms_df, vocab_terms_filtered, vocab_status]
|
| 1032 |
)
|
| 1033 |
|
| 1034 |
-
|
|
|
|
|
|
|
| 1035 |
gr.Markdown("## Extraction Spec (Field Builder)")
|
|
|
|
| 1036 |
with gr.Row():
|
| 1037 |
preset_name = gr.Dropdown(label="Preset", choices=list(PRESET_MAP.keys()), value="Core (recommended)")
|
| 1038 |
preset_mode = gr.Radio(label="Preset mode", choices=["Replace", "Append"], value="Append")
|
|
@@ -1041,30 +976,48 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
|
| 1041 |
with gr.Row():
|
| 1042 |
field_name_in = gr.Textbox(label="Field name", placeholder="e.g., Genotoxicity_result")
|
| 1043 |
field_type_in = gr.Dropdown(label="Type", choices=TYPE_CHOICES, value="str")
|
| 1044 |
-
enum_values_in = gr.Textbox(label="Enum values (comma-separated;
|
| 1045 |
instructions_in = gr.Textbox(label="Instructions", placeholder="Tell the extractor exactly what to pull.", lines=2)
|
| 1046 |
|
| 1047 |
-
|
| 1048 |
-
add_update_field_btn = gr.Button("Add/Update field")
|
| 1049 |
-
remove_field_name = gr.Dropdown(label="Remove field", choices=[], value=None)
|
| 1050 |
-
remove_field_btn = gr.Button("Remove")
|
| 1051 |
|
| 1052 |
fields_df = gr.Dataframe(
|
| 1053 |
-
label="Fields (edit
|
| 1054 |
headers=["field","type","enum_values","instructions"],
|
| 1055 |
interactive=True,
|
| 1056 |
wrap=True
|
| 1057 |
)
|
|
|
|
| 1058 |
fields_apply_btn = gr.Button("Apply builder table")
|
| 1059 |
fields_status = gr.Textbox(label="Field builder status", interactive=False)
|
| 1060 |
|
| 1061 |
with gr.Accordion("Advanced: Raw extraction spec (auto-generated)", open=False):
|
| 1062 |
field_spec = gr.Textbox(label="Extraction spec", lines=12, interactive=False)
|
| 1063 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1064 |
extract_btn = gr.Button("Run Extraction (Grounded)")
|
| 1065 |
status = gr.Textbox(label="Status", interactive=False)
|
| 1066 |
|
| 1067 |
-
# Replace wide table with a compact overview (not duplicate)
|
| 1068 |
overview_df = gr.Dataframe(
|
| 1069 |
label="Batch Overview (compact)",
|
| 1070 |
interactive=False,
|
|
@@ -1097,111 +1050,18 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
|
| 1097 |
evidence_md = gr.Markdown()
|
| 1098 |
reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
|
| 1099 |
|
| 1100 |
-
# -------------------------
|
| 1101 |
-
# INIT vocab + fields on load (via a button-less trick: use .load)
|
| 1102 |
-
# -------------------------
|
| 1103 |
-
def _init_all():
|
| 1104 |
-
v, keys, k0, df_terms, vjson, vmsg = vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
|
| 1105 |
-
frows, fdf, fspec, fmsg = fields_init_state()
|
| 1106 |
-
remove_choices = [r["field"] for r in frows]
|
| 1107 |
-
return (
|
| 1108 |
-
v, gr.update(choices=keys, value=k0), df_terms, vjson, vmsg,
|
| 1109 |
-
frows, fdf, fspec, fmsg, gr.update(choices=remove_choices, value=(remove_choices[0] if remove_choices else None))
|
| 1110 |
-
)
|
| 1111 |
-
|
| 1112 |
-
demo.load(
|
| 1113 |
-
_init_all,
|
| 1114 |
-
inputs=None,
|
| 1115 |
-
outputs=[vocab_state, vocab_category, vocab_terms_df, vocab_json, vocab_status,
|
| 1116 |
-
field_rows_state, fields_df, field_spec, fields_status, remove_field_name]
|
| 1117 |
-
)
|
| 1118 |
-
|
| 1119 |
-
# Vocab events
|
| 1120 |
-
vocab_category.change(
|
| 1121 |
-
fn=vocab_load_category,
|
| 1122 |
-
inputs=[vocab_state, vocab_category, vocab_search],
|
| 1123 |
-
outputs=[vocab_terms_df, vocab_terms_filtered, vocab_status]
|
| 1124 |
-
)
|
| 1125 |
-
vocab_add_btn.click(
|
| 1126 |
-
fn=vocab_add_term,
|
| 1127 |
-
inputs=[vocab_state, vocab_category, vocab_term_add, vocab_search],
|
| 1128 |
-
outputs=[vocab_terms_df, vocab_terms_filtered, vocab_term_add, vocab_status]
|
| 1129 |
-
)
|
| 1130 |
-
|
| 1131 |
-
vocab_remove_btn.click(
|
| 1132 |
-
fn=vocab_remove_term,
|
| 1133 |
-
inputs=[vocab_state, vocab_category, vocab_term_remove, vocab_search],
|
| 1134 |
-
outputs=[vocab_terms_df, vocab_terms_filtered, vocab_term_remove, vocab_status]
|
| 1135 |
-
)
|
| 1136 |
-
|
| 1137 |
-
vocab_apply_btn.click(
|
| 1138 |
-
fn=vocab_apply_df,
|
| 1139 |
-
inputs=[vocab_state, vocab_category, vocab_terms_df, vocab_search],
|
| 1140 |
-
outputs=[vocab_json, vocab_terms_filtered, vocab_status]
|
| 1141 |
-
)
|
| 1142 |
-
|
| 1143 |
-
vocab_reset_btn.click(
|
| 1144 |
-
fn=vocab_reset_defaults,
|
| 1145 |
-
inputs=None,
|
| 1146 |
-
outputs=[vocab_state, vocab_category, vocab_terms_df, vocab_json, vocab_status]
|
| 1147 |
-
)
|
| 1148 |
-
|
| 1149 |
-
# Field builder events
|
| 1150 |
-
preset_btn.click(
|
| 1151 |
-
fn=fields_load_preset,
|
| 1152 |
-
inputs=[preset_name, preset_mode, field_rows_state],
|
| 1153 |
-
outputs=[field_rows_state, fields_df, field_spec, fields_status]
|
| 1154 |
-
).then(
|
| 1155 |
-
fn=lambda rows: gr.update(choices=[r["field"] for r in rows], value=None),
|
| 1156 |
-
inputs=[field_rows_state],
|
| 1157 |
-
outputs=[remove_field_name]
|
| 1158 |
-
)
|
| 1159 |
-
|
| 1160 |
-
add_update_field_btn.click(
|
| 1161 |
-
fn=fields_add_or_update,
|
| 1162 |
-
inputs=[field_name_in, field_type_in, enum_values_in, instructions_in, field_rows_state],
|
| 1163 |
-
outputs=[field_rows_state, fields_df, field_spec, fields_status]
|
| 1164 |
-
).then(
|
| 1165 |
-
fn=lambda rows: gr.update(choices=[r["field"] for r in rows], value=None),
|
| 1166 |
-
inputs=[field_rows_state],
|
| 1167 |
-
outputs=[remove_field_name]
|
| 1168 |
-
)
|
| 1169 |
-
|
| 1170 |
-
remove_field_btn.click(
|
| 1171 |
-
fn=fields_remove,
|
| 1172 |
-
inputs=[remove_field_name, field_rows_state],
|
| 1173 |
-
outputs=[field_rows_state, fields_df, field_spec, fields_status]
|
| 1174 |
-
).then(
|
| 1175 |
-
fn=lambda rows: gr.update(choices=[r["field"] for r in rows], value=None),
|
| 1176 |
-
inputs=[field_rows_state],
|
| 1177 |
-
outputs=[remove_field_name]
|
| 1178 |
-
)
|
| 1179 |
-
|
| 1180 |
-
fields_apply_btn.click(
|
| 1181 |
-
fn=fields_apply_df,
|
| 1182 |
-
inputs=[field_rows_state, fields_df],
|
| 1183 |
-
outputs=[field_rows_state, fields_df, field_spec, fields_status]
|
| 1184 |
-
).then(
|
| 1185 |
-
fn=lambda rows: gr.update(choices=[r["field"] for r in rows], value=None),
|
| 1186 |
-
inputs=[field_rows_state],
|
| 1187 |
-
outputs=[remove_field_name]
|
| 1188 |
-
)
|
| 1189 |
-
|
| 1190 |
-
# Extraction
|
| 1191 |
extract_btn.click(
|
| 1192 |
fn=run_extraction,
|
| 1193 |
inputs=[files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars],
|
| 1194 |
outputs=[overview_df, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
|
| 1195 |
)
|
| 1196 |
|
| 1197 |
-
# Vertical view selection
|
| 1198 |
record_pick.change(
|
| 1199 |
fn=on_pick,
|
| 1200 |
inputs=[record_pick, state_records, state_details],
|
| 1201 |
outputs=[vertical_view, evidence_md]
|
| 1202 |
)
|
| 1203 |
|
| 1204 |
-
# Review mode
|
| 1205 |
review_mode.change(fn=toggle_review_mode, inputs=[review_mode], outputs=[vertical_view])
|
| 1206 |
|
| 1207 |
save_btn.click(
|
|
@@ -1216,6 +1076,43 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
|
| 1216 |
outputs=[reviewed_csv, review_status]
|
| 1217 |
)
|
| 1218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1219 |
with gr.Tab("Cross-paper Synthesis"):
|
| 1220 |
gr.Markdown("Upload `extraction_details.json` from Extract. Synthesis is based strictly on grounded extractions.")
|
| 1221 |
api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
|
|
|
|
| 57 |
"genotoxicity_result_enum": ["positive","negative","equivocal","not_reported"]
|
| 58 |
}"""
|
| 59 |
|
| 60 |
+
# (Used only as a fallback / advanced preview)
|
| 61 |
DEFAULT_FIELD_SPEC = """# One field per line: Field Name | type | instructions
|
| 62 |
# types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]
|
| 63 |
|
|
|
|
| 107 |
{"field": "Study_type", "type": "enum", "enum_values": "in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other", "instructions": "Choose the best match."},
|
| 108 |
{"field": "Exposure_route", "type": "enum", "enum_values": "oral,inhalation,dermal,parenteral,multiple,not_reported", "instructions": "Choose best match."},
|
| 109 |
{"field": "Species", "type": "enum", "enum_values": "human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported", "instructions": "Choose best match."},
|
| 110 |
+
{"field": "Dose_metrics", "type": "list[str]", "enum_values": "", "instructions": "Include reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available."},
|
| 111 |
{"field": "Key_findings", "type": "str", "enum_values": "", "instructions": "2-4 bullet-like sentences summarizing the main findings."},
|
| 112 |
{"field": "Conclusion", "type": "str", "enum_values": "", "instructions": "What does the paper conclude about safety/risk?"},
|
| 113 |
]
|
|
|
|
| 410 |
|
| 411 |
|
| 412 |
# =============================
|
| 413 |
+
# UI helpers: vertical view + evidence + overview
|
| 414 |
# =============================
|
| 415 |
def _make_vertical(records: List[Dict[str, Any]], file_name: str) -> pd.DataFrame:
|
| 416 |
if not records or not file_name:
|
|
|
|
| 438 |
quote = quote[:280] + "…"
|
| 439 |
lines.append(f"- **{field}** (pages {pages}): “{quote}”")
|
| 440 |
header = "### Evidence (grounding)\n"
|
| 441 |
+
return header + ("\n".join(lines) if lines else "- (no evidence returned)")
|
|
|
|
|
|
|
| 442 |
|
| 443 |
|
| 444 |
def _overview_df_from_records(records: List[Dict[str, Any]]) -> pd.DataFrame:
|
|
|
|
| 446 |
return pd.DataFrame(columns=["file","paper_title","risk_stance","risk_confidence"])
|
| 447 |
df = pd.DataFrame(records)
|
| 448 |
cols = ["file","paper_title","risk_stance","risk_confidence"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
cols = [c for c in cols if c in df.columns]
|
| 450 |
return df[cols].copy() if cols else df.head(50)
|
| 451 |
|
| 452 |
+
|
| 453 |
+
# =============================
|
| 454 |
+
# Controlled vocab guided editor (lists only) + SEARCH FILTER
|
| 455 |
+
# =============================
|
| 456 |
def _filter_terms_df(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
| 457 |
if df is None or df.empty:
|
| 458 |
return pd.DataFrame(columns=["term"])
|
|
|
|
| 460 |
if not q:
|
| 461 |
return df[["term"]].copy()
|
| 462 |
mask = df["term"].astype(str).str.lower().str.contains(q, na=False)
|
| 463 |
+
return df.loc[mask, ["term"]].copy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 464 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
|
| 466 |
def vocab_init_state(vocab_json: str):
|
| 467 |
try:
|
|
|
|
| 472 |
list_keys = sorted([k for k, v in vocab.items() if isinstance(v, list)])
|
| 473 |
default_key = list_keys[0] if list_keys else None
|
| 474 |
terms = vocab.get(default_key, []) if default_key else []
|
| 475 |
+
full_df = pd.DataFrame({"term": terms})
|
| 476 |
+
return vocab, list_keys, default_key, full_df, json.dumps(vocab, indent=2), "✅ Vocab loaded."
|
| 477 |
|
| 478 |
|
| 479 |
def vocab_load_category(vocab_state: Dict[str, Any], category: str, search: str):
|
|
|
|
| 522 |
return json.dumps(vocab_state, indent=2), pd.DataFrame(columns=["term"]), "Pick a list category first."
|
| 523 |
|
| 524 |
try:
|
| 525 |
+
df = terms_df if isinstance(terms_df, pd.DataFrame) else pd.DataFrame(terms_df, columns=["term"])
|
|
|
|
|
|
|
|
|
|
| 526 |
except Exception:
|
| 527 |
+
return json.dumps(vocab_state, indent=2), pd.DataFrame(columns=["term"]), "Could not parse terms table."
|
|
|
|
| 528 |
|
| 529 |
terms = []
|
| 530 |
for t in df.get("term", []).tolist():
|
|
|
|
| 541 |
def vocab_reset_defaults():
|
| 542 |
return vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
|
| 543 |
|
| 544 |
+
|
| 545 |
def vocab_filter_preview(terms_df, search):
|
| 546 |
try:
|
| 547 |
df = terms_df if isinstance(terms_df, pd.DataFrame) else pd.DataFrame(terms_df, columns=["term"])
|
|
|
|
| 549 |
df = pd.DataFrame(columns=["term"])
|
| 550 |
return _filter_terms_df(df, search)
|
| 551 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 552 |
|
| 553 |
# =============================
|
| 554 |
# Field builder (type dropdown + presets)
|
| 555 |
# =============================
|
| 556 |
TYPE_CHOICES = ["str", "num", "bool", "list[str]", "list[num]", "enum", "list[enum]"]
|
| 557 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 558 |
|
| 559 |
def build_spec_from_field_df(df: pd.DataFrame) -> str:
|
| 560 |
lines = [
|
|
|
|
| 571 |
if not field or not ftype:
|
| 572 |
continue
|
| 573 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 574 |
if ftype == "enum":
|
| 575 |
vals = [v.strip() for v in enums.split(",") if v.strip()]
|
| 576 |
type_str = f"enum[{','.join(vals)}]" if vals else "str"
|
|
|
|
| 584 |
|
| 585 |
return "\n".join(lines).strip() + "\n"
|
| 586 |
|
| 587 |
+
|
| 588 |
+
def fields_init_state():
|
| 589 |
+
fields = []
|
| 590 |
+
for row in (PRESET_CORE + PRESET_NAMS_INSILICO + PRESET_GENOTOX_OECD):
|
| 591 |
+
fields.append(dict(row))
|
| 592 |
+
df = pd.DataFrame(fields, columns=["field","type","enum_values","instructions"])
|
| 593 |
+
spec = build_spec_from_field_df(df)
|
| 594 |
+
return fields, df, spec, "✅ Field builder loaded."
|
| 595 |
+
|
| 596 |
+
|
| 597 |
+
def fields_load_preset(preset_name: str, mode: str, field_rows: List[Dict[str, Any]]):
|
| 598 |
+
preset = PRESET_MAP.get(preset_name)
|
| 599 |
+
if not preset:
|
| 600 |
+
df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
|
| 601 |
+
return field_rows, df, build_spec_from_field_df(df), "Unknown preset."
|
| 602 |
+
|
| 603 |
+
if mode == "Replace":
|
| 604 |
+
new_rows = [dict(r) for r in preset]
|
| 605 |
+
else:
|
| 606 |
+
new_rows = [dict(r) for r in field_rows]
|
| 607 |
+
for p in preset:
|
| 608 |
+
found = False
|
| 609 |
+
for r in new_rows:
|
| 610 |
+
if str(r.get("field","")).strip().lower() == str(p.get("field","")).strip().lower():
|
| 611 |
+
r.update(p)
|
| 612 |
+
found = True
|
| 613 |
+
break
|
| 614 |
+
if not found:
|
| 615 |
+
new_rows.append(dict(p))
|
| 616 |
+
|
| 617 |
+
df = pd.DataFrame(new_rows, columns=["field","type","enum_values","instructions"])
|
| 618 |
+
spec = build_spec_from_field_df(df)
|
| 619 |
+
return new_rows, df, spec, f"✅ Loaded preset: {preset_name} ({mode})."
|
| 620 |
+
|
| 621 |
+
|
| 622 |
def fields_add_or_update(field_name: str, ftype: str, enum_values: str, instructions: str, field_rows: List[Dict[str, Any]]):
|
| 623 |
field_name = (field_name or "").strip()
|
| 624 |
ftype = (ftype or "").strip()
|
|
|
|
| 629 |
df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
|
| 630 |
return field_rows, df, build_spec_from_field_df(df), "Field name and type are required."
|
| 631 |
|
|
|
|
| 632 |
updated = False
|
| 633 |
for r in field_rows:
|
| 634 |
if str(r.get("field","")).strip().lower() == field_name.lower():
|
|
|
|
| 645 |
spec = build_spec_from_field_df(df)
|
| 646 |
return field_rows, df, spec, ("Updated field." if updated else "Added field.")
|
| 647 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 648 |
|
| 649 |
def fields_apply_df(field_rows: List[Dict[str, Any]], df_in: Any):
|
| 650 |
try:
|
| 651 |
+
df = df_in if isinstance(df_in, pd.DataFrame) else pd.DataFrame(df_in, columns=["field","type","enum_values","instructions"])
|
|
|
|
|
|
|
|
|
|
| 652 |
except Exception:
|
| 653 |
df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
|
| 654 |
return field_rows, df, build_spec_from_field_df(df), "Could not parse builder table."
|
| 655 |
|
|
|
|
| 656 |
cleaned = []
|
| 657 |
seen = set()
|
| 658 |
for _, r in df.iterrows():
|
|
|
|
| 672 |
spec = build_spec_from_field_df(df2)
|
| 673 |
return cleaned, df2, spec, f"✅ Applied builder table ({len(cleaned)} fields)."
|
| 674 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 675 |
|
| 676 |
# =============================
|
| 677 |
+
# Main extraction handler
|
| 678 |
# =============================
|
| 679 |
def run_extraction(
|
| 680 |
files,
|
|
|
|
| 717 |
pages, page_count = extract_pages_from_pdf(pdf_path, max_pages=int(max_pages))
|
| 718 |
|
| 719 |
if _text_based_pdf_warning(pages):
|
| 720 |
+
ex = {
|
| 721 |
"_file": filename,
|
| 722 |
"_pages_in_pdf": page_count,
|
| 723 |
"paper_title": "",
|
|
|
|
| 726 |
"risk_summary": "No extractable text found. This app supports text-based PDFs only.",
|
| 727 |
"extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
|
| 728 |
"evidence": []
|
| 729 |
+
}
|
| 730 |
+
results.append(ex)
|
| 731 |
else:
|
| 732 |
chunks = chunk_pages(pages, target_chars=int(chunk_chars))
|
|
|
|
| 733 |
queries = ["regulatory acceptability risk hazard concern conclusion adverse effect uncertainty noael loael bmd bmdl"]
|
| 734 |
for k, ins in field_instr.items():
|
| 735 |
queries.append(ins if ins else k)
|
|
|
|
| 749 |
extracted["_pages_in_pdf"] = page_count
|
| 750 |
results.append(extracted)
|
| 751 |
|
|
|
|
| 752 |
ex = results[-1]
|
| 753 |
row = {
|
| 754 |
"file": filename,
|
|
|
|
| 778 |
default = choices[0] if choices else None
|
| 779 |
vertical = _make_vertical(records, default)
|
| 780 |
evidence = _render_evidence(results, default)
|
|
|
|
| 781 |
overview = _overview_df_from_records(records)
|
| 782 |
|
| 783 |
status = "Done. Use the vertical view + evidence for review. Export reviewed CSV when ready."
|
|
|
|
| 800 |
def on_pick(file_name: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
|
| 801 |
return _make_vertical(records, file_name), _render_evidence(details, file_name)
|
| 802 |
|
| 803 |
+
|
| 804 |
def toggle_review_mode(is_on: bool):
|
| 805 |
return gr.update(interactive=bool(is_on))
|
| 806 |
|
| 807 |
+
|
| 808 |
def save_review_changes(file_name: str, vertical_df: Any, records: List[Dict[str, Any]]):
|
| 809 |
if not file_name or not records:
|
| 810 |
return pd.DataFrame(), records, "Nothing to save."
|
| 811 |
|
| 812 |
try:
|
| 813 |
+
dfv = vertical_df if isinstance(vertical_df, pd.DataFrame) else pd.DataFrame(vertical_df, columns=["Field", "Value"])
|
|
|
|
|
|
|
|
|
|
| 814 |
except Exception:
|
| 815 |
return _overview_df_from_records(records), records, "Could not parse edited vertical table."
|
| 816 |
|
|
|
|
| 832 |
msg = "Saved changes into session data. Export reviewed CSV to download." if updated else "Record not found."
|
| 833 |
return _overview_df_from_records(new_records), new_records, msg
|
| 834 |
|
| 835 |
+
|
| 836 |
def export_reviewed_csv(records: List[Dict[str, Any]]):
|
| 837 |
if not records:
|
| 838 |
return None, "No reviewed data to export."
|
|
|
|
| 843 |
|
| 844 |
|
| 845 |
# =============================
|
| 846 |
+
# Synthesis tab handler
|
| 847 |
# =============================
|
| 848 |
def run_synthesis(api_key, model, extraction_json_file):
|
| 849 |
if extraction_json_file is None:
|
|
|
|
| 865 |
gr.Markdown(
|
| 866 |
"# Toxicology PDF → Grounded Extractor (GPT-4o)\n\n"
|
| 867 |
"**Important:** Text-based PDFs only (not scanned/image PDFs). If no extractable text is found, the record is marked `insufficient_data`.\n\n"
|
| 868 |
+
"UI includes a guided **Controlled Vocab editor** (lists only, with search) and a **Field Builder** (type dropdown + presets)."
|
|
|
|
| 869 |
)
|
| 870 |
|
| 871 |
+
# States
|
| 872 |
+
state_records = gr.State([]) # list[dict]
|
| 873 |
+
state_details = gr.State([]) # list[dict]
|
| 874 |
+
vocab_state = gr.State({}) # dict
|
| 875 |
+
field_rows_state = gr.State([]) # list[dict]
|
| 876 |
|
| 877 |
with gr.Tab("Extract"):
|
| 878 |
files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
|
|
|
|
| 890 |
# Controlled Vocabulary (guided editor)
|
| 891 |
# -------------------------
|
| 892 |
gr.Markdown("## Controlled Vocabulary (guided editor)")
|
| 893 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 894 |
vocab_category = gr.Dropdown(label="Category (lists only)", choices=[], value=None)
|
| 895 |
+
vocab_search = gr.Textbox(label="Search terms", placeholder="Type to filter (e.g., 471, AMES, comet)", lines=1)
|
| 896 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 897 |
with gr.Row():
|
| 898 |
vocab_term_add = gr.Textbox(label="Add term", placeholder="type term and click Add")
|
| 899 |
vocab_add_btn = gr.Button("Add")
|
|
|
|
| 900 |
with gr.Row():
|
| 901 |
vocab_term_remove = gr.Textbox(label="Remove term", placeholder="type exact term and click Remove")
|
| 902 |
vocab_remove_btn = gr.Button("Remove")
|
| 903 |
+
vocab_apply_btn = gr.Button("Apply full list to category")
|
| 904 |
vocab_reset_btn = gr.Button("Reset vocab to defaults")
|
| 905 |
+
|
|
|
|
| 906 |
vocab_terms_df = gr.Dataframe(
|
| 907 |
headers=["term"],
|
| 908 |
label="Terms (full list; edit directly)",
|
| 909 |
interactive=True,
|
| 910 |
wrap=True
|
| 911 |
)
|
| 912 |
+
|
|
|
|
| 913 |
vocab_terms_filtered = gr.Dataframe(
|
| 914 |
headers=["term"],
|
| 915 |
label="Filtered preview (read-only)",
|
| 916 |
interactive=False,
|
| 917 |
wrap=True
|
| 918 |
)
|
| 919 |
+
|
| 920 |
vocab_status = gr.Textbox(label="Vocab status", interactive=False)
|
| 921 |
+
|
| 922 |
with gr.Accordion("Advanced: Raw vocab JSON (auto-generated)", open=False):
|
| 923 |
vocab_json = gr.Textbox(label="Controlled vocab JSON", lines=12, interactive=False)
|
| 924 |
+
|
| 925 |
+
# Filter preview wiring (must be AFTER vocab_terms_df exists)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 926 |
vocab_search.change(
|
| 927 |
fn=vocab_filter_preview,
|
| 928 |
inputs=[vocab_terms_df, vocab_search],
|
| 929 |
outputs=[vocab_terms_filtered]
|
| 930 |
)
|
| 931 |
+
|
| 932 |
+
vocab_category.change(
|
| 933 |
+
fn=vocab_load_category,
|
| 934 |
+
inputs=[vocab_state, vocab_category, vocab_search],
|
| 935 |
+
outputs=[vocab_terms_df, vocab_terms_filtered, vocab_status]
|
| 936 |
+
)
|
| 937 |
+
|
| 938 |
vocab_add_btn.click(
|
| 939 |
fn=vocab_add_term,
|
| 940 |
inputs=[vocab_state, vocab_category, vocab_term_add, vocab_search],
|
| 941 |
outputs=[vocab_terms_df, vocab_terms_filtered, vocab_term_add, vocab_status]
|
| 942 |
)
|
| 943 |
+
|
| 944 |
vocab_remove_btn.click(
|
| 945 |
fn=vocab_remove_term,
|
| 946 |
inputs=[vocab_state, vocab_category, vocab_term_remove, vocab_search],
|
| 947 |
outputs=[vocab_terms_df, vocab_terms_filtered, vocab_term_remove, vocab_status]
|
| 948 |
)
|
| 949 |
+
|
| 950 |
vocab_apply_btn.click(
|
| 951 |
fn=vocab_apply_df,
|
| 952 |
inputs=[vocab_state, vocab_category, vocab_terms_df, vocab_search],
|
| 953 |
outputs=[vocab_json, vocab_terms_filtered, vocab_status]
|
| 954 |
)
|
| 955 |
+
|
| 956 |
vocab_reset_btn.click(
|
| 957 |
fn=vocab_reset_defaults,
|
| 958 |
inputs=None,
|
|
|
|
| 963 |
outputs=[vocab_terms_df, vocab_terms_filtered, vocab_status]
|
| 964 |
)
|
| 965 |
|
| 966 |
+
# -------------------------
|
| 967 |
+
# Field Builder
|
| 968 |
+
# -------------------------
|
| 969 |
gr.Markdown("## Extraction Spec (Field Builder)")
|
| 970 |
+
|
| 971 |
with gr.Row():
|
| 972 |
preset_name = gr.Dropdown(label="Preset", choices=list(PRESET_MAP.keys()), value="Core (recommended)")
|
| 973 |
preset_mode = gr.Radio(label="Preset mode", choices=["Replace", "Append"], value="Append")
|
|
|
|
| 976 |
with gr.Row():
|
| 977 |
field_name_in = gr.Textbox(label="Field name", placeholder="e.g., Genotoxicity_result")
|
| 978 |
field_type_in = gr.Dropdown(label="Type", choices=TYPE_CHOICES, value="str")
|
| 979 |
+
enum_values_in = gr.Textbox(label="Enum values (comma-separated; for enum/list[enum])", placeholder="a,b,c", lines=2)
|
| 980 |
instructions_in = gr.Textbox(label="Instructions", placeholder="Tell the extractor exactly what to pull.", lines=2)
|
| 981 |
|
| 982 |
+
add_update_field_btn = gr.Button("Add/Update field")
|
|
|
|
|
|
|
|
|
|
| 983 |
|
| 984 |
fields_df = gr.Dataframe(
|
| 985 |
+
label="Fields (edit and click Apply)",
|
| 986 |
headers=["field","type","enum_values","instructions"],
|
| 987 |
interactive=True,
|
| 988 |
wrap=True
|
| 989 |
)
|
| 990 |
+
|
| 991 |
fields_apply_btn = gr.Button("Apply builder table")
|
| 992 |
fields_status = gr.Textbox(label="Field builder status", interactive=False)
|
| 993 |
|
| 994 |
with gr.Accordion("Advanced: Raw extraction spec (auto-generated)", open=False):
|
| 995 |
field_spec = gr.Textbox(label="Extraction spec", lines=12, interactive=False)
|
| 996 |
|
| 997 |
+
preset_btn.click(
|
| 998 |
+
fn=fields_load_preset,
|
| 999 |
+
inputs=[preset_name, preset_mode, field_rows_state],
|
| 1000 |
+
outputs=[field_rows_state, fields_df, field_spec, fields_status]
|
| 1001 |
+
)
|
| 1002 |
+
|
| 1003 |
+
add_update_field_btn.click(
|
| 1004 |
+
fn=fields_add_or_update,
|
| 1005 |
+
inputs=[field_name_in, field_type_in, enum_values_in, instructions_in, field_rows_state],
|
| 1006 |
+
outputs=[field_rows_state, fields_df, field_spec, fields_status]
|
| 1007 |
+
)
|
| 1008 |
+
|
| 1009 |
+
fields_apply_btn.click(
|
| 1010 |
+
fn=fields_apply_df,
|
| 1011 |
+
inputs=[field_rows_state, fields_df],
|
| 1012 |
+
outputs=[field_rows_state, fields_df, field_spec, fields_status]
|
| 1013 |
+
)
|
| 1014 |
+
|
| 1015 |
+
# -------------------------
|
| 1016 |
+
# Run extraction
|
| 1017 |
+
# -------------------------
|
| 1018 |
extract_btn = gr.Button("Run Extraction (Grounded)")
|
| 1019 |
status = gr.Textbox(label="Status", interactive=False)
|
| 1020 |
|
|
|
|
| 1021 |
overview_df = gr.Dataframe(
|
| 1022 |
label="Batch Overview (compact)",
|
| 1023 |
interactive=False,
|
|
|
|
| 1050 |
evidence_md = gr.Markdown()
|
| 1051 |
reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
|
| 1052 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1053 |
extract_btn.click(
|
| 1054 |
fn=run_extraction,
|
| 1055 |
inputs=[files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars],
|
| 1056 |
outputs=[overview_df, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
|
| 1057 |
)
|
| 1058 |
|
|
|
|
| 1059 |
record_pick.change(
|
| 1060 |
fn=on_pick,
|
| 1061 |
inputs=[record_pick, state_records, state_details],
|
| 1062 |
outputs=[vertical_view, evidence_md]
|
| 1063 |
)
|
| 1064 |
|
|
|
|
| 1065 |
review_mode.change(fn=toggle_review_mode, inputs=[review_mode], outputs=[vertical_view])
|
| 1066 |
|
| 1067 |
save_btn.click(
|
|
|
|
| 1076 |
outputs=[reviewed_csv, review_status]
|
| 1077 |
)
|
| 1078 |
|
| 1079 |
+
# -------------------------
|
| 1080 |
+
# Initialize vocab + fields on load
|
| 1081 |
+
# -------------------------
|
| 1082 |
+
def _init_all():
|
| 1083 |
+
v, keys, k0, full_df, vjson, vmsg = vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
|
| 1084 |
+
filtered_df = _filter_terms_df(full_df, "")
|
| 1085 |
+
frows, fdf, fspec, fmsg = fields_init_state()
|
| 1086 |
+
return (
|
| 1087 |
+
v,
|
| 1088 |
+
gr.update(choices=keys, value=k0),
|
| 1089 |
+
full_df,
|
| 1090 |
+
filtered_df,
|
| 1091 |
+
vjson,
|
| 1092 |
+
vmsg,
|
| 1093 |
+
frows,
|
| 1094 |
+
fdf,
|
| 1095 |
+
fspec,
|
| 1096 |
+
fmsg
|
| 1097 |
+
)
|
| 1098 |
+
|
| 1099 |
+
demo.load(
|
| 1100 |
+
_init_all,
|
| 1101 |
+
inputs=None,
|
| 1102 |
+
outputs=[
|
| 1103 |
+
vocab_state,
|
| 1104 |
+
vocab_category,
|
| 1105 |
+
vocab_terms_df,
|
| 1106 |
+
vocab_terms_filtered,
|
| 1107 |
+
vocab_json,
|
| 1108 |
+
vocab_status,
|
| 1109 |
+
field_rows_state,
|
| 1110 |
+
fields_df,
|
| 1111 |
+
field_spec,
|
| 1112 |
+
fields_status
|
| 1113 |
+
]
|
| 1114 |
+
)
|
| 1115 |
+
|
| 1116 |
with gr.Tab("Cross-paper Synthesis"):
|
| 1117 |
gr.Markdown("Upload `extraction_details.json` from Extract. Synthesis is based strictly on grounded extractions.")
|
| 1118 |
api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
|