Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -25,28 +25,12 @@ DEFAULT_CONTROLLED_VOCAB_JSON = """{
|
|
| 25 |
|
| 26 |
"study_type_enum": ["in_vivo","in_vitro","epidemiology","in_silico","review","methodology","other"],
|
| 27 |
"in_silico_method_enum": [
|
| 28 |
-
"qsar",
|
| 29 |
-
"read_across",
|
| 30 |
-
"molecular_docking",
|
| 31 |
-
"molecular_dynamics",
|
| 32 |
-
"pbpk_pbtK",
|
| 33 |
-
"aop_based",
|
| 34 |
-
"ml_model",
|
| 35 |
-
"other",
|
| 36 |
-
"not_reported"
|
| 37 |
],
|
| 38 |
"nams_method_enum": [
|
| 39 |
-
"high_throughput_screening_hts",
|
| 40 |
-
"
|
| 41 |
-
"
|
| 42 |
-
"omics_metabolomics",
|
| 43 |
-
"organ_on_chip",
|
| 44 |
-
"microphysiological_system_mps",
|
| 45 |
-
"3d_tissue_model",
|
| 46 |
-
"in_chemico_assay",
|
| 47 |
-
"in_silico_as_nams",
|
| 48 |
-
"other",
|
| 49 |
-
"not_reported"
|
| 50 |
],
|
| 51 |
|
| 52 |
"exposure_route_enum": ["oral","inhalation","dermal","parenteral","multiple","not_reported"],
|
|
@@ -70,22 +54,9 @@ DEFAULT_CONTROLLED_VOCAB_JSON = """{
|
|
| 70 |
"not_reported"
|
| 71 |
],
|
| 72 |
|
| 73 |
-
"genotoxicity_result_enum": ["positive","negative","equivocal","not_reported"]
|
| 74 |
-
|
| 75 |
-
"genotoxicity_result_keywords": {
|
| 76 |
-
"positive": [
|
| 77 |
-
"genotoxic","mutagenic","clastogenic","statistically_significant_increase",
|
| 78 |
-
"significant_increase_in_mutations","induced_dna_damage","dose_dependent_increase"
|
| 79 |
-
],
|
| 80 |
-
"negative": [
|
| 81 |
-
"non_genotoxic","not_genotoxic","not_mutagenic","no_evidence_of_genotoxicity",
|
| 82 |
-
"no_statistically_significant_increase","negative_result"
|
| 83 |
-
],
|
| 84 |
-
"equivocal": ["equivocal","inconclusive"]
|
| 85 |
-
}
|
| 86 |
}"""
|
| 87 |
|
| 88 |
-
|
| 89 |
DEFAULT_FIELD_SPEC = """# One field per line: Field Name | type | instructions
|
| 90 |
# types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]
|
| 91 |
|
|
@@ -117,7 +88,7 @@ Genotox_OECD_TG_in_vivo | list[enum[
|
|
| 117 |
not_reported
|
| 118 |
]] | If genotoxicity in vivo tests are reported, select all applicable TGs. Otherwise not_reported.
|
| 119 |
|
| 120 |
-
Genotoxicity_result | enum[positive,negative,equivocal,not_reported] | Classify based on reported results
|
| 121 |
Genotoxicity_result_notes | str | Short explanation grounded to the paper’s wording + what test context it applies to.
|
| 122 |
|
| 123 |
Dose_metrics | list[str] | Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available.
|
|
@@ -126,6 +97,39 @@ Conclusion | str | What does the paper conclude about safety/risk?
|
|
| 126 |
"""
|
| 127 |
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
# =============================
|
| 131 |
# PDF extraction (text-based PDFs only)
|
|
@@ -183,8 +187,13 @@ def chunk_pages(pages: List[Tuple[int, str]], target_chars: int = 3000) -> List[
|
|
| 183 |
return chunks
|
| 184 |
|
| 185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
# =============================
|
| 187 |
-
# Lightweight retrieval (TF-IDF)
|
| 188 |
# =============================
|
| 189 |
def select_relevant_chunks(
|
| 190 |
chunks: List[Dict[str, Any]],
|
|
@@ -230,7 +239,7 @@ def build_context(selected_chunks: List[Dict[str, Any]], max_chars: int = 20000)
|
|
| 230 |
|
| 231 |
|
| 232 |
# =============================
|
| 233 |
-
#
|
| 234 |
# =============================
|
| 235 |
def slugify_field(name: str) -> str:
|
| 236 |
name = name.strip()
|
|
@@ -242,7 +251,7 @@ def slugify_field(name: str) -> str:
|
|
| 242 |
def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
|
| 243 |
"""
|
| 244 |
spec lines: Field Name | type | instructions
|
| 245 |
-
|
| 246 |
"""
|
| 247 |
props: Dict[str, Any] = {}
|
| 248 |
instr: Dict[str, str] = {}
|
|
@@ -292,18 +301,10 @@ def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
|
|
| 292 |
|
| 293 |
|
| 294 |
def build_extraction_schema(field_props: Dict[str, Any], vocab: Dict[str, Any]) -> Dict[str, Any]:
|
| 295 |
-
"""
|
| 296 |
-
IMPORTANT: Structured Outputs (strict=True) requires that for every object:
|
| 297 |
-
required must exist and include every key in properties.
|
| 298 |
-
"""
|
| 299 |
-
risk_enum = vocab.get(
|
| 300 |
-
"risk_stance_enum",
|
| 301 |
-
["acceptable", "acceptable_with_uncertainty", "not_acceptable", "insufficient_data"]
|
| 302 |
-
)
|
| 303 |
-
|
| 304 |
all_field_keys = list(field_props.keys())
|
| 305 |
|
| 306 |
-
|
| 307 |
"type": "object",
|
| 308 |
"additionalProperties": False,
|
| 309 |
"properties": {
|
|
@@ -331,9 +332,8 @@ def build_extraction_schema(field_props: Dict[str, Any], vocab: Dict[str, Any])
|
|
| 331 |
}
|
| 332 |
}
|
| 333 |
},
|
| 334 |
-
"required": ["paper_title",
|
| 335 |
}
|
| 336 |
-
return schema
|
| 337 |
|
| 338 |
|
| 339 |
# =============================
|
|
@@ -354,10 +354,7 @@ def openai_structured_extract(
|
|
| 354 |
field_instructions: Dict[str, str],
|
| 355 |
context: str
|
| 356 |
) -> Dict[str, Any]:
|
| 357 |
-
field_instr_lines = []
|
| 358 |
-
for k, v in field_instructions.items():
|
| 359 |
-
field_instr_lines.append(f"- {k}: {v if v else '(no extra instructions)'}")
|
| 360 |
-
|
| 361 |
vocab_text = json.dumps(controlled_vocab, indent=2)
|
| 362 |
|
| 363 |
system_msg = (
|
|
@@ -368,10 +365,8 @@ def openai_structured_extract(
|
|
| 368 |
"3) Provide evidence quotes + page ranges for extracted fields.\n"
|
| 369 |
"4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
|
| 370 |
"5) Prefer controlled vocab terms when applicable.\n"
|
| 371 |
-
"6) For
|
| 372 |
-
"7) For
|
| 373 |
-
"8) For NAMs/in_silico fields, only populate if methods are explicitly described; otherwise not_reported.\n"
|
| 374 |
-
|
| 375 |
)
|
| 376 |
|
| 377 |
user_msg = (
|
|
@@ -409,19 +404,12 @@ def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[
|
|
| 409 |
"Base strictly on the provided extracted JSON (which is evidence-backed).\n"
|
| 410 |
)
|
| 411 |
user_msg = "EXTRACTED_ROWS_JSON:\n" + json.dumps(rows, indent=2)
|
| 412 |
-
|
| 413 |
-
resp = client.responses.create(
|
| 414 |
-
model=model,
|
| 415 |
-
input=[
|
| 416 |
-
{"role": "system", "content": system_msg},
|
| 417 |
-
{"role": "user", "content": user_msg}
|
| 418 |
-
],
|
| 419 |
-
)
|
| 420 |
return resp.output_text
|
| 421 |
|
| 422 |
|
| 423 |
# =============================
|
| 424 |
-
#
|
| 425 |
# =============================
|
| 426 |
def _make_vertical(records: List[Dict[str, Any]], file_name: str) -> pd.DataFrame:
|
| 427 |
if not records or not file_name:
|
|
@@ -451,17 +439,282 @@ def _render_evidence(details: List[Dict[str, Any]], file_name: str, max_items: i
|
|
| 451 |
header = "### Evidence (grounding)\n"
|
| 452 |
if not lines:
|
| 453 |
lines = ["- (no evidence returned)"]
|
| 454 |
-
return header + "\n".join(lines)
|
| 455 |
|
| 456 |
|
| 457 |
-
def
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 461 |
|
| 462 |
|
| 463 |
# =============================
|
| 464 |
-
#
|
| 465 |
# =============================
|
| 466 |
def run_extraction(
|
| 467 |
files,
|
|
@@ -474,26 +727,23 @@ def run_extraction(
|
|
| 474 |
max_context_chars
|
| 475 |
):
|
| 476 |
if not files:
|
| 477 |
-
return
|
| 478 |
|
| 479 |
-
# vocab
|
| 480 |
try:
|
| 481 |
vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
|
| 482 |
except Exception as e:
|
| 483 |
-
return
|
| 484 |
|
| 485 |
-
# field spec
|
| 486 |
field_props, field_instr = parse_field_spec(field_spec or DEFAULT_FIELD_SPEC)
|
| 487 |
if not field_props:
|
| 488 |
-
return
|
| 489 |
|
| 490 |
schema = build_extraction_schema(field_props, vocab)
|
| 491 |
|
| 492 |
-
# OpenAI
|
| 493 |
try:
|
| 494 |
client = get_openai_client(api_key)
|
| 495 |
except Exception as e:
|
| 496 |
-
return
|
| 497 |
|
| 498 |
results: List[Dict[str, Any]] = []
|
| 499 |
flat_rows: List[Dict[str, Any]] = []
|
|
@@ -506,14 +756,7 @@ def run_extraction(
|
|
| 506 |
|
| 507 |
pages, page_count = extract_pages_from_pdf(pdf_path, max_pages=int(max_pages))
|
| 508 |
|
| 509 |
-
# enforce text-based PDFs note
|
| 510 |
if _text_based_pdf_warning(pages):
|
| 511 |
-
# create an "empty" record with warning
|
| 512 |
-
row = {"file": filename, "paper_title": "", "risk_stance": "insufficient_data", "risk_confidence": 0.0, "risk_summary": "No extractable text found. This app supports text-based PDFs only."}
|
| 513 |
-
for k, sch in field_props.items():
|
| 514 |
-
row[k] = "" if sch.get("type") != "array" else ""
|
| 515 |
-
flat_rows.append(row)
|
| 516 |
-
|
| 517 |
results.append({
|
| 518 |
"_file": filename,
|
| 519 |
"_pages_in_pdf": page_count,
|
|
@@ -524,67 +767,64 @@ def run_extraction(
|
|
| 524 |
"extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
|
| 525 |
"evidence": []
|
| 526 |
})
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
extracted["_file"] = filename
|
| 549 |
-
extracted["_pages_in_pdf"] = page_count
|
| 550 |
-
results.append(extracted)
|
| 551 |
|
| 552 |
-
# flatten to
|
|
|
|
| 553 |
row = {
|
| 554 |
"file": filename,
|
| 555 |
-
"paper_title":
|
| 556 |
-
"risk_stance":
|
| 557 |
-
"risk_confidence":
|
| 558 |
-
"risk_summary":
|
| 559 |
}
|
| 560 |
-
|
| 561 |
-
ext = extracted.get("extracted") or {}
|
| 562 |
for k in field_props.keys():
|
| 563 |
v = ext.get(k, "" if field_props[k].get("type") != "array" else [])
|
| 564 |
if isinstance(v, list):
|
| 565 |
row[k] = "; ".join([str(x) for x in v])
|
| 566 |
else:
|
| 567 |
row[k] = v
|
| 568 |
-
|
| 569 |
flat_rows.append(row)
|
| 570 |
|
| 571 |
df = pd.DataFrame(flat_rows)
|
|
|
|
| 572 |
|
| 573 |
csv_path = tmpdir / "extraction_table.csv"
|
| 574 |
json_path = tmpdir / "extraction_details.json"
|
| 575 |
df.to_csv(csv_path, index=False)
|
| 576 |
json_path.write_text(json.dumps(results, indent=2), encoding="utf-8")
|
| 577 |
|
| 578 |
-
records = df.to_dict("records")
|
| 579 |
choices = [r["file"] for r in records if "file" in r]
|
| 580 |
default = choices[0] if choices else None
|
| 581 |
vertical = _make_vertical(records, default)
|
| 582 |
evidence = _render_evidence(results, default)
|
| 583 |
|
| 584 |
-
|
| 585 |
|
|
|
|
| 586 |
return (
|
| 587 |
-
|
| 588 |
str(csv_path),
|
| 589 |
str(json_path),
|
| 590 |
status,
|
|
@@ -602,34 +842,24 @@ def run_extraction(
|
|
| 602 |
def on_pick(file_name: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
|
| 603 |
return _make_vertical(records, file_name), _render_evidence(details, file_name)
|
| 604 |
|
| 605 |
-
|
| 606 |
def toggle_review_mode(is_on: bool):
|
| 607 |
-
# make vertical table editable when review mode is on
|
| 608 |
return gr.update(interactive=bool(is_on))
|
| 609 |
|
| 610 |
-
|
| 611 |
def save_review_changes(file_name: str, vertical_df: Any, records: List[Dict[str, Any]]):
|
| 612 |
-
"""
|
| 613 |
-
vertical_df comes from gr.Dataframe: typically list[list] or pandas df-like.
|
| 614 |
-
Expect two columns: Field, Value
|
| 615 |
-
"""
|
| 616 |
if not file_name or not records:
|
| 617 |
-
return
|
| 618 |
|
| 619 |
-
# Convert vertical_df into dict
|
| 620 |
try:
|
| 621 |
if isinstance(vertical_df, pd.DataFrame):
|
| 622 |
dfv = vertical_df
|
| 623 |
else:
|
| 624 |
-
# gradio may pass list-of-lists
|
| 625 |
dfv = pd.DataFrame(vertical_df, columns=["Field", "Value"])
|
| 626 |
except Exception:
|
| 627 |
-
return
|
| 628 |
|
| 629 |
dfv = dfv.dropna(subset=["Field"])
|
| 630 |
updates = {str(r["Field"]): r["Value"] for _, r in dfv.iterrows() if str(r["Field"]).strip()}
|
| 631 |
|
| 632 |
-
# Update matching record
|
| 633 |
new_records = []
|
| 634 |
updated = False
|
| 635 |
for r in records:
|
|
@@ -642,10 +872,8 @@ def save_review_changes(file_name: str, vertical_df: Any, records: List[Dict[str
|
|
| 642 |
else:
|
| 643 |
new_records.append(r)
|
| 644 |
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
return df_wide, new_records, msg
|
| 648 |
-
|
| 649 |
|
| 650 |
def export_reviewed_csv(records: List[Dict[str, Any]]):
|
| 651 |
if not records:
|
|
@@ -657,11 +885,11 @@ def export_reviewed_csv(records: List[Dict[str, Any]]):
|
|
| 657 |
|
| 658 |
|
| 659 |
# =============================
|
| 660 |
-
# Synthesis
|
| 661 |
# =============================
|
| 662 |
def run_synthesis(api_key, model, extraction_json_file):
|
| 663 |
if extraction_json_file is None:
|
| 664 |
-
return "Upload the extraction_details.json
|
| 665 |
|
| 666 |
try:
|
| 667 |
client = get_openai_client(api_key)
|
|
@@ -675,46 +903,92 @@ def run_synthesis(api_key, model, extraction_json_file):
|
|
| 675 |
# =============================
|
| 676 |
# Gradio UI
|
| 677 |
# =============================
|
| 678 |
-
with gr.Blocks(title="Toxicology PDF → Grounded
|
| 679 |
gr.Markdown(
|
| 680 |
-
"# Toxicology PDF → Grounded
|
| 681 |
-
"**Important:**
|
| 682 |
-
"
|
|
|
|
| 683 |
)
|
| 684 |
|
| 685 |
-
# State
|
| 686 |
-
state_records = gr.State([])
|
| 687 |
-
state_details = gr.State([])
|
|
|
|
|
|
|
| 688 |
|
| 689 |
-
with gr.Tab("Extract
|
| 690 |
-
files = gr.File(label="Upload toxicology
|
| 691 |
|
| 692 |
with gr.Row():
|
| 693 |
api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
|
| 694 |
-
model = gr.Dropdown(
|
| 695 |
-
label="Model",
|
| 696 |
-
choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"],
|
| 697 |
-
value="gpt-4o-2024-08-06"
|
| 698 |
-
)
|
| 699 |
|
| 700 |
with gr.Row():
|
| 701 |
max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
|
| 702 |
chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
|
| 703 |
max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
|
| 704 |
|
| 705 |
-
|
| 706 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 707 |
|
| 708 |
extract_btn = gr.Button("Run Extraction (Grounded)")
|
| 709 |
status = gr.Textbox(label="Status", interactive=False)
|
| 710 |
|
| 711 |
-
table
|
| 712 |
-
|
|
|
|
| 713 |
interactive=False,
|
| 714 |
wrap=True,
|
| 715 |
show_row_numbers=True,
|
| 716 |
buttons=["fullscreen", "copy"]
|
| 717 |
)
|
|
|
|
| 718 |
with gr.Row():
|
| 719 |
out_csv = gr.File(label="Download: extraction_table.csv")
|
| 720 |
out_json = gr.File(label="Download: extraction_details.json (evidence + structured data)")
|
|
@@ -724,7 +998,7 @@ with gr.Blocks(title="Toxicology PDF → Grounded Table Extractor") as demo:
|
|
| 724 |
|
| 725 |
with gr.Row():
|
| 726 |
review_mode = gr.Checkbox(label="Review mode (enable editing)", value=False)
|
| 727 |
-
save_btn = gr.Button("Save
|
| 728 |
export_btn = gr.Button("Export reviewed CSV")
|
| 729 |
|
| 730 |
review_status = gr.Textbox(label="Review status", interactive=False)
|
|
@@ -737,38 +1011,121 @@ with gr.Blocks(title="Toxicology PDF → Grounded Table Extractor") as demo:
|
|
| 737 |
label="Vertical record view (Field → Value)"
|
| 738 |
)
|
| 739 |
evidence_md = gr.Markdown()
|
| 740 |
-
|
| 741 |
reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
|
| 742 |
|
| 743 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 744 |
extract_btn.click(
|
| 745 |
fn=run_extraction,
|
| 746 |
inputs=[files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars],
|
| 747 |
-
outputs=[
|
| 748 |
)
|
| 749 |
|
| 750 |
-
#
|
| 751 |
record_pick.change(
|
| 752 |
fn=on_pick,
|
| 753 |
inputs=[record_pick, state_records, state_details],
|
| 754 |
outputs=[vertical_view, evidence_md]
|
| 755 |
)
|
| 756 |
|
| 757 |
-
#
|
| 758 |
-
review_mode.change(
|
| 759 |
-
fn=toggle_review_mode,
|
| 760 |
-
inputs=[review_mode],
|
| 761 |
-
outputs=[vertical_view]
|
| 762 |
-
)
|
| 763 |
|
| 764 |
-
# Save edits back to wide table + state
|
| 765 |
save_btn.click(
|
| 766 |
fn=save_review_changes,
|
| 767 |
inputs=[record_pick, vertical_view, state_records],
|
| 768 |
-
outputs=[
|
| 769 |
)
|
| 770 |
|
| 771 |
-
# Export reviewed CSV
|
| 772 |
export_btn.click(
|
| 773 |
fn=export_reviewed_csv,
|
| 774 |
inputs=[state_records],
|
|
@@ -776,47 +1133,29 @@ with gr.Blocks(title="Toxicology PDF → Grounded Table Extractor") as demo:
|
|
| 776 |
)
|
| 777 |
|
| 778 |
with gr.Tab("Cross-paper Synthesis"):
|
| 779 |
-
gr.Markdown("Upload
|
| 780 |
api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
|
| 781 |
-
model2 = gr.Dropdown(
|
| 782 |
-
label="Model",
|
| 783 |
-
choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"],
|
| 784 |
-
value="gpt-4o-2024-08-06"
|
| 785 |
-
)
|
| 786 |
extraction_json_file = gr.File(label="Upload extraction_details.json", file_types=[".json"], file_count="single")
|
| 787 |
synth_btn = gr.Button("Synthesize Across Papers")
|
| 788 |
synth_md = gr.Markdown()
|
| 789 |
-
|
| 790 |
-
synth_btn.click(
|
| 791 |
-
fn=run_synthesis,
|
| 792 |
-
inputs=[api_key2, model2, extraction_json_file],
|
| 793 |
-
outputs=[synth_md]
|
| 794 |
-
)
|
| 795 |
|
| 796 |
with gr.Tab("Pending tasks"):
|
| 797 |
gr.Markdown(
|
| 798 |
-
"##
|
| 799 |
-
"
|
| 800 |
-
"- Change schema to
|
| 801 |
-
"
|
| 802 |
-
"
|
| 803 |
-
"
|
| 804 |
-
"-
|
| 805 |
-
"
|
| 806 |
-
"
|
| 807 |
-
"
|
| 808 |
-
"-
|
| 809 |
-
"
|
| 810 |
-
"
|
| 811 |
-
"- Parse dose metrics into `{metric, value, unit, route, duration}`\n"
|
| 812 |
-
"- Normalize units (e.g., mg/kg/day)\n"
|
| 813 |
-
"- Auto-split multi-chemical text into canonical list\n\n"
|
| 814 |
-
"### 5) Multi-document compare mode\n"
|
| 815 |
-
"- Compare by chemical or endpoint\n"
|
| 816 |
-
"- Create a consensus + disagreements table\n\n"
|
| 817 |
-
"### 6) PDF limitations\n"
|
| 818 |
-
"- Current: **text-based PDFs only**\n"
|
| 819 |
-
"- Optional future: OCR for scanned PDFs (adds heavy dependencies)\n"
|
| 820 |
)
|
| 821 |
|
| 822 |
if __name__ == "__main__":
|
|
|
|
| 25 |
|
| 26 |
"study_type_enum": ["in_vivo","in_vitro","epidemiology","in_silico","review","methodology","other"],
|
| 27 |
"in_silico_method_enum": [
|
| 28 |
+
"qsar","read_across","molecular_docking","molecular_dynamics","pbpk_pbtK","aop_based","ml_model","other","not_reported"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
],
|
| 30 |
"nams_method_enum": [
|
| 31 |
+
"high_throughput_screening_hts","omics_transcriptomics","omics_proteomics","omics_metabolomics",
|
| 32 |
+
"organ_on_chip","microphysiological_system_mps","3d_tissue_model","in_chemico_assay",
|
| 33 |
+
"in_silico_as_nams","other","not_reported"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
],
|
| 35 |
|
| 36 |
"exposure_route_enum": ["oral","inhalation","dermal","parenteral","multiple","not_reported"],
|
|
|
|
| 54 |
"not_reported"
|
| 55 |
],
|
| 56 |
|
| 57 |
+
"genotoxicity_result_enum": ["positive","negative","equivocal","not_reported"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
}"""
|
| 59 |
|
|
|
|
| 60 |
DEFAULT_FIELD_SPEC = """# One field per line: Field Name | type | instructions
|
| 61 |
# types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]
|
| 62 |
|
|
|
|
| 88 |
not_reported
|
| 89 |
]] | If genotoxicity in vivo tests are reported, select all applicable TGs. Otherwise not_reported.
|
| 90 |
|
| 91 |
+
Genotoxicity_result | enum[positive,negative,equivocal,not_reported] | Classify based on reported results. If unclear, not_reported.
|
| 92 |
Genotoxicity_result_notes | str | Short explanation grounded to the paper’s wording + what test context it applies to.
|
| 93 |
|
| 94 |
Dose_metrics | list[str] | Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available.
|
|
|
|
| 97 |
"""
|
| 98 |
|
| 99 |
|
| 100 |
+
# =============================
|
| 101 |
+
# Field presets (UI)
|
| 102 |
+
# =============================
|
| 103 |
+
PRESET_CORE = [
|
| 104 |
+
{"field": "Chemical(s)", "type": "list[str]", "enum_values": "", "instructions": "Primary chemical(s) studied; include common name + abbreviation if present."},
|
| 105 |
+
{"field": "CAS_numbers", "type": "list[str]", "enum_values": "", "instructions": "Extract any CAS numbers mentioned."},
|
| 106 |
+
{"field": "Study_type", "type": "enum", "enum_values": "in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other", "instructions": "Choose the best match."},
|
| 107 |
+
{"field": "Exposure_route", "type": "enum", "enum_values": "oral,inhalation,dermal,parenteral,multiple,not_reported", "instructions": "Choose best match."},
|
| 108 |
+
{"field": "Species", "type": "enum", "enum_values": "human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported", "instructions": "Choose best match."},
|
| 109 |
+
{"field": "Dose_metrics", "type": "list[str]", "enum_values": "", "instructions": "Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available."},
|
| 110 |
+
{"field": "Key_findings", "type": "str", "enum_values": "", "instructions": "2-4 bullet-like sentences summarizing the main findings."},
|
| 111 |
+
{"field": "Conclusion", "type": "str", "enum_values": "", "instructions": "What does the paper conclude about safety/risk?"},
|
| 112 |
+
]
|
| 113 |
+
|
| 114 |
+
PRESET_NAMS_INSILICO = [
|
| 115 |
+
{"field": "Approach", "type": "enum", "enum_values": "in_vivo,in_vitro,in_silico,nams,mixed,not_reported", "instructions": "Identify if results are in silico or NAMs; use 'mixed' if multiple."},
|
| 116 |
+
{"field": "In_silico_methods", "type": "list[enum]", "enum_values": "qsar,read_across,molecular_docking,molecular_dynamics,pbpk_pbtK,aop_based,ml_model,other,not_reported", "instructions": "If in_silico, list methods used (can be multiple)."},
|
| 117 |
+
{"field": "NAMs_methods", "type": "list[enum]", "enum_values": "high_throughput_screening_hts,omics_transcriptomics,omics_proteomics,omics_metabolomics,organ_on_chip,microphysiological_system_mps,3d_tissue_model,in_chemico_assay,in_silico_as_nams,other,not_reported", "instructions": "If NAMs, list methods used (can be multiple)."},
|
| 118 |
+
]
|
| 119 |
+
|
| 120 |
+
PRESET_GENOTOX_OECD = [
|
| 121 |
+
{"field": "Genotox_OECD_TG_in_vitro", "type": "list[enum]", "enum_values": "OECD_TG_471_Bacterial Reverse mutation test(AMES test),OECD_TG_473_In Vitro Mammalian Chromosomal Aberration Test,OECD_TG_476_In Vitro Mammalian Cell Gene Mutation Tests (Hprt & xprt),OECD_TG_487_In Vitro Mammalian Cell Micronucleus Test,OECD_TG_490_In Vitro Mammalian Cell Gene Mutation Tests (Thymidine Kinase),not_reported", "instructions": "If in vitro genotox tests are reported, select TGs. Otherwise not_reported."},
|
| 122 |
+
{"field": "Genotox_OECD_TG_in_vivo", "type": "list[enum]", "enum_values": "OECD_TG_474_In Vivo Mammalian Erythrocyte Micronucleus Test,OECD_TG_475_Mammalian Bone Marrow Chromosomal Aberration Test,OECD_TG_488_Transgenic Rodent Somatic & Germ Cell Gene Mutation Assays,OECD_TG_489_In Vivo Mammalian Alkaline Comet Assay,not_reported", "instructions": "If in vivo genotox tests are reported, select TGs. Otherwise not_reported."},
|
| 123 |
+
{"field": "Genotoxicity_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Classify based on reported results. If unclear, not_reported."},
|
| 124 |
+
{"field": "Genotoxicity_result_notes", "type": "str", "enum_values": "", "instructions": "Short explanation grounded to the paper’s wording + test context."},
|
| 125 |
+
]
|
| 126 |
+
|
| 127 |
+
PRESET_MAP = {
|
| 128 |
+
"Core (recommended)": PRESET_CORE,
|
| 129 |
+
"NAMs + In Silico": PRESET_NAMS_INSILICO,
|
| 130 |
+
"Genotox (OECD TGs)": PRESET_GENOTOX_OECD,
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
|
| 134 |
# =============================
|
| 135 |
# PDF extraction (text-based PDFs only)
|
|
|
|
| 187 |
return chunks
|
| 188 |
|
| 189 |
|
| 190 |
+
def _text_based_pdf_warning(pages: List[Tuple[int, str]]) -> bool:
|
| 191 |
+
joined = " ".join([clean_text(t) for _, t in pages if clean_text(t)])
|
| 192 |
+
return len(joined.strip()) < 200
|
| 193 |
+
|
| 194 |
+
|
| 195 |
# =============================
|
| 196 |
+
# Lightweight retrieval (TF-IDF)
|
| 197 |
# =============================
|
| 198 |
def select_relevant_chunks(
|
| 199 |
chunks: List[Dict[str, Any]],
|
|
|
|
| 239 |
|
| 240 |
|
| 241 |
# =============================
|
| 242 |
+
# Spec -> JSON schema
|
| 243 |
# =============================
|
| 244 |
def slugify_field(name: str) -> str:
|
| 245 |
name = name.strip()
|
|
|
|
| 251 |
def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
|
| 252 |
"""
|
| 253 |
spec lines: Field Name | type | instructions
|
| 254 |
+
types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]
|
| 255 |
"""
|
| 256 |
props: Dict[str, Any] = {}
|
| 257 |
instr: Dict[str, str] = {}
|
|
|
|
| 301 |
|
| 302 |
|
| 303 |
def build_extraction_schema(field_props: Dict[str, Any], vocab: Dict[str, Any]) -> Dict[str, Any]:
|
| 304 |
+
risk_enum = vocab.get("risk_stance_enum", ["acceptable","acceptable_with_uncertainty","not_acceptable","insufficient_data"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
all_field_keys = list(field_props.keys())
|
| 306 |
|
| 307 |
+
return {
|
| 308 |
"type": "object",
|
| 309 |
"additionalProperties": False,
|
| 310 |
"properties": {
|
|
|
|
| 332 |
}
|
| 333 |
}
|
| 334 |
},
|
| 335 |
+
"required": ["paper_title","risk_stance","risk_confidence","risk_summary","extracted","evidence"]
|
| 336 |
}
|
|
|
|
| 337 |
|
| 338 |
|
| 339 |
# =============================
|
|
|
|
| 354 |
field_instructions: Dict[str, str],
|
| 355 |
context: str
|
| 356 |
) -> Dict[str, Any]:
|
| 357 |
+
field_instr_lines = [f"- {k}: {v if v else '(no extra instructions)'}" for k, v in field_instructions.items()]
|
|
|
|
|
|
|
|
|
|
| 358 |
vocab_text = json.dumps(controlled_vocab, indent=2)
|
| 359 |
|
| 360 |
system_msg = (
|
|
|
|
| 365 |
"3) Provide evidence quotes + page ranges for extracted fields.\n"
|
| 366 |
"4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
|
| 367 |
"5) Prefer controlled vocab terms when applicable.\n"
|
| 368 |
+
"6) For OECD TG fields, only populate if explicitly stated or clearly described; otherwise use not_reported.\n"
|
| 369 |
+
"7) For NAMs/in_silico fields, only populate if explicitly described; otherwise not_reported.\n"
|
|
|
|
|
|
|
| 370 |
)
|
| 371 |
|
| 372 |
user_msg = (
|
|
|
|
| 404 |
"Base strictly on the provided extracted JSON (which is evidence-backed).\n"
|
| 405 |
)
|
| 406 |
user_msg = "EXTRACTED_ROWS_JSON:\n" + json.dumps(rows, indent=2)
|
| 407 |
+
resp = client.responses.create(model=model, input=[{"role":"system","content":system_msg},{"role":"user","content":user_msg}])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
return resp.output_text
|
| 409 |
|
| 410 |
|
| 411 |
# =============================
|
| 412 |
+
# UI helpers: vertical view + evidence
|
| 413 |
# =============================
|
| 414 |
def _make_vertical(records: List[Dict[str, Any]], file_name: str) -> pd.DataFrame:
|
| 415 |
if not records or not file_name:
|
|
|
|
| 439 |
header = "### Evidence (grounding)\n"
|
| 440 |
if not lines:
|
| 441 |
lines = ["- (no evidence returned)"]
|
| 442 |
+
return header + "\n".join(lines)
|
| 443 |
|
| 444 |
|
| 445 |
+
def _overview_df_from_records(records: List[Dict[str, Any]]) -> pd.DataFrame:
|
| 446 |
+
if not records:
|
| 447 |
+
return pd.DataFrame(columns=["file","paper_title","risk_stance","risk_confidence"])
|
| 448 |
+
df = pd.DataFrame(records)
|
| 449 |
+
cols = ["file","paper_title","risk_stance","risk_confidence"]
|
| 450 |
+
# Include chemicals if present
|
| 451 |
+
for c in ["chemicals", "chemical_s", "chemical", "chemical_s_"]:
|
| 452 |
+
if c in df.columns and c not in cols:
|
| 453 |
+
cols.append(c)
|
| 454 |
+
break
|
| 455 |
+
cols = [c for c in cols if c in df.columns]
|
| 456 |
+
return df[cols].copy() if cols else df.head(50)
|
| 457 |
+
|
| 458 |
+
def _filter_terms_df(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
| 459 |
+
if df is None or df.empty:
|
| 460 |
+
return pd.DataFrame(columns=["term"])
|
| 461 |
+
q = (query or "").strip().lower()
|
| 462 |
+
if not q:
|
| 463 |
+
return df[["term"]].copy()
|
| 464 |
+
mask = df["term"].astype(str).str.lower().str.contains(q, na=False)
|
| 465 |
+
out = df.loc[mask, ["term"]].copy()
|
| 466 |
+
return out
|
| 467 |
+
|
| 468 |
+
# =============================
|
| 469 |
+
# Controlled vocab guided editor (lists only)
|
| 470 |
+
# =============================
|
| 471 |
+
vocab_search = gr.Textbox(label="Search terms", placeholder="Type to filter (e.g., 471, AMES, comet)", lines=1)
|
| 472 |
+
|
| 473 |
+
vocab_terms_filtered = gr.Dataframe(
|
| 474 |
+
headers=["term"],
|
| 475 |
+
label="Filtered preview (read-only)",
|
| 476 |
+
interactive=False,
|
| 477 |
+
wrap=True
|
| 478 |
+
)
|
| 479 |
+
|
| 480 |
+
def vocab_init_state(vocab_json: str):
|
| 481 |
+
try:
|
| 482 |
+
vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
|
| 483 |
+
except Exception:
|
| 484 |
+
vocab = json.loads(DEFAULT_CONTROLLED_VOCAB_JSON)
|
| 485 |
+
|
| 486 |
+
list_keys = sorted([k for k, v in vocab.items() if isinstance(v, list)])
|
| 487 |
+
default_key = list_keys[0] if list_keys else None
|
| 488 |
+
terms = vocab.get(default_key, []) if default_key else []
|
| 489 |
+
terms_df = pd.DataFrame({"term": terms})
|
| 490 |
+
return vocab, list_keys, default_key, terms_df, json.dumps(vocab, indent=2), "✅ Vocab loaded."
|
| 491 |
+
|
| 492 |
+
|
| 493 |
+
def vocab_load_category(vocab_state: Dict[str, Any], category: str, search: str):
|
| 494 |
+
if not category or category not in vocab_state:
|
| 495 |
+
empty = pd.DataFrame(columns=["term"])
|
| 496 |
+
return empty, empty, "Select a category."
|
| 497 |
+
terms = vocab_state.get(category, [])
|
| 498 |
+
if not isinstance(terms, list):
|
| 499 |
+
empty = pd.DataFrame(columns=["term"])
|
| 500 |
+
return empty, empty, "This category is not a list."
|
| 501 |
+
full = pd.DataFrame({"term": terms})
|
| 502 |
+
filtered = _filter_terms_df(full, search)
|
| 503 |
+
return full, filtered, f"Editing: {category}"
|
| 504 |
+
|
| 505 |
+
|
| 506 |
+
def vocab_add_term(vocab_state: Dict[str, Any], category: str, term: str, search: str):
|
| 507 |
+
term = (term or "").strip()
|
| 508 |
+
if not term:
|
| 509 |
+
return gr.update(), gr.update(), "", "Enter a term to add."
|
| 510 |
+
if not category or category not in vocab_state or not isinstance(vocab_state.get(category), list):
|
| 511 |
+
return gr.update(), gr.update(), "", "Pick a list category first."
|
| 512 |
+
|
| 513 |
+
if term not in vocab_state[category]:
|
| 514 |
+
vocab_state[category].append(term)
|
| 515 |
+
|
| 516 |
+
full = pd.DataFrame({"term": vocab_state[category]})
|
| 517 |
+
filtered = _filter_terms_df(full, search)
|
| 518 |
+
return full, filtered, "", f"Added: {term}"
|
| 519 |
+
|
| 520 |
+
|
| 521 |
+
def vocab_remove_term(vocab_state: Dict[str, Any], category: str, term: str, search: str):
|
| 522 |
+
term = (term or "").strip()
|
| 523 |
+
if not term:
|
| 524 |
+
return gr.update(), gr.update(), "", "Enter a term to remove."
|
| 525 |
+
if not category or category not in vocab_state or not isinstance(vocab_state.get(category), list):
|
| 526 |
+
return gr.update(), gr.update(), "", "Pick a list category first."
|
| 527 |
+
|
| 528 |
+
vocab_state[category] = [t for t in vocab_state[category] if t != term]
|
| 529 |
+
full = pd.DataFrame({"term": vocab_state[category]})
|
| 530 |
+
filtered = _filter_terms_df(full, search)
|
| 531 |
+
return full, filtered, "", f"Removed: {term}"
|
| 532 |
+
|
| 533 |
+
|
| 534 |
+
def vocab_apply_df(vocab_state: Dict[str, Any], category: str, terms_df: Any, search: str):
|
| 535 |
+
if not category or category not in vocab_state or not isinstance(vocab_state.get(category), list):
|
| 536 |
+
return json.dumps(vocab_state, indent=2), pd.DataFrame(columns=["term"]), "Pick a list category first."
|
| 537 |
+
|
| 538 |
+
try:
|
| 539 |
+
if isinstance(terms_df, pd.DataFrame):
|
| 540 |
+
df = terms_df
|
| 541 |
+
else:
|
| 542 |
+
df = pd.DataFrame(terms_df, columns=["term"])
|
| 543 |
+
except Exception:
|
| 544 |
+
vjson = json.dumps(vocab_state, indent=2)
|
| 545 |
+
return vjson, pd.DataFrame(columns=["term"]), "Could not parse terms table."
|
| 546 |
+
|
| 547 |
+
terms = []
|
| 548 |
+
for t in df.get("term", []).tolist():
|
| 549 |
+
t = (str(t) if t is not None else "").strip()
|
| 550 |
+
if t and t not in terms:
|
| 551 |
+
terms.append(t)
|
| 552 |
+
|
| 553 |
+
vocab_state[category] = terms
|
| 554 |
+
vjson = json.dumps(vocab_state, indent=2)
|
| 555 |
+
filtered = _filter_terms_df(pd.DataFrame({"term": terms}), search)
|
| 556 |
+
return vjson, filtered, f"✅ Applied {len(terms)} terms to {category}."
|
| 557 |
+
|
| 558 |
+
|
| 559 |
+
def vocab_reset_defaults():
|
| 560 |
+
return vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
|
| 561 |
+
|
| 562 |
+
def vocab_filter_preview(terms_df, search):
|
| 563 |
+
try:
|
| 564 |
+
df = terms_df if isinstance(terms_df, pd.DataFrame) else pd.DataFrame(terms_df, columns=["term"])
|
| 565 |
+
except Exception:
|
| 566 |
+
df = pd.DataFrame(columns=["term"])
|
| 567 |
+
return _filter_terms_df(df, search)
|
| 568 |
+
|
| 569 |
+
vocab_search.change(
|
| 570 |
+
fn=vocab_filter_preview,
|
| 571 |
+
inputs=[vocab_terms_df, vocab_search],
|
| 572 |
+
outputs=[vocab_terms_filtered]
|
| 573 |
+
)
|
| 574 |
+
|
| 575 |
+
|
| 576 |
+
# =============================
|
| 577 |
+
# Field builder (type dropdown + presets)
|
| 578 |
+
# =============================
|
| 579 |
+
TYPE_CHOICES = ["str", "num", "bool", "list[str]", "list[num]", "enum", "list[enum]"]
|
| 580 |
+
|
| 581 |
+
def fields_init_state():
|
| 582 |
+
# start from DEFAULT_FIELD_SPEC by showing a friendly default builder (Core + Genotox + NAMs)
|
| 583 |
+
fields = []
|
| 584 |
+
for row in (PRESET_CORE + PRESET_NAMS_INSILICO + PRESET_GENOTOX_OECD):
|
| 585 |
+
fields.append(dict(row))
|
| 586 |
+
df = pd.DataFrame(fields, columns=["field","type","enum_values","instructions"])
|
| 587 |
+
spec = build_spec_from_field_df(df)
|
| 588 |
+
return fields, df, spec, "✅ Field builder loaded."
|
| 589 |
+
|
| 590 |
+
def build_spec_from_field_df(df: pd.DataFrame) -> str:
|
| 591 |
+
lines = [
|
| 592 |
+
"# One field per line: Field Name | type | instructions",
|
| 593 |
+
"# types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]",
|
| 594 |
+
""
|
| 595 |
+
]
|
| 596 |
+
for _, r in df.iterrows():
|
| 597 |
+
field = str(r.get("field","")).strip()
|
| 598 |
+
ftype = str(r.get("type","")).strip()
|
| 599 |
+
enums = str(r.get("enum_values","")).strip()
|
| 600 |
+
instr = str(r.get("instructions","")).strip()
|
| 601 |
+
|
| 602 |
+
if not field or not ftype:
|
| 603 |
+
continue
|
| 604 |
+
|
| 605 |
+
# normalize types
|
| 606 |
+
if ftype not in TYPE_CHOICES:
|
| 607 |
+
# keep as-is, but likely invalid; user can fix
|
| 608 |
+
pass
|
| 609 |
+
|
| 610 |
+
if ftype == "enum":
|
| 611 |
+
vals = [v.strip() for v in enums.split(",") if v.strip()]
|
| 612 |
+
type_str = f"enum[{','.join(vals)}]" if vals else "str"
|
| 613 |
+
elif ftype == "list[enum]":
|
| 614 |
+
vals = [v.strip() for v in enums.split(",") if v.strip()]
|
| 615 |
+
type_str = f"list[enum[{','.join(vals)}]]" if vals else "list[str]"
|
| 616 |
+
else:
|
| 617 |
+
type_str = ftype
|
| 618 |
+
|
| 619 |
+
lines.append(f"{field} | {type_str} | {instr}")
|
| 620 |
+
|
| 621 |
+
return "\n".join(lines).strip() + "\n"
|
| 622 |
+
|
| 623 |
+
def fields_add_or_update(field_name: str, ftype: str, enum_values: str, instructions: str, field_rows: List[Dict[str, Any]]):
|
| 624 |
+
field_name = (field_name or "").strip()
|
| 625 |
+
ftype = (ftype or "").strip()
|
| 626 |
+
enum_values = (enum_values or "").strip()
|
| 627 |
+
instructions = (instructions or "").strip()
|
| 628 |
+
|
| 629 |
+
if not field_name or not ftype:
|
| 630 |
+
df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
|
| 631 |
+
return field_rows, df, build_spec_from_field_df(df), "Field name and type are required."
|
| 632 |
+
|
| 633 |
+
# update if exists
|
| 634 |
+
updated = False
|
| 635 |
+
for r in field_rows:
|
| 636 |
+
if str(r.get("field","")).strip().lower() == field_name.lower():
|
| 637 |
+
r["type"] = ftype
|
| 638 |
+
r["enum_values"] = enum_values
|
| 639 |
+
r["instructions"] = instructions
|
| 640 |
+
updated = True
|
| 641 |
+
break
|
| 642 |
+
|
| 643 |
+
if not updated:
|
| 644 |
+
field_rows.append({"field": field_name, "type": ftype, "enum_values": enum_values, "instructions": instructions})
|
| 645 |
+
|
| 646 |
+
df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
|
| 647 |
+
spec = build_spec_from_field_df(df)
|
| 648 |
+
return field_rows, df, spec, ("Updated field." if updated else "Added field.")
|
| 649 |
+
|
| 650 |
+
def fields_remove(field_to_remove: str, field_rows: List[Dict[str, Any]]):
|
| 651 |
+
key = (field_to_remove or "").strip().lower()
|
| 652 |
+
if not key:
|
| 653 |
+
df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
|
| 654 |
+
return field_rows, df, build_spec_from_field_df(df), "Pick a field to remove."
|
| 655 |
+
field_rows = [r for r in field_rows if str(r.get("field","")).strip().lower() != key]
|
| 656 |
+
df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
|
| 657 |
+
spec = build_spec_from_field_df(df)
|
| 658 |
+
return field_rows, df, spec, "Removed."
|
| 659 |
+
|
| 660 |
+
def fields_apply_df(field_rows: List[Dict[str, Any]], df_in: Any):
|
| 661 |
+
try:
|
| 662 |
+
if isinstance(df_in, pd.DataFrame):
|
| 663 |
+
df = df_in
|
| 664 |
+
else:
|
| 665 |
+
df = pd.DataFrame(df_in, columns=["field","type","enum_values","instructions"])
|
| 666 |
+
except Exception:
|
| 667 |
+
df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
|
| 668 |
+
return field_rows, df, build_spec_from_field_df(df), "Could not parse builder table."
|
| 669 |
+
|
| 670 |
+
# clean + rebuild list of dicts
|
| 671 |
+
cleaned = []
|
| 672 |
+
seen = set()
|
| 673 |
+
for _, r in df.iterrows():
|
| 674 |
+
field = str(r.get("field","")).strip()
|
| 675 |
+
ftype = str(r.get("type","")).strip()
|
| 676 |
+
enums = str(r.get("enum_values","")).strip()
|
| 677 |
+
instr = str(r.get("instructions","")).strip()
|
| 678 |
+
if not field or not ftype:
|
| 679 |
+
continue
|
| 680 |
+
k = field.lower()
|
| 681 |
+
if k in seen:
|
| 682 |
+
continue
|
| 683 |
+
seen.add(k)
|
| 684 |
+
cleaned.append({"field": field, "type": ftype, "enum_values": enums, "instructions": instr})
|
| 685 |
+
|
| 686 |
+
df2 = pd.DataFrame(cleaned, columns=["field","type","enum_values","instructions"])
|
| 687 |
+
spec = build_spec_from_field_df(df2)
|
| 688 |
+
return cleaned, df2, spec, f"✅ Applied builder table ({len(cleaned)} fields)."
|
| 689 |
+
|
| 690 |
+
def fields_load_preset(preset_name: str, mode: str, field_rows: List[Dict[str, Any]]):
|
| 691 |
+
preset = PRESET_MAP.get(preset_name)
|
| 692 |
+
if not preset:
|
| 693 |
+
df = pd.DataFrame(field_rows, columns=["field","type","enum_values","instructions"])
|
| 694 |
+
return field_rows, df, build_spec_from_field_df(df), "Unknown preset."
|
| 695 |
+
|
| 696 |
+
if mode == "Replace":
|
| 697 |
+
new_rows = [dict(r) for r in preset]
|
| 698 |
+
else:
|
| 699 |
+
# Append (update existing fields if same name)
|
| 700 |
+
new_rows = [dict(r) for r in field_rows]
|
| 701 |
+
for p in preset:
|
| 702 |
+
found = False
|
| 703 |
+
for r in new_rows:
|
| 704 |
+
if str(r.get("field","")).strip().lower() == str(p.get("field","")).strip().lower():
|
| 705 |
+
r.update(p)
|
| 706 |
+
found = True
|
| 707 |
+
break
|
| 708 |
+
if not found:
|
| 709 |
+
new_rows.append(dict(p))
|
| 710 |
+
|
| 711 |
+
df = pd.DataFrame(new_rows, columns=["field","type","enum_values","instructions"])
|
| 712 |
+
spec = build_spec_from_field_df(df)
|
| 713 |
+
return new_rows, df, spec, f"✅ Loaded preset: {preset_name} ({mode})."
|
| 714 |
|
| 715 |
|
| 716 |
# =============================
|
| 717 |
+
# Extraction handler
|
| 718 |
# =============================
|
| 719 |
def run_extraction(
|
| 720 |
files,
|
|
|
|
| 727 |
max_context_chars
|
| 728 |
):
|
| 729 |
if not files:
|
| 730 |
+
return pd.DataFrame(), None, None, "Upload one or more PDFs.", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
|
| 731 |
|
|
|
|
| 732 |
try:
|
| 733 |
vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
|
| 734 |
except Exception as e:
|
| 735 |
+
return pd.DataFrame(), None, None, f"Controlled vocab JSON invalid: {e}", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
|
| 736 |
|
|
|
|
| 737 |
field_props, field_instr = parse_field_spec(field_spec or DEFAULT_FIELD_SPEC)
|
| 738 |
if not field_props:
|
| 739 |
+
return pd.DataFrame(), None, None, "Extraction spec produced no fields.", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
|
| 740 |
|
| 741 |
schema = build_extraction_schema(field_props, vocab)
|
| 742 |
|
|
|
|
| 743 |
try:
|
| 744 |
client = get_openai_client(api_key)
|
| 745 |
except Exception as e:
|
| 746 |
+
return pd.DataFrame(), None, None, str(e), gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
|
| 747 |
|
| 748 |
results: List[Dict[str, Any]] = []
|
| 749 |
flat_rows: List[Dict[str, Any]] = []
|
|
|
|
| 756 |
|
| 757 |
pages, page_count = extract_pages_from_pdf(pdf_path, max_pages=int(max_pages))
|
| 758 |
|
|
|
|
| 759 |
if _text_based_pdf_warning(pages):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 760 |
results.append({
|
| 761 |
"_file": filename,
|
| 762 |
"_pages_in_pdf": page_count,
|
|
|
|
| 767 |
"extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
|
| 768 |
"evidence": []
|
| 769 |
})
|
| 770 |
+
else:
|
| 771 |
+
chunks = chunk_pages(pages, target_chars=int(chunk_chars))
|
| 772 |
+
|
| 773 |
+
queries = ["regulatory acceptability risk hazard concern conclusion adverse effect uncertainty noael loael bmd bmdl"]
|
| 774 |
+
for k, ins in field_instr.items():
|
| 775 |
+
queries.append(ins if ins else k)
|
| 776 |
+
|
| 777 |
+
selected = select_relevant_chunks(chunks, queries, top_per_query=2, max_chunks=12)
|
| 778 |
+
context = build_context(selected, max_chars=int(max_context_chars))
|
| 779 |
+
|
| 780 |
+
extracted = openai_structured_extract(
|
| 781 |
+
client=client,
|
| 782 |
+
model=model,
|
| 783 |
+
schema=schema,
|
| 784 |
+
controlled_vocab=vocab,
|
| 785 |
+
field_instructions=field_instr,
|
| 786 |
+
context=context
|
| 787 |
+
)
|
| 788 |
+
extracted["_file"] = filename
|
| 789 |
+
extracted["_pages_in_pdf"] = page_count
|
| 790 |
+
results.append(extracted)
|
|
|
|
|
|
|
|
|
|
| 791 |
|
| 792 |
+
# flatten to internal records for vertical view + review/export
|
| 793 |
+
ex = results[-1]
|
| 794 |
row = {
|
| 795 |
"file": filename,
|
| 796 |
+
"paper_title": ex.get("paper_title",""),
|
| 797 |
+
"risk_stance": ex.get("risk_stance",""),
|
| 798 |
+
"risk_confidence": ex.get("risk_confidence",""),
|
| 799 |
+
"risk_summary": ex.get("risk_summary","")
|
| 800 |
}
|
| 801 |
+
ext = ex.get("extracted") or {}
|
|
|
|
| 802 |
for k in field_props.keys():
|
| 803 |
v = ext.get(k, "" if field_props[k].get("type") != "array" else [])
|
| 804 |
if isinstance(v, list):
|
| 805 |
row[k] = "; ".join([str(x) for x in v])
|
| 806 |
else:
|
| 807 |
row[k] = v
|
|
|
|
| 808 |
flat_rows.append(row)
|
| 809 |
|
| 810 |
df = pd.DataFrame(flat_rows)
|
| 811 |
+
records = df.to_dict("records")
|
| 812 |
|
| 813 |
csv_path = tmpdir / "extraction_table.csv"
|
| 814 |
json_path = tmpdir / "extraction_details.json"
|
| 815 |
df.to_csv(csv_path, index=False)
|
| 816 |
json_path.write_text(json.dumps(results, indent=2), encoding="utf-8")
|
| 817 |
|
|
|
|
| 818 |
choices = [r["file"] for r in records if "file" in r]
|
| 819 |
default = choices[0] if choices else None
|
| 820 |
vertical = _make_vertical(records, default)
|
| 821 |
evidence = _render_evidence(results, default)
|
| 822 |
|
| 823 |
+
overview = _overview_df_from_records(records)
|
| 824 |
|
| 825 |
+
status = "Done. Use the vertical view + evidence for review. Export reviewed CSV when ready."
|
| 826 |
return (
|
| 827 |
+
overview,
|
| 828 |
str(csv_path),
|
| 829 |
str(json_path),
|
| 830 |
status,
|
|
|
|
| 842 |
def on_pick(file_name: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
|
| 843 |
return _make_vertical(records, file_name), _render_evidence(details, file_name)
|
| 844 |
|
|
|
|
| 845 |
def toggle_review_mode(is_on: bool):
|
|
|
|
| 846 |
return gr.update(interactive=bool(is_on))
|
| 847 |
|
|
|
|
| 848 |
def save_review_changes(file_name: str, vertical_df: Any, records: List[Dict[str, Any]]):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 849 |
if not file_name or not records:
|
| 850 |
+
return pd.DataFrame(), records, "Nothing to save."
|
| 851 |
|
|
|
|
| 852 |
try:
|
| 853 |
if isinstance(vertical_df, pd.DataFrame):
|
| 854 |
dfv = vertical_df
|
| 855 |
else:
|
|
|
|
| 856 |
dfv = pd.DataFrame(vertical_df, columns=["Field", "Value"])
|
| 857 |
except Exception:
|
| 858 |
+
return _overview_df_from_records(records), records, "Could not parse edited vertical table."
|
| 859 |
|
| 860 |
dfv = dfv.dropna(subset=["Field"])
|
| 861 |
updates = {str(r["Field"]): r["Value"] for _, r in dfv.iterrows() if str(r["Field"]).strip()}
|
| 862 |
|
|
|
|
| 863 |
new_records = []
|
| 864 |
updated = False
|
| 865 |
for r in records:
|
|
|
|
| 872 |
else:
|
| 873 |
new_records.append(r)
|
| 874 |
|
| 875 |
+
msg = "Saved changes into session data. Export reviewed CSV to download." if updated else "Record not found."
|
| 876 |
+
return _overview_df_from_records(new_records), new_records, msg
|
|
|
|
|
|
|
| 877 |
|
| 878 |
def export_reviewed_csv(records: List[Dict[str, Any]]):
|
| 879 |
if not records:
|
|
|
|
| 885 |
|
| 886 |
|
| 887 |
# =============================
|
| 888 |
+
# Synthesis
|
| 889 |
# =============================
|
| 890 |
def run_synthesis(api_key, model, extraction_json_file):
|
| 891 |
if extraction_json_file is None:
|
| 892 |
+
return "Upload the extraction_details.json from the Extract tab first."
|
| 893 |
|
| 894 |
try:
|
| 895 |
client = get_openai_client(api_key)
|
|
|
|
| 903 |
# =============================
|
| 904 |
# Gradio UI
|
| 905 |
# =============================
|
| 906 |
+
with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
| 907 |
gr.Markdown(
|
| 908 |
+
"# Toxicology PDF → Grounded Extractor (GPT-4o)\n\n"
|
| 909 |
+
"**Important:** Text-based PDFs only (not scanned/image PDFs). If no extractable text is found, the record is marked `insufficient_data`.\n\n"
|
| 910 |
+
"This UI is optimized for non-JSON users: **Controlled vocab editor** + **Field Builder**.\n"
|
| 911 |
+
"Raw JSON/spec are available under **Advanced**."
|
| 912 |
)
|
| 913 |
|
| 914 |
+
# State
|
| 915 |
+
state_records = gr.State([]) # list[dict]
|
| 916 |
+
state_details = gr.State([]) # list[dict]
|
| 917 |
+
vocab_state = gr.State({}) # dict
|
| 918 |
+
field_rows_state = gr.State([]) # list[dict]
|
| 919 |
|
| 920 |
+
with gr.Tab("Extract"):
|
| 921 |
+
files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
|
| 922 |
|
| 923 |
with gr.Row():
|
| 924 |
api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
|
| 925 |
+
model = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 926 |
|
| 927 |
with gr.Row():
|
| 928 |
max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
|
| 929 |
chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
|
| 930 |
max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
|
| 931 |
|
| 932 |
+
gr.Markdown("## Controlled Vocabulary (guided editor)")
|
| 933 |
+
vocab_mode = gr.Radio(choices=["Guided", "Advanced (Raw JSON)"], value="Guided", label="Vocab editor mode")
|
| 934 |
+
|
| 935 |
+
with gr.Row():
|
| 936 |
+
vocab_category = gr.Dropdown(label="Category (lists only)", choices=[], value=None)
|
| 937 |
+
vocab_term_add = gr.Textbox(label="Add term", placeholder="type term and click Add")
|
| 938 |
+
vocab_add_btn = gr.Button("Add")
|
| 939 |
+
with gr.Row():
|
| 940 |
+
vocab_term_remove = gr.Textbox(label="Remove term", placeholder="type exact term and click Remove")
|
| 941 |
+
vocab_remove_btn = gr.Button("Remove")
|
| 942 |
+
vocab_apply_btn = gr.Button("Apply table changes to category")
|
| 943 |
+
vocab_reset_btn = gr.Button("Reset vocab to defaults")
|
| 944 |
+
|
| 945 |
+
vocab_terms_df = gr.Dataframe(headers=["term"], label="Terms (edit directly)", interactive=True, wrap=True)
|
| 946 |
+
vocab_status = gr.Textbox(label="Vocab status", interactive=False)
|
| 947 |
+
|
| 948 |
+
with gr.Accordion("Advanced: Raw vocab JSON (auto-generated)", open=False):
|
| 949 |
+
vocab_json = gr.Textbox(label="Controlled vocab JSON", lines=12, interactive=False)
|
| 950 |
+
|
| 951 |
+
gr.Markdown("## Extraction Spec (Field Builder)")
|
| 952 |
+
with gr.Row():
|
| 953 |
+
preset_name = gr.Dropdown(label="Preset", choices=list(PRESET_MAP.keys()), value="Core (recommended)")
|
| 954 |
+
preset_mode = gr.Radio(label="Preset mode", choices=["Replace", "Append"], value="Append")
|
| 955 |
+
preset_btn = gr.Button("Load preset")
|
| 956 |
+
|
| 957 |
+
with gr.Row():
|
| 958 |
+
field_name_in = gr.Textbox(label="Field name", placeholder="e.g., Genotoxicity_result")
|
| 959 |
+
field_type_in = gr.Dropdown(label="Type", choices=TYPE_CHOICES, value="str")
|
| 960 |
+
enum_values_in = gr.Textbox(label="Enum values (comma-separated; used for enum/list[enum])", placeholder="a,b,c", lines=2)
|
| 961 |
+
instructions_in = gr.Textbox(label="Instructions", placeholder="Tell the extractor exactly what to pull.", lines=2)
|
| 962 |
+
|
| 963 |
+
with gr.Row():
|
| 964 |
+
add_update_field_btn = gr.Button("Add/Update field")
|
| 965 |
+
remove_field_name = gr.Dropdown(label="Remove field", choices=[], value=None)
|
| 966 |
+
remove_field_btn = gr.Button("Remove")
|
| 967 |
+
|
| 968 |
+
fields_df = gr.Dataframe(
|
| 969 |
+
label="Fields (edit if needed, then click Apply)",
|
| 970 |
+
headers=["field","type","enum_values","instructions"],
|
| 971 |
+
interactive=True,
|
| 972 |
+
wrap=True
|
| 973 |
+
)
|
| 974 |
+
fields_apply_btn = gr.Button("Apply builder table")
|
| 975 |
+
fields_status = gr.Textbox(label="Field builder status", interactive=False)
|
| 976 |
+
|
| 977 |
+
with gr.Accordion("Advanced: Raw extraction spec (auto-generated)", open=False):
|
| 978 |
+
field_spec = gr.Textbox(label="Extraction spec", lines=12, interactive=False)
|
| 979 |
|
| 980 |
extract_btn = gr.Button("Run Extraction (Grounded)")
|
| 981 |
status = gr.Textbox(label="Status", interactive=False)
|
| 982 |
|
| 983 |
+
# Replace wide table with a compact overview (not duplicate)
|
| 984 |
+
overview_df = gr.Dataframe(
|
| 985 |
+
label="Batch Overview (compact)",
|
| 986 |
interactive=False,
|
| 987 |
wrap=True,
|
| 988 |
show_row_numbers=True,
|
| 989 |
buttons=["fullscreen", "copy"]
|
| 990 |
)
|
| 991 |
+
|
| 992 |
with gr.Row():
|
| 993 |
out_csv = gr.File(label="Download: extraction_table.csv")
|
| 994 |
out_json = gr.File(label="Download: extraction_details.json (evidence + structured data)")
|
|
|
|
| 998 |
|
| 999 |
with gr.Row():
|
| 1000 |
review_mode = gr.Checkbox(label="Review mode (enable editing)", value=False)
|
| 1001 |
+
save_btn = gr.Button("Save edits")
|
| 1002 |
export_btn = gr.Button("Export reviewed CSV")
|
| 1003 |
|
| 1004 |
review_status = gr.Textbox(label="Review status", interactive=False)
|
|
|
|
| 1011 |
label="Vertical record view (Field → Value)"
|
| 1012 |
)
|
| 1013 |
evidence_md = gr.Markdown()
|
|
|
|
| 1014 |
reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
|
| 1015 |
|
| 1016 |
+
# -------------------------
|
| 1017 |
+
# INIT vocab + fields on load (via a button-less trick: use .load)
|
| 1018 |
+
# -------------------------
|
| 1019 |
+
def _init_all():
|
| 1020 |
+
v, keys, k0, df_terms, vjson, vmsg = vocab_init_state(DEFAULT_CONTROLLED_VOCAB_JSON)
|
| 1021 |
+
frows, fdf, fspec, fmsg = fields_init_state()
|
| 1022 |
+
remove_choices = [r["field"] for r in frows]
|
| 1023 |
+
return (
|
| 1024 |
+
v, gr.update(choices=keys, value=k0), df_terms, vjson, vmsg,
|
| 1025 |
+
frows, fdf, fspec, fmsg, gr.update(choices=remove_choices, value=(remove_choices[0] if remove_choices else None))
|
| 1026 |
+
)
|
| 1027 |
+
|
| 1028 |
+
demo.load(
|
| 1029 |
+
_init_all,
|
| 1030 |
+
inputs=None,
|
| 1031 |
+
outputs=[vocab_state, vocab_category, vocab_terms_df, vocab_json, vocab_status,
|
| 1032 |
+
field_rows_state, fields_df, field_spec, fields_status, remove_field_name]
|
| 1033 |
+
)
|
| 1034 |
+
|
| 1035 |
+
# Vocab events
|
| 1036 |
+
vocab_category.change(
|
| 1037 |
+
fn=vocab_load_category,
|
| 1038 |
+
inputs=[vocab_state, vocab_category, vocab_search],
|
| 1039 |
+
outputs=[vocab_terms_df, vocab_terms_filtered, vocab_status]
|
| 1040 |
+
)
|
| 1041 |
+
vocab_add_btn.click(
|
| 1042 |
+
fn=vocab_add_term,
|
| 1043 |
+
inputs=[vocab_state, vocab_category, vocab_term_add, vocab_search],
|
| 1044 |
+
outputs=[vocab_terms_df, vocab_terms_filtered, vocab_term_add, vocab_status]
|
| 1045 |
+
)
|
| 1046 |
+
|
| 1047 |
+
vocab_remove_btn.click(
|
| 1048 |
+
fn=vocab_remove_term,
|
| 1049 |
+
inputs=[vocab_state, vocab_category, vocab_term_remove, vocab_search],
|
| 1050 |
+
outputs=[vocab_terms_df, vocab_terms_filtered, vocab_term_remove, vocab_status]
|
| 1051 |
+
)
|
| 1052 |
+
|
| 1053 |
+
vocab_apply_btn.click(
|
| 1054 |
+
fn=vocab_apply_df,
|
| 1055 |
+
inputs=[vocab_state, vocab_category, vocab_terms_df, vocab_search],
|
| 1056 |
+
outputs=[vocab_json, vocab_terms_filtered, vocab_status]
|
| 1057 |
+
)
|
| 1058 |
+
|
| 1059 |
+
vocab_reset_btn.click(
|
| 1060 |
+
fn=vocab_reset_defaults,
|
| 1061 |
+
inputs=None,
|
| 1062 |
+
outputs=[vocab_state, vocab_category, vocab_terms_df, vocab_json, vocab_status]
|
| 1063 |
+
)
|
| 1064 |
+
|
| 1065 |
+
# Field builder events
|
| 1066 |
+
preset_btn.click(
|
| 1067 |
+
fn=fields_load_preset,
|
| 1068 |
+
inputs=[preset_name, preset_mode, field_rows_state],
|
| 1069 |
+
outputs=[field_rows_state, fields_df, field_spec, fields_status]
|
| 1070 |
+
).then(
|
| 1071 |
+
fn=lambda rows: gr.update(choices=[r["field"] for r in rows], value=None),
|
| 1072 |
+
inputs=[field_rows_state],
|
| 1073 |
+
outputs=[remove_field_name]
|
| 1074 |
+
)
|
| 1075 |
+
|
| 1076 |
+
add_update_field_btn.click(
|
| 1077 |
+
fn=fields_add_or_update,
|
| 1078 |
+
inputs=[field_name_in, field_type_in, enum_values_in, instructions_in, field_rows_state],
|
| 1079 |
+
outputs=[field_rows_state, fields_df, field_spec, fields_status]
|
| 1080 |
+
).then(
|
| 1081 |
+
fn=lambda rows: gr.update(choices=[r["field"] for r in rows], value=None),
|
| 1082 |
+
inputs=[field_rows_state],
|
| 1083 |
+
outputs=[remove_field_name]
|
| 1084 |
+
)
|
| 1085 |
+
|
| 1086 |
+
remove_field_btn.click(
|
| 1087 |
+
fn=fields_remove,
|
| 1088 |
+
inputs=[remove_field_name, field_rows_state],
|
| 1089 |
+
outputs=[field_rows_state, fields_df, field_spec, fields_status]
|
| 1090 |
+
).then(
|
| 1091 |
+
fn=lambda rows: gr.update(choices=[r["field"] for r in rows], value=None),
|
| 1092 |
+
inputs=[field_rows_state],
|
| 1093 |
+
outputs=[remove_field_name]
|
| 1094 |
+
)
|
| 1095 |
+
|
| 1096 |
+
fields_apply_btn.click(
|
| 1097 |
+
fn=fields_apply_df,
|
| 1098 |
+
inputs=[field_rows_state, fields_df],
|
| 1099 |
+
outputs=[field_rows_state, fields_df, field_spec, fields_status]
|
| 1100 |
+
).then(
|
| 1101 |
+
fn=lambda rows: gr.update(choices=[r["field"] for r in rows], value=None),
|
| 1102 |
+
inputs=[field_rows_state],
|
| 1103 |
+
outputs=[remove_field_name]
|
| 1104 |
+
)
|
| 1105 |
+
|
| 1106 |
+
# Extraction
|
| 1107 |
extract_btn.click(
|
| 1108 |
fn=run_extraction,
|
| 1109 |
inputs=[files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars],
|
| 1110 |
+
outputs=[overview_df, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
|
| 1111 |
)
|
| 1112 |
|
| 1113 |
+
# Vertical view selection
|
| 1114 |
record_pick.change(
|
| 1115 |
fn=on_pick,
|
| 1116 |
inputs=[record_pick, state_records, state_details],
|
| 1117 |
outputs=[vertical_view, evidence_md]
|
| 1118 |
)
|
| 1119 |
|
| 1120 |
+
# Review mode
|
| 1121 |
+
review_mode.change(fn=toggle_review_mode, inputs=[review_mode], outputs=[vertical_view])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1122 |
|
|
|
|
| 1123 |
save_btn.click(
|
| 1124 |
fn=save_review_changes,
|
| 1125 |
inputs=[record_pick, vertical_view, state_records],
|
| 1126 |
+
outputs=[overview_df, state_records, review_status]
|
| 1127 |
)
|
| 1128 |
|
|
|
|
| 1129 |
export_btn.click(
|
| 1130 |
fn=export_reviewed_csv,
|
| 1131 |
inputs=[state_records],
|
|
|
|
| 1133 |
)
|
| 1134 |
|
| 1135 |
with gr.Tab("Cross-paper Synthesis"):
|
| 1136 |
+
gr.Markdown("Upload `extraction_details.json` from Extract. Synthesis is based strictly on grounded extractions.")
|
| 1137 |
api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
|
| 1138 |
+
model2 = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1139 |
extraction_json_file = gr.File(label="Upload extraction_details.json", file_types=[".json"], file_count="single")
|
| 1140 |
synth_btn = gr.Button("Synthesize Across Papers")
|
| 1141 |
synth_md = gr.Markdown()
|
| 1142 |
+
synth_btn.click(fn=run_synthesis, inputs=[api_key2, model2, extraction_json_file], outputs=[synth_md])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1143 |
|
| 1144 |
with gr.Tab("Pending tasks"):
|
| 1145 |
gr.Markdown(
|
| 1146 |
+
"## Pending tasks\n\n"
|
| 1147 |
+
"1) One row per chemical–endpoint pair\n"
|
| 1148 |
+
"- Change schema to output `records[]` and flatten into multiple rows per paper\n\n"
|
| 1149 |
+
"2) Evidence verification\n"
|
| 1150 |
+
"- If evidence quote not found in context → blank value + flag UNVERIFIED\n\n"
|
| 1151 |
+
"3) Taxonomy mapping\n"
|
| 1152 |
+
"- Synonyms + preferred terms for FDA / OECD / MedDRA-like structure\n\n"
|
| 1153 |
+
"4) Column transforms\n"
|
| 1154 |
+
"- Parse NOAEL/LOAEL etc into structured {metric,value,unit,route,duration}\n\n"
|
| 1155 |
+
"5) Compare mode\n"
|
| 1156 |
+
"- Compare across papers by chemical/endpoint, output consensus + disagreements table\n\n"
|
| 1157 |
+
"6) OCR (optional)\n"
|
| 1158 |
+
"- Currently: text-based PDFs only; OCR adds heavy deps"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1159 |
)
|
| 1160 |
|
| 1161 |
if __name__ == "__main__":
|