Spaces:
Running
Running
Upload app.py
Browse files
app.py
CHANGED
|
@@ -146,6 +146,7 @@ APP_CSS = """
|
|
| 146 |
# =============================
|
| 147 |
DEFAULT_CONTROLLED_VOCAB_JSON = """{
|
| 148 |
"risk_stance_enum": ["acceptable","acceptable_with_uncertainty","not_acceptable","insufficient_data"],
|
|
|
|
| 149 |
|
| 150 |
"approach_enum": ["in_vivo","in_vitro","in_silico","nams","mixed","not_reported"],
|
| 151 |
|
|
@@ -217,6 +218,12 @@ PRESET_GENOTOX_OECD = [
|
|
| 217 |
"enum_values": "OECD_TG_474_In Vivo Mammalian Erythrocyte Micronucleus Test,OECD_TG_475_Mammalian Bone Marrow Chromosomal Aberration Test,OECD_TG_488_Transgenic Rodent Somatic & Germ Cell Gene Mutation Assays,OECD_TG_489_In Vivo Mammalian Alkaline Comet Assay,not_reported",
|
| 218 |
"instructions": "Select all in vivo OECD TGs explicitly reported (or clearly described). If none, use not_reported."
|
| 219 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
{"field": "genotoxicity_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Classify overall genotoxicity outcome as reported. If unclear, not_reported."},
|
| 221 |
{"field": "genotoxicity_result_notes", "type": "str", "enum_values": "", "instructions": "Short explanation grounded to text + test context (e.g., AMES, micronucleus)."},
|
| 222 |
]
|
|
@@ -276,7 +283,7 @@ ENDPOINT_PRESETS: Dict[str, List[str]] = {
|
|
| 276 |
}
|
| 277 |
|
| 278 |
ENDPOINT_QUERY_HINTS: Dict[str, List[str]] = {
|
| 279 |
-
"Genotoxicity (OECD TG)": ["genotoxicity", "mutagenicity", "AMES", "micronucleus", "comet assay", "chromosomal aberration", "OECD TG 471 473 476 487 490 474 489"],
|
| 280 |
"NAMs / In Silico": ["in silico", "QSAR", "read-across", "AOP", "PBPK", "high-throughput", "omics", "organ-on-chip", "microphysiological"],
|
| 281 |
"Acute toxicity": ["acute toxicity", "LD50", "LC50", "single dose", "lethality", "mortality"],
|
| 282 |
"Repeated dose toxicity": ["repeated dose", "subchronic", "chronic", "NOAEL", "LOAEL", "target organ", "90-day", "28-day"],
|
|
@@ -523,6 +530,7 @@ def openai_structured_extract(
|
|
| 523 |
) -> Dict[str, Any]:
|
| 524 |
field_instr_lines = [f"- {k}: {v if v else '(no extra instructions)'}" for k, v in field_instructions.items()]
|
| 525 |
vocab_text = json.dumps(controlled_vocab, indent=2)
|
|
|
|
| 526 |
|
| 527 |
system_msg = (
|
| 528 |
"You are a toxicology research paper data-extraction assistant for an industry safety assessor.\n"
|
|
@@ -532,16 +540,34 @@ def openai_structured_extract(
|
|
| 532 |
"3) Provide evidence quotes + page ranges for extracted fields.\n"
|
| 533 |
"4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
|
| 534 |
"5) Prefer controlled vocab terms when applicable.\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 535 |
)
|
| 536 |
|
| 537 |
user_msg = (
|
| 538 |
"CONTROLLED VOCAB (JSON):\n"
|
| 539 |
f"{vocab_text}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 540 |
"FIELD INSTRUCTIONS:\n"
|
| 541 |
+ "\n".join(field_instr_lines)
|
| 542 |
+ "\n\n"
|
| 543 |
"EXCERPTS (with page ranges):\n"
|
| 544 |
-
f"{context}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 545 |
)
|
| 546 |
|
| 547 |
resp = client.responses.create(
|
|
|
|
| 146 |
# =============================
|
| 147 |
DEFAULT_CONTROLLED_VOCAB_JSON = """{
|
| 148 |
"risk_stance_enum": ["acceptable","acceptable_with_uncertainty","not_acceptable","insufficient_data"],
|
| 149 |
+
"fda_ctp_tier_enum": ["Tier_1_high_priority","Tier_2_moderate_priority","Tier_3_lower_priority","enough data is not available"],
|
| 150 |
|
| 151 |
"approach_enum": ["in_vivo","in_vitro","in_silico","nams","mixed","not_reported"],
|
| 152 |
|
|
|
|
| 218 |
"enum_values": "OECD_TG_474_In Vivo Mammalian Erythrocyte Micronucleus Test,OECD_TG_475_Mammalian Bone Marrow Chromosomal Aberration Test,OECD_TG_488_Transgenic Rodent Somatic & Germ Cell Gene Mutation Assays,OECD_TG_489_In Vivo Mammalian Alkaline Comet Assay,not_reported",
|
| 219 |
"instructions": "Select all in vivo OECD TGs explicitly reported (or clearly described). If none, use not_reported."
|
| 220 |
},
|
| 221 |
+
{
|
| 222 |
+
"field": "fda_ctp_carcinogenicity_tier",
|
| 223 |
+
"type": "enum",
|
| 224 |
+
"enum_values": "Tier_1_high_priority,Tier_2_moderate_priority,Tier_3_lower_priority,enough data is not available",
|
| 225 |
+
"instructions": "Assign FDA CTP carcinogenicity/genotoxicity tier based strictly on provided evidence. If decision cannot be made from excerpts, use exactly: enough data is not available."
|
| 226 |
+
},
|
| 227 |
{"field": "genotoxicity_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Classify overall genotoxicity outcome as reported. If unclear, not_reported."},
|
| 228 |
{"field": "genotoxicity_result_notes", "type": "str", "enum_values": "", "instructions": "Short explanation grounded to text + test context (e.g., AMES, micronucleus)."},
|
| 229 |
]
|
|
|
|
| 283 |
}
|
| 284 |
|
| 285 |
ENDPOINT_QUERY_HINTS: Dict[str, List[str]] = {
|
| 286 |
+
"Genotoxicity (OECD TG)": ["genotoxicity", "mutagenicity", "AMES", "micronucleus", "comet assay", "chromosomal aberration", "OECD TG 471 473 476 487 490 474 489", "carcinogenicity tiering", "FDA CTP tier"],
|
| 287 |
"NAMs / In Silico": ["in silico", "QSAR", "read-across", "AOP", "PBPK", "high-throughput", "omics", "organ-on-chip", "microphysiological"],
|
| 288 |
"Acute toxicity": ["acute toxicity", "LD50", "LC50", "single dose", "lethality", "mortality"],
|
| 289 |
"Repeated dose toxicity": ["repeated dose", "subchronic", "chronic", "NOAEL", "LOAEL", "target organ", "90-day", "28-day"],
|
|
|
|
| 530 |
) -> Dict[str, Any]:
|
| 531 |
field_instr_lines = [f"- {k}: {v if v else '(no extra instructions)'}" for k, v in field_instructions.items()]
|
| 532 |
vocab_text = json.dumps(controlled_vocab, indent=2)
|
| 533 |
+
has_fda_tier_field = "fda_ctp_carcinogenicity_tier" in field_instructions
|
| 534 |
|
| 535 |
system_msg = (
|
| 536 |
"You are a toxicology research paper data-extraction assistant for an industry safety assessor.\n"
|
|
|
|
| 540 |
"3) Provide evidence quotes + page ranges for extracted fields.\n"
|
| 541 |
"4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
|
| 542 |
"5) Prefer controlled vocab terms when applicable.\n"
|
| 543 |
+
"6) Use an INTERNAL Tree-of-Thought process before finalizing JSON:\n"
|
| 544 |
+
" - Branch evidence by endpoint/theme.\n"
|
| 545 |
+
" - Test competing interpretations.\n"
|
| 546 |
+
" - Prune branches that are not directly supported by excerpts.\n"
|
| 547 |
+
" - Select the most evidence-grounded branch only.\n"
|
| 548 |
+
" - Do NOT output reasoning traces; output JSON only.\n"
|
| 549 |
+
"7) If the FDA CTP tier field is requested but evidence is insufficient, output exactly: 'enough data is not available'.\n"
|
| 550 |
)
|
| 551 |
|
| 552 |
user_msg = (
|
| 553 |
"CONTROLLED VOCAB (JSON):\n"
|
| 554 |
f"{vocab_text}\n\n"
|
| 555 |
+
"TREE-OF-THOUGHT EXECUTION FRAMEWORK (internal only, do not output):\n"
|
| 556 |
+
"A) Build evidence map: claims -> quotes -> page ranges.\n"
|
| 557 |
+
"B) Generate candidate interpretations per endpoint.\n"
|
| 558 |
+
"C) Eliminate candidates lacking direct quote support.\n"
|
| 559 |
+
"D) Select final grounded interpretation and populate schema fields.\n"
|
| 560 |
+
"E) For uncertain fields, use explicit fallback values from enum/instructions.\n\n"
|
| 561 |
"FIELD INSTRUCTIONS:\n"
|
| 562 |
+ "\n".join(field_instr_lines)
|
| 563 |
+ "\n\n"
|
| 564 |
"EXCERPTS (with page ranges):\n"
|
| 565 |
+
f"{context}\n\n"
|
| 566 |
+
+ (
|
| 567 |
+
"IMPORTANT: `fda_ctp_carcinogenicity_tier` must be one of "
|
| 568 |
+
"[Tier_1_high_priority, Tier_2_moderate_priority, Tier_3_lower_priority, enough data is not available].\n"
|
| 569 |
+
if has_fda_tier_field else ""
|
| 570 |
+
)
|
| 571 |
)
|
| 572 |
|
| 573 |
resp = client.responses.create(
|