hchevva commited on
Commit
40a8012
·
verified ·
1 Parent(s): ddfd78b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -2
app.py CHANGED
@@ -146,6 +146,7 @@ APP_CSS = """
146
  # =============================
147
  DEFAULT_CONTROLLED_VOCAB_JSON = """{
148
  "risk_stance_enum": ["acceptable","acceptable_with_uncertainty","not_acceptable","insufficient_data"],
 
149
 
150
  "approach_enum": ["in_vivo","in_vitro","in_silico","nams","mixed","not_reported"],
151
 
@@ -217,6 +218,12 @@ PRESET_GENOTOX_OECD = [
217
  "enum_values": "OECD_TG_474_In Vivo Mammalian Erythrocyte Micronucleus Test,OECD_TG_475_Mammalian Bone Marrow Chromosomal Aberration Test,OECD_TG_488_Transgenic Rodent Somatic & Germ Cell Gene Mutation Assays,OECD_TG_489_In Vivo Mammalian Alkaline Comet Assay,not_reported",
218
  "instructions": "Select all in vivo OECD TGs explicitly reported (or clearly described). If none, use not_reported."
219
  },
 
 
 
 
 
 
220
  {"field": "genotoxicity_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Classify overall genotoxicity outcome as reported. If unclear, not_reported."},
221
  {"field": "genotoxicity_result_notes", "type": "str", "enum_values": "", "instructions": "Short explanation grounded to text + test context (e.g., AMES, micronucleus)."},
222
  ]
@@ -276,7 +283,7 @@ ENDPOINT_PRESETS: Dict[str, List[str]] = {
276
  }
277
 
278
  ENDPOINT_QUERY_HINTS: Dict[str, List[str]] = {
279
- "Genotoxicity (OECD TG)": ["genotoxicity", "mutagenicity", "AMES", "micronucleus", "comet assay", "chromosomal aberration", "OECD TG 471 473 476 487 490 474 489"],
280
  "NAMs / In Silico": ["in silico", "QSAR", "read-across", "AOP", "PBPK", "high-throughput", "omics", "organ-on-chip", "microphysiological"],
281
  "Acute toxicity": ["acute toxicity", "LD50", "LC50", "single dose", "lethality", "mortality"],
282
  "Repeated dose toxicity": ["repeated dose", "subchronic", "chronic", "NOAEL", "LOAEL", "target organ", "90-day", "28-day"],
@@ -523,6 +530,7 @@ def openai_structured_extract(
523
  ) -> Dict[str, Any]:
524
  field_instr_lines = [f"- {k}: {v if v else '(no extra instructions)'}" for k, v in field_instructions.items()]
525
  vocab_text = json.dumps(controlled_vocab, indent=2)
 
526
 
527
  system_msg = (
528
  "You are a toxicology research paper data-extraction assistant for an industry safety assessor.\n"
@@ -532,16 +540,34 @@ def openai_structured_extract(
532
  "3) Provide evidence quotes + page ranges for extracted fields.\n"
533
  "4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
534
  "5) Prefer controlled vocab terms when applicable.\n"
 
 
 
 
 
 
 
535
  )
536
 
537
  user_msg = (
538
  "CONTROLLED VOCAB (JSON):\n"
539
  f"{vocab_text}\n\n"
 
 
 
 
 
 
540
  "FIELD INSTRUCTIONS:\n"
541
  + "\n".join(field_instr_lines)
542
  + "\n\n"
543
  "EXCERPTS (with page ranges):\n"
544
- f"{context}\n"
 
 
 
 
 
545
  )
546
 
547
  resp = client.responses.create(
 
146
  # =============================
147
  DEFAULT_CONTROLLED_VOCAB_JSON = """{
148
  "risk_stance_enum": ["acceptable","acceptable_with_uncertainty","not_acceptable","insufficient_data"],
149
+ "fda_ctp_tier_enum": ["Tier_1_high_priority","Tier_2_moderate_priority","Tier_3_lower_priority","enough data is not available"],
150
 
151
  "approach_enum": ["in_vivo","in_vitro","in_silico","nams","mixed","not_reported"],
152
 
 
218
  "enum_values": "OECD_TG_474_In Vivo Mammalian Erythrocyte Micronucleus Test,OECD_TG_475_Mammalian Bone Marrow Chromosomal Aberration Test,OECD_TG_488_Transgenic Rodent Somatic & Germ Cell Gene Mutation Assays,OECD_TG_489_In Vivo Mammalian Alkaline Comet Assay,not_reported",
219
  "instructions": "Select all in vivo OECD TGs explicitly reported (or clearly described). If none, use not_reported."
220
  },
221
+ {
222
+ "field": "fda_ctp_carcinogenicity_tier",
223
+ "type": "enum",
224
+ "enum_values": "Tier_1_high_priority,Tier_2_moderate_priority,Tier_3_lower_priority,enough data is not available",
225
+ "instructions": "Assign FDA CTP carcinogenicity/genotoxicity tier based strictly on provided evidence. If decision cannot be made from excerpts, use exactly: enough data is not available."
226
+ },
227
  {"field": "genotoxicity_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Classify overall genotoxicity outcome as reported. If unclear, not_reported."},
228
  {"field": "genotoxicity_result_notes", "type": "str", "enum_values": "", "instructions": "Short explanation grounded to text + test context (e.g., AMES, micronucleus)."},
229
  ]
 
283
  }
284
 
285
  ENDPOINT_QUERY_HINTS: Dict[str, List[str]] = {
286
+ "Genotoxicity (OECD TG)": ["genotoxicity", "mutagenicity", "AMES", "micronucleus", "comet assay", "chromosomal aberration", "OECD TG 471 473 476 487 490 474 489", "carcinogenicity tiering", "FDA CTP tier"],
287
  "NAMs / In Silico": ["in silico", "QSAR", "read-across", "AOP", "PBPK", "high-throughput", "omics", "organ-on-chip", "microphysiological"],
288
  "Acute toxicity": ["acute toxicity", "LD50", "LC50", "single dose", "lethality", "mortality"],
289
  "Repeated dose toxicity": ["repeated dose", "subchronic", "chronic", "NOAEL", "LOAEL", "target organ", "90-day", "28-day"],
 
530
  ) -> Dict[str, Any]:
531
  field_instr_lines = [f"- {k}: {v if v else '(no extra instructions)'}" for k, v in field_instructions.items()]
532
  vocab_text = json.dumps(controlled_vocab, indent=2)
533
+ has_fda_tier_field = "fda_ctp_carcinogenicity_tier" in field_instructions
534
 
535
  system_msg = (
536
  "You are a toxicology research paper data-extraction assistant for an industry safety assessor.\n"
 
540
  "3) Provide evidence quotes + page ranges for extracted fields.\n"
541
  "4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
542
  "5) Prefer controlled vocab terms when applicable.\n"
543
+ "6) Use an INTERNAL Tree-of-Thought process before finalizing JSON:\n"
544
+ " - Branch evidence by endpoint/theme.\n"
545
+ " - Test competing interpretations.\n"
546
+ " - Prune branches that are not directly supported by excerpts.\n"
547
+ " - Select the most evidence-grounded branch only.\n"
548
+ " - Do NOT output reasoning traces; output JSON only.\n"
549
+ "7) If the FDA CTP tier field is requested but evidence is insufficient, output exactly: 'enough data is not available'.\n"
550
  )
551
 
552
  user_msg = (
553
  "CONTROLLED VOCAB (JSON):\n"
554
  f"{vocab_text}\n\n"
555
+ "TREE-OF-THOUGHT EXECUTION FRAMEWORK (internal only, do not output):\n"
556
+ "A) Build evidence map: claims -> quotes -> page ranges.\n"
557
+ "B) Generate candidate interpretations per endpoint.\n"
558
+ "C) Eliminate candidates lacking direct quote support.\n"
559
+ "D) Select final grounded interpretation and populate schema fields.\n"
560
+ "E) For uncertain fields, use explicit fallback values from enum/instructions.\n\n"
561
  "FIELD INSTRUCTIONS:\n"
562
  + "\n".join(field_instr_lines)
563
  + "\n\n"
564
  "EXCERPTS (with page ranges):\n"
565
+ f"{context}\n\n"
566
+ + (
567
+ "IMPORTANT: `fda_ctp_carcinogenicity_tier` must be one of "
568
+ "[Tier_1_high_priority, Tier_2_moderate_priority, Tier_3_lower_priority, enough data is not available].\n"
569
+ if has_fda_tier_field else ""
570
+ )
571
  )
572
 
573
  resp = client.responses.create(