Spaces:

hchevva
/

NLP_Project

Runtime error

App Files Files Community

hchevva commited on Feb 14

Commit

40a8012

verified ·

1 Parent(s): ddfd78b

Upload app.py

Browse files

Files changed (1) hide show

app.py +28 -2

app.py CHANGED Viewed

@@ -146,6 +146,7 @@ APP_CSS = """
 # =============================
 DEFAULT_CONTROLLED_VOCAB_JSON = """{
   "risk_stance_enum": ["acceptable","acceptable_with_uncertainty","not_acceptable","insufficient_data"],
   "approach_enum": ["in_vivo","in_vitro","in_silico","nams","mixed","not_reported"],
@@ -217,6 +218,12 @@ PRESET_GENOTOX_OECD = [
         "enum_values": "OECD_TG_474_In Vivo Mammalian Erythrocyte Micronucleus Test,OECD_TG_475_Mammalian Bone Marrow Chromosomal Aberration Test,OECD_TG_488_Transgenic Rodent Somatic & Germ Cell Gene Mutation Assays,OECD_TG_489_In Vivo Mammalian Alkaline Comet Assay,not_reported",
         "instructions": "Select all in vivo OECD TGs explicitly reported (or clearly described). If none, use not_reported."
     },
     {"field": "genotoxicity_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Classify overall genotoxicity outcome as reported. If unclear, not_reported."},
     {"field": "genotoxicity_result_notes", "type": "str", "enum_values": "", "instructions": "Short explanation grounded to text + test context (e.g., AMES, micronucleus)."},
 ]
@@ -276,7 +283,7 @@ ENDPOINT_PRESETS: Dict[str, List[str]] = {
 }
 ENDPOINT_QUERY_HINTS: Dict[str, List[str]] = {
-    "Genotoxicity (OECD TG)": ["genotoxicity", "mutagenicity", "AMES", "micronucleus", "comet assay", "chromosomal aberration", "OECD TG 471 473 476 487 490 474 489"],
     "NAMs / In Silico": ["in silico", "QSAR", "read-across", "AOP", "PBPK", "high-throughput", "omics", "organ-on-chip", "microphysiological"],
     "Acute toxicity": ["acute toxicity", "LD50", "LC50", "single dose", "lethality", "mortality"],
     "Repeated dose toxicity": ["repeated dose", "subchronic", "chronic", "NOAEL", "LOAEL", "target organ", "90-day", "28-day"],
@@ -523,6 +530,7 @@ def openai_structured_extract(
 ) -> Dict[str, Any]:
     field_instr_lines = [f"- {k}: {v if v else '(no extra instructions)'}" for k, v in field_instructions.items()]
     vocab_text = json.dumps(controlled_vocab, indent=2)
     system_msg = (
         "You are a toxicology research paper data-extraction assistant for an industry safety assessor.\n"
@@ -532,16 +540,34 @@ def openai_structured_extract(
         "3) Provide evidence quotes + page ranges for extracted fields.\n"
         "4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
         "5) Prefer controlled vocab terms when applicable.\n"
     )
     user_msg = (
         "CONTROLLED VOCAB (JSON):\n"
         f"{vocab_text}\n\n"
         "FIELD INSTRUCTIONS:\n"
         + "\n".join(field_instr_lines)
         + "\n\n"
         "EXCERPTS (with page ranges):\n"
-        f"{context}\n"
     )
     resp = client.responses.create(

 # =============================
 DEFAULT_CONTROLLED_VOCAB_JSON = """{
   "risk_stance_enum": ["acceptable","acceptable_with_uncertainty","not_acceptable","insufficient_data"],
+  "fda_ctp_tier_enum": ["Tier_1_high_priority","Tier_2_moderate_priority","Tier_3_lower_priority","enough data is not available"],
   "approach_enum": ["in_vivo","in_vitro","in_silico","nams","mixed","not_reported"],
         "enum_values": "OECD_TG_474_In Vivo Mammalian Erythrocyte Micronucleus Test,OECD_TG_475_Mammalian Bone Marrow Chromosomal Aberration Test,OECD_TG_488_Transgenic Rodent Somatic & Germ Cell Gene Mutation Assays,OECD_TG_489_In Vivo Mammalian Alkaline Comet Assay,not_reported",
         "instructions": "Select all in vivo OECD TGs explicitly reported (or clearly described). If none, use not_reported."
     },
+    {
+        "field": "fda_ctp_carcinogenicity_tier",
+        "type": "enum",
+        "enum_values": "Tier_1_high_priority,Tier_2_moderate_priority,Tier_3_lower_priority,enough data is not available",
+        "instructions": "Assign FDA CTP carcinogenicity/genotoxicity tier based strictly on provided evidence. If decision cannot be made from excerpts, use exactly: enough data is not available."
+    },
     {"field": "genotoxicity_result", "type": "enum", "enum_values": "positive,negative,equivocal,not_reported", "instructions": "Classify overall genotoxicity outcome as reported. If unclear, not_reported."},
     {"field": "genotoxicity_result_notes", "type": "str", "enum_values": "", "instructions": "Short explanation grounded to text + test context (e.g., AMES, micronucleus)."},
 ]
 }
 ENDPOINT_QUERY_HINTS: Dict[str, List[str]] = {
+    "Genotoxicity (OECD TG)": ["genotoxicity", "mutagenicity", "AMES", "micronucleus", "comet assay", "chromosomal aberration", "OECD TG 471 473 476 487 490 474 489", "carcinogenicity tiering", "FDA CTP tier"],
     "NAMs / In Silico": ["in silico", "QSAR", "read-across", "AOP", "PBPK", "high-throughput", "omics", "organ-on-chip", "microphysiological"],
     "Acute toxicity": ["acute toxicity", "LD50", "LC50", "single dose", "lethality", "mortality"],
     "Repeated dose toxicity": ["repeated dose", "subchronic", "chronic", "NOAEL", "LOAEL", "target organ", "90-day", "28-day"],
 ) -> Dict[str, Any]:
     field_instr_lines = [f"- {k}: {v if v else '(no extra instructions)'}" for k, v in field_instructions.items()]
     vocab_text = json.dumps(controlled_vocab, indent=2)
+    has_fda_tier_field = "fda_ctp_carcinogenicity_tier" in field_instructions
     system_msg = (
         "You are a toxicology research paper data-extraction assistant for an industry safety assessor.\n"
         "3) Provide evidence quotes + page ranges for extracted fields.\n"
         "4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
         "5) Prefer controlled vocab terms when applicable.\n"
+        "6) Use an INTERNAL Tree-of-Thought process before finalizing JSON:\n"
+        "   - Branch evidence by endpoint/theme.\n"
+        "   - Test competing interpretations.\n"
+        "   - Prune branches that are not directly supported by excerpts.\n"
+        "   - Select the most evidence-grounded branch only.\n"
+        "   - Do NOT output reasoning traces; output JSON only.\n"
+        "7) If the FDA CTP tier field is requested but evidence is insufficient, output exactly: 'enough data is not available'.\n"
     )
     user_msg = (
         "CONTROLLED VOCAB (JSON):\n"
         f"{vocab_text}\n\n"
+        "TREE-OF-THOUGHT EXECUTION FRAMEWORK (internal only, do not output):\n"
+        "A) Build evidence map: claims -> quotes -> page ranges.\n"
+        "B) Generate candidate interpretations per endpoint.\n"
+        "C) Eliminate candidates lacking direct quote support.\n"
+        "D) Select final grounded interpretation and populate schema fields.\n"
+        "E) For uncertain fields, use explicit fallback values from enum/instructions.\n\n"
         "FIELD INSTRUCTIONS:\n"
         + "\n".join(field_instr_lines)
         + "\n\n"
         "EXCERPTS (with page ranges):\n"
+        f"{context}\n\n"
+        + (
+            "IMPORTANT: `fda_ctp_carcinogenicity_tier` must be one of "
+            "[Tier_1_high_priority, Tier_2_moderate_priority, Tier_3_lower_priority, enough data is not available].\n"
+            if has_fda_tier_field else ""
+        )
     )
     resp = client.responses.create(