Spaces:

hchevva
/

NLP_Project

Running

App Files Files Community

hchevva commited on 11 days ago

Commit

594d25a

verified ·

1 Parent(s): 79b39e6

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -13

app.py CHANGED Viewed

@@ -21,40 +21,112 @@ from openai import OpenAI
 DEFAULT_CONTROLLED_VOCAB_JSON = """{
   "risk_stance_enum": ["acceptable","acceptable_with_uncertainty","not_acceptable","insufficient_data"],
   "study_type_enum": ["in_vivo","in_vitro","epidemiology","in_silico","review","methodology","other"],
   "exposure_route_enum": ["oral","inhalation","dermal","parenteral","multiple","not_reported"],
   "species_enum": ["human","rat","mouse","rabbit","dog","non_human_primate","cell_line","other","not_reported"],
-  "oecd_endpoints": [
-    "acute_toxicity","subacute_toxicity","subchronic_toxicity","chronic_toxicity",
-    "carcinogenicity","genotoxicity","reproductive_toxicity","developmental_toxicity",
-    "neurotoxicity","immunotoxicity","endocrine_activity","sensitization","irritation_corrosion"
-  ],
-  "meddra_like_terms": [
-    "hepatic_disorder","renal_disorder","nervous_system_disorder","respiratory_disorder",
-    "skin_and_subcutaneous_tissue_disorder","reproductive_system_disorder",
-    "immune_system_disorder","blood_and_lymphatic_system_disorder"
   ],
-  "dose_metric_terms": ["noael","loael","bmd","bmdl","ld50","lc50","ec50","ic50"]
 }"""
 DEFAULT_FIELD_SPEC = """# One field per line: Field Name | type | instructions
-# types: str, num, bool, list[str], list[num], enum[a,b,c]
 Chemical(s) | list[str] | Primary chemical(s) studied; include common name + abbreviation if present.
 CAS_numbers | list[str] | Extract any CAS numbers mentioned.
 Study_type | enum[in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other] | Choose the best match.
 Exposure_route | enum[oral,inhalation,dermal,parenteral,multiple,not_reported] | Choose best match.
 Species | enum[human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported] | Choose best match.
-OECD_endpoints | list[str] | Extract endpoints; prefer controlled vocab 'oecd_endpoints' when applicable.
-MedDRA_like_terms | list[str] | Extract effects; prefer controlled vocab 'meddra_like_terms' when applicable.
 Dose_metrics | list[str] | Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available.
 Key_findings | str | 2-4 bullet-like sentences summarizing the main findings.
 Conclusion | str | What does the paper conclude about safety/risk?
 """
 # =============================
 # PDF extraction (text-based PDFs only)
 # =============================
@@ -199,6 +271,10 @@ def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
             schema = {"type": "number"}
         elif ftype == "bool":
             schema = {"type": "boolean"}
         elif ftype.startswith("list[str]"):
             schema = {"type": "array", "items": {"type": "string"}}
         elif ftype.startswith("list[num]"):
@@ -292,6 +368,10 @@ def openai_structured_extract(
         "3) Provide evidence quotes + page ranges for extracted fields.\n"
         "4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
         "5) Prefer controlled vocab terms when applicable.\n"
     )
     user_msg = (

 DEFAULT_CONTROLLED_VOCAB_JSON = """{
   "risk_stance_enum": ["acceptable","acceptable_with_uncertainty","not_acceptable","insufficient_data"],
+  "approach_enum": ["in_vivo","in_vitro","in_silico","nams","mixed","not_reported"],
   "study_type_enum": ["in_vivo","in_vitro","epidemiology","in_silico","review","methodology","other"],
+  "in_silico_method_enum": [
+    "qsar",
+    "read_across",
+    "molecular_docking",
+    "molecular_dynamics",
+    "pbpk_pbtK",
+    "aop_based",
+    "ml_model",
+    "other",
+    "not_reported"
+  ],
+  "nams_method_enum": [
+    "high_throughput_screening_hts",
+    "omics_transcriptomics",
+    "omics_proteomics",
+    "omics_metabolomics",
+    "organ_on_chip",
+    "microphysiological_system_mps",
+    "3d_tissue_model",
+    "in_chemico_assay",
+    "in_silico_as_nams",
+    "other",
+    "not_reported"
+  ],
   "exposure_route_enum": ["oral","inhalation","dermal","parenteral","multiple","not_reported"],
   "species_enum": ["human","rat","mouse","rabbit","dog","non_human_primate","cell_line","other","not_reported"],
+  "dose_metric_terms": ["noael","loael","bmd","bmdl","ld50","lc50","ec50","ic50"],
+  "genotoxicity_oecd_tg_in_vitro_enum": [
+    "OECD_TG_471_Bacterial Reverse mutation test(AMES test)",
+    "OECD_TG_473_In Vitro Mammalian Chromosomal Aberration Test",
+    "OECD_TG_476_In Vitro Mammalian Cell Gene Mutation Tests (Hprt & xprt)",
+    "OECD_TG_487_In Vitro Mammalian Cell Micronucleus Test",
+    "OECD_TG_490_In Vitro Mammalian Cell Gene Mutation Tests (Thymidine Kinase)",
+    "not_reported"
+  ],
+  "genotoxicity_oecd_tg_in_vivo_enum": [
+    "OECD_TG_474_In Vivo Mammalian Erythrocyte Micronucleus Test",
+    "OECD_TG_475_Mammalian Bone Marrow Chromosomal Aberration Test",
+    "OECD_TG_488_Transgenic Rodent Somatic & Germ Cell Gene Mutation Assays",
+    "OECD_TG_489_In Vivo Mammalian Alkaline Comet Assay",
+    "not_reported"
   ],
+  "genotoxicity_result_enum": ["positive","negative","equivocal","not_reported"],
+  "genotoxicity_result_keywords": {
+    "positive": [
+      "genotoxic","mutagenic","clastogenic","statistically_significant_increase",
+      "significant_increase_in_mutations","induced_dna_damage","dose_dependent_increase"
+    ],
+    "negative": [
+      "non_genotoxic","not_genotoxic","not_mutagenic","no_evidence_of_genotoxicity",
+      "no_statistically_significant_increase","negative_result"
+    ],
+    "equivocal": ["equivocal","inconclusive"]
+  }
 }"""
 DEFAULT_FIELD_SPEC = """# One field per line: Field Name | type | instructions
+# types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]
 Chemical(s) | list[str] | Primary chemical(s) studied; include common name + abbreviation if present.
 CAS_numbers | list[str] | Extract any CAS numbers mentioned.
+Approach | enum[in_vivo,in_vitro,in_silico,nams,mixed,not_reported] | Identify if results are in silico or NAMs; use 'mixed' if multiple.
+In_silico_methods | list[enum[qsar,read_across,molecular_docking,molecular_dynamics,pbpk_pbtK,aop_based,ml_model,other,not_reported]] | If in_silico, list methods used (can be multiple).
+NAMs_methods | list[enum[high_throughput_screening_hts,omics_transcriptomics,omics_proteomics,omics_metabolomics,organ_on_chip,microphysiological_system_mps,3d_tissue_model,in_chemico_assay,in_silico_as_nams,other,not_reported]] | If NAMs, list methods used (can be multiple).
 Study_type | enum[in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other] | Choose the best match.
 Exposure_route | enum[oral,inhalation,dermal,parenteral,multiple,not_reported] | Choose best match.
 Species | enum[human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported] | Choose best match.
+Genotox_OECD_TG_in_vitro | list[enum[
+  OECD_TG_471_Bacterial Reverse mutation test(AMES test),
+  OECD_TG_473_In Vitro Mammalian Chromosomal Aberration Test,
+  OECD_TG_476_In Vitro Mammalian Cell Gene Mutation Tests (Hprt & xprt),
+  OECD_TG_487_In Vitro Mammalian Cell Micronucleus Test,
+  OECD_TG_490_In Vitro Mammalian Cell Gene Mutation Tests (Thymidine Kinase),
+  not_reported
+]] | If genotoxicity in vitro tests are reported, select all applicable TGs. Otherwise not_reported.
+Genotox_OECD_TG_in_vivo | list[enum[
+  OECD_TG_474_In Vivo Mammalian Erythrocyte Micronucleus Test,
+  OECD_TG_475_Mammalian Bone Marrow Chromosomal Aberration Test,
+  OECD_TG_488_Transgenic Rodent Somatic & Germ Cell Gene Mutation Assays,
+  OECD_TG_489_In Vivo Mammalian Alkaline Comet Assay,
+  not_reported
+]] | If genotoxicity in vivo tests are reported, select all applicable TGs. Otherwise not_reported.
+Genotoxicity_result | enum[positive,negative,equivocal,not_reported] | Classify based on reported results language (see genotoxicity_result_keywords in vocab).
+Genotoxicity_result_notes | str | Short explanation grounded to the paper’s wording + what test context it applies to.
 Dose_metrics | list[str] | Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available.
 Key_findings | str | 2-4 bullet-like sentences summarizing the main findings.
 Conclusion | str | What does the paper conclude about safety/risk?
 """
 # =============================
 # PDF extraction (text-based PDFs only)
 # =============================
             schema = {"type": "number"}
         elif ftype == "bool":
             schema = {"type": "boolean"}
+        elif ftype.startswith("list[enum[") and ftype.endswith("]]"):
+            inside = ftype[len("list[enum["):-2].strip()
+            vals = [v.strip() for v in inside.split(",") if v.strip()]
+            schema = {"type": "array", "items": {"type": "string", "enum": vals}}
         elif ftype.startswith("list[str]"):
             schema = {"type": "array", "items": {"type": "string"}}
         elif ftype.startswith("list[num]"):
         "3) Provide evidence quotes + page ranges for extracted fields.\n"
         "4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
         "5) Prefer controlled vocab terms when applicable.\n"
+        "6) For Genotoxicity_result, use genotoxicity_result_keywords (positive/negative/equivocal) as guidance, but ONLY if the paper explicitly reports results.\n"
+        "7) For OECD TG fields, only populate if the TG is explicitly stated or clearly described; otherwise use not_reported.\n"
+        "8) For NAMs/in_silico fields, only populate if methods are explicitly described; otherwise not_reported.\n"
     )
     user_msg = (