hchevva commited on
Commit
594d25a
·
verified ·
1 Parent(s): 79b39e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -13
app.py CHANGED
@@ -21,40 +21,112 @@ from openai import OpenAI
21
  DEFAULT_CONTROLLED_VOCAB_JSON = """{
22
  "risk_stance_enum": ["acceptable","acceptable_with_uncertainty","not_acceptable","insufficient_data"],
23
 
 
 
24
  "study_type_enum": ["in_vivo","in_vitro","epidemiology","in_silico","review","methodology","other"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  "exposure_route_enum": ["oral","inhalation","dermal","parenteral","multiple","not_reported"],
26
  "species_enum": ["human","rat","mouse","rabbit","dog","non_human_primate","cell_line","other","not_reported"],
27
 
28
- "oecd_endpoints": [
29
- "acute_toxicity","subacute_toxicity","subchronic_toxicity","chronic_toxicity",
30
- "carcinogenicity","genotoxicity","reproductive_toxicity","developmental_toxicity",
31
- "neurotoxicity","immunotoxicity","endocrine_activity","sensitization","irritation_corrosion"
32
- ],
33
 
34
- "meddra_like_terms": [
35
- "hepatic_disorder","renal_disorder","nervous_system_disorder","respiratory_disorder",
36
- "skin_and_subcutaneous_tissue_disorder","reproductive_system_disorder",
37
- "immune_system_disorder","blood_and_lymphatic_system_disorder"
 
 
 
 
 
 
 
 
 
 
38
  ],
39
 
40
- "dose_metric_terms": ["noael","loael","bmd","bmdl","ld50","lc50","ec50","ic50"]
 
 
 
 
 
 
 
 
 
 
 
 
41
  }"""
42
 
 
43
  DEFAULT_FIELD_SPEC = """# One field per line: Field Name | type | instructions
44
- # types: str, num, bool, list[str], list[num], enum[a,b,c]
 
45
  Chemical(s) | list[str] | Primary chemical(s) studied; include common name + abbreviation if present.
46
  CAS_numbers | list[str] | Extract any CAS numbers mentioned.
 
 
 
 
 
47
  Study_type | enum[in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other] | Choose the best match.
48
  Exposure_route | enum[oral,inhalation,dermal,parenteral,multiple,not_reported] | Choose best match.
49
  Species | enum[human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported] | Choose best match.
50
- OECD_endpoints | list[str] | Extract endpoints; prefer controlled vocab 'oecd_endpoints' when applicable.
51
- MedDRA_like_terms | list[str] | Extract effects; prefer controlled vocab 'meddra_like_terms' when applicable.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  Dose_metrics | list[str] | Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available.
53
  Key_findings | str | 2-4 bullet-like sentences summarizing the main findings.
54
  Conclusion | str | What does the paper conclude about safety/risk?
55
  """
56
 
57
 
 
58
  # =============================
59
  # PDF extraction (text-based PDFs only)
60
  # =============================
@@ -199,6 +271,10 @@ def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
199
  schema = {"type": "number"}
200
  elif ftype == "bool":
201
  schema = {"type": "boolean"}
 
 
 
 
202
  elif ftype.startswith("list[str]"):
203
  schema = {"type": "array", "items": {"type": "string"}}
204
  elif ftype.startswith("list[num]"):
@@ -292,6 +368,10 @@ def openai_structured_extract(
292
  "3) Provide evidence quotes + page ranges for extracted fields.\n"
293
  "4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
294
  "5) Prefer controlled vocab terms when applicable.\n"
 
 
 
 
295
  )
296
 
297
  user_msg = (
 
21
  DEFAULT_CONTROLLED_VOCAB_JSON = """{
22
  "risk_stance_enum": ["acceptable","acceptable_with_uncertainty","not_acceptable","insufficient_data"],
23
 
24
+ "approach_enum": ["in_vivo","in_vitro","in_silico","nams","mixed","not_reported"],
25
+
26
  "study_type_enum": ["in_vivo","in_vitro","epidemiology","in_silico","review","methodology","other"],
27
+ "in_silico_method_enum": [
28
+ "qsar",
29
+ "read_across",
30
+ "molecular_docking",
31
+ "molecular_dynamics",
32
+ "pbpk_pbtK",
33
+ "aop_based",
34
+ "ml_model",
35
+ "other",
36
+ "not_reported"
37
+ ],
38
+ "nams_method_enum": [
39
+ "high_throughput_screening_hts",
40
+ "omics_transcriptomics",
41
+ "omics_proteomics",
42
+ "omics_metabolomics",
43
+ "organ_on_chip",
44
+ "microphysiological_system_mps",
45
+ "3d_tissue_model",
46
+ "in_chemico_assay",
47
+ "in_silico_as_nams",
48
+ "other",
49
+ "not_reported"
50
+ ],
51
+
52
  "exposure_route_enum": ["oral","inhalation","dermal","parenteral","multiple","not_reported"],
53
  "species_enum": ["human","rat","mouse","rabbit","dog","non_human_primate","cell_line","other","not_reported"],
54
 
55
+ "dose_metric_terms": ["noael","loael","bmd","bmdl","ld50","lc50","ec50","ic50"],
 
 
 
 
56
 
57
+ "genotoxicity_oecd_tg_in_vitro_enum": [
58
+ "OECD_TG_471_Bacterial Reverse mutation test(AMES test)",
59
+ "OECD_TG_473_In Vitro Mammalian Chromosomal Aberration Test",
60
+ "OECD_TG_476_In Vitro Mammalian Cell Gene Mutation Tests (Hprt & xprt)",
61
+ "OECD_TG_487_In Vitro Mammalian Cell Micronucleus Test",
62
+ "OECD_TG_490_In Vitro Mammalian Cell Gene Mutation Tests (Thymidine Kinase)",
63
+ "not_reported"
64
+ ],
65
+ "genotoxicity_oecd_tg_in_vivo_enum": [
66
+ "OECD_TG_474_In Vivo Mammalian Erythrocyte Micronucleus Test",
67
+ "OECD_TG_475_Mammalian Bone Marrow Chromosomal Aberration Test",
68
+ "OECD_TG_488_Transgenic Rodent Somatic & Germ Cell Gene Mutation Assays",
69
+ "OECD_TG_489_In Vivo Mammalian Alkaline Comet Assay",
70
+ "not_reported"
71
  ],
72
 
73
+ "genotoxicity_result_enum": ["positive","negative","equivocal","not_reported"],
74
+
75
+ "genotoxicity_result_keywords": {
76
+ "positive": [
77
+ "genotoxic","mutagenic","clastogenic","statistically_significant_increase",
78
+ "significant_increase_in_mutations","induced_dna_damage","dose_dependent_increase"
79
+ ],
80
+ "negative": [
81
+ "non_genotoxic","not_genotoxic","not_mutagenic","no_evidence_of_genotoxicity",
82
+ "no_statistically_significant_increase","negative_result"
83
+ ],
84
+ "equivocal": ["equivocal","inconclusive"]
85
+ }
86
  }"""
87
 
88
+
89
  DEFAULT_FIELD_SPEC = """# One field per line: Field Name | type | instructions
90
+ # types: str, num, bool, list[str], list[num], enum[a,b,c], list[enum[a,b,c]]
91
+
92
  Chemical(s) | list[str] | Primary chemical(s) studied; include common name + abbreviation if present.
93
  CAS_numbers | list[str] | Extract any CAS numbers mentioned.
94
+
95
+ Approach | enum[in_vivo,in_vitro,in_silico,nams,mixed,not_reported] | Identify if results are in silico or NAMs; use 'mixed' if multiple.
96
+ In_silico_methods | list[enum[qsar,read_across,molecular_docking,molecular_dynamics,pbpk_pbtK,aop_based,ml_model,other,not_reported]] | If in_silico, list methods used (can be multiple).
97
+ NAMs_methods | list[enum[high_throughput_screening_hts,omics_transcriptomics,omics_proteomics,omics_metabolomics,organ_on_chip,microphysiological_system_mps,3d_tissue_model,in_chemico_assay,in_silico_as_nams,other,not_reported]] | If NAMs, list methods used (can be multiple).
98
+
99
  Study_type | enum[in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other] | Choose the best match.
100
  Exposure_route | enum[oral,inhalation,dermal,parenteral,multiple,not_reported] | Choose best match.
101
  Species | enum[human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported] | Choose best match.
102
+
103
+ Genotox_OECD_TG_in_vitro | list[enum[
104
+ OECD_TG_471_Bacterial Reverse mutation test(AMES test),
105
+ OECD_TG_473_In Vitro Mammalian Chromosomal Aberration Test,
106
+ OECD_TG_476_In Vitro Mammalian Cell Gene Mutation Tests (Hprt & xprt),
107
+ OECD_TG_487_In Vitro Mammalian Cell Micronucleus Test,
108
+ OECD_TG_490_In Vitro Mammalian Cell Gene Mutation Tests (Thymidine Kinase),
109
+ not_reported
110
+ ]] | If genotoxicity in vitro tests are reported, select all applicable TGs. Otherwise not_reported.
111
+
112
+ Genotox_OECD_TG_in_vivo | list[enum[
113
+ OECD_TG_474_In Vivo Mammalian Erythrocyte Micronucleus Test,
114
+ OECD_TG_475_Mammalian Bone Marrow Chromosomal Aberration Test,
115
+ OECD_TG_488_Transgenic Rodent Somatic & Germ Cell Gene Mutation Assays,
116
+ OECD_TG_489_In Vivo Mammalian Alkaline Comet Assay,
117
+ not_reported
118
+ ]] | If genotoxicity in vivo tests are reported, select all applicable TGs. Otherwise not_reported.
119
+
120
+ Genotoxicity_result | enum[positive,negative,equivocal,not_reported] | Classify based on reported results language (see genotoxicity_result_keywords in vocab).
121
+ Genotoxicity_result_notes | str | Short explanation grounded to the paper’s wording + what test context it applies to.
122
+
123
  Dose_metrics | list[str] | Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available.
124
  Key_findings | str | 2-4 bullet-like sentences summarizing the main findings.
125
  Conclusion | str | What does the paper conclude about safety/risk?
126
  """
127
 
128
 
129
+
130
  # =============================
131
  # PDF extraction (text-based PDFs only)
132
  # =============================
 
271
  schema = {"type": "number"}
272
  elif ftype == "bool":
273
  schema = {"type": "boolean"}
274
+ elif ftype.startswith("list[enum[") and ftype.endswith("]]"):
275
+ inside = ftype[len("list[enum["):-2].strip()
276
+ vals = [v.strip() for v in inside.split(",") if v.strip()]
277
+ schema = {"type": "array", "items": {"type": "string", "enum": vals}}
278
  elif ftype.startswith("list[str]"):
279
  schema = {"type": "array", "items": {"type": "string"}}
280
  elif ftype.startswith("list[num]"):
 
368
  "3) Provide evidence quotes + page ranges for extracted fields.\n"
369
  "4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
370
  "5) Prefer controlled vocab terms when applicable.\n"
371
+ "6) For Genotoxicity_result, use genotoxicity_result_keywords (positive/negative/equivocal) as guidance, but ONLY if the paper explicitly reports results.\n"
372
+ "7) For OECD TG fields, only populate if the TG is explicitly stated or clearly described; otherwise use not_reported.\n"
373
+ "8) For NAMs/in_silico fields, only populate if methods are explicitly described; otherwise not_reported.\n"
374
+
375
  )
376
 
377
  user_msg = (