EphAsad commited on
Commit
8f3c0ec
·
verified ·
1 Parent(s): e11bc4c

Update engine/parser_ext.py

Browse files
Files changed (1) hide show
  1. engine/parser_ext.py +142 -56
engine/parser_ext.py CHANGED
@@ -1,14 +1,45 @@
1
  # engine/parser_ext.py
2
  # ======================================================================
3
  # Extended test parser — Stage 12C-fix2
 
 
 
 
 
 
 
 
 
 
 
4
  # ======================================================================
5
 
6
  from __future__ import annotations
7
- import os, re
8
- from typing import Dict, Any
 
 
9
 
10
  UNKNOWN = "Unknown"
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  # ======================================================================
13
  # Helpers
14
  # ======================================================================
@@ -25,113 +56,155 @@ def _set_if_stronger(parsed: Dict[str,str], field: str, value: str):
25
  if field not in parsed or parsed[field] == UNKNOWN:
26
  parsed[field] = value
27
 
28
- def _parse_pnv_after_anchor(text: str, parsed: Dict[str,str], field: str):
 
 
 
 
 
29
  m = re.search(
30
- rf"\b{re.escape(field.lower())}\b\s*(positive|negative|variable|unknown)",
31
- text.lower()
 
32
  )
33
  if m:
34
  _set_if_stronger(parsed, field, m.group(1).capitalize())
35
 
 
 
 
 
 
 
 
 
 
 
36
  # ======================================================================
37
  # 1. Gas Production
38
  # ======================================================================
39
 
40
  def _parse_gas_production(text: str, parsed: Dict[str,str]):
41
  t = text.lower()
42
- if any(x in t for x in [
 
43
  "produces gas","gas produced","with gas",
44
  "gas production positive","gas producer",
45
- "production of gas","ferments glucose with gas"
46
- ]):
47
- _set_if_stronger(parsed,"Gas Production","Positive")
48
- elif any(x in t for x in [
49
  "does not produce gas","no gas",
50
- "absence of gas","gas production negative"
51
- ]):
 
 
 
 
52
  _set_if_stronger(parsed,"Gas Production","Negative")
53
 
54
  # ======================================================================
55
- # 2. Motility Type (fixed)
56
  # ======================================================================
57
 
58
- MOTILITY_TYPES = {
59
  "Peritrichous","Monotrichous","Polytrichous","Polar",
60
- "Swarming","Tumbling","Gliding","Corkscrew","Axial"
61
- }
62
 
63
  def _parse_motility_type(text: str, parsed: Dict[str,str]):
64
  t = text.lower()
65
 
66
- # Explicit negative
67
- if re.search(r"\bmotility type\b\s*(negative|none)", t):
68
- _set_if_stronger(parsed,"Motility Type","Negative")
 
 
69
  return
70
 
71
- # Anchor-based
72
- m = re.search(r"\bmotility type\b\s*[:\-]?\s*([a-z]+)", t)
 
 
 
73
  if m:
74
  val = m.group(1).capitalize()
75
  if val in MOTILITY_TYPES:
76
  _set_if_stronger(parsed,"Motility Type",val)
77
  return
78
 
79
- # Free word
80
  for mt in MOTILITY_TYPES:
81
  if re.search(rf"\b{mt.lower()}\b", t):
82
  _set_if_stronger(parsed,"Motility Type",mt)
83
  return
84
 
85
  # ======================================================================
86
- # 3. Pigment (unchanged)
87
  # ======================================================================
88
 
 
 
 
 
 
 
89
  def _parse_pigment(text: str, parsed: Dict[str,str]):
90
  t = text.lower()
91
- if not re.search(r"\b(pigment|pigmentation)\b", t):
92
- return
93
 
94
- if "no pigmentation" in t or "pigment none" in t:
95
- _set_if_stronger(parsed,"Pigment","None")
96
  return
97
 
98
- pigments = []
99
- for p in [
100
- "pyocyanin","pyoverdine","pyovacin",
101
- "green","yellow","pink","red","orange",
102
- "brown","black","violet","cream"
103
- ]:
104
  if re.search(rf"\b{p}\b", t):
105
- pigments.append(p.capitalize())
106
 
107
- if pigments:
108
- _set_if_stronger(parsed,"Pigment","; ".join(sorted(set(pigments))))
 
 
109
 
110
  # ======================================================================
111
  # 4. Colony Pattern (explicit only)
112
  # ======================================================================
113
 
 
 
 
 
 
 
114
  def _parse_colony_pattern(text: str, parsed: Dict[str,str]):
115
  t = text.lower()
 
116
  if not re.search(r"\bcolony pattern\b", t):
117
  return
118
 
119
- m = re.search(r"\bcolony pattern\b\s*[:\-]?\s*([a-z\-]+)", t)
 
 
 
120
  if m:
121
- _set_if_stronger(parsed,"Colony Pattern",m.group(1).capitalize())
 
 
122
 
123
  # ======================================================================
124
  # 5. Odor (anchor-based)
125
  # ======================================================================
126
 
127
  def _parse_odor(text: str, parsed: Dict[str,str]):
 
 
128
  m = re.search(
129
  r"\b(odor|odour|smell)\b\s*[:\-]?\s*([a-z; ]+)",
130
- text.lower()
131
  )
132
  if not m:
133
  return
134
- vals = [v.strip().capitalize() for v in m.group(2).split(";") if v.strip()]
 
 
135
  if vals:
136
  _set_if_stronger(parsed,"Odor","; ".join(vals))
137
 
@@ -141,6 +214,7 @@ def _parse_odor(text: str, parsed: Dict[str,str]):
141
 
142
  def _parse_tsi(text: str, parsed: Dict[str,str]):
143
  t = text.upper()
 
144
  if "TSI" in t and "UNKNOWN" in t:
145
  _set_if_stronger(parsed,"TSI Pattern","Unknown")
146
  return
@@ -148,38 +222,48 @@ def _parse_tsi(text: str, parsed: Dict[str,str]):
148
  m = re.search(r"\b([KA]/[KA])(\s*\+\s*H2S)?\b", t)
149
  if m:
150
  base = m.group(1)
151
- _set_if_stronger(parsed,"TSI Pattern", base + ("+H2S" if m.group(2) else ""))
 
 
 
152
 
153
  # ======================================================================
154
  # 7. NaCl Tolerant (>=6%)
155
  # ======================================================================
156
 
157
  def _parse_nacl(text: str, parsed: Dict[str,str]):
158
- _parse_pnv_after_anchor(text, parsed, "NaCl Tolerant (>=6%)")
 
 
 
 
159
 
160
  # ======================================================================
161
- # 8. Haemolysis Type (fixed)
162
  # ======================================================================
163
 
164
  def _parse_haemolysis_type(text: str, parsed: Dict[str,str]):
165
  m = re.search(
166
  r"\bhaemolysis type\b\s*[:\-]?\s*(alpha|beta|gamma|none)",
167
- text.lower()
 
168
  )
169
  if m:
170
  _set_if_stronger(parsed,"Haemolysis Type",m.group(1).capitalize())
171
 
172
  # ======================================================================
173
- # 9. Ornithine / Ornitihine Decarboxylase alias sync
174
  # ======================================================================
175
 
176
- def _sync_ornithine(parsed: Dict[str,str]):
177
- if "Ornitihine Decarboxylase" in parsed:
178
- _set_if_stronger(
179
- parsed,
180
- "Ornithine Decarboxylase",
181
- parsed["Ornitihine Decarboxylase"]
182
- )
 
 
183
 
184
  # ======================================================================
185
  # MAIN
@@ -201,11 +285,13 @@ def parse_text_extended(text: str) -> Dict[str,Any]:
201
  _parse_tsi(cleaned, parsed)
202
  _parse_nacl(cleaned, parsed)
203
  _parse_haemolysis_type(cleaned, parsed)
204
- _sync_ornithine(parsed)
205
 
206
- # If Motility is explicitly negative anywhere → Motility Type negative
207
- if parsed.get("Motility") == "Negative":
208
- _set_if_stronger(parsed,"Motility Type","Negative")
 
 
209
 
210
  return {
211
  "parsed_fields": parsed,
 
1
  # engine/parser_ext.py
2
  # ======================================================================
3
  # Extended test parser — Stage 12C-fix2
4
+ #
5
+ # Fixes added after eval_parsers (~0.9045 accuracy):
6
+ # ✔ Haemolysis Type: supports "None"
7
+ # ✔ Ornithine Decarboxylase: supports correct spelling + typo alias sync
8
+ # ✔ Motility Type: supports "Negative"/"None" when explicitly stated
9
+ #
10
+ # GOAL:
11
+ # • Explicit-only parsing
12
+ # • ML-safe
13
+ # • Deterministic
14
+ # • No inference
15
  # ======================================================================
16
 
17
  from __future__ import annotations
18
+ import os, re, json
19
+ from typing import Dict, Any, List
20
+
21
+ EXTENDED_SCHEMA_PATH = os.path.join("data", "extended_schema.json")
22
 
23
  UNKNOWN = "Unknown"
24
 
25
+ # ======================================================================
26
+ # Fields NOT parsed here
27
+ # ======================================================================
28
+ CORE_FIELDS = {
29
+ "Genus","Species",
30
+ "Gram Stain","Shape","Colony Morphology",
31
+ "Haemolysis","Motility","Capsule","Spore Formation",
32
+ "Growth Temperature","Oxygen Requirement","Media Grown On",
33
+ "Catalase","Oxidase","Indole","Urease","Citrate","Methyl Red","VP",
34
+ "H2S","DNase","ONPG","Coagulase","Lipase Test","Nitrate Reduction",
35
+ "Lysine Decarboxylase","Arginine dihydrolase",
36
+ "Gelatin Hydrolysis","Esculin Hydrolysis",
37
+ "Glucose Fermentation","Lactose Fermentation","Sucrose Fermentation",
38
+ "Mannitol Fermentation","Sorbitol Fermentation","Maltose Fermentation",
39
+ "Xylose Fermentation","Rhamnose Fermentation","Arabinose Fermentation",
40
+ "Raffinose Fermentation","Trehalose Fermentation","Inositol Fermentation",
41
+ }
42
+
43
  # ======================================================================
44
  # Helpers
45
  # ======================================================================
 
56
  if field not in parsed or parsed[field] == UNKNOWN:
57
  parsed[field] = value
58
 
59
+ def _parse_pnv_after_anchor(
60
+ text: str,
61
+ parsed: Dict[str,str],
62
+ field: str,
63
+ anchor: str
64
+ ):
65
  m = re.search(
66
+ rf"\b{re.escape(anchor)}\b\s*(positive|negative|variable|unknown)",
67
+ text,
68
+ re.IGNORECASE,
69
  )
70
  if m:
71
  _set_if_stronger(parsed, field, m.group(1).capitalize())
72
 
73
+ def _load_extended_schema(path: str) -> Dict[str, Any]:
74
+ if not os.path.exists(path):
75
+ return {}
76
+ try:
77
+ with open(path, "r", encoding="utf-8") as f:
78
+ obj = json.load(f)
79
+ return obj if isinstance(obj, dict) else {}
80
+ except Exception:
81
+ return {}
82
+
83
  # ======================================================================
84
  # 1. Gas Production
85
  # ======================================================================
86
 
87
  def _parse_gas_production(text: str, parsed: Dict[str,str]):
88
  t = text.lower()
89
+
90
+ POS = [
91
  "produces gas","gas produced","with gas",
92
  "gas production positive","gas producer",
93
+ "production of gas","ferments glucose with gas",
94
+ ]
95
+ NEG = [
 
96
  "does not produce gas","no gas",
97
+ "absence of gas","gas production negative",
98
+ ]
99
+
100
+ if any(p in t for p in POS):
101
+ _set_if_stronger(parsed,"Gas Production","Positive")
102
+ elif any(n in t for n in NEG):
103
  _set_if_stronger(parsed,"Gas Production","Negative")
104
 
105
  # ======================================================================
106
+ # 2. Motility Type (explicit)
107
  # ======================================================================
108
 
109
+ MOTILITY_TYPES = [
110
  "Peritrichous","Monotrichous","Polytrichous","Polar",
111
+ "Swarming","Tumbling","Gliding","Corkscrew","Axial",
112
+ ]
113
 
114
  def _parse_motility_type(text: str, parsed: Dict[str,str]):
115
  t = text.lower()
116
 
117
+ # Explicit negative / none:
118
+ # "Motility Type Negative" / "Motility Type None"
119
+ mneg = re.search(r"\bmotility type\b\s*[:\-]?\s*(negative|none)\b", t)
120
+ if mneg:
121
+ _set_if_stronger(parsed, "Motility Type", mneg.group(1).capitalize())
122
  return
123
 
124
+ # Anchor-based: "Motility Type Swarming"
125
+ m = re.search(
126
+ r"\bmotility type\b\s*[:\-]?\s*([a-z]+)",
127
+ t
128
+ )
129
  if m:
130
  val = m.group(1).capitalize()
131
  if val in MOTILITY_TYPES:
132
  _set_if_stronger(parsed,"Motility Type",val)
133
  return
134
 
135
+ # Free explicit words (only these)
136
  for mt in MOTILITY_TYPES:
137
  if re.search(rf"\b{mt.lower()}\b", t):
138
  _set_if_stronger(parsed,"Motility Type",mt)
139
  return
140
 
141
  # ======================================================================
142
+ # 3. Pigment (explicit only)
143
  # ======================================================================
144
 
145
+ PIGMENT_TERMS = [
146
+ "pyocyanin","pyoverdine","pyovacin",
147
+ "green","yellow","pink","red","orange",
148
+ "brown","black","violet","cream",
149
+ ]
150
+
151
  def _parse_pigment(text: str, parsed: Dict[str,str]):
152
  t = text.lower()
 
 
153
 
154
+ if not re.search(r"\b(pigment|pigmentation)\b", t):
 
155
  return
156
 
157
+ found = []
158
+ for p in PIGMENT_TERMS:
 
 
 
 
159
  if re.search(rf"\b{p}\b", t):
160
+ found.append(p.capitalize())
161
 
162
+ if "no pigmentation" in t or "pigment none" in t:
163
+ _set_if_stronger(parsed,"Pigment","None")
164
+ elif found:
165
+ _set_if_stronger(parsed,"Pigment","; ".join(sorted(set(found))))
166
 
167
  # ======================================================================
168
  # 4. Colony Pattern (explicit only)
169
  # ======================================================================
170
 
171
+ COLONY_PATTERNS = [
172
+ "Mucoid","Smooth","Rough","Filamentous",
173
+ "Spreading","Swarming","Sticky",
174
+ "Ground-glass","Molar-tooth","Chalky","Corroding",
175
+ ]
176
+
177
  def _parse_colony_pattern(text: str, parsed: Dict[str,str]):
178
  t = text.lower()
179
+
180
  if not re.search(r"\bcolony pattern\b", t):
181
  return
182
 
183
+ m = re.search(
184
+ r"\bcolony pattern\b\s*[:\-]?\s*([a-z\-]+)",
185
+ t
186
+ )
187
  if m:
188
+ val = m.group(1).capitalize()
189
+ if val in COLONY_PATTERNS:
190
+ _set_if_stronger(parsed,"Colony Pattern",val)
191
 
192
  # ======================================================================
193
  # 5. Odor (anchor-based)
194
  # ======================================================================
195
 
196
  def _parse_odor(text: str, parsed: Dict[str,str]):
197
+ t = text.lower()
198
+
199
  m = re.search(
200
  r"\b(odor|odour|smell)\b\s*[:\-]?\s*([a-z; ]+)",
201
+ t
202
  )
203
  if not m:
204
  return
205
+
206
+ raw = m.group(2)
207
+ vals = [v.strip().capitalize() for v in raw.split(";") if v.strip()]
208
  if vals:
209
  _set_if_stronger(parsed,"Odor","; ".join(vals))
210
 
 
214
 
215
  def _parse_tsi(text: str, parsed: Dict[str,str]):
216
  t = text.upper()
217
+
218
  if "TSI" in t and "UNKNOWN" in t:
219
  _set_if_stronger(parsed,"TSI Pattern","Unknown")
220
  return
 
222
  m = re.search(r"\b([KA]/[KA])(\s*\+\s*H2S)?\b", t)
223
  if m:
224
  base = m.group(1)
225
+ if m.group(2):
226
+ _set_if_stronger(parsed,"TSI Pattern",f"{base}+H2S")
227
+ else:
228
+ _set_if_stronger(parsed,"TSI Pattern",base)
229
 
230
  # ======================================================================
231
  # 7. NaCl Tolerant (>=6%)
232
  # ======================================================================
233
 
234
  def _parse_nacl(text: str, parsed: Dict[str,str]):
235
+ _parse_pnv_after_anchor(
236
+ text, parsed,
237
+ "NaCl Tolerant (>=6%)",
238
+ "NaCl Tolerant (>=6%)"
239
+ )
240
 
241
  # ======================================================================
242
+ # 8. Haemolysis Type override (supports None)
243
  # ======================================================================
244
 
245
  def _parse_haemolysis_type(text: str, parsed: Dict[str,str]):
246
  m = re.search(
247
  r"\bhaemolysis type\b\s*[:\-]?\s*(alpha|beta|gamma|none)",
248
+ text,
249
+ re.IGNORECASE,
250
  )
251
  if m:
252
  _set_if_stronger(parsed,"Haemolysis Type",m.group(1).capitalize())
253
 
254
  # ======================================================================
255
+ # 9. Ornithine Decarboxylase: accept both spellings + alias sync
256
  # ======================================================================
257
 
258
+ def _parse_ornithine_dec(text: str, parsed: Dict[str,str]):
259
+ # Correct spelling
260
+ _parse_pnv_after_anchor(text, parsed, "Ornithine Decarboxylase", "Ornithine Decarboxylase")
261
+ # Common typo spelling (legacy)
262
+ _parse_pnv_after_anchor(text, parsed, "Ornitihine Decarboxylase", "Ornitihine Decarboxylase")
263
+
264
+ # Sync: if typo parsed, also fill correct field
265
+ if "Ornitihine Decarboxylase" in parsed and "Ornithine Decarboxylase" not in parsed:
266
+ _set_if_stronger(parsed, "Ornithine Decarboxylase", parsed["Ornitihine Decarboxylase"])
267
 
268
  # ======================================================================
269
  # MAIN
 
285
  _parse_tsi(cleaned, parsed)
286
  _parse_nacl(cleaned, parsed)
287
  _parse_haemolysis_type(cleaned, parsed)
288
+ _parse_ornithine_dec(cleaned, parsed)
289
 
290
+ # NOTE:
291
+ # You asked: "If Motility Negative parsed (by any parser) should make Motility Type negative automatically".
292
+ # parser_ext only sees raw text, and you’ve chosen to leave parser_rules alone.
293
+ # So we *only* enforce this if the raw text explicitly says "Motility Type Negative/None".
294
+ # (No cross-parser inference here to stay ML-safe.)
295
 
296
  return {
297
  "parsed_fields": parsed,