EphAsad commited on
Commit
e11bc4c
·
verified ·
1 Parent(s): 28b5c1c

Update engine/parser_ext.py

Browse files
Files changed (1) hide show
  1. engine/parser_ext.py +65 -115
engine/parser_ext.py CHANGED
@@ -1,47 +1,14 @@
1
  # engine/parser_ext.py
2
  # ======================================================================
3
- # Extended test parser — Stage 12C-fix1
4
- #
5
- # Fixes added after eval_parsers (0.882 accuracy):
6
- # ✔ Odor: parse value following odor/smell anchor
7
- # ✔ Ornithine Decarboxylase spelling corrected
8
- # ✔ Motility Type: parse value following anchor
9
- # ✔ NaCl Tolerant (>=6%) explicit rule
10
- # ✔ Haemolysis Type explicit override rule
11
- #
12
- # GOAL:
13
- # • Explicit-only parsing
14
- # • ML-safe
15
- # • Deterministic
16
- # • No inference
17
  # ======================================================================
18
 
19
  from __future__ import annotations
20
- import os, re, json
21
- from typing import Dict, Any, List
22
-
23
- EXTENDED_SCHEMA_PATH = os.path.join("data", "extended_schema.json")
24
 
25
  UNKNOWN = "Unknown"
26
 
27
- # ======================================================================
28
- # Fields NOT parsed here
29
- # ======================================================================
30
- CORE_FIELDS = {
31
- "Genus","Species",
32
- "Gram Stain","Shape","Colony Morphology",
33
- "Haemolysis","Motility","Capsule","Spore Formation",
34
- "Growth Temperature","Oxygen Requirement","Media Grown On",
35
- "Catalase","Oxidase","Indole","Urease","Citrate","Methyl Red","VP",
36
- "H2S","DNase","ONPG","Coagulase","Lipase Test","Nitrate Reduction",
37
- "Lysine Decarboxylase","Arginine dihydrolase",
38
- "Gelatin Hydrolysis","Esculin Hydrolysis",
39
- "Glucose Fermentation","Lactose Fermentation","Sucrose Fermentation",
40
- "Mannitol Fermentation","Sorbitol Fermentation","Maltose Fermentation",
41
- "Xylose Fermentation","Rhamnose Fermentation","Arabinose Fermentation",
42
- "Raffinose Fermentation","Trehalose Fermentation","Inositol Fermentation",
43
- }
44
-
45
  # ======================================================================
46
  # Helpers
47
  # ======================================================================
@@ -58,16 +25,10 @@ def _set_if_stronger(parsed: Dict[str,str], field: str, value: str):
58
  if field not in parsed or parsed[field] == UNKNOWN:
59
  parsed[field] = value
60
 
61
- def _parse_pnv_after_anchor(
62
- text: str,
63
- parsed: Dict[str,str],
64
- field: str,
65
- anchor: str
66
- ):
67
  m = re.search(
68
- rf"\b{re.escape(anchor)}\b\s*(positive|negative|variable|unknown)",
69
- text,
70
- re.IGNORECASE,
71
  )
72
  if m:
73
  _set_if_stronger(parsed, field, m.group(1).capitalize())
@@ -78,118 +39,99 @@ def _parse_pnv_after_anchor(
78
 
79
  def _parse_gas_production(text: str, parsed: Dict[str,str]):
80
  t = text.lower()
81
-
82
- POS = [
83
  "produces gas","gas produced","with gas",
84
  "gas production positive","gas producer",
85
- "production of gas","ferments glucose with gas",
86
- ]
87
- NEG = [
88
- "does not produce gas","no gas",
89
- "absence of gas","gas production negative",
90
- ]
91
-
92
- if any(p in t for p in POS):
93
  _set_if_stronger(parsed,"Gas Production","Positive")
94
- elif any(n in t for n in NEG):
 
 
 
95
  _set_if_stronger(parsed,"Gas Production","Negative")
96
 
97
  # ======================================================================
98
- # 2. Motility Type (explicit)
99
  # ======================================================================
100
 
101
- MOTILITY_TYPES = [
102
  "Peritrichous","Monotrichous","Polytrichous","Polar",
103
- "Swarming","Tumbling","Gliding","Corkscrew","Axial",
104
- ]
105
 
106
  def _parse_motility_type(text: str, parsed: Dict[str,str]):
107
  t = text.lower()
108
 
109
- # Anchor-based: "Motility Type Swarming"
110
- m = re.search(
111
- r"\bmotility type\b\s*[:\-]?\s*([a-z]+)",
112
- t
113
- )
 
 
114
  if m:
115
  val = m.group(1).capitalize()
116
  if val in MOTILITY_TYPES:
117
  _set_if_stronger(parsed,"Motility Type",val)
118
  return
119
 
120
- # Free explicit words (only these)
121
  for mt in MOTILITY_TYPES:
122
  if re.search(rf"\b{mt.lower()}\b", t):
123
  _set_if_stronger(parsed,"Motility Type",mt)
124
  return
125
 
126
  # ======================================================================
127
- # 3. Pigment (explicit only)
128
  # ======================================================================
129
 
130
- PIGMENT_TERMS = [
131
- "pyocyanin","pyoverdine","pyovacin",
132
- "green","yellow","pink","red","orange",
133
- "brown","black","violet","cream",
134
- ]
135
-
136
  def _parse_pigment(text: str, parsed: Dict[str,str]):
137
  t = text.lower()
138
-
139
  if not re.search(r"\b(pigment|pigmentation)\b", t):
140
  return
141
 
142
- found = []
143
- for p in PIGMENT_TERMS:
144
- if re.search(rf"\b{p}\b", t):
145
- found.append(p.capitalize())
146
-
147
  if "no pigmentation" in t or "pigment none" in t:
148
  _set_if_stronger(parsed,"Pigment","None")
149
- elif found:
150
- _set_if_stronger(parsed,"Pigment","; ".join(sorted(set(found))))
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  # ======================================================================
153
  # 4. Colony Pattern (explicit only)
154
  # ======================================================================
155
 
156
- COLONY_PATTERNS = [
157
- "Mucoid","Smooth","Rough","Filamentous",
158
- "Spreading","Swarming","Sticky",
159
- "Ground-glass","Molar-tooth","Chalky","Corroding",
160
- ]
161
-
162
  def _parse_colony_pattern(text: str, parsed: Dict[str,str]):
163
  t = text.lower()
164
-
165
  if not re.search(r"\bcolony pattern\b", t):
166
  return
167
 
168
- m = re.search(
169
- r"\bcolony pattern\b\s*[:\-]?\s*([a-z\-]+)",
170
- t
171
- )
172
  if m:
173
- val = m.group(1).capitalize()
174
- if val in COLONY_PATTERNS:
175
- _set_if_stronger(parsed,"Colony Pattern",val)
176
 
177
  # ======================================================================
178
  # 5. Odor (anchor-based)
179
  # ======================================================================
180
 
181
  def _parse_odor(text: str, parsed: Dict[str,str]):
182
- t = text.lower()
183
-
184
  m = re.search(
185
  r"\b(odor|odour|smell)\b\s*[:\-]?\s*([a-z; ]+)",
186
- t
187
  )
188
  if not m:
189
  return
190
-
191
- raw = m.group(2)
192
- vals = [v.strip().capitalize() for v in raw.split(";") if v.strip()]
193
  if vals:
194
  _set_if_stronger(parsed,"Odor","; ".join(vals))
195
 
@@ -199,7 +141,6 @@ def _parse_odor(text: str, parsed: Dict[str,str]):
199
 
200
  def _parse_tsi(text: str, parsed: Dict[str,str]):
201
  t = text.upper()
202
-
203
  if "TSI" in t and "UNKNOWN" in t:
204
  _set_if_stronger(parsed,"TSI Pattern","Unknown")
205
  return
@@ -207,35 +148,39 @@ def _parse_tsi(text: str, parsed: Dict[str,str]):
207
  m = re.search(r"\b([KA]/[KA])(\s*\+\s*H2S)?\b", t)
208
  if m:
209
  base = m.group(1)
210
- if m.group(2):
211
- _set_if_stronger(parsed,"TSI Pattern",f"{base}+H2S")
212
- else:
213
- _set_if_stronger(parsed,"TSI Pattern",base)
214
 
215
  # ======================================================================
216
  # 7. NaCl Tolerant (>=6%)
217
  # ======================================================================
218
 
219
  def _parse_nacl(text: str, parsed: Dict[str,str]):
220
- _parse_pnv_after_anchor(
221
- text, parsed,
222
- "NaCl Tolerant (>=6%)",
223
- "NaCl Tolerant (>=6%)"
224
- )
225
 
226
  # ======================================================================
227
- # 8. Haemolysis Type override
228
  # ======================================================================
229
 
230
  def _parse_haemolysis_type(text: str, parsed: Dict[str,str]):
231
  m = re.search(
232
- r"\bhaemolysis type\b\s*[:\-]?\s*(alpha|beta|gamma)",
233
- text,
234
- re.IGNORECASE,
235
  )
236
  if m:
237
  _set_if_stronger(parsed,"Haemolysis Type",m.group(1).capitalize())
238
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  # ======================================================================
240
  # MAIN
241
  # ======================================================================
@@ -256,6 +201,11 @@ def parse_text_extended(text: str) -> Dict[str,Any]:
256
  _parse_tsi(cleaned, parsed)
257
  _parse_nacl(cleaned, parsed)
258
  _parse_haemolysis_type(cleaned, parsed)
 
 
 
 
 
259
 
260
  return {
261
  "parsed_fields": parsed,
 
1
  # engine/parser_ext.py
2
  # ======================================================================
3
+ # Extended test parser — Stage 12C-fix2
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  # ======================================================================
5
 
6
  from __future__ import annotations
7
+ import os, re
8
+ from typing import Dict, Any
 
 
9
 
10
  UNKNOWN = "Unknown"
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  # ======================================================================
13
  # Helpers
14
  # ======================================================================
 
25
  if field not in parsed or parsed[field] == UNKNOWN:
26
  parsed[field] = value
27
 
28
+ def _parse_pnv_after_anchor(text: str, parsed: Dict[str,str], field: str):
 
 
 
 
 
29
  m = re.search(
30
+ rf"\b{re.escape(field.lower())}\b\s*(positive|negative|variable|unknown)",
31
+ text.lower()
 
32
  )
33
  if m:
34
  _set_if_stronger(parsed, field, m.group(1).capitalize())
 
39
 
40
  def _parse_gas_production(text: str, parsed: Dict[str,str]):
41
  t = text.lower()
42
+ if any(x in t for x in [
 
43
  "produces gas","gas produced","with gas",
44
  "gas production positive","gas producer",
45
+ "production of gas","ferments glucose with gas"
46
+ ]):
 
 
 
 
 
 
47
  _set_if_stronger(parsed,"Gas Production","Positive")
48
+ elif any(x in t for x in [
49
+ "does not produce gas","no gas",
50
+ "absence of gas","gas production negative"
51
+ ]):
52
  _set_if_stronger(parsed,"Gas Production","Negative")
53
 
54
  # ======================================================================
55
+ # 2. Motility Type (fixed)
56
  # ======================================================================
57
 
58
+ MOTILITY_TYPES = {
59
  "Peritrichous","Monotrichous","Polytrichous","Polar",
60
+ "Swarming","Tumbling","Gliding","Corkscrew","Axial"
61
+ }
62
 
63
  def _parse_motility_type(text: str, parsed: Dict[str,str]):
64
  t = text.lower()
65
 
66
+ # Explicit negative
67
+ if re.search(r"\bmotility type\b\s*(negative|none)", t):
68
+ _set_if_stronger(parsed,"Motility Type","Negative")
69
+ return
70
+
71
+ # Anchor-based
72
+ m = re.search(r"\bmotility type\b\s*[:\-]?\s*([a-z]+)", t)
73
  if m:
74
  val = m.group(1).capitalize()
75
  if val in MOTILITY_TYPES:
76
  _set_if_stronger(parsed,"Motility Type",val)
77
  return
78
 
79
+ # Free word
80
  for mt in MOTILITY_TYPES:
81
  if re.search(rf"\b{mt.lower()}\b", t):
82
  _set_if_stronger(parsed,"Motility Type",mt)
83
  return
84
 
85
  # ======================================================================
86
+ # 3. Pigment (unchanged)
87
  # ======================================================================
88
 
 
 
 
 
 
 
89
  def _parse_pigment(text: str, parsed: Dict[str,str]):
90
  t = text.lower()
 
91
  if not re.search(r"\b(pigment|pigmentation)\b", t):
92
  return
93
 
 
 
 
 
 
94
  if "no pigmentation" in t or "pigment none" in t:
95
  _set_if_stronger(parsed,"Pigment","None")
96
+ return
97
+
98
+ pigments = []
99
+ for p in [
100
+ "pyocyanin","pyoverdine","pyovacin",
101
+ "green","yellow","pink","red","orange",
102
+ "brown","black","violet","cream"
103
+ ]:
104
+ if re.search(rf"\b{p}\b", t):
105
+ pigments.append(p.capitalize())
106
+
107
+ if pigments:
108
+ _set_if_stronger(parsed,"Pigment","; ".join(sorted(set(pigments))))
109
 
110
  # ======================================================================
111
  # 4. Colony Pattern (explicit only)
112
  # ======================================================================
113
 
 
 
 
 
 
 
114
  def _parse_colony_pattern(text: str, parsed: Dict[str,str]):
115
  t = text.lower()
 
116
  if not re.search(r"\bcolony pattern\b", t):
117
  return
118
 
119
+ m = re.search(r"\bcolony pattern\b\s*[:\-]?\s*([a-z\-]+)", t)
 
 
 
120
  if m:
121
+ _set_if_stronger(parsed,"Colony Pattern",m.group(1).capitalize())
 
 
122
 
123
  # ======================================================================
124
  # 5. Odor (anchor-based)
125
  # ======================================================================
126
 
127
  def _parse_odor(text: str, parsed: Dict[str,str]):
 
 
128
  m = re.search(
129
  r"\b(odor|odour|smell)\b\s*[:\-]?\s*([a-z; ]+)",
130
+ text.lower()
131
  )
132
  if not m:
133
  return
134
+ vals = [v.strip().capitalize() for v in m.group(2).split(";") if v.strip()]
 
 
135
  if vals:
136
  _set_if_stronger(parsed,"Odor","; ".join(vals))
137
 
 
141
 
142
  def _parse_tsi(text: str, parsed: Dict[str,str]):
143
  t = text.upper()
 
144
  if "TSI" in t and "UNKNOWN" in t:
145
  _set_if_stronger(parsed,"TSI Pattern","Unknown")
146
  return
 
148
  m = re.search(r"\b([KA]/[KA])(\s*\+\s*H2S)?\b", t)
149
  if m:
150
  base = m.group(1)
151
+ _set_if_stronger(parsed,"TSI Pattern", base + ("+H2S" if m.group(2) else ""))
 
 
 
152
 
153
  # ======================================================================
154
  # 7. NaCl Tolerant (>=6%)
155
  # ======================================================================
156
 
157
  def _parse_nacl(text: str, parsed: Dict[str,str]):
158
+ _parse_pnv_after_anchor(text, parsed, "NaCl Tolerant (>=6%)")
 
 
 
 
159
 
160
  # ======================================================================
161
+ # 8. Haemolysis Type (fixed)
162
  # ======================================================================
163
 
164
  def _parse_haemolysis_type(text: str, parsed: Dict[str,str]):
165
  m = re.search(
166
+ r"\bhaemolysis type\b\s*[:\-]?\s*(alpha|beta|gamma|none)",
167
+ text.lower()
 
168
  )
169
  if m:
170
  _set_if_stronger(parsed,"Haemolysis Type",m.group(1).capitalize())
171
 
172
+ # ======================================================================
173
+ # 9. Ornithine / Ornitihine Decarboxylase alias sync
174
+ # ======================================================================
175
+
176
+ def _sync_ornithine(parsed: Dict[str,str]):
177
+ if "Ornitihine Decarboxylase" in parsed:
178
+ _set_if_stronger(
179
+ parsed,
180
+ "Ornithine Decarboxylase",
181
+ parsed["Ornitihine Decarboxylase"]
182
+ )
183
+
184
  # ======================================================================
185
  # MAIN
186
  # ======================================================================
 
201
  _parse_tsi(cleaned, parsed)
202
  _parse_nacl(cleaned, parsed)
203
  _parse_haemolysis_type(cleaned, parsed)
204
+ _sync_ornithine(parsed)
205
+
206
+ # If Motility is explicitly negative anywhere → Motility Type negative
207
+ if parsed.get("Motility") == "Negative":
208
+ _set_if_stronger(parsed,"Motility Type","Negative")
209
 
210
  return {
211
  "parsed_fields": parsed,