EphAsad commited on
Commit
b6dd794
·
verified ·
1 Parent(s): db752eb

Update engine/parser_ext.py

Browse files
Files changed (1) hide show
  1. engine/parser_ext.py +324 -111
engine/parser_ext.py CHANGED
@@ -1,19 +1,28 @@
1
  # engine/parser_ext.py
2
  # ------------------------------------------------------------
3
- # Extended test parser (Stage 11C)
4
  #
5
- # - Focuses on *extended* tests (disc tests, rare biochemicals, etc.)
6
- # - Uses extended_schema.json dynamically
7
- # - Ignores core DB fields (those are handled by parser_rules)
8
- # - Adds robust patterns for:
9
- # CAMP, PYR, Optochin, Bacitracin, Novobiocin
 
 
 
 
 
 
 
 
 
10
  #
11
- # Returns:
12
- # {
13
- # "parsed_fields": { ... },
14
- # "source": "extended_parser",
15
- # "raw": original_text
16
- # }
17
  # ------------------------------------------------------------
18
 
19
  from __future__ import annotations
@@ -23,6 +32,10 @@ import os
23
  import re
24
  from typing import Dict, Any, List
25
 
 
 
 
 
26
  EXTENDED_SCHEMA_PATH = os.path.join("data", "extended_schema.json")
27
  ALIAS_MAP_PATH = os.path.join("data", "alias_maps.json")
28
 
@@ -77,9 +90,146 @@ CORE_FIELDS = {
77
  "Inositol Fermentation",
78
  }
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  # ------------------------------------------------------------
82
- # Helpers: load extended schema & alias maps
83
  # ------------------------------------------------------------
84
 
85
  def _load_extended_schema(path: str = EXTENDED_SCHEMA_PATH) -> Dict[str, Any]:
@@ -97,6 +247,7 @@ def _load_alias_map(path: str = ALIAS_MAP_PATH) -> Dict[str, str]:
97
  """
98
  alias_maps.json is assumed to be a simple dict like:
99
  { "Field:raw_value": "canonical_value", ... }
 
100
  We keep this optional and conservative.
101
  """
102
  if not os.path.exists(path):
@@ -126,32 +277,60 @@ def _apply_field_value_alias(field: str, value: str, alias_map: Dict[str, str])
126
 
127
 
128
  # ------------------------------------------------------------
129
- # Value normalisation helpers
130
  # ------------------------------------------------------------
131
 
132
  def _bool_from_tokens(tokens: List[str]) -> str:
133
  """
134
- Map "positive/sensitive/susceptible" vs "negative/resistant"
135
- into Positive / Negative where appropriate.
 
 
 
 
136
  """
137
  t = " ".join(tokens).lower()
138
 
139
  # Strong negative signals
140
  neg_tokens = [
141
- "negative", "no", "not", "resistant", "no zone",
142
- "no growth", "fails to", "does not"
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  ]
144
  if any(nt in t for nt in neg_tokens):
145
  return "Negative"
146
 
147
  # Strong positive signals
148
  pos_tokens = [
149
- "positive", "pos", "sensitive", "susceptible",
150
- "clear zone", "zone of inhibition"
 
 
 
 
 
 
151
  ]
152
  if any(pt in t for pt in pos_tokens):
153
  return "Positive"
154
 
 
 
 
 
 
155
  return UNKNOWN
156
 
157
 
@@ -160,6 +339,7 @@ def _disc_result_from_phrase(phrase: str) -> str:
160
  For disc tests like Novobiocin / Optochin / Bacitracin, interpret:
161
  - 'sensitive', 'susceptible' as Positive
162
  - 'resistant', 'no zone' as Negative
 
163
  - default -> Unknown
164
  """
165
  ph = phrase.lower()
@@ -170,7 +350,6 @@ def _disc_result_from_phrase(phrase: str) -> str:
170
  if any(w in ph for w in ["sensitive", "susceptible", "zone of inhibition", "clear zone"]):
171
  return "Positive"
172
 
173
- # If explicit 'positive'/'negative' appears, handle that
174
  if "positive" in ph:
175
  return "Positive"
176
  if "negative" in ph:
@@ -180,117 +359,150 @@ def _disc_result_from_phrase(phrase: str) -> str:
180
 
181
 
182
  # ------------------------------------------------------------
183
- # Core pattern logic for extended tests
184
  # ------------------------------------------------------------
185
 
186
  def _parse_disc_tests(text: str, parsed: Dict[str, str]) -> None:
187
  """
188
- Handle disc tests:
189
  - Optochin
190
  - Bacitracin
191
  - Novobiocin
192
- with phrasing like 'optochin sensitive', 'bacitracin resistant', etc.
 
 
 
193
  """
194
  lower = text.lower()
195
 
196
- disc_fields = ["Optochin", "Bacitracin", "Novobiocin"]
197
-
198
- for test_name in disc_fields:
199
  key = test_name.lower()
200
- # Find segments surrounding the keyword
201
- for m in re.finditer(rf"\b{re.escape(key)}\b[^\.,;]*", lower):
 
202
  segment = lower[m.start():m.end()]
203
  val = _disc_result_from_phrase(segment)
204
  if val != UNKNOWN:
205
- parsed[test_name] = val
206
 
207
- # Also handle "<test> test positive/negative"
208
- for m in re.finditer(rf"\b{re.escape(key)}\s+test[^\.,;]*", lower):
209
  segment = lower[m.start():m.end()]
210
  val = _disc_result_from_phrase(segment)
211
  if val != UNKNOWN:
212
- parsed[test_name] = val
213
 
214
 
215
- def _parse_simple_PNV_test(
216
- text: str,
217
- test_name: str,
218
- parsed: Dict[str, str],
219
- extra_keywords: List[str] | None = None,
220
- ) -> None:
221
  """
222
- Generic P/N/V parser for named tests (e.g. CAMP, PYR, Hippurate).
223
- Looks for patterns like:
224
- 'CAMP positive', 'PYR test negative'
225
- and maps to Positive / Negative / Variable.
 
 
 
226
  """
227
- if extra_keywords is None:
228
- extra_keywords = []
229
 
230
- label = test_name.lower()
231
- lower = text.lower()
 
 
 
 
 
 
232
 
233
- # Basic patterns: "<name> positive/negative/variable"
234
- pat_direct = rf"\b{re.escape(label)}\b[^\.,;]*"
235
- for m in re.finditer(pat_direct, lower):
236
- segment = lower[m.start():m.end()]
237
- val = _bool_from_tokens(segment.split())
238
- if val != UNKNOWN:
239
- parsed[test_name] = val
240
-
241
- # Patterns like "<name> test positive/negative"
242
- pat_test = rf"\b{re.escape(label)}\s+test[^\.,;]*"
243
- for m in re.finditer(pat_test, lower):
244
- segment = lower[m.start():m.end()]
245
- val = _bool_from_tokens(segment.split())
246
- if val != UNKNOWN:
247
- parsed[test_name] = val
248
-
249
- # Extra synonyms if any (e.g. "CAMP reaction", "PYR activity")
250
- for kw in extra_keywords:
251
- k = kw.lower()
252
- pat_kw = rf"\b{re.escape(k)}\b[^\.,;]*"
253
- for m in re.finditer(pat_kw, lower):
254
- segment = lower[m.start():m.end()]
255
- val = _bool_from_tokens(segment.split())
256
- if val != UNKNOWN:
257
- parsed[test_name] = val
258
 
 
 
 
 
 
 
 
 
259
 
260
- def _parse_extended_from_schema(
 
 
 
 
 
261
  text: str,
262
  ext_schema: Dict[str, Any],
263
  alias_map: Dict[str, str],
264
  parsed: Dict[str, str],
265
  ) -> None:
266
  """
267
- Generic extended parser driven by extended_schema.json.
268
 
269
- For each field where value_type == "enum_PNV" and not in CORE_FIELDS:
270
- - looks for '<field> positive/negative/variable' style patterns.
271
- - applies alias map for (field, value).
 
 
272
  """
273
  lower = text.lower()
274
-
275
- for field_name, meta in ext_schema.items():
276
- if not isinstance(meta, dict):
277
- continue
278
- if meta.get("value_type") != "enum_PNV":
279
- continue
280
- if field_name in CORE_FIELDS:
281
- # We never treat core DB tests as "extended"
282
- continue
283
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  label = field_name.lower()
285
- # Very simple pattern: "<field_name> positive/negative/variable"
286
- pat = rf"\b{re.escape(label)}\b[^\.,;]*"
287
- for m in re.finditer(pat, lower):
288
- segment = lower[m.start():m.end()]
289
- val = _bool_from_tokens(segment.split())
290
- if val == UNKNOWN:
291
- continue
292
- val = _apply_field_value_alias(field_name, val, alias_map)
293
- parsed[field_name] = val
294
 
295
 
296
  # ------------------------------------------------------------
@@ -301,10 +513,13 @@ def parse_text_extended(text: str) -> Dict[str, Any]:
301
  """
302
  Parse extended-only tests from the description.
303
 
304
- This is intentionally conservative:
305
- - Only sets a field if reasonably confident from text
306
- - Never overwrites core parser behaviour directly
307
- - Plays nicely with alias maps and extended_schema
 
 
 
308
 
309
  Returns:
310
  {
@@ -313,31 +528,29 @@ def parse_text_extended(text: str) -> Dict[str, Any]:
313
  "raw": original_text
314
  }
315
  """
316
- if not text:
 
317
  return {
318
  "parsed_fields": {},
319
  "source": "extended_parser",
320
- "raw": text or "",
321
  }
322
 
 
 
323
  ext_schema = _load_extended_schema(EXTENDED_SCHEMA_PATH)
324
  alias_map = _load_alias_map(ALIAS_MAP_PATH)
325
 
326
  parsed: Dict[str, str] = {}
327
 
328
  # 1) Disc tests (Novobiocin / Optochin / Bacitracin) with rich language
329
- _parse_disc_tests(text, parsed)
330
-
331
- # 2) CAMP & PYR & Hippurate (if present in schema/gold tests)
332
- _parse_simple_PNV_test(text, "CAMP", parsed, extra_keywords=["CAMP reaction"])
333
- _parse_simple_PNV_test(text, "PYR", parsed, extra_keywords=["PYR activity"])
334
- _parse_simple_PNV_test(text, "Hippurate Hydrolysis", parsed, extra_keywords=["hippurate"])
335
 
336
- # 3) Any other enum_PNV extended tests from extended_schema.json
337
- _parse_extended_from_schema(text, ext_schema, alias_map, parsed)
338
 
339
  return {
340
  "parsed_fields": parsed,
341
  "source": "extended_parser",
342
- "raw": text,
343
  }
 
1
  # engine/parser_ext.py
2
  # ------------------------------------------------------------
3
+ # Extended test parser Stage 11G (Option A: high-recall, SOTA-style)
4
  #
5
+ # Goals:
6
+ # - Focus ONLY on *extended* tests (NOT in the core DB schema).
7
+ # - Drive behaviour entirely from extended_schema.json
8
+ # value_type == "enum_PNV"
9
+ # honour per-field aliases
10
+ # • additional hard-coded microbiology aliases where useful
11
+ # - Aggressive, high-recall extraction:
12
+ # • "<test> positive/negative/variable"
13
+ # • "<test> test positive/negative"
14
+ # • disc tests: "novobiocin sensitive", "bacitracin resistant", etc.
15
+ # • "non-<test>" / "no <test>" / "non acid-fast" → Negative
16
+ # • "sensitive/susceptible" vs "resistant" → mapped to P/N
17
+ # - Never touch core DB fields (delegate to parser_rules).
18
+ # - Respect alias_maps.json for field/value canonicalisation.
19
  #
20
+ # Output:
21
+ # {
22
+ # "parsed_fields": { field: value, ... },
23
+ # "source": "extended_parser",
24
+ # "raw": original_text
25
+ # }
26
  # ------------------------------------------------------------
27
 
28
  from __future__ import annotations
 
32
  import re
33
  from typing import Dict, Any, List
34
 
35
+ # ------------------------------------------------------------
36
+ # Paths & constants
37
+ # ------------------------------------------------------------
38
+
39
  EXTENDED_SCHEMA_PATH = os.path.join("data", "extended_schema.json")
40
  ALIAS_MAP_PATH = os.path.join("data", "alias_maps.json")
41
 
 
90
  "Inositol Fermentation",
91
  }
92
 
93
+ # Extra microbiology-driven aliases that are too domain-specific to live
94
+ # directly in extended_schema.json, but very useful for parsing.
95
+ EXTRA_FIELD_ALIASES: Dict[str, List[str]] = {
96
+ "Hippurate Hydrolysis": [
97
+ "hippurate",
98
+ "hippurate test",
99
+ ],
100
+ "CAMP": [
101
+ "camp test",
102
+ "camp reaction",
103
+ ],
104
+ "PYR": [
105
+ "pyr test",
106
+ "pyr reaction",
107
+ "pyr activity",
108
+ ],
109
+ "Acid Fast": [
110
+ "acid-fast",
111
+ "acid fast",
112
+ "acid-fast stain",
113
+ "acid fast stain",
114
+ ],
115
+ "Gas Production": [
116
+ "gas production",
117
+ "gas-producing",
118
+ "gas producing",
119
+ "gas producer",
120
+ ],
121
+ "Bile Solubility": [
122
+ "bile solubility",
123
+ "bile soluble",
124
+ ],
125
+ "Bile Resistance": [
126
+ "bile resistance",
127
+ "bile resistant",
128
+ ],
129
+ "Glucose Oxidation": [
130
+ "oxidation of glucose",
131
+ "glucose oxidation",
132
+ ],
133
+ "Mannose Fermentation": [
134
+ "mannose positive",
135
+ "mannose negative",
136
+ "ferments mannose",
137
+ "does not ferment mannose",
138
+ ],
139
+ "Fructose Fermentation": [
140
+ "fructose positive",
141
+ "fructose negative",
142
+ "ferments fructose",
143
+ "does not ferment fructose",
144
+ ],
145
+ "Inulin Fermentation": [
146
+ "inulin positive",
147
+ "inulin negative",
148
+ "ferments inulin",
149
+ "does not ferment inulin",
150
+ ],
151
+ "Glycerol Fermentation": [
152
+ "glycerol positive",
153
+ "glycerol negative",
154
+ "ferments glycerol",
155
+ "does not ferment glycerol",
156
+ ],
157
+ "Cellobiose Fermentation": [
158
+ "cellobiose positive",
159
+ "cellobiose negative",
160
+ "ferments cellobiose",
161
+ "does not ferment cellobiose",
162
+ ],
163
+ "Casein Hydrolysis": [
164
+ "caseinase",
165
+ "casein hydrolysis",
166
+ ],
167
+ "Tyrosine Hydrolysis": [
168
+ "tyrosine hydrolysis",
169
+ ],
170
+ "Iron Oxidation": [
171
+ "iron oxidation",
172
+ ],
173
+ "Sulfur Utilization": [
174
+ "sulfur utilization",
175
+ "sulphur utilization",
176
+ "sulfur utilisation",
177
+ "sulphur utilisation",
178
+ ],
179
+ "Antibiotic Resistance": [
180
+ "antibiotic resistance",
181
+ "antibiotic-resistant",
182
+ "antibiotic resistant",
183
+ ],
184
+ }
185
+
186
+ # Disc tests we treat with special "sensitive / resistant" logic
187
+ DISC_TEST_FIELDS = {"Optochin", "Bacitracin", "Novobiocin"}
188
+
189
+
190
+ # ------------------------------------------------------------
191
+ # Basic helpers
192
+ # ------------------------------------------------------------
193
+
194
+ def _clean_text(text: str) -> str:
195
+ """
196
+ Normalise a few unicode oddities and collapse whitespace.
197
+ We keep case sensitive content for matching, but most
198
+ logic will run on .lower() views.
199
+
200
+ Also:
201
+ - strip degree symbols (not vital for extended tests, but harmless)
202
+ - normalise subscript ₂ → 2 if ever encountered
203
+ """
204
+ if not text:
205
+ return ""
206
+ s = text.replace("°", "").replace("º", "")
207
+ s = s.replace("₂", "2")
208
+ # collapse whitespace
209
+ return " ".join(s.split())
210
+
211
+
212
+ def _norm(s: str) -> str:
213
+ return s.strip().lower()
214
+
215
+
216
+ def _set_if_stronger(parsed: Dict[str, str], field: str, value: str) -> None:
217
+ """
218
+ Write value to parsed[field] if:
219
+ - field not present, or
220
+ - we are replacing Unknown with a concrete value.
221
+
222
+ Extended parser only ever writes extended tests, so this is mostly
223
+ about avoiding weaker overwrites (e.g. UNKNOWN over Positive).
224
+ """
225
+ if not value:
226
+ return
227
+ if field not in parsed or parsed[field] == UNKNOWN:
228
+ parsed[field] = value
229
+
230
 
231
  # ------------------------------------------------------------
232
+ # Loading schema & alias maps
233
  # ------------------------------------------------------------
234
 
235
  def _load_extended_schema(path: str = EXTENDED_SCHEMA_PATH) -> Dict[str, Any]:
 
247
  """
248
  alias_maps.json is assumed to be a simple dict like:
249
  { "Field:raw_value": "canonical_value", ... }
250
+
251
  We keep this optional and conservative.
252
  """
253
  if not os.path.exists(path):
 
277
 
278
 
279
  # ------------------------------------------------------------
280
+ # Value P/N/V logic
281
  # ------------------------------------------------------------
282
 
283
  def _bool_from_tokens(tokens: List[str]) -> str:
284
  """
285
+ Map text tokens into Positive / Negative / Variable.
286
+
287
+ This is intentionally high-recall and slightly aggressive.
288
+ We consider a phrase "negative" if any strong negative token appears,
289
+ and "positive" if any strong positive token appears, with a bias:
290
+ - negative has priority when both appear.
291
  """
292
  t = " ".join(tokens).lower()
293
 
294
  # Strong negative signals
295
  neg_tokens = [
296
+ "negative",
297
+ "neg",
298
+ "-",
299
+ "no",
300
+ "not",
301
+ "non",
302
+ "resistant",
303
+ "no zone",
304
+ "no growth",
305
+ "fails to",
306
+ "does not",
307
+ "doesn't",
308
+ "without",
309
+ "lacks",
310
+ "absent",
311
  ]
312
  if any(nt in t for nt in neg_tokens):
313
  return "Negative"
314
 
315
  # Strong positive signals
316
  pos_tokens = [
317
+ "positive",
318
+ "pos",
319
+ "+",
320
+ "sensitive",
321
+ "susceptible",
322
+ "clear zone",
323
+ "zone of inhibition",
324
+ "with growth",
325
  ]
326
  if any(pt in t for pt in pos_tokens):
327
  return "Positive"
328
 
329
+ # Variable
330
+ var_tokens = ["variable", "var", "v"]
331
+ if any(vt in t for vt in var_tokens):
332
+ return "Variable"
333
+
334
  return UNKNOWN
335
 
336
 
 
339
  For disc tests like Novobiocin / Optochin / Bacitracin, interpret:
340
  - 'sensitive', 'susceptible' as Positive
341
  - 'resistant', 'no zone' as Negative
342
+ - explicit 'positive/negative' if present
343
  - default -> Unknown
344
  """
345
  ph = phrase.lower()
 
350
  if any(w in ph for w in ["sensitive", "susceptible", "zone of inhibition", "clear zone"]):
351
  return "Positive"
352
 
 
353
  if "positive" in ph:
354
  return "Positive"
355
  if "negative" in ph:
 
359
 
360
 
361
  # ------------------------------------------------------------
362
+ # Disc test parsing
363
  # ------------------------------------------------------------
364
 
365
  def _parse_disc_tests(text: str, parsed: Dict[str, str]) -> None:
366
  """
367
+ Handle disc tests explicitly:
368
  - Optochin
369
  - Bacitracin
370
  - Novobiocin
371
+
372
+ We look for segments like:
373
+ 'optochin sensitive', 'bacitracin resistant', 'novobiocin sensitive',
374
+ '<test> test positive', etc.
375
  """
376
  lower = text.lower()
377
 
378
+ for test_name in DISC_TEST_FIELDS:
 
 
379
  key = test_name.lower()
380
+
381
+ # Segments like "optochin sensitive..."
382
+ for m in re.finditer(rf"\b{re.escape(key)}\b[^\.;,\n]*", lower):
383
  segment = lower[m.start():m.end()]
384
  val = _disc_result_from_phrase(segment)
385
  if val != UNKNOWN:
386
+ _set_if_stronger(parsed, test_name, val)
387
 
388
+ # Segments like "optochin test positive..."
389
+ for m in re.finditer(rf"\b{re.escape(key)}\s+test[^\.;,\n]*", lower):
390
  segment = lower[m.start():m.end()]
391
  val = _disc_result_from_phrase(segment)
392
  if val != UNKNOWN:
393
+ _set_if_stronger(parsed, test_name, val)
394
 
395
 
396
+ # ------------------------------------------------------------
397
+ # Schema-driven extended parsing
398
+ # ------------------------------------------------------------
399
+
400
+ def _build_field_keywords(ext_schema: Dict[str, Any]) -> Dict[str, List[str]]:
 
401
  """
402
+ Build a mapping:
403
+ field_name -> list of keywords/synonyms (lowercased)
404
+
405
+ Sources:
406
+ - field name itself
407
+ - 'aliases' array in extended_schema.json
408
+ - EXTRA_FIELD_ALIASES hard-coded here
409
  """
410
+ field_kw: Dict[str, List[str]] = {}
 
411
 
412
+ for field_name, meta in ext_schema.items():
413
+ if not isinstance(meta, dict):
414
+ continue
415
+ if meta.get("value_type") != "enum_PNV":
416
+ continue
417
+ if field_name in CORE_FIELDS:
418
+ # Extended parser never touches core DB fields.
419
+ continue
420
 
421
+ kws: List[str] = []
422
+
423
+ # Canonical field label
424
+ kws.append(field_name)
425
+
426
+ # schema-defined aliases
427
+ aliases = meta.get("aliases", [])
428
+ if isinstance(aliases, list):
429
+ for a in aliases:
430
+ if isinstance(a, str) and a.strip():
431
+ kws.append(a)
432
+
433
+ # extra hard-coded aliases
434
+ extra = EXTRA_FIELD_ALIASES.get(field_name, [])
435
+ for a in extra:
436
+ if isinstance(a, str) and a.strip():
437
+ kws.append(a)
 
 
 
 
 
 
 
 
438
 
439
+ # de-duplicate & normalise spacing
440
+ normed: List[str] = []
441
+ seen = set()
442
+ for k in kws:
443
+ kk = " ".join(k.strip().split())
444
+ if kk and kk.lower() not in seen:
445
+ seen.add(kk.lower())
446
+ normed.append(kk)
447
 
448
+ field_kw[field_name] = normed
449
+
450
+ return field_kw
451
+
452
+
453
+ def _parse_schema_enum_pnv(
454
  text: str,
455
  ext_schema: Dict[str, Any],
456
  alias_map: Dict[str, str],
457
  parsed: Dict[str, str],
458
  ) -> None:
459
  """
460
+ High-recall parsing for enum_PNV extended tests driven by schema.
461
 
462
+ For each field (not in CORE_FIELDS):
463
+ - For each keyword/synonym:
464
+ "<kw> positive/negative/variable"
465
+ • "<kw> test positive/negative"
466
+ • "non-<kw>" / "no <kw>" style negatives (via _bool_from_tokens)
467
  """
468
  lower = text.lower()
469
+ field_keywords = _build_field_keywords(ext_schema)
470
+
471
+ for field_name, keywords in field_keywords.items():
472
+ for kw in keywords:
473
+ key = kw.lower()
474
+
475
+ # Pattern 1: "<kw> ... (up to sentence/phrase boundary)"
476
+ pat_direct = rf"\b{re.escape(key)}\b[^\.;,\n]*"
477
+ for m in re.finditer(pat_direct, lower):
478
+ segment = lower[m.start():m.end()]
479
+ tokens = segment.split()
480
+ val = _bool_from_tokens(tokens)
481
+ if val != UNKNOWN:
482
+ val = _apply_field_value_alias(field_name, val, alias_map)
483
+ _set_if_stronger(parsed, field_name, val)
484
+
485
+ # Pattern 2: "<kw> test ..."
486
+ pat_test = rf"\b{re.escape(key)}\s+test[^\.;,\n]*"
487
+ for m in re.finditer(pat_test, lower):
488
+ segment = lower[m.start():m.end()]
489
+ tokens = segment.split()
490
+ val = _bool_from_tokens(tokens)
491
+ if val != UNKNOWN:
492
+ val = _apply_field_value_alias(field_name, val, alias_map)
493
+ _set_if_stronger(parsed, field_name, val)
494
+
495
+ # Extra aggressive negative patterns of the form:
496
+ # - "non-<label>", "non <label>"
497
+ # - "no <label>"
498
+ # This is especially useful for things like "non acid-fast"
499
+ # or "non CAMP reacting".
500
  label = field_name.lower()
501
+ if re.search(rf"\bnon[- ]{re.escape(label)}\b", lower) or re.search(
502
+ rf"\bno\s+{re.escape(label)}\b", lower
503
+ ):
504
+ val = _apply_field_value_alias(field_name, "Negative", alias_map)
505
+ _set_if_stronger(parsed, field_name, val)
 
 
 
 
506
 
507
 
508
  # ------------------------------------------------------------
 
513
  """
514
  Parse extended-only tests from the description.
515
 
516
+ Behaviour:
517
+ - If text empty return empty parsed_fields.
518
+ - Loads extended_schema.json + alias_maps.json
519
+ - Runs:
520
+ 1) disc test parsing (Optochin / Bacitracin / Novobiocin)
521
+ 2) generic schema-driven enum_PNV parsing for all ext fields
522
+ - Never touches core DB fields.
523
 
524
  Returns:
525
  {
 
528
  "raw": original_text
529
  }
530
  """
531
+ original = text or ""
532
+ if not original.strip():
533
  return {
534
  "parsed_fields": {},
535
  "source": "extended_parser",
536
+ "raw": original,
537
  }
538
 
539
+ text_clean = _clean_text(original)
540
+
541
  ext_schema = _load_extended_schema(EXTENDED_SCHEMA_PATH)
542
  alias_map = _load_alias_map(ALIAS_MAP_PATH)
543
 
544
  parsed: Dict[str, str] = {}
545
 
546
  # 1) Disc tests (Novobiocin / Optochin / Bacitracin) with rich language
547
+ _parse_disc_tests(text_clean, parsed)
 
 
 
 
 
548
 
549
+ # 2) All other enum_PNV extended tests from extended_schema.json
550
+ _parse_schema_enum_pnv(text_clean, ext_schema, alias_map, parsed)
551
 
552
  return {
553
  "parsed_fields": parsed,
554
  "source": "extended_parser",
555
+ "raw": original,
556
  }