EphAsad commited on
Commit
bb75255
·
verified ·
1 Parent(s): 37d9a5e

Update engine/parser_rules.py

Browse files
Files changed (1) hide show
  1. engine/parser_rules.py +96 -212
engine/parser_rules.py CHANGED
@@ -2,31 +2,31 @@
2
  # ------------------------------------------------------------
3
  # Rule-based core parser for microbiology descriptions.
4
  #
5
- # Stage 11F (Option A ranges + fixes) + 11H + 11I + 11J:
6
  #
7
  # - Always store Growth Temperature as "low//high"
8
  # • single: 37 → "37//37"
9
- # • two temps in text: min//max (e.g. "4 °C but not 45 °C" → "4//45")
10
- # - DNase robust parsing (DNase / DNase test, DNase activity, etc.)
11
- # - Non–spore-forming → Spore Formation = Negative (regex + early return)
12
  # - "non-H2S producing" → H2S = Negative
13
- # - "aerobically" / "anaerobically" Aerobic / Anaerobic
14
  # - NaCl tolerance phrases improved
15
- # - Colony morphology from "colonies dry, white and irregular on nutrient agar"
16
  #
17
- # New in this version:
18
- # • "Gelatinase positive/negative" → Gelatin Hydrolysis Positive/Negative
19
- # • "<sugar> fermenter" → <Sugar> Fermentation = Positive
20
- # • "<sugar> is positive/negative" handled
21
- # • "<sugar> fermentation is positive/negative" handled
22
- # • Grouped "does not ferment lactose and sucrose" handled cleanly
23
- # (does NOT accidentally mark glucose negative when it appears after "but")
24
- # • Global non-fermenter + explicit positive sugar:
25
- # "Non-fermenter, ferments glucose weakly"
26
- # all sugars Negative *except* Glucose = Positive
27
- # • Core tests accept "is positive/is negative/is variable"
28
- #"H2S production is positive/negative" handled
29
- # ONPG phrases like "ONPG is negative" parsed via core patterns
30
  # ------------------------------------------------------------
31
 
32
  from __future__ import annotations
@@ -34,14 +34,12 @@ from __future__ import annotations
34
  import re
35
  from typing import Dict, Any, List
36
 
37
-
38
  UNKNOWN = "Unknown"
39
 
40
  # ------------------------------------------------------------
41
  # Core fields and sugar mapping
42
  # ------------------------------------------------------------
43
 
44
- # Sugar name → core DB column
45
  SUGAR_FIELDS: Dict[str, str] = {
46
  "glucose": "Glucose Fermentation",
47
  "lactose": "Lactose Fermentation",
@@ -58,25 +56,17 @@ SUGAR_FIELDS: Dict[str, str] = {
58
  }
59
 
60
  CORE_BOOL_FIELDS: Dict[str, List[str]] = {
61
- # field: [keywords to recognise the test name]
62
  "Catalase": ["catalase"],
63
  "Oxidase": ["oxidase"],
64
  "Indole": ["indole"],
65
  "Urease": ["urease"],
66
  "Citrate": ["citrate"],
67
- # MR: include "mr"
68
  "Methyl Red": ["methyl red", "mr test", "mr"],
69
  "VP": ["voges-proskauer", "vp test", "vp"],
70
- # H2S (includes H₂S → normalised to H2S in _clean_text)
71
  "H2S": ["h2s", "hydrogen sulfide"],
72
- # DNase: broaden patterns
73
  "DNase": [
74
- "dnase",
75
- "dnase test",
76
- "dnase activity",
77
- "dnase production",
78
- "dnaase",
79
- "dna hydrolysis",
80
  ],
81
  "ONPG": ["onpg"],
82
  "Coagulase": ["coagulase"],
@@ -87,7 +77,6 @@ CORE_BOOL_FIELDS: Dict[str, List[str]] = {
87
  "Ornitihine Decarboxylase": ["ornithine decarboxylase", "ornithine decarb"],
88
  "Arginine dihydrolase": ["arginine dihydrolase"],
89
  "Gelatin Hydrolysis": ["gelatin hydrolysis", "gelatinase"],
90
- # Esculin Hydrolysis: also match plain "esculin"
91
  "Esculin Hydrolysis": ["esculin hydrolysis", "esculin"],
92
  }
93
 
@@ -96,41 +85,22 @@ CORE_BOOL_FIELDS: Dict[str, List[str]] = {
96
  # ------------------------------------------------------------
97
 
98
  def _clean_text(text: str) -> str:
99
- """
100
- Normalise unicode oddities and collapse whitespace.
101
- Also:
102
- - strip degree symbols
103
- - normalise subscript ₂ → 2 for H₂S
104
- """
105
  if not text:
106
  return ""
107
  s = text.replace("°", "").replace("º", "")
108
- # normalise subscript 2 (H₂S → H2S)
109
  s = s.replace("₂", "2")
110
- # collapse whitespace
111
  return " ".join(s.split())
112
 
113
-
114
  def _norm(s: str) -> str:
115
  return s.strip().lower()
116
 
117
-
118
  def _set_if_stronger(parsed: Dict[str, str], field: str, value: str) -> None:
119
- """
120
- Write value to parsed[field] if:
121
- - field not present, or
122
- - we are replacing Unknown with a concrete value
123
- """
124
  if not value:
125
  return
126
  if field not in parsed or parsed[field] == UNKNOWN:
127
  parsed[field] = value
128
 
129
-
130
  def _value_from_pnv_token(token: str) -> str | None:
131
- """
132
- Map a simple token to Positive / Negative / Variable.
133
- """
134
  seg = _norm(token)
135
  if seg in ["positive", "pos", "+"]:
136
  return "Positive"
@@ -140,28 +110,15 @@ def _value_from_pnv_token(token: str) -> str | None:
140
  return "Variable"
141
  return None
142
 
143
-
144
  def _value_from_pnv_context(segment: str) -> str | None:
145
- """
146
- Interpret a phrase as Positive / Negative / Variable.
147
-
148
- Handles:
149
- - "positive"
150
- - "is positive"
151
- - "+", "neg", etc.
152
- """
153
  seg = _norm(segment)
154
- # direct token first
155
  val = _value_from_pnv_token(seg)
156
  if val:
157
  return val
158
- # "... is positive"
159
  m = re.search(r"\bis\s+(positive|negative|variable|pos|neg|\+|\-)\b", seg)
160
  if m:
161
  return _value_from_pnv_token(m.group(1))
162
  return None
163
-
164
-
165
  # ------------------------------------------------------------
166
  # Gram stain and shape
167
  # ------------------------------------------------------------
@@ -176,21 +133,17 @@ def _parse_gram_and_shape(text_lc: str, parsed: Dict[str, str]) -> None:
176
  _set_if_stronger(parsed, "Gram Stain", "Variable")
177
 
178
  # Shape
179
- # Prefer "short rods" over generic rods
180
  if "short rods" in text_lc:
181
  _set_if_stronger(parsed, "Shape", "Short Rods")
182
 
183
- # Cocci and variants (diplococci, tetracocci, etc.)
184
  if re.search(r"\bcocci\b", text_lc):
185
  _set_if_stronger(parsed, "Shape", "Cocci")
186
  if re.search(r"\b(diplococci|tetracocci|streptococci|staphylococci)\b", text_lc):
187
  _set_if_stronger(parsed, "Shape", "Cocci")
188
 
189
- # Rods / bacilli
190
  if re.search(r"\brods?\b", text_lc) or "bacilli" in text_lc:
191
  _set_if_stronger(parsed, "Shape", "Rods")
192
 
193
- # Spiral
194
  if "spiral" in text_lc or "spirochete" in text_lc:
195
  _set_if_stronger(parsed, "Shape", "Spiral")
196
 
@@ -200,11 +153,6 @@ def _parse_gram_and_shape(text_lc: str, parsed: Dict[str, str]) -> None:
200
  # ------------------------------------------------------------
201
 
202
  def _parse_haemolysis(text_lc: str, parsed: Dict[str, str]) -> None:
203
- """
204
- Handle haemolysis phrasing:
205
- - beta-haemolytic / beta hemolytic / beta-haemolysis / etc.
206
- - alpha- / gamma- / non-haemolytic
207
- """
208
  # Beta
209
  if re.search(r"beta[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
210
  _set_if_stronger(parsed, "Haemolysis Type", "Beta")
@@ -219,6 +167,7 @@ def _parse_haemolysis(text_lc: str, parsed: Dict[str, str]) -> None:
219
  if re.search(r"gamma[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
220
  _set_if_stronger(parsed, "Haemolysis Type", "Gamma")
221
  _set_if_stronger(parsed, "Haemolysis", "Negative")
 
222
  if (
223
  "non-haemolytic" in text_lc
224
  or "non hemolytic" in text_lc
@@ -243,16 +192,19 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
243
  - "catalase positive"
244
  - "positive for catalase"
245
  - "catalase is positive"
246
- Also handles:
 
 
 
247
  - NaCl tolerance with % values
248
  - Nitrate reduction text
249
  - H2S production / non-production
250
- - DNase universal coverage
251
- - explicit gelatinase → Gelatin Hydrolysis mapping
252
  """
253
  for field, keywords in CORE_BOOL_FIELDS.items():
254
  for kw in keywords:
255
- # "... catalase positive"
256
  m1 = re.search(
257
  rf"{re.escape(kw)}[ \-]?"
258
  r"(positive|negative|variable|pos|neg|\+|\-)",
@@ -264,7 +216,7 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
264
  _set_if_stronger(parsed, field, val)
265
  break
266
 
267
- # "positive for catalase"
268
  m2 = re.search(
269
  rf"(positive|negative|variable|pos|neg|\+|\-)\s+"
270
  rf"(for\s+)?{re.escape(kw)}",
@@ -276,7 +228,7 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
276
  _set_if_stronger(parsed, field, val)
277
  break
278
 
279
- # "<kw> is positive"
280
  m3 = re.search(
281
  rf"{re.escape(kw)}\s+is\s+"
282
  r"(positive|negative|variable|pos|neg|\+|\-)",
@@ -288,9 +240,44 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
288
  _set_if_stronger(parsed, field, val)
289
  break
290
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  # Special-case NaCl tolerance with explicit percentages
292
  if field == "NaCl Tolerant (>=6%)":
293
- # e.g. "grows in 6.5% NaCl", "grows at 10% NaCl"
294
  for m in re.finditer(
295
  r"(grows|growth)\s+(in|at)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
296
  text_lc,
@@ -302,7 +289,6 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
302
  except Exception:
303
  pass
304
 
305
- # e.g. "NaCl tolerant up to 10%"
306
  for m in re.finditer(
307
  r"nacl\s+tolerant\s+(?:to|up to)?\s*(\d+(?:\.\d+)?)\s*%?",
308
  text_lc,
@@ -314,14 +300,12 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
314
  except Exception:
315
  pass
316
 
317
- # explicit negative phrasing: "does not grow in 7% NaCl"
318
  if re.search(
319
  r"does (not|n't) grow\s+(in|at)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
320
  text_lc,
321
  ):
322
  _set_if_stronger(parsed, "NaCl Tolerant (>=6%)", "Negative")
323
 
324
- # general "in 6.5% NaCl" → assume tolerance if no explicit "no growth"
325
  for m in re.finditer(
326
  r"\bin\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
327
  text_lc,
@@ -333,13 +317,13 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
333
  except Exception:
334
  pass
335
 
336
- # Nitrate: "reduces nitrate" / "does not reduce nitrate"
337
  if re.search(r"reduces nitrate", text_lc):
338
  _set_if_stronger(parsed, "Nitrate Reduction", "Positive")
339
  if re.search(r"does (not|n't) reduce nitrate", text_lc):
340
  _set_if_stronger(parsed, "Nitrate Reduction", "Negative")
341
 
342
- # H2S: "produces H2S", "H2S production", "H2S production is positive"
343
  if re.search(r"(produces|production of)\s+h2s", text_lc):
344
  _set_if_stronger(parsed, "H2S", "Positive")
345
  if re.search(r"h2s production\s+is\s+(positive|pos|\+)", text_lc):
@@ -353,29 +337,21 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
353
  ):
354
  _set_if_stronger(parsed, "H2S", "Negative")
355
 
356
- # --- DNase universal coverage ---
357
- # Positive forms
358
  if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(positive|pos|\+)\b", text_lc):
359
  _set_if_stronger(parsed, "DNase", "Positive")
360
-
361
  if re.search(r"\b(positive|pos|\+)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc):
362
  _set_if_stronger(parsed, "DNase", "Positive")
363
-
364
- # Negative forms
365
  if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(negative|neg|\-)\b", text_lc):
366
  _set_if_stronger(parsed, "DNase", "Negative")
367
-
368
  if re.search(r"\b(negative|neg|\-)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc):
369
  _set_if_stronger(parsed, "DNase", "Negative")
370
-
371
- # non-DNase-producing
372
  if re.search(r"\bnon[- ]?dnase[- ]?producing\b", text_lc):
373
  _set_if_stronger(parsed, "DNase", "Negative")
374
 
375
- # --- NEW: Gelatinase → Gelatin Hydrolysis ---
376
  if re.search(r"\bgelatinase\s*(positive|pos|\+)\b", text_lc):
377
  _set_if_stronger(parsed, "Gelatin Hydrolysis", "Positive")
378
-
379
  if re.search(r"\bgelatinase\s*(negative|neg|\-)\b", text_lc):
380
  _set_if_stronger(parsed, "Gelatin Hydrolysis", "Negative")
381
 
@@ -402,7 +378,6 @@ def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None
402
  ):
403
  _set_if_stronger(parsed, "Motility", "Negative")
404
 
405
- # Specific motility phrases: tumbling, swarming, corkscrew
406
  if (
407
  "tumbling motility" in text_lc
408
  or "swarming motility" in text_lc
@@ -411,7 +386,7 @@ def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None
411
  ):
412
  _set_if_stronger(parsed, "Motility", "Positive")
413
 
414
- # Capsule (including "capsule positive/negative")
415
  if (
416
  "capsulated" in text_lc
417
  or "encapsulated" in text_lc
@@ -428,15 +403,13 @@ def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None
428
  _set_if_stronger(parsed, "Capsule", "Negative")
429
 
430
  # Spore formation
431
- # NEGATIVE FIRST with strict boundaries, then early-return
432
  if (
433
  re.search(r"\bnon[-\s]?spore[-\s]?forming\b", text_lc)
434
  or "no spores" in text_lc
435
  ):
436
  _set_if_stronger(parsed, "Spore Formation", "Negative")
437
- return # prevent any positive overwrite
438
 
439
- # POSITIVE (must not match the negative form)
440
  if (
441
  re.search(r"\bspore[-\s]?forming\b", text_lc)
442
  or "forms spores" in text_lc
@@ -449,17 +422,9 @@ def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None
449
  # ------------------------------------------------------------
450
 
451
  def _parse_oxygen(text_lc: str, parsed: Dict[str, str]) -> None:
452
- """
453
- Robust oxygen parsing:
454
- - Handle facultative first
455
- - Avoid "aerobic" accidentally matching inside "anaerobic"
456
- - Include "aerobically" / "anaerobically"
457
- """
458
- # Facultative first
459
  if re.search(r"facultative(ly)? anaerob", text_lc):
460
  _set_if_stronger(parsed, "Oxygen Requirement", "Facultative Anaerobe")
461
 
462
- # Strict anaerobic (before aerobic)
463
  if (
464
  re.search(r"\bobligate anaerob", text_lc)
465
  or (re.search(r"\banaerobic\b", text_lc) and "facultative" not in text_lc)
@@ -467,14 +432,10 @@ def _parse_oxygen(text_lc: str, parsed: Dict[str, str]) -> None:
467
  ):
468
  _set_if_stronger(parsed, "Oxygen Requirement", "Anaerobic")
469
 
470
- # Now handle purely aerobic, avoiding "anaerobic"
471
  if (
472
  re.search(r"\bobligate aerobe\b", text_lc)
473
  or (re.search(r"\baerobic\b", text_lc) and "anaerobic" not in text_lc)
474
- or (
475
- re.search(r"\baerobically\b", text_lc)
476
- and "anaerobically" not in text_lc
477
- )
478
  ):
479
  _set_if_stronger(parsed, "Oxygen Requirement", "Aerobic")
480
 
@@ -490,14 +451,6 @@ def _parse_oxygen(text_lc: str, parsed: Dict[str, str]) -> None:
490
  # ------------------------------------------------------------
491
 
492
  def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
493
- """
494
- Look for explicit temperatures like "grows at 37 °C" or ranges like "4–45 °C".
495
- We ALWAYS store as "low//high":
496
- - true ranges: "4-45 °C" → "4//45"
497
- - two temps in text: min//max (Option A)
498
- - single temps: "37 °C" → "37//37"
499
- """
500
- # 1) Explicit ranges like "4-45 °C" or "10–40 °C"
501
  range_pattern = re.compile(
502
  r"(\d+)\s*[-–/]\s*(\d+)\s*(?:c|°c|degrees c|degrees celsius)"
503
  )
@@ -508,7 +461,6 @@ def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
508
  _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
509
  return
510
 
511
- # 2) Option A: any two explicit temps → min//max
512
  temps = re.findall(r"(\d+)\s*(?:c|°c|degrees c|degrees celsius)", text_lc)
513
  if len(temps) >= 2:
514
  nums = [int(t) for t in temps]
@@ -517,7 +469,6 @@ def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
517
  _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
518
  return
519
 
520
- # 3) Single temps like "grows at 37 c"
521
  single_pattern = re.compile(
522
  r"(grows|growth|optimum|optimal)\s+(?:at\s+)?(\d+)\s*"
523
  r"(?:c|°c|degrees c|degrees celsius)"
@@ -528,14 +479,12 @@ def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
528
  _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
529
  return
530
 
531
- # 4) Simplified: "grows at 37" (no explicit °C)
532
  m_simple_num = re.search(r"grows at (\d+)\b", text_lc)
533
  if m_simple_num:
534
  temp = m_simple_num.group(1)
535
  _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
536
  return
537
 
538
- # 5) Fallback: plain "37c" somewhere in the text
539
  m_plain = re.search(
540
  r"\b(\d+)\s*(?:c|°c|degrees c|degrees celsius)\b",
541
  text_lc,
@@ -546,48 +495,21 @@ def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
546
 
547
 
548
  # ------------------------------------------------------------
549
- # Media grown on (coarse mapping)
550
  # ------------------------------------------------------------
551
 
552
  MEDIA_KEYWORDS = {
553
- "Blood Agar": [
554
- "blood agar",
555
- "blood-agar",
556
- ],
557
- "MacConkey Agar": [
558
- "macconkey agar",
559
- "mac conkey agar",
560
- "macconkey",
561
- ],
562
- "Chocolate Agar": [
563
- "chocolate agar",
564
- "chocolate-agar",
565
- ],
566
- "Nutrient Agar": [
567
- "nutrient agar",
568
- "nutrient-agar",
569
- ],
570
- "XLD Agar": [
571
- "xld agar",
572
- ],
573
- "TCBS Agar": [
574
- "tcbs agar",
575
- "tcbs",
576
- ],
577
- "ALOA": [
578
- "aloa agar",
579
- "aloa",
580
- ],
581
- "BCYE Agar": [
582
- "bcye agar",
583
- "bcye",
584
- ],
585
- "MRS Agar": [
586
- "mrs agar",
587
- ],
588
  }
589
 
590
-
591
  def _parse_media(text_lc: str, parsed: Dict[str, str]) -> None:
592
  found_media: List[str] = []
593
  for media_name, patterns in MEDIA_KEYWORDS.items():
@@ -600,34 +522,21 @@ def _parse_media(text_lc: str, parsed: Dict[str, str]) -> None:
600
 
601
 
602
  # ------------------------------------------------------------
603
- # Sugar fermentation parsing
604
  # ------------------------------------------------------------
605
 
606
  def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
607
- """
608
- Handles patterns like:
609
- - "glucose positive, mannitol negative"
610
- - "ferments glucose, mannitol and sucrose but not lactose"
611
- - "does not ferment lactose or sucrose"
612
- - "non-lactose fermenter"
613
- - "<sugar> fermenter" (positive unless "non-<sugar> fermenter")
614
- - "<sugar> is positive/negative"
615
- - "<sugar> fermentation is positive/negative"
616
- - global non-fermenter phrases
617
- """
618
-
619
- # 0) Simple "glucose positive / negative" style + "<sugar> is positive"
620
  for sugar_key, field in SUGAR_FIELDS.items():
621
- # "glucose positive"
622
  m_simple = re.search(
623
- rf"{sugar_key}\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc
 
624
  )
625
  if m_simple:
626
  val = _value_from_pnv_context(m_simple.group(1))
627
  if val:
628
  _set_if_stronger(parsed, field, val)
629
 
630
- # "<sugar> is positive"
631
  m_is = re.search(
632
  rf"{sugar_key}\s+is\s+(positive|negative|variable|pos|neg|\+|\-)",
633
  text_lc,
@@ -637,15 +546,12 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
637
  if val:
638
  _set_if_stronger(parsed, field, val)
639
 
640
- # 0b) "<sugar> fermenter" Positive; "non-<sugar> fermenter" → Negative
641
  for sugar_key, field in SUGAR_FIELDS.items():
642
- # positive: "lactose fermenter"
643
  if re.search(rf"\b{sugar_key}\s+fermenter\b", text_lc) and not re.search(
644
  rf"\bnon[- ]{sugar_key}\s+fermenter\b", text_lc
645
  ):
646
  _set_if_stronger(parsed, field, "Positive")
647
-
648
- # negative: "non-lactose fermenter"
649
  if re.search(rf"\bnon[- ]{sugar_key}\s+fermenter\b", text_lc):
650
  _set_if_stronger(parsed, field, "Negative")
651
 
@@ -653,24 +559,19 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
653
  ferments_pattern = re.compile(r"ferments\s+([a-z0-9 ,;/&\-]+)")
654
  for m in ferments_pattern.finditer(text_lc):
655
  seg = m.group(1)
656
- # Split positive vs negative part on "but not"
657
  neg_split = re.split(r"\bbut not\b", seg, maxsplit=1)
658
  pos_part = neg_split[0]
659
  neg_part = neg_split[1] if len(neg_split) > 1 else ""
660
 
661
- # Positive sugars from pos_part
662
  for sugar_key, field in SUGAR_FIELDS.items():
663
  if re.search(rf"\b{sugar_key}\b", pos_part):
664
  _set_if_stronger(parsed, field, "Positive")
665
 
666
- # Negative sugars from neg_part
667
  for sugar_key, field in SUGAR_FIELDS.items():
668
  if re.search(rf"\b{sugar_key}\b", neg_part):
669
  _set_if_stronger(parsed, field, "Negative")
670
 
671
- # 2) Grouped "does not ferment X, Y and Z" — but **stop at "but" / punctuation**
672
- # This prevents glucose being accidentally marked negative in:
673
- # "does not ferment lactose or sucrose, but glucose fermentation is positive"
674
  grouped_neg_pattern = re.compile(
675
  r"does\s+(?:not|n't)\s+ferment\s+([a-z0-9 ,;/&\-]+?)(?:\s+but\b|\.|;|,|$)"
676
  )
@@ -680,23 +581,22 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
680
  if re.search(rf"\b{sugar_key}\b", seg):
681
  _set_if_stronger(parsed, field, "Negative")
682
 
683
- # 3) "does not ferment X" (single sugar)
684
  for sugar_key, field in SUGAR_FIELDS.items():
685
  if re.search(
686
  rf"does\s+(?:not|n't)\s+ferment\s+{sugar_key}\b", text_lc
687
  ):
688
  _set_if_stronger(parsed, field, "Negative")
689
 
690
- # 4) "non-lactose fermenter" covered above (+ keep "non-lactose fermenting")
691
  for sugar_key, field in SUGAR_FIELDS.items():
692
  if re.search(
693
  rf"non[- ]{sugar_key}\s+ferment(ing|er)?", text_lc
694
  ):
695
  _set_if_stronger(parsed, field, "Negative")
696
 
697
- # 5) "X fermentation positive/negative" + "is positive"
698
  for sugar_key, field in SUGAR_FIELDS.items():
699
- # "glucose fermentation positive"
700
  m1 = re.search(
701
  rf"{sugar_key}\s+fermentation[ \-]?"
702
  r"(positive|negative|variable|pos|neg|\+|\-)",
@@ -708,7 +608,6 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
708
  _set_if_stronger(parsed, field, val)
709
  continue
710
 
711
- # "positive for glucose fermentation"
712
  m2 = re.search(
713
  rf"(positive|negative|variable|pos|neg|\+|\-)\s+"
714
  rf"(for\s+)?{sugar_key}\s+fermentation",
@@ -720,7 +619,6 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
720
  _set_if_stronger(parsed, field, val)
721
  continue
722
 
723
- # NEW: "<sugar> fermentation is positive/negative"
724
  m3 = re.search(
725
  rf"{sugar_key}\s+fermentation\s+is\s+"
726
  r"(positive|negative|variable|pos|neg|\+|\-)",
@@ -732,9 +630,7 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
732
  _set_if_stronger(parsed, field, val)
733
  continue
734
 
735
- # 6) Global non-fermenter patterns
736
- # e.g. "non-fermenter", "does not ferment sugars"
737
- # → set all sugars Negative *unless* already set by a more specific rule.
738
  if (
739
  re.search(
740
  r"does\s+(?:not|n't)\s+ferment\s+(carbohydrates|sugars)", text_lc
@@ -744,20 +640,11 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
744
  for field in SUGAR_FIELDS.values():
745
  if field not in parsed or parsed[field] == UNKNOWN:
746
  _set_if_stronger(parsed, field, "Negative")
747
-
748
-
749
  # ------------------------------------------------------------
750
- # Colony morphology (coarse, optional)
751
  # ------------------------------------------------------------
752
 
753
  def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
754
- """
755
- Very coarse mapping for colony morphology. We try:
756
- - "colonies are yellow, mucoid"
757
- - "colonies dry, white and irregular on nutrient agar"
758
- - "forming green colonies", "forms mucoid colonies"
759
- """
760
- # Pattern 1: "colonies are ..."
761
  m = re.search(r"colon(y|ies)\s+(are|is)\s+([a-z0-9 ,;\-]+)", text_lc)
762
  if m:
763
  desc = m.group(3).strip()
@@ -769,7 +656,6 @@ def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
769
  _set_if_stronger(parsed, "Colony Morphology", pretty)
770
  return
771
 
772
- # Pattern 2: "colonies dry, white and irregular on nutrient agar"
773
  m2 = re.search(
774
  r"colonies\s+([a-z0-9 ,;\-]+?)(?:\s+on\b|\.|,)",
775
  text_lc,
@@ -784,7 +670,6 @@ def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
784
  _set_if_stronger(parsed, "Colony Morphology", pretty)
785
  return
786
 
787
- # Pattern 3: "forming green colonies", "forms mucoid colonies"
788
  m3 = re.search(
789
  r"(forming|forms|produces)\s+([a-z0-9 ,;\-]+?)\s+colonies",
790
  text_lc,
@@ -831,10 +716,9 @@ def parse_text_rules(text: str) -> Dict[str, Any]:
831
  }
832
 
833
  except Exception as e:
834
- # Fail-safe: never crash the app, just report an error
835
  return {
836
  "parsed_fields": parsed,
837
  "source": "rule_parser",
838
  "raw": original,
839
  "error": f"{type(e).__name__}: {e}",
840
- }
 
2
  # ------------------------------------------------------------
3
  # Rule-based core parser for microbiology descriptions.
4
  #
5
+ # Stage 11F (Option A ranges + fixes) + 11H + 11I + 11J + 11K:
6
  #
7
  # - Always store Growth Temperature as "low//high"
8
  # • single: 37 → "37//37"
9
+ # • two temps in text: min//max
10
+ # - DNase robust parsing (DNase test / activity / production)
11
+ # - Non–spore-forming → Spore Formation = Negative
12
  # - "non-H2S producing" → H2S = Negative
13
+ # - Aerobic / Anaerobic including aerobically/anaerobically
14
  # - NaCl tolerance phrases improved
15
+ # - Colony morphology extraction
16
  #
17
+ # New additions:
18
+ # • "Gelatinase positive/negative" → Gelatin Hydrolysis
19
+ # • "<sugar> fermenter" → <Sugar> Fermentation
20
+ # • "<sugar> is positive/negative"
21
+ # • "<sugar> fermentation is positive/negative"
22
+ # • Grouped negative sugars, avoiding false glucose-negative
23
+ # Global non-fermenter + explicit positive sugar handled
24
+ # • Core tests accept "is positive"
25
+ # H2S production "is positive/negative"
26
+ # "ONPG is negative" captured
27
+ # • NEW (11K): "<kw> reaction is positive/negative"
28
+ # "<kw> reaction positive/negative"
29
+ # "<kw> test reaction is positive"
30
  # ------------------------------------------------------------
31
 
32
  from __future__ import annotations
 
34
  import re
35
  from typing import Dict, Any, List
36
 
 
37
  UNKNOWN = "Unknown"
38
 
39
  # ------------------------------------------------------------
40
  # Core fields and sugar mapping
41
  # ------------------------------------------------------------
42
 
 
43
  SUGAR_FIELDS: Dict[str, str] = {
44
  "glucose": "Glucose Fermentation",
45
  "lactose": "Lactose Fermentation",
 
56
  }
57
 
58
  CORE_BOOL_FIELDS: Dict[str, List[str]] = {
 
59
  "Catalase": ["catalase"],
60
  "Oxidase": ["oxidase"],
61
  "Indole": ["indole"],
62
  "Urease": ["urease"],
63
  "Citrate": ["citrate"],
 
64
  "Methyl Red": ["methyl red", "mr test", "mr"],
65
  "VP": ["voges-proskauer", "vp test", "vp"],
 
66
  "H2S": ["h2s", "hydrogen sulfide"],
 
67
  "DNase": [
68
+ "dnase", "dnase test", "dnase activity",
69
+ "dnase production", "dnaase", "dna hydrolysis"
 
 
 
 
70
  ],
71
  "ONPG": ["onpg"],
72
  "Coagulase": ["coagulase"],
 
77
  "Ornitihine Decarboxylase": ["ornithine decarboxylase", "ornithine decarb"],
78
  "Arginine dihydrolase": ["arginine dihydrolase"],
79
  "Gelatin Hydrolysis": ["gelatin hydrolysis", "gelatinase"],
 
80
  "Esculin Hydrolysis": ["esculin hydrolysis", "esculin"],
81
  }
82
 
 
85
  # ------------------------------------------------------------
86
 
87
  def _clean_text(text: str) -> str:
 
 
 
 
 
 
88
  if not text:
89
  return ""
90
  s = text.replace("°", "").replace("º", "")
 
91
  s = s.replace("₂", "2")
 
92
  return " ".join(s.split())
93
 
 
94
  def _norm(s: str) -> str:
95
  return s.strip().lower()
96
 
 
97
  def _set_if_stronger(parsed: Dict[str, str], field: str, value: str) -> None:
 
 
 
 
 
98
  if not value:
99
  return
100
  if field not in parsed or parsed[field] == UNKNOWN:
101
  parsed[field] = value
102
 
 
103
  def _value_from_pnv_token(token: str) -> str | None:
 
 
 
104
  seg = _norm(token)
105
  if seg in ["positive", "pos", "+"]:
106
  return "Positive"
 
110
  return "Variable"
111
  return None
112
 
 
113
  def _value_from_pnv_context(segment: str) -> str | None:
 
 
 
 
 
 
 
 
114
  seg = _norm(segment)
 
115
  val = _value_from_pnv_token(seg)
116
  if val:
117
  return val
 
118
  m = re.search(r"\bis\s+(positive|negative|variable|pos|neg|\+|\-)\b", seg)
119
  if m:
120
  return _value_from_pnv_token(m.group(1))
121
  return None
 
 
122
  # ------------------------------------------------------------
123
  # Gram stain and shape
124
  # ------------------------------------------------------------
 
133
  _set_if_stronger(parsed, "Gram Stain", "Variable")
134
 
135
  # Shape
 
136
  if "short rods" in text_lc:
137
  _set_if_stronger(parsed, "Shape", "Short Rods")
138
 
 
139
  if re.search(r"\bcocci\b", text_lc):
140
  _set_if_stronger(parsed, "Shape", "Cocci")
141
  if re.search(r"\b(diplococci|tetracocci|streptococci|staphylococci)\b", text_lc):
142
  _set_if_stronger(parsed, "Shape", "Cocci")
143
 
 
144
  if re.search(r"\brods?\b", text_lc) or "bacilli" in text_lc:
145
  _set_if_stronger(parsed, "Shape", "Rods")
146
 
 
147
  if "spiral" in text_lc or "spirochete" in text_lc:
148
  _set_if_stronger(parsed, "Shape", "Spiral")
149
 
 
153
  # ------------------------------------------------------------
154
 
155
  def _parse_haemolysis(text_lc: str, parsed: Dict[str, str]) -> None:
 
 
 
 
 
156
  # Beta
157
  if re.search(r"beta[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
158
  _set_if_stronger(parsed, "Haemolysis Type", "Beta")
 
167
  if re.search(r"gamma[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
168
  _set_if_stronger(parsed, "Haemolysis Type", "Gamma")
169
  _set_if_stronger(parsed, "Haemolysis", "Negative")
170
+
171
  if (
172
  "non-haemolytic" in text_lc
173
  or "non hemolytic" in text_lc
 
192
  - "catalase positive"
193
  - "positive for catalase"
194
  - "catalase is positive"
195
+ - "indole reaction is negative"
196
+ - "indole reaction negative"
197
+ - "indole test reaction is positive"
198
+ Plus:
199
  - NaCl tolerance with % values
200
  - Nitrate reduction text
201
  - H2S production / non-production
202
+ - DNase coverage
203
+ - gelatinase → Gelatin Hydrolysis
204
  """
205
  for field, keywords in CORE_BOOL_FIELDS.items():
206
  for kw in keywords:
207
+ # 1) "... catalase positive"
208
  m1 = re.search(
209
  rf"{re.escape(kw)}[ \-]?"
210
  r"(positive|negative|variable|pos|neg|\+|\-)",
 
216
  _set_if_stronger(parsed, field, val)
217
  break
218
 
219
+ # 2) "positive for catalase"
220
  m2 = re.search(
221
  rf"(positive|negative|variable|pos|neg|\+|\-)\s+"
222
  rf"(for\s+)?{re.escape(kw)}",
 
228
  _set_if_stronger(parsed, field, val)
229
  break
230
 
231
+ # 3) "<kw> is positive"
232
  m3 = re.search(
233
  rf"{re.escape(kw)}\s+is\s+"
234
  r"(positive|negative|variable|pos|neg|\+|\-)",
 
240
  _set_if_stronger(parsed, field, val)
241
  break
242
 
243
+ # 4) NEW: "<kw> reaction is positive/negative"
244
+ m4 = re.search(
245
+ rf"{re.escape(kw)}\s+reaction\s+is\s+"
246
+ r"(positive|negative|variable|pos|neg|\+|\-)",
247
+ text_lc,
248
+ )
249
+ if m4:
250
+ val = _value_from_pnv_token(m4.group(1))
251
+ if val:
252
+ _set_if_stronger(parsed, field, val)
253
+ break
254
+
255
+ # 5) NEW: "<kw> reaction positive/negative"
256
+ m5 = re.search(
257
+ rf"{re.escape(kw)}\s+reaction\s+"
258
+ r"(positive|negative|variable|pos|neg|\+|\-)",
259
+ text_lc,
260
+ )
261
+ if m5:
262
+ val = _value_from_pnv_token(m5.group(1))
263
+ if val:
264
+ _set_if_stronger(parsed, field, val)
265
+ break
266
+
267
+ # 6) NEW: "<kw> test reaction is positive"
268
+ m6 = re.search(
269
+ rf"{re.escape(kw)}\s+test\s+reaction\s+is\s+"
270
+ r"(positive|negative|variable|pos|neg|\+|\-)",
271
+ text_lc,
272
+ )
273
+ if m6:
274
+ val = _value_from_pnv_token(m6.group(1))
275
+ if val:
276
+ _set_if_stronger(parsed, field, val)
277
+ break
278
+
279
  # Special-case NaCl tolerance with explicit percentages
280
  if field == "NaCl Tolerant (>=6%)":
 
281
  for m in re.finditer(
282
  r"(grows|growth)\s+(in|at)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
283
  text_lc,
 
289
  except Exception:
290
  pass
291
 
 
292
  for m in re.finditer(
293
  r"nacl\s+tolerant\s+(?:to|up to)?\s*(\d+(?:\.\d+)?)\s*%?",
294
  text_lc,
 
300
  except Exception:
301
  pass
302
 
 
303
  if re.search(
304
  r"does (not|n't) grow\s+(in|at)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
305
  text_lc,
306
  ):
307
  _set_if_stronger(parsed, "NaCl Tolerant (>=6%)", "Negative")
308
 
 
309
  for m in re.finditer(
310
  r"\bin\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
311
  text_lc,
 
317
  except Exception:
318
  pass
319
 
320
+ # Nitrate
321
  if re.search(r"reduces nitrate", text_lc):
322
  _set_if_stronger(parsed, "Nitrate Reduction", "Positive")
323
  if re.search(r"does (not|n't) reduce nitrate", text_lc):
324
  _set_if_stronger(parsed, "Nitrate Reduction", "Negative")
325
 
326
+ # H2S
327
  if re.search(r"(produces|production of)\s+h2s", text_lc):
328
  _set_if_stronger(parsed, "H2S", "Positive")
329
  if re.search(r"h2s production\s+is\s+(positive|pos|\+)", text_lc):
 
337
  ):
338
  _set_if_stronger(parsed, "H2S", "Negative")
339
 
340
+ # DNase
 
341
  if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(positive|pos|\+)\b", text_lc):
342
  _set_if_stronger(parsed, "DNase", "Positive")
 
343
  if re.search(r"\b(positive|pos|\+)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc):
344
  _set_if_stronger(parsed, "DNase", "Positive")
 
 
345
  if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(negative|neg|\-)\b", text_lc):
346
  _set_if_stronger(parsed, "DNase", "Negative")
 
347
  if re.search(r"\b(negative|neg|\-)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc):
348
  _set_if_stronger(parsed, "DNase", "Negative")
 
 
349
  if re.search(r"\bnon[- ]?dnase[- ]?producing\b", text_lc):
350
  _set_if_stronger(parsed, "DNase", "Negative")
351
 
352
+ # Gelatinase → Gelatin Hydrolysis
353
  if re.search(r"\bgelatinase\s*(positive|pos|\+)\b", text_lc):
354
  _set_if_stronger(parsed, "Gelatin Hydrolysis", "Positive")
 
355
  if re.search(r"\bgelatinase\s*(negative|neg|\-)\b", text_lc):
356
  _set_if_stronger(parsed, "Gelatin Hydrolysis", "Negative")
357
 
 
378
  ):
379
  _set_if_stronger(parsed, "Motility", "Negative")
380
 
 
381
  if (
382
  "tumbling motility" in text_lc
383
  or "swarming motility" in text_lc
 
386
  ):
387
  _set_if_stronger(parsed, "Motility", "Positive")
388
 
389
+ # Capsule
390
  if (
391
  "capsulated" in text_lc
392
  or "encapsulated" in text_lc
 
403
  _set_if_stronger(parsed, "Capsule", "Negative")
404
 
405
  # Spore formation
 
406
  if (
407
  re.search(r"\bnon[-\s]?spore[-\s]?forming\b", text_lc)
408
  or "no spores" in text_lc
409
  ):
410
  _set_if_stronger(parsed, "Spore Formation", "Negative")
411
+ return
412
 
 
413
  if (
414
  re.search(r"\bspore[-\s]?forming\b", text_lc)
415
  or "forms spores" in text_lc
 
422
  # ------------------------------------------------------------
423
 
424
  def _parse_oxygen(text_lc: str, parsed: Dict[str, str]) -> None:
 
 
 
 
 
 
 
425
  if re.search(r"facultative(ly)? anaerob", text_lc):
426
  _set_if_stronger(parsed, "Oxygen Requirement", "Facultative Anaerobe")
427
 
 
428
  if (
429
  re.search(r"\bobligate anaerob", text_lc)
430
  or (re.search(r"\banaerobic\b", text_lc) and "facultative" not in text_lc)
 
432
  ):
433
  _set_if_stronger(parsed, "Oxygen Requirement", "Anaerobic")
434
 
 
435
  if (
436
  re.search(r"\bobligate aerobe\b", text_lc)
437
  or (re.search(r"\baerobic\b", text_lc) and "anaerobic" not in text_lc)
438
+ or (re.search(r"\baerobically\b", text_lc) and "anaerobically" not in text_lc)
 
 
 
439
  ):
440
  _set_if_stronger(parsed, "Oxygen Requirement", "Aerobic")
441
 
 
451
  # ------------------------------------------------------------
452
 
453
  def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
 
 
 
 
 
 
 
 
454
  range_pattern = re.compile(
455
  r"(\d+)\s*[-–/]\s*(\d+)\s*(?:c|°c|degrees c|degrees celsius)"
456
  )
 
461
  _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
462
  return
463
 
 
464
  temps = re.findall(r"(\d+)\s*(?:c|°c|degrees c|degrees celsius)", text_lc)
465
  if len(temps) >= 2:
466
  nums = [int(t) for t in temps]
 
469
  _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
470
  return
471
 
 
472
  single_pattern = re.compile(
473
  r"(grows|growth|optimum|optimal)\s+(?:at\s+)?(\d+)\s*"
474
  r"(?:c|°c|degrees c|degrees celsius)"
 
479
  _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
480
  return
481
 
 
482
  m_simple_num = re.search(r"grows at (\d+)\b", text_lc)
483
  if m_simple_num:
484
  temp = m_simple_num.group(1)
485
  _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
486
  return
487
 
 
488
  m_plain = re.search(
489
  r"\b(\d+)\s*(?:c|°c|degrees c|degrees celsius)\b",
490
  text_lc,
 
495
 
496
 
497
  # ------------------------------------------------------------
498
+ # Media grown on
499
  # ------------------------------------------------------------
500
 
501
  MEDIA_KEYWORDS = {
502
+ "Blood Agar": ["blood agar", "blood-agar"],
503
+ "MacConkey Agar": ["macconkey agar", "mac conkey agar", "macconkey"],
504
+ "Chocolate Agar": ["chocolate agar", "chocolate-agar"],
505
+ "Nutrient Agar": ["nutrient agar", "nutrient-agar"],
506
+ "XLD Agar": ["xld agar"],
507
+ "TCBS Agar": ["tcbs agar", "tcbs"],
508
+ "ALOA": ["aloa agar", "aloa"],
509
+ "BCYE Agar": ["bcye agar", "bcye"],
510
+ "MRS Agar": ["mrs agar"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511
  }
512
 
 
513
  def _parse_media(text_lc: str, parsed: Dict[str, str]) -> None:
514
  found_media: List[str] = []
515
  for media_name, patterns in MEDIA_KEYWORDS.items():
 
522
 
523
 
524
  # ------------------------------------------------------------
525
+ # Sugar fermentation
526
  # ------------------------------------------------------------
527
 
528
  def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
529
+ # 0) Simple "<sugar> positive/negative" and "<sugar> is positive"
 
 
 
 
 
 
 
 
 
 
 
 
530
  for sugar_key, field in SUGAR_FIELDS.items():
 
531
  m_simple = re.search(
532
+ rf"{sugar_key}\s+(positive|negative|variable|pos|neg|\+|\-)",
533
+ text_lc,
534
  )
535
  if m_simple:
536
  val = _value_from_pnv_context(m_simple.group(1))
537
  if val:
538
  _set_if_stronger(parsed, field, val)
539
 
 
540
  m_is = re.search(
541
  rf"{sugar_key}\s+is\s+(positive|negative|variable|pos|neg|\+|\-)",
542
  text_lc,
 
546
  if val:
547
  _set_if_stronger(parsed, field, val)
548
 
549
+ # 0b) "<sugar> fermenter" vs "non-<sugar> fermenter"
550
  for sugar_key, field in SUGAR_FIELDS.items():
 
551
  if re.search(rf"\b{sugar_key}\s+fermenter\b", text_lc) and not re.search(
552
  rf"\bnon[- ]{sugar_key}\s+fermenter\b", text_lc
553
  ):
554
  _set_if_stronger(parsed, field, "Positive")
 
 
555
  if re.search(rf"\bnon[- ]{sugar_key}\s+fermenter\b", text_lc):
556
  _set_if_stronger(parsed, field, "Negative")
557
 
 
559
  ferments_pattern = re.compile(r"ferments\s+([a-z0-9 ,;/&\-]+)")
560
  for m in ferments_pattern.finditer(text_lc):
561
  seg = m.group(1)
 
562
  neg_split = re.split(r"\bbut not\b", seg, maxsplit=1)
563
  pos_part = neg_split[0]
564
  neg_part = neg_split[1] if len(neg_split) > 1 else ""
565
 
 
566
  for sugar_key, field in SUGAR_FIELDS.items():
567
  if re.search(rf"\b{sugar_key}\b", pos_part):
568
  _set_if_stronger(parsed, field, "Positive")
569
 
 
570
  for sugar_key, field in SUGAR_FIELDS.items():
571
  if re.search(rf"\b{sugar_key}\b", neg_part):
572
  _set_if_stronger(parsed, field, "Negative")
573
 
574
+ # 2) Grouped "does not ferment X, Y and Z" (stop at but/punctuation)
 
 
575
  grouped_neg_pattern = re.compile(
576
  r"does\s+(?:not|n't)\s+ferment\s+([a-z0-9 ,;/&\-]+?)(?:\s+but\b|\.|;|,|$)"
577
  )
 
581
  if re.search(rf"\b{sugar_key}\b", seg):
582
  _set_if_stronger(parsed, field, "Negative")
583
 
584
+ # 3) Single "does not ferment X"
585
  for sugar_key, field in SUGAR_FIELDS.items():
586
  if re.search(
587
  rf"does\s+(?:not|n't)\s+ferment\s+{sugar_key}\b", text_lc
588
  ):
589
  _set_if_stronger(parsed, field, "Negative")
590
 
591
+ # 4) "non-lactose fermenter"
592
  for sugar_key, field in SUGAR_FIELDS.items():
593
  if re.search(
594
  rf"non[- ]{sugar_key}\s+ferment(ing|er)?", text_lc
595
  ):
596
  _set_if_stronger(parsed, field, "Negative")
597
 
598
+ # 5) "<sugar> fermentation positive/negative" + "is positive"
599
  for sugar_key, field in SUGAR_FIELDS.items():
 
600
  m1 = re.search(
601
  rf"{sugar_key}\s+fermentation[ \-]?"
602
  r"(positive|negative|variable|pos|neg|\+|\-)",
 
608
  _set_if_stronger(parsed, field, val)
609
  continue
610
 
 
611
  m2 = re.search(
612
  rf"(positive|negative|variable|pos|neg|\+|\-)\s+"
613
  rf"(for\s+)?{sugar_key}\s+fermentation",
 
619
  _set_if_stronger(parsed, field, val)
620
  continue
621
 
 
622
  m3 = re.search(
623
  rf"{sugar_key}\s+fermentation\s+is\s+"
624
  r"(positive|negative|variable|pos|neg|\+|\-)",
 
630
  _set_if_stronger(parsed, field, val)
631
  continue
632
 
633
+ # 6) Global non-fermenter phrases
 
 
634
  if (
635
  re.search(
636
  r"does\s+(?:not|n't)\s+ferment\s+(carbohydrates|sugars)", text_lc
 
640
  for field in SUGAR_FIELDS.values():
641
  if field not in parsed or parsed[field] == UNKNOWN:
642
  _set_if_stronger(parsed, field, "Negative")
 
 
643
  # ------------------------------------------------------------
644
+ # Colony morphology
645
  # ------------------------------------------------------------
646
 
647
  def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
 
 
 
 
 
 
 
648
  m = re.search(r"colon(y|ies)\s+(are|is)\s+([a-z0-9 ,;\-]+)", text_lc)
649
  if m:
650
  desc = m.group(3).strip()
 
656
  _set_if_stronger(parsed, "Colony Morphology", pretty)
657
  return
658
 
 
659
  m2 = re.search(
660
  r"colonies\s+([a-z0-9 ,;\-]+?)(?:\s+on\b|\.|,)",
661
  text_lc,
 
670
  _set_if_stronger(parsed, "Colony Morphology", pretty)
671
  return
672
 
 
673
  m3 = re.search(
674
  r"(forming|forms|produces)\s+([a-z0-9 ,;\-]+?)\s+colonies",
675
  text_lc,
 
716
  }
717
 
718
  except Exception as e:
 
719
  return {
720
  "parsed_fields": parsed,
721
  "source": "rule_parser",
722
  "raw": original,
723
  "error": f"{type(e).__name__}: {e}",
724
+ }