EphAsad commited on
Commit
82eb007
·
verified ·
1 Parent(s): 596952c

Update engine/parser_rules.py

Browse files
Files changed (1) hide show
  1. engine/parser_rules.py +116 -210
engine/parser_rules.py CHANGED
@@ -2,7 +2,8 @@
2
  # ------------------------------------------------------------
3
  # Rule-based core parser for microbiology descriptions.
4
  #
5
- # Stage 11F (Option A ranges + fixes) + Stage 11H additions:
 
6
  # - Always store Growth Temperature as "low//high"
7
  # • single: 37 → "37//37"
8
  # • two temps in text: min//max (e.g. "4 °C but not 45 °C" → "4//45")
@@ -12,9 +13,19 @@
12
  # - "aerobically" / "anaerobically" → Aerobic / Anaerobic
13
  # - NaCl tolerance phrases improved
14
  # - Colony morphology from "colonies dry, white and irregular on nutrient agar"
15
- # - NEW (11H):
 
16
  # • "Gelatinase positive/negative" → Gelatin Hydrolysis Positive/Negative
17
  # • "<sugar> fermenter" → <Sugar> Fermentation = Positive
 
 
 
 
 
 
 
 
 
18
  # ------------------------------------------------------------
19
 
20
  from __future__ import annotations
@@ -29,7 +40,6 @@ UNKNOWN = "Unknown"
29
  # Core fields and sugar mapping
30
  # ------------------------------------------------------------
31
 
32
- # Sugar name → core DB column
33
  SUGAR_FIELDS: Dict[str, str] = {
34
  "glucose": "Glucose Fermentation",
35
  "lactose": "Lactose Fermentation",
@@ -46,25 +56,17 @@ SUGAR_FIELDS: Dict[str, str] = {
46
  }
47
 
48
  CORE_BOOL_FIELDS: Dict[str, List[str]] = {
49
- # field: [keywords to recognise the test name]
50
  "Catalase": ["catalase"],
51
  "Oxidase": ["oxidase"],
52
  "Indole": ["indole"],
53
  "Urease": ["urease"],
54
  "Citrate": ["citrate"],
55
- # MR: include "mr"
56
  "Methyl Red": ["methyl red", "mr test", "mr"],
57
  "VP": ["voges-proskauer", "vp test", "vp"],
58
- # H2S (includes H₂S → normalised to H2S in _clean_text)
59
  "H2S": ["h2s", "hydrogen sulfide"],
60
- # DNase: broaden patterns
61
  "DNase": [
62
- "dnase",
63
- "dnase test",
64
- "dnase activity",
65
- "dnase production",
66
- "dnaase",
67
- "dna hydrolysis",
68
  ],
69
  "ONPG": ["onpg"],
70
  "Coagulase": ["coagulase"],
@@ -75,7 +77,6 @@ CORE_BOOL_FIELDS: Dict[str, List[str]] = {
75
  "Ornitihine Decarboxylase": ["ornithine decarboxylase", "ornithine decarb"],
76
  "Arginine dihydrolase": ["arginine dihydrolase"],
77
  "Gelatin Hydrolysis": ["gelatin hydrolysis", "gelatinase"],
78
- # Esculin Hydrolysis: also match plain "esculin"
79
  "Esculin Hydrolysis": ["esculin hydrolysis", "esculin"],
80
  }
81
 
@@ -84,42 +85,23 @@ CORE_BOOL_FIELDS: Dict[str, List[str]] = {
84
  # ------------------------------------------------------------
85
 
86
  def _clean_text(text: str) -> str:
87
- """
88
- Normalise a few unicode oddities and collapse whitespace.
89
- Also:
90
- - strip degree symbols
91
- - normalise subscript ₂ → 2 for H₂S
92
- """
93
  if not text:
94
  return ""
95
  s = text.replace("°", "").replace("º", "")
96
- # normalise subscript 2 (H₂S → H2S)
97
  s = s.replace("₂", "2")
98
- # keep dashes as-is; regexes handle - and – explicitly
99
  return " ".join(s.split())
100
 
101
-
102
  def _norm(s: str) -> str:
103
  return s.strip().lower()
104
 
105
-
106
  def _set_if_stronger(parsed: Dict[str, str], field: str, value: str) -> None:
107
- """
108
- Write value to parsed[field] if:
109
- - field not present, or
110
- - we are replacing Unknown with a concrete value
111
- """
112
  if not value:
113
  return
114
  if field not in parsed or parsed[field] == UNKNOWN:
115
  parsed[field] = value
116
 
117
-
118
- def _value_from_pnv_context(segment: str) -> str | None:
119
- """
120
- Interpret a short phrase as Positive / Negative / Variable.
121
- """
122
- seg = _norm(segment)
123
  if seg in ["positive", "pos", "+"]:
124
  return "Positive"
125
  if seg in ["negative", "neg", "-"]:
@@ -128,13 +110,21 @@ def _value_from_pnv_context(segment: str) -> str | None:
128
  return "Variable"
129
  return None
130
 
 
 
 
 
 
 
 
 
 
131
 
132
  # ------------------------------------------------------------
133
  # Gram stain and shape
134
  # ------------------------------------------------------------
135
 
136
  def _parse_gram_and_shape(text_lc: str, parsed: Dict[str, str]) -> None:
137
- # Gram stain
138
  if "gram-positive" in text_lc or "gram positive" in text_lc:
139
  _set_if_stronger(parsed, "Gram Stain", "Positive")
140
  elif "gram-negative" in text_lc or "gram negative" in text_lc:
@@ -142,50 +132,37 @@ def _parse_gram_and_shape(text_lc: str, parsed: Dict[str, str]) -> None:
142
  elif "gram variable" in text_lc:
143
  _set_if_stronger(parsed, "Gram Stain", "Variable")
144
 
145
- # Shape
146
- # Prefer "short rods" over generic rods
147
  if "short rods" in text_lc:
148
  _set_if_stronger(parsed, "Shape", "Short Rods")
149
 
150
- # Cocci and variants (diplococci, tetracocci, etc.)
151
  if re.search(r"\bcocci\b", text_lc):
152
  _set_if_stronger(parsed, "Shape", "Cocci")
153
  if re.search(r"\b(diplococci|tetracocci|streptococci|staphylococci)\b", text_lc):
154
  _set_if_stronger(parsed, "Shape", "Cocci")
155
 
156
- # Rods / bacilli
157
  if re.search(r"\brods?\b", text_lc) or "bacilli" in text_lc:
158
  _set_if_stronger(parsed, "Shape", "Rods")
159
 
160
- # Spiral
161
  if "spiral" in text_lc or "spirochete" in text_lc:
162
  _set_if_stronger(parsed, "Shape", "Spiral")
163
 
164
-
165
  # ------------------------------------------------------------
166
  # Haemolysis
167
  # ------------------------------------------------------------
168
 
169
  def _parse_haemolysis(text_lc: str, parsed: Dict[str, str]) -> None:
170
- """
171
- Handle haemolysis phrasing:
172
- - beta-haemolytic / beta hemolytic / beta-haemolysis / etc.
173
- - alpha- / gamma- / non-haemolytic
174
- """
175
- # Beta
176
  if re.search(r"beta[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
177
  _set_if_stronger(parsed, "Haemolysis Type", "Beta")
178
  _set_if_stronger(parsed, "Haemolysis", "Positive")
179
 
180
- # Alpha
181
  if re.search(r"alpha[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
182
  _set_if_stronger(parsed, "Haemolysis Type", "Alpha")
183
  _set_if_stronger(parsed, "Haemolysis", "Positive")
184
 
185
- # Gamma / non-haemolytic
186
  if re.search(r"gamma[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
187
  _set_if_stronger(parsed, "Haemolysis Type", "Gamma")
188
  _set_if_stronger(parsed, "Haemolysis", "Negative")
 
189
  if (
190
  "non-haemolytic" in text_lc
191
  or "non hemolytic" in text_lc
@@ -194,32 +171,20 @@ def _parse_haemolysis(text_lc: str, parsed: Dict[str, str]) -> None:
194
  _set_if_stronger(parsed, "Haemolysis Type", "None")
195
  _set_if_stronger(parsed, "Haemolysis", "Negative")
196
 
197
- # Variable phrasing
198
  if "variable haemolysis" in text_lc or "variable hemolysis" in text_lc:
199
  _set_if_stronger(parsed, "Haemolysis Type", "Variable")
200
  _set_if_stronger(parsed, "Haemolysis", "Variable")
201
 
202
-
203
  # ------------------------------------------------------------
204
- # Boolean test parser (core enzyme tests etc.)
205
  # ------------------------------------------------------------
206
 
207
  def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
208
- """
209
- For each test in CORE_BOOL_FIELDS, look for patterns like:
210
- "catalase positive", "positive for catalase", etc.
211
- Also handles:
212
- - NaCl tolerance with % values
213
- - Nitrate reduction text
214
- - H2S production / non-production
215
- - DNase universal coverage
216
- - NEW (11H): explicit gelatinase → Gelatin Hydrolysis mapping
217
- """
218
  for field, keywords in CORE_BOOL_FIELDS.items():
219
  for kw in keywords:
220
- # "... catalase positive"
221
  m1 = re.search(
222
- rf"{re.escape(kw)}[ \-]?(positive|negative|variable|pos|neg|\+|\-)",
 
223
  text_lc,
224
  )
225
  if m1:
@@ -228,9 +193,9 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
228
  _set_if_stronger(parsed, field, val)
229
  break
230
 
231
- # "positive for catalase"
232
  m2 = re.search(
233
- rf"(positive|negative|variable|pos|neg|\+|\-)\s+(for\s+)?{re.escape(kw)}",
 
234
  text_lc,
235
  )
236
  if m2:
@@ -239,9 +204,18 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
239
  _set_if_stronger(parsed, field, val)
240
  break
241
 
242
- # Special-case NaCl tolerance with explicit percentages
 
 
 
 
 
 
 
 
 
 
243
  if field == "NaCl Tolerant (>=6%)":
244
- # e.g. "grows in 6.5% NaCl", "grows at 10% NaCl"
245
  for m in re.finditer(
246
  r"(grows|growth)\s+(in|at)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
247
  text_lc,
@@ -253,7 +227,6 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
253
  except Exception:
254
  pass
255
 
256
- # e.g. "NaCl tolerant up to 10%"
257
  for m in re.finditer(
258
  r"nacl\s+tolerant\s+(?:to|up to)?\s*(\d+(?:\.\d+)?)\s*%?",
259
  text_lc,
@@ -265,14 +238,12 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
265
  except Exception:
266
  pass
267
 
268
- # explicit negative phrasing: "does not grow in 7% NaCl"
269
  if re.search(
270
  r"does (not|n't) grow\s+(in|at)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
271
  text_lc,
272
  ):
273
  _set_if_stronger(parsed, "NaCl Tolerant (>=6%)", "Negative")
274
 
275
- # general "in 6.5% NaCl" → assume tolerance if no explicit "no growth"
276
  for m in re.finditer(
277
  r"\bin\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
278
  text_lc,
@@ -284,16 +255,19 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
284
  except Exception:
285
  pass
286
 
287
- # Nitrate: "reduces nitrate" / "does not reduce nitrate"
288
  if re.search(r"reduces nitrate", text_lc):
289
  _set_if_stronger(parsed, "Nitrate Reduction", "Positive")
290
  if re.search(r"does (not|n't) reduce nitrate", text_lc):
291
  _set_if_stronger(parsed, "Nitrate Reduction", "Negative")
292
 
293
- # H2S: "produces H2S", "H2S production", "does not produce H2S",
294
- # "non-H2S producing"
295
  if re.search(r"(produces|production of)\s+h2s", text_lc):
296
  _set_if_stronger(parsed, "H2S", "Positive")
 
 
 
 
297
  if (
298
  re.search(r"does (not|n't) produce\s+h2s", text_lc)
299
  or re.search(r"no h2s production", text_lc)
@@ -301,30 +275,21 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
301
  ):
302
  _set_if_stronger(parsed, "H2S", "Negative")
303
 
304
- # --- DNase universal coverage ---
305
- # Positive forms
306
  if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(positive|pos|\+)\b", text_lc):
307
  _set_if_stronger(parsed, "DNase", "Positive")
308
-
309
  if re.search(r"\b(positive|pos|\+)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc):
310
  _set_if_stronger(parsed, "DNase", "Positive")
311
-
312
- # Negative forms
313
  if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(negative|neg|\-)\b", text_lc):
314
  _set_if_stronger(parsed, "DNase", "Negative")
315
-
316
  if re.search(r"\b(negative|neg|\-)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc):
317
  _set_if_stronger(parsed, "DNase", "Negative")
318
-
319
- # non-DNase-producing
320
  if re.search(r"\bnon[- ]?dnase[- ]?producing\b", text_lc):
321
  _set_if_stronger(parsed, "DNase", "Negative")
322
 
323
- # --- NEW: Gelatinase → Gelatin Hydrolysis ---
324
- # Explicit mapping just in case generic patterns miss it
325
  if re.search(r"\bgelatinase\s*(positive|pos|\+)\b", text_lc):
326
  _set_if_stronger(parsed, "Gelatin Hydrolysis", "Positive")
327
-
328
  if re.search(r"\bgelatinase\s*(negative|neg|\-)\b", text_lc):
329
  _set_if_stronger(parsed, "Gelatin Hydrolysis", "Negative")
330
 
@@ -334,7 +299,6 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
334
  # ------------------------------------------------------------
335
 
336
  def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None:
337
- # Motility
338
  if (
339
  re.search(r"\bmotile\b", text_lc)
340
  and not re.search(r"\bnon[- ]?motile\b", text_lc)
@@ -351,7 +315,6 @@ def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None
351
  ):
352
  _set_if_stronger(parsed, "Motility", "Negative")
353
 
354
- # Specific motility phrases: tumbling, swarming, corkscrew
355
  if (
356
  "tumbling motility" in text_lc
357
  or "swarming motility" in text_lc
@@ -360,7 +323,6 @@ def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None
360
  ):
361
  _set_if_stronger(parsed, "Motility", "Positive")
362
 
363
- # Capsule (including "capsule positive/negative")
364
  if (
365
  "capsulated" in text_lc
366
  or "encapsulated" in text_lc
@@ -376,16 +338,13 @@ def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None
376
  ):
377
  _set_if_stronger(parsed, "Capsule", "Negative")
378
 
379
- # Spore formation
380
- # NEGATIVE FIRST with strict boundaries, then early-return
381
  if (
382
  re.search(r"\bnon[-\s]?spore[-\s]?forming\b", text_lc)
383
  or "no spores" in text_lc
384
  ):
385
  _set_if_stronger(parsed, "Spore Formation", "Negative")
386
- return # prevent any positive overwrite
387
 
388
- # POSITIVE (must not match the negative form)
389
  if (
390
  re.search(r"\bspore[-\s]?forming\b", text_lc)
391
  or "forms spores" in text_lc
@@ -398,17 +357,9 @@ def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None
398
  # ------------------------------------------------------------
399
 
400
  def _parse_oxygen(text_lc: str, parsed: Dict[str, str]) -> None:
401
- """
402
- Robust oxygen parsing:
403
- - Handle facultative first
404
- - Avoid "aerobic" accidentally matching inside "anaerobic"
405
- - Include "aerobically" / "anaerobically"
406
- """
407
- # Facultative first
408
  if re.search(r"facultative(ly)? anaerob", text_lc):
409
  _set_if_stronger(parsed, "Oxygen Requirement", "Facultative Anaerobe")
410
 
411
- # Strict anaerobic (before aerobic)
412
  if (
413
  re.search(r"\bobligate anaerob", text_lc)
414
  or (re.search(r"\banaerobic\b", text_lc) and "facultative" not in text_lc)
@@ -416,17 +367,10 @@ def _parse_oxygen(text_lc: str, parsed: Dict[str, str]) -> None:
416
  ):
417
  _set_if_stronger(parsed, "Oxygen Requirement", "Anaerobic")
418
 
419
- # Now handle purely aerobic, avoiding "anaerobic"
420
  if (
421
  re.search(r"\bobligate aerobe\b", text_lc)
422
- or (
423
- re.search(r"\baerobic\b", text_lc)
424
- and "anaerobic" not in text_lc
425
- )
426
- or (
427
- re.search(r"\baerobically\b", text_lc)
428
- and "anaerobically" not in text_lc
429
- )
430
  ):
431
  _set_if_stronger(parsed, "Oxygen Requirement", "Aerobic")
432
 
@@ -438,20 +382,10 @@ def _parse_oxygen(text_lc: str, parsed: Dict[str, str]) -> None:
438
 
439
 
440
  # ------------------------------------------------------------
441
- # Growth temperature
442
  # ------------------------------------------------------------
443
 
444
  def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
445
- """
446
- Look for explicit temperatures like "grows at 37 °C" or ranges like "4–45 °C".
447
- We ALWAYS store as "low//high":
448
- - true ranges: "4-45 °C" → "4//45"
449
- - two temps in text: min//max (Option A), e.g.:
450
- "grows at 4 °C but not at 45 °C" → "4//45"
451
- "grows at 42 °C but not at 25 °C" → "25//42"
452
- - single temps: "37 °C" → "37//37"
453
- """
454
- # 1) Explicit ranges like "4-45 °C" or "10–40 °C"
455
  range_pattern = re.compile(
456
  r"(\d+)\s*[-–/]\s*(\d+)\s*(?:c|°c|degrees c|degrees celsius)"
457
  )
@@ -462,7 +396,6 @@ def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
462
  _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
463
  return
464
 
465
- # 2) Option A: any two explicit temps → min//max
466
  temps = re.findall(r"(\d+)\s*(?:c|°c|degrees c|degrees celsius)", text_lc)
467
  if len(temps) >= 2:
468
  nums = [int(t) for t in temps]
@@ -471,9 +404,9 @@ def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
471
  _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
472
  return
473
 
474
- # 3) Single temps like "grows at 37 c"
475
  single_pattern = re.compile(
476
- r"(grows|growth|optimum|optimal)\s+(?:at\s+)?(\d+)\s*(?:c|°c|degrees c|degrees celsius)"
 
477
  )
478
  m_single = single_pattern.search(text_lc)
479
  if m_single:
@@ -481,14 +414,12 @@ def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
481
  _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
482
  return
483
 
484
- # 4) Simplified: "grows at 37" (no explicit °C)
485
  m_simple_num = re.search(r"grows at (\d+)\b", text_lc)
486
  if m_simple_num:
487
  temp = m_simple_num.group(1)
488
  _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
489
  return
490
 
491
- # 5) Fallback: plain "37c" somewhere in the text
492
  m_plain = re.search(
493
  r"\b(\d+)\s*(?:c|°c|degrees c|degrees celsius)\b",
494
  text_lc,
@@ -499,48 +430,21 @@ def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
499
 
500
 
501
  # ------------------------------------------------------------
502
- # Media grown on (coarse mapping)
503
  # ------------------------------------------------------------
504
 
505
  MEDIA_KEYWORDS = {
506
- "Blood Agar": [
507
- "blood agar",
508
- "blood-agar",
509
- ],
510
- "MacConkey Agar": [
511
- "macconkey agar",
512
- "mac conkey agar",
513
- "macconkey",
514
- ],
515
- "Chocolate Agar": [
516
- "chocolate agar",
517
- "chocolate-agar",
518
- ],
519
- "Nutrient Agar": [
520
- "nutrient agar",
521
- "nutrient-agar",
522
- ],
523
- "XLD Agar": [
524
- "xld agar",
525
- ],
526
- "TCBS Agar": [
527
- "tcbs agar",
528
- "tcbs",
529
- ],
530
- "ALOA": [
531
- "aloa agar",
532
- "aloa",
533
- ],
534
- "BCYE Agar": [
535
- "bcye agar",
536
- "bcye",
537
- ],
538
- "MRS Agar": [
539
- "mrs agar",
540
- ],
541
  }
542
 
543
-
544
  def _parse_media(text_lc: str, parsed: Dict[str, str]) -> None:
545
  found_media: List[str] = []
546
  for media_name, patterns in MEDIA_KEYWORDS.items():
@@ -553,73 +457,76 @@ def _parse_media(text_lc: str, parsed: Dict[str, str]) -> None:
553
 
554
 
555
  # ------------------------------------------------------------
556
- # Sugar fermentation parsing
557
  # ------------------------------------------------------------
558
 
559
  def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
560
- """
561
- Handles patterns like:
562
- - "glucose positive, mannitol negative"
563
- - "ferments glucose, mannitol and sucrose but not lactose"
564
- - "does not ferment lactose"
565
- - "non-lactose fermenter"
566
- - "<sugar> fermenter" (positive)
567
- - global non-fermenter phrases
568
- """
569
-
570
- # 0) Simple "glucose positive / negative" style
571
  for sugar_key, field in SUGAR_FIELDS.items():
572
  m_simple = re.search(
573
- rf"{sugar_key}\s+(positive|negative|variable|pos|neg|\+|\-)",
574
- text_lc,
575
  )
576
  if m_simple:
577
  val = _value_from_pnv_context(m_simple.group(1))
578
  if val:
579
  _set_if_stronger(parsed, field, val)
580
 
581
- # 0b) NEW: "<sugar> fermenter" → Positive (unless "non-<sugar> fermenter")
 
 
 
 
 
 
 
 
582
  for sugar_key, field in SUGAR_FIELDS.items():
583
- # positive: "lactose fermenter"
584
  if re.search(rf"\b{sugar_key}\s+fermenter\b", text_lc) and not re.search(
585
  rf"\bnon[- ]{sugar_key}\s+fermenter\b", text_lc
586
  ):
587
  _set_if_stronger(parsed, field, "Positive")
 
 
588
 
589
- # 1) "ferments X, Y and Z but not A, B"
590
  ferments_pattern = re.compile(r"ferments\s+([a-z0-9 ,;/&\-]+)")
591
  for m in ferments_pattern.finditer(text_lc):
592
  seg = m.group(1)
593
- # Split positive vs negative part on "but not"
594
  neg_split = re.split(r"\bbut not\b", seg, maxsplit=1)
595
  pos_part = neg_split[0]
596
  neg_part = neg_split[1] if len(neg_split) > 1 else ""
597
 
598
- # Positive sugars from pos_part
599
  for sugar_key, field in SUGAR_FIELDS.items():
600
  if re.search(rf"\b{sugar_key}\b", pos_part):
601
  _set_if_stronger(parsed, field, "Positive")
602
 
603
- # Negative sugars from neg_part
604
  for sugar_key, field in SUGAR_FIELDS.items():
605
  if re.search(rf"\b{sugar_key}\b", neg_part):
606
  _set_if_stronger(parsed, field, "Negative")
607
 
608
- # 2) "does not ferment X"
 
 
 
 
 
 
 
 
609
  for sugar_key, field in SUGAR_FIELDS.items():
610
- if re.search(rf"does (not|n't) ferment {sugar_key}\b", text_lc):
 
 
611
  _set_if_stronger(parsed, field, "Negative")
612
 
613
- # 3) "non-lactose fermenter"
614
  for sugar_key, field in SUGAR_FIELDS.items():
615
- if re.search(rf"non[- ]{sugar_key} ferment(ing|er)?", text_lc):
 
 
616
  _set_if_stronger(parsed, field, "Negative")
617
 
618
- # 4) "X fermentation positive/negative"
619
  for sugar_key, field in SUGAR_FIELDS.items():
620
- # "glucose fermentation positive"
621
  m1 = re.search(
622
- rf"{sugar_key}\s+fermentation[ \-]?(positive|negative|variable|pos|neg|\+|\-)",
 
623
  text_lc,
624
  )
625
  if m1:
@@ -628,9 +535,9 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
628
  _set_if_stronger(parsed, field, val)
629
  continue
630
 
631
- # "positive for glucose fermentation"
632
  m2 = re.search(
633
- rf"(positive|negative|variable|pos|neg|\+|\-)\s+(for\s+)?{sugar_key}\s+fermentation",
 
634
  text_lc,
635
  )
636
  if m2:
@@ -639,10 +546,22 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
639
  _set_if_stronger(parsed, field, val)
640
  continue
641
 
642
- # 5) Global non-fermenter patterns
 
 
 
 
 
 
 
 
 
 
643
  if (
644
- re.search(r"does (not|n't) ferment (carbohydrates|sugars)", text_lc)
645
- or re.search(r"non[- ]ferment(er|ing|ative)", text_lc)
 
 
646
  ):
647
  for field in SUGAR_FIELDS.values():
648
  if field not in parsed or parsed[field] == UNKNOWN:
@@ -650,18 +569,11 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
650
 
651
 
652
  # ------------------------------------------------------------
653
- # Colony morphology (coarse, optional)
654
  # ------------------------------------------------------------
655
 
656
  def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
657
- """
658
- Very coarse mapping for colony morphology. We try:
659
- - "colonies are yellow, mucoid"
660
- - "colonies dry, white and irregular on nutrient agar"
661
- - "forming green colonies", "forms mucoid colonies"
662
- """
663
- # Pattern 1: "colonies are ..."
664
- m = re.search(r"colon(y|ies) (are|is)\s+([a-z0-9 ,;\-]+)", text_lc)
665
  if m:
666
  desc = m.group(3).strip()
667
  if desc:
@@ -672,7 +584,6 @@ def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
672
  _set_if_stronger(parsed, "Colony Morphology", pretty)
673
  return
674
 
675
- # Pattern 2: "colonies dry, white and irregular on nutrient agar"
676
  m2 = re.search(
677
  r"colonies\s+([a-z0-9 ,;\-]+?)(?:\s+on\b|\.|,)",
678
  text_lc,
@@ -687,7 +598,6 @@ def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
687
  _set_if_stronger(parsed, "Colony Morphology", pretty)
688
  return
689
 
690
- # Pattern 3: "forming green colonies", "forms mucoid colonies"
691
  m3 = re.search(
692
  r"(forming|forms|produces)\s+([a-z0-9 ,;\-]+?)\s+colonies",
693
  text_lc,
@@ -707,9 +617,6 @@ def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
707
  # ------------------------------------------------------------
708
 
709
  def parse_text_rules(text: str) -> Dict[str, Any]:
710
- """
711
- Main entry point.
712
- """
713
  original = text or ""
714
  text_clean = _clean_text(original)
715
  text_lc = text_clean.lower()
@@ -734,7 +641,6 @@ def parse_text_rules(text: str) -> Dict[str, Any]:
734
  }
735
 
736
  except Exception as e:
737
- # Fail-safe: never crash the app, just report an error
738
  return {
739
  "parsed_fields": parsed,
740
  "source": "rule_parser",
 
2
  # ------------------------------------------------------------
3
  # Rule-based core parser for microbiology descriptions.
4
  #
5
+ # Stage 11F (Option A ranges + fixes) + 11H + 11I:
6
+ #
7
  # - Always store Growth Temperature as "low//high"
8
  # • single: 37 → "37//37"
9
  # • two temps in text: min//max (e.g. "4 °C but not 45 °C" → "4//45")
 
13
  # - "aerobically" / "anaerobically" → Aerobic / Anaerobic
14
  # - NaCl tolerance phrases improved
15
  # - Colony morphology from "colonies dry, white and irregular on nutrient agar"
16
+ #
17
+ # New in this version:
18
  # • "Gelatinase positive/negative" → Gelatin Hydrolysis Positive/Negative
19
  # • "<sugar> fermenter" → <Sugar> Fermentation = Positive
20
+ # • "<sugar> is positive/negative" handled
21
+ # • "<sugar> fermentation is positive/negative" handled
22
+ # • Grouped "does not ferment lactose and sucrose" handled
23
+ # • Global non-fermenter + explicit positive sugar:
24
+ # "Non-fermenter, ferments glucose weakly"
25
+ # → all sugars Negative *except* Glucose = Positive
26
+ # • Core tests accept "is positive/is negative/is variable"
27
+ # • "H2S production is positive/negative" handled
28
+ # • ONPG phrases like "ONPG is negative" now parsed
29
  # ------------------------------------------------------------
30
 
31
  from __future__ import annotations
 
40
  # Core fields and sugar mapping
41
  # ------------------------------------------------------------
42
 
 
43
  SUGAR_FIELDS: Dict[str, str] = {
44
  "glucose": "Glucose Fermentation",
45
  "lactose": "Lactose Fermentation",
 
56
  }
57
 
58
  CORE_BOOL_FIELDS: Dict[str, List[str]] = {
 
59
  "Catalase": ["catalase"],
60
  "Oxidase": ["oxidase"],
61
  "Indole": ["indole"],
62
  "Urease": ["urease"],
63
  "Citrate": ["citrate"],
 
64
  "Methyl Red": ["methyl red", "mr test", "mr"],
65
  "VP": ["voges-proskauer", "vp test", "vp"],
 
66
  "H2S": ["h2s", "hydrogen sulfide"],
 
67
  "DNase": [
68
+ "dnase", "dnase test", "dnase activity",
69
+ "dnase production", "dnaase", "dna hydrolysis",
 
 
 
 
70
  ],
71
  "ONPG": ["onpg"],
72
  "Coagulase": ["coagulase"],
 
77
  "Ornitihine Decarboxylase": ["ornithine decarboxylase", "ornithine decarb"],
78
  "Arginine dihydrolase": ["arginine dihydrolase"],
79
  "Gelatin Hydrolysis": ["gelatin hydrolysis", "gelatinase"],
 
80
  "Esculin Hydrolysis": ["esculin hydrolysis", "esculin"],
81
  }
82
 
 
85
  # ------------------------------------------------------------
86
 
87
  def _clean_text(text: str) -> str:
 
 
 
 
 
 
88
  if not text:
89
  return ""
90
  s = text.replace("°", "").replace("º", "")
 
91
  s = s.replace("₂", "2")
 
92
  return " ".join(s.split())
93
 
 
94
  def _norm(s: str) -> str:
95
  return s.strip().lower()
96
 
 
97
  def _set_if_stronger(parsed: Dict[str, str], field: str, value: str) -> None:
 
 
 
 
 
98
  if not value:
99
  return
100
  if field not in parsed or parsed[field] == UNKNOWN:
101
  parsed[field] = value
102
 
103
+ def _value_from_pnv_token(token: str) -> str | None:
104
+ seg = _norm(token)
 
 
 
 
105
  if seg in ["positive", "pos", "+"]:
106
  return "Positive"
107
  if seg in ["negative", "neg", "-"]:
 
110
  return "Variable"
111
  return None
112
 
113
+ def _value_from_pnv_context(segment: str) -> str | None:
114
+ seg = _norm(segment)
115
+ val = _value_from_pnv_token(seg)
116
+ if val:
117
+ return val
118
+ m = re.search(r"\bis\s+(positive|negative|variable|pos|neg|\+|\-)\b", seg)
119
+ if m:
120
+ return _value_from_pnv_token(m.group(1))
121
+ return None
122
 
123
  # ------------------------------------------------------------
124
  # Gram stain and shape
125
  # ------------------------------------------------------------
126
 
127
  def _parse_gram_and_shape(text_lc: str, parsed: Dict[str, str]) -> None:
 
128
  if "gram-positive" in text_lc or "gram positive" in text_lc:
129
  _set_if_stronger(parsed, "Gram Stain", "Positive")
130
  elif "gram-negative" in text_lc or "gram negative" in text_lc:
 
132
  elif "gram variable" in text_lc:
133
  _set_if_stronger(parsed, "Gram Stain", "Variable")
134
 
 
 
135
  if "short rods" in text_lc:
136
  _set_if_stronger(parsed, "Shape", "Short Rods")
137
 
 
138
  if re.search(r"\bcocci\b", text_lc):
139
  _set_if_stronger(parsed, "Shape", "Cocci")
140
  if re.search(r"\b(diplococci|tetracocci|streptococci|staphylococci)\b", text_lc):
141
  _set_if_stronger(parsed, "Shape", "Cocci")
142
 
 
143
  if re.search(r"\brods?\b", text_lc) or "bacilli" in text_lc:
144
  _set_if_stronger(parsed, "Shape", "Rods")
145
 
 
146
  if "spiral" in text_lc or "spirochete" in text_lc:
147
  _set_if_stronger(parsed, "Shape", "Spiral")
148
 
 
149
  # ------------------------------------------------------------
150
  # Haemolysis
151
  # ------------------------------------------------------------
152
 
153
  def _parse_haemolysis(text_lc: str, parsed: Dict[str, str]) -> None:
 
 
 
 
 
 
154
  if re.search(r"beta[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
155
  _set_if_stronger(parsed, "Haemolysis Type", "Beta")
156
  _set_if_stronger(parsed, "Haemolysis", "Positive")
157
 
 
158
  if re.search(r"alpha[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
159
  _set_if_stronger(parsed, "Haemolysis Type", "Alpha")
160
  _set_if_stronger(parsed, "Haemolysis", "Positive")
161
 
 
162
  if re.search(r"gamma[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
163
  _set_if_stronger(parsed, "Haemolysis Type", "Gamma")
164
  _set_if_stronger(parsed, "Haemolysis", "Negative")
165
+
166
  if (
167
  "non-haemolytic" in text_lc
168
  or "non hemolytic" in text_lc
 
171
  _set_if_stronger(parsed, "Haemolysis Type", "None")
172
  _set_if_stronger(parsed, "Haemolysis", "Negative")
173
 
 
174
  if "variable haemolysis" in text_lc or "variable hemolysis" in text_lc:
175
  _set_if_stronger(parsed, "Haemolysis Type", "Variable")
176
  _set_if_stronger(parsed, "Haemolysis", "Variable")
177
 
 
178
  # ------------------------------------------------------------
179
+ # Core enzyme test parsing
180
  # ------------------------------------------------------------
181
 
182
  def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
 
 
 
 
 
 
 
 
 
 
183
  for field, keywords in CORE_BOOL_FIELDS.items():
184
  for kw in keywords:
 
185
  m1 = re.search(
186
+ rf"{re.escape(kw)}[ \-]?"
187
+ r"(positive|negative|variable|pos|neg|\+|\-)",
188
  text_lc,
189
  )
190
  if m1:
 
193
  _set_if_stronger(parsed, field, val)
194
  break
195
 
 
196
  m2 = re.search(
197
+ rf"(positive|negative|variable|pos|neg|\+|\-)\s+"
198
+ rf"(for\s+)?{re.escape(kw)}",
199
  text_lc,
200
  )
201
  if m2:
 
204
  _set_if_stronger(parsed, field, val)
205
  break
206
 
207
+ m3 = re.search(
208
+ rf"{re.escape(kw)}\s+is\s+"
209
+ r"(positive|negative|variable|pos|neg|\+|\-)",
210
+ text_lc,
211
+ )
212
+ if m3:
213
+ val = _value_from_pnv_token(m3.group(1))
214
+ if val:
215
+ _set_if_stronger(parsed, field, val)
216
+ break
217
+ # --- NaCl tolerance explicit patterns ---
218
  if field == "NaCl Tolerant (>=6%)":
 
219
  for m in re.finditer(
220
  r"(grows|growth)\s+(in|at)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
221
  text_lc,
 
227
  except Exception:
228
  pass
229
 
 
230
  for m in re.finditer(
231
  r"nacl\s+tolerant\s+(?:to|up to)?\s*(\d+(?:\.\d+)?)\s*%?",
232
  text_lc,
 
238
  except Exception:
239
  pass
240
 
 
241
  if re.search(
242
  r"does (not|n't) grow\s+(in|at)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
243
  text_lc,
244
  ):
245
  _set_if_stronger(parsed, "NaCl Tolerant (>=6%)", "Negative")
246
 
 
247
  for m in re.finditer(
248
  r"\bin\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
249
  text_lc,
 
255
  except Exception:
256
  pass
257
 
258
+ # Nitrate
259
  if re.search(r"reduces nitrate", text_lc):
260
  _set_if_stronger(parsed, "Nitrate Reduction", "Positive")
261
  if re.search(r"does (not|n't) reduce nitrate", text_lc):
262
  _set_if_stronger(parsed, "Nitrate Reduction", "Negative")
263
 
264
+ # H2S
 
265
  if re.search(r"(produces|production of)\s+h2s", text_lc):
266
  _set_if_stronger(parsed, "H2S", "Positive")
267
+ if re.search(r"h2s production\s+is\s+(positive|pos|\+)", text_lc):
268
+ _set_if_stronger(parsed, "H2S", "Positive")
269
+ if re.search(r"h2s production\s+is\s+(negative|neg|\-)", text_lc):
270
+ _set_if_stronger(parsed, "H2S", "Negative")
271
  if (
272
  re.search(r"does (not|n't) produce\s+h2s", text_lc)
273
  or re.search(r"no h2s production", text_lc)
 
275
  ):
276
  _set_if_stronger(parsed, "H2S", "Negative")
277
 
278
+ # DNase
 
279
  if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(positive|pos|\+)\b", text_lc):
280
  _set_if_stronger(parsed, "DNase", "Positive")
 
281
  if re.search(r"\b(positive|pos|\+)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc):
282
  _set_if_stronger(parsed, "DNase", "Positive")
 
 
283
  if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(negative|neg|\-)\b", text_lc):
284
  _set_if_stronger(parsed, "DNase", "Negative")
 
285
  if re.search(r"\b(negative|neg|\-)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc):
286
  _set_if_stronger(parsed, "DNase", "Negative")
 
 
287
  if re.search(r"\bnon[- ]?dnase[- ]?producing\b", text_lc):
288
  _set_if_stronger(parsed, "DNase", "Negative")
289
 
290
+ # NEW: Gelatinase mapping
 
291
  if re.search(r"\bgelatinase\s*(positive|pos|\+)\b", text_lc):
292
  _set_if_stronger(parsed, "Gelatin Hydrolysis", "Positive")
 
293
  if re.search(r"\bgelatinase\s*(negative|neg|\-)\b", text_lc):
294
  _set_if_stronger(parsed, "Gelatin Hydrolysis", "Negative")
295
 
 
299
  # ------------------------------------------------------------
300
 
301
  def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None:
 
302
  if (
303
  re.search(r"\bmotile\b", text_lc)
304
  and not re.search(r"\bnon[- ]?motile\b", text_lc)
 
315
  ):
316
  _set_if_stronger(parsed, "Motility", "Negative")
317
 
 
318
  if (
319
  "tumbling motility" in text_lc
320
  or "swarming motility" in text_lc
 
323
  ):
324
  _set_if_stronger(parsed, "Motility", "Positive")
325
 
 
326
  if (
327
  "capsulated" in text_lc
328
  or "encapsulated" in text_lc
 
338
  ):
339
  _set_if_stronger(parsed, "Capsule", "Negative")
340
 
 
 
341
  if (
342
  re.search(r"\bnon[-\s]?spore[-\s]?forming\b", text_lc)
343
  or "no spores" in text_lc
344
  ):
345
  _set_if_stronger(parsed, "Spore Formation", "Negative")
346
+ return
347
 
 
348
  if (
349
  re.search(r"\bspore[-\s]?forming\b", text_lc)
350
  or "forms spores" in text_lc
 
357
  # ------------------------------------------------------------
358
 
359
  def _parse_oxygen(text_lc: str, parsed: Dict[str, str]) -> None:
 
 
 
 
 
 
 
360
  if re.search(r"facultative(ly)? anaerob", text_lc):
361
  _set_if_stronger(parsed, "Oxygen Requirement", "Facultative Anaerobe")
362
 
 
363
  if (
364
  re.search(r"\bobligate anaerob", text_lc)
365
  or (re.search(r"\banaerobic\b", text_lc) and "facultative" not in text_lc)
 
367
  ):
368
  _set_if_stronger(parsed, "Oxygen Requirement", "Anaerobic")
369
 
 
370
  if (
371
  re.search(r"\bobligate aerobe\b", text_lc)
372
+ or (re.search(r"\baerobic\b", text_lc) and "anaerobic" not in text_lc)
373
+ or (re.search(r"\baerobically\b", text_lc) and "anaerobically" not in text_lc)
 
 
 
 
 
 
374
  ):
375
  _set_if_stronger(parsed, "Oxygen Requirement", "Aerobic")
376
 
 
382
 
383
 
384
  # ------------------------------------------------------------
385
+ # Growth Temperature
386
  # ------------------------------------------------------------
387
 
388
  def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
 
 
 
 
 
 
 
 
 
 
389
  range_pattern = re.compile(
390
  r"(\d+)\s*[-–/]\s*(\d+)\s*(?:c|°c|degrees c|degrees celsius)"
391
  )
 
396
  _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
397
  return
398
 
 
399
  temps = re.findall(r"(\d+)\s*(?:c|°c|degrees c|degrees celsius)", text_lc)
400
  if len(temps) >= 2:
401
  nums = [int(t) for t in temps]
 
404
  _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
405
  return
406
 
 
407
  single_pattern = re.compile(
408
+ r"(grows|growth|optimum|optimal)\s+(?:at\s+)?(\d+)\s*"
409
+ r"(?:c|°c|degrees c|degrees celsius)"
410
  )
411
  m_single = single_pattern.search(text_lc)
412
  if m_single:
 
414
  _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
415
  return
416
 
 
417
  m_simple_num = re.search(r"grows at (\d+)\b", text_lc)
418
  if m_simple_num:
419
  temp = m_simple_num.group(1)
420
  _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
421
  return
422
 
 
423
  m_plain = re.search(
424
  r"\b(\d+)\s*(?:c|°c|degrees c|degrees celsius)\b",
425
  text_lc,
 
430
 
431
 
432
  # ------------------------------------------------------------
433
+ # Media Grown On
434
  # ------------------------------------------------------------
435
 
436
  MEDIA_KEYWORDS = {
437
+ "Blood Agar": ["blood agar", "blood-agar"],
438
+ "MacConkey Agar": ["macconkey agar", "mac conkey agar", "macconkey"],
439
+ "Chocolate Agar": ["chocolate agar", "chocolate-agar"],
440
+ "Nutrient Agar": ["nutrient agar", "nutrient-agar"],
441
+ "XLD Agar": ["xld agar"],
442
+ "TCBS Agar": ["tcbs agar", "tcbs"],
443
+ "ALOA": ["aloa agar", "aloa"],
444
+ "BCYE Agar": ["bcye agar", "bcye"],
445
+ "MRS Agar": ["mrs agar"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
  }
447
 
 
448
  def _parse_media(text_lc: str, parsed: Dict[str, str]) -> None:
449
  found_media: List[str] = []
450
  for media_name, patterns in MEDIA_KEYWORDS.items():
 
457
 
458
 
459
  # ------------------------------------------------------------
460
+ # Sugar fermentation
461
  # ------------------------------------------------------------
462
 
463
  def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
 
 
 
 
 
 
 
 
 
 
 
464
  for sugar_key, field in SUGAR_FIELDS.items():
465
  m_simple = re.search(
466
+ rf"{sugar_key}\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc
 
467
  )
468
  if m_simple:
469
  val = _value_from_pnv_context(m_simple.group(1))
470
  if val:
471
  _set_if_stronger(parsed, field, val)
472
 
473
+ m_is = re.search(
474
+ rf"{sugar_key}\s+is\s+(positive|negative|variable|pos|neg|\+|\-)",
475
+ text_lc,
476
+ )
477
+ if m_is:
478
+ val = _value_from_pnv_token(m_is.group(1))
479
+ if val:
480
+ _set_if_stronger(parsed, field, val)
481
+
482
  for sugar_key, field in SUGAR_FIELDS.items():
 
483
  if re.search(rf"\b{sugar_key}\s+fermenter\b", text_lc) and not re.search(
484
  rf"\bnon[- ]{sugar_key}\s+fermenter\b", text_lc
485
  ):
486
  _set_if_stronger(parsed, field, "Positive")
487
+ if re.search(rf"\bnon[- ]{sugar_key}\s+fermenter\b", text_lc):
488
+ _set_if_stronger(parsed, field, "Negative")
489
 
 
490
  ferments_pattern = re.compile(r"ferments\s+([a-z0-9 ,;/&\-]+)")
491
  for m in ferments_pattern.finditer(text_lc):
492
  seg = m.group(1)
 
493
  neg_split = re.split(r"\bbut not\b", seg, maxsplit=1)
494
  pos_part = neg_split[0]
495
  neg_part = neg_split[1] if len(neg_split) > 1 else ""
496
 
 
497
  for sugar_key, field in SUGAR_FIELDS.items():
498
  if re.search(rf"\b{sugar_key}\b", pos_part):
499
  _set_if_stronger(parsed, field, "Positive")
500
 
 
501
  for sugar_key, field in SUGAR_FIELDS.items():
502
  if re.search(rf"\b{sugar_key}\b", neg_part):
503
  _set_if_stronger(parsed, field, "Negative")
504
 
505
+ grouped_neg_pattern = re.compile(
506
+ r"does\s+(?:not|n't)\s+ferment\s+([a-z0-9 ,;/&\-]+)"
507
+ )
508
+ for m in grouped_neg_pattern.finditer(text_lc):
509
+ seg = m.group(1)
510
+ for sugar_key, field in SUGAR_FIELDS.items():
511
+ if re.search(rf"\b{sugar_key}\b", seg):
512
+ _set_if_stronger(parsed, field, "Negative")
513
+
514
  for sugar_key, field in SUGAR_FIELDS.items():
515
+ if re.search(
516
+ rf"does\s+(?:not|n't)\s+ferment\s+{sugar_key}\b", text_lc
517
+ ):
518
  _set_if_stronger(parsed, field, "Negative")
519
 
 
520
  for sugar_key, field in SUGAR_FIELDS.items():
521
+ if re.search(
522
+ rf"non[- ]{sugar_key}\s+ferment(ing|er)?", text_lc
523
+ ):
524
  _set_if_stronger(parsed, field, "Negative")
525
 
 
526
  for sugar_key, field in SUGAR_FIELDS.items():
 
527
  m1 = re.search(
528
+ rf"{sugar_key}\s+fermentation[ \-]?"
529
+ r"(positive|negative|variable|pos|neg|\+|\-)",
530
  text_lc,
531
  )
532
  if m1:
 
535
  _set_if_stronger(parsed, field, val)
536
  continue
537
 
 
538
  m2 = re.search(
539
+ rf"(positive|negative|variable|pos|neg|\+|\-)\s+"
540
+ rf"(for\s+)?{sugar_key}\s+fermentation",
541
  text_lc,
542
  )
543
  if m2:
 
546
  _set_if_stronger(parsed, field, val)
547
  continue
548
 
549
+ m3 = re.search(
550
+ rf"{sugar_key}\s+fermentation\s+is\s+"
551
+ r"(positive|negative|variable|pos|neg|\+|\-)",
552
+ text_lc,
553
+ )
554
+ if m3:
555
+ val = _value_from_pnv_token(m3.group(1))
556
+ if val:
557
+ _set_if_stronger(parsed, field, val)
558
+ continue
559
+
560
  if (
561
+ re.search(
562
+ r"does\s+(?:not|n't)\s+ferment\s+(carbohydrates|sugars)", text_lc
563
+ )
564
+ or re.search(r"\bnon[- ]ferment(er|ing|ative)\b", text_lc)
565
  ):
566
  for field in SUGAR_FIELDS.values():
567
  if field not in parsed or parsed[field] == UNKNOWN:
 
569
 
570
 
571
  # ------------------------------------------------------------
572
+ # Colony morphology
573
  # ------------------------------------------------------------
574
 
575
  def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
576
+ m = re.search(r"colon(y|ies)\s+(are|is)\s+([a-z0-9 ,;\-]+)", text_lc)
 
 
 
 
 
 
 
577
  if m:
578
  desc = m.group(3).strip()
579
  if desc:
 
584
  _set_if_stronger(parsed, "Colony Morphology", pretty)
585
  return
586
 
 
587
  m2 = re.search(
588
  r"colonies\s+([a-z0-9 ,;\-]+?)(?:\s+on\b|\.|,)",
589
  text_lc,
 
598
  _set_if_stronger(parsed, "Colony Morphology", pretty)
599
  return
600
 
 
601
  m3 = re.search(
602
  r"(forming|forms|produces)\s+([a-z0-9 ,;\-]+?)\s+colonies",
603
  text_lc,
 
617
  # ------------------------------------------------------------
618
 
619
  def parse_text_rules(text: str) -> Dict[str, Any]:
 
 
 
620
  original = text or ""
621
  text_clean = _clean_text(original)
622
  text_lc = text_clean.lower()
 
641
  }
642
 
643
  except Exception as e:
 
644
  return {
645
  "parsed_fields": parsed,
646
  "source": "rule_parser",