EphAsad commited on
Commit
8927925
Β·
verified Β·
1 Parent(s): 1475dff

Update engine/parser_rules.py

Browse files
Files changed (1) hide show
  1. engine/parser_rules.py +99 -26
engine/parser_rules.py CHANGED
@@ -2,7 +2,7 @@
2
  # ------------------------------------------------------------
3
  # Rule-based core parser for microbiology descriptions.
4
  #
5
- # Stage 11F (Option A ranges + fixes) + 11H + 11I + 11J + 11K + 11L:
6
  #
7
  # - Always store Growth Temperature as "low//high"
8
  # β€’ single: 37 β†’ "37//37"
@@ -26,7 +26,9 @@
26
  # β€’ grouped "does not ferment lactose and sucrose"
27
  # (without nuking glucose in "but glucose positive")
28
  # β€’ global "non-fermenter" β†’ all sugars Negative (Unknown-only)
29
- # β€’ NEW: "asaccharolytic" β†’ all sugars Negative (Unknown-only)
 
 
30
  #
31
  # - Core tests:
32
  # β€’ "<kw> positive/negative"
@@ -37,14 +39,19 @@
37
  # β€’ "<kw> test reaction is positive/negative"
38
  # β€’ "ONPG is negative" handled via core patterns
39
  # β€’ "H2S production is positive/negative"
40
- # β€’ NEW: "MR and VP negative/positive" β†’ both set
 
 
 
 
41
  #
42
  # - Decarboxylases:
43
  # β€’ "all decarboxylases negative/positive"
44
  # β†’ Lysine / Ornithine / Arginine dihydrolase set accordingly
45
  # (Unknown-only; explicit values can override later)
46
- # β€’ "lysine, ornithine and arginine dihydrolase negative"
47
- # β†’ list-based assignment for the three decarboxylases
 
48
  #
49
  # - Capsule / Motility:
50
  # β€’ "capsule present"/"capsule is present" β†’ Capsule Positive
@@ -55,6 +62,10 @@
55
  # - Gelatin / Esculin:
56
  # β€’ "gelatin positive/negative" β†’ Gelatin Hydrolysis
57
  # β€’ "esculin positive/negative" β†’ Esculin Hydrolysis
 
 
 
 
58
  # ------------------------------------------------------------
59
 
60
  from __future__ import annotations
@@ -111,11 +122,12 @@ CORE_BOOL_FIELDS: Dict[str, List[str]] = {
111
  "Lipase Test": ["lipase"],
112
  "Nitrate Reduction": ["nitrate reduction", "nitrate"],
113
  "NaCl Tolerant (>=6%)": ["6% nacl", "7% nacl", "nacl tolerant"],
 
114
  "Lysine Decarboxylase": ["lysine decarboxylase", "lysine decarb", "lysine"],
115
  "Ornitihine Decarboxylase": ["ornithine decarboxylase", "ornithine decarb", "ornithine"],
116
  "Arginine dihydrolase": ["arginine dihydrolase", "arginine decarboxylase", "arginine"],
 
117
  "Gelatin Hydrolysis": ["gelatin hydrolysis", "gelatinase", "gelatin"],
118
- # Esculin Hydrolysis: also match plain "esculin"
119
  "Esculin Hydrolysis": ["esculin hydrolysis", "esculin"],
120
  }
121
 
@@ -204,10 +216,14 @@ def _parse_gram_and_shape(text_lc: str, parsed: Dict[str, str]) -> None:
204
  _set_if_stronger(parsed, "Gram Stain", "Variable")
205
 
206
  # Shape
207
- # Prefer "short rods" over generic rods
208
  if "short rods" in text_lc:
209
  _set_if_stronger(parsed, "Shape", "Short Rods")
210
 
 
 
 
 
211
  # Cocci and variants (diplococci, tetracocci, etc.)
212
  if re.search(r"\bcocci\b", text_lc):
213
  _set_if_stronger(parsed, "Shape", "Cocci")
@@ -282,7 +298,10 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
282
  - gelatinase / gelatin β†’ Gelatin Hydrolysis
283
  - esculin β†’ Esculin Hydrolysis
284
  - grouped MR/VP: "MR and VP negative"
285
- - decarboxylase global + list phrases
 
 
 
286
  """
287
  for field, keywords in CORE_BOOL_FIELDS.items():
288
  for kw in keywords:
@@ -470,9 +489,7 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
470
  _assign_mr_vp(name1)
471
  _assign_mr_vp(name2)
472
 
473
- # --- Decarboxylases global + list logic ---
474
-
475
- # 1) "all decarboxylases negative/positive"
476
  m_all_decarb = re.search(
477
  r"all\s+decarboxylases?\s+(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)",
478
  text_lc,
@@ -483,24 +500,62 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
483
  for f in ("Lysine Decarboxylase", "Ornitihine Decarboxylase", "Arginine dihydrolase"):
484
  _set_if_stronger(parsed, f, val)
485
 
486
- # 2) List-style: "lysine, ornithine and arginine dihydrolase negative"
487
- # We look for any phrase ending in a P/N and then check for the names inside.
488
- for m in re.finditer(
489
- r"([a-z0-9 ,/]+?)\s+(?:decarboxylases?|dihydrolase[s]?)?\s*(?:are\s+)?"
490
- r"(positive|negative|variable|pos|neg|\+|\-)",
491
- text_lc,
492
- ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
  seg = m.group(1)
494
  val = _value_from_pnv_token(m.group(2))
495
  if not val:
496
  continue
497
 
498
- if "lysine" in seg:
499
- _set_if_stronger(parsed, "Lysine Decarboxylase", val)
500
- if "ornithine" in seg:
501
- _set_if_stronger(parsed, "Ornitihine Decarboxylase", val)
502
- if "arginine" in seg:
503
- _set_if_stronger(parsed, "Arginine dihydrolase", val)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504
 
505
 
506
  # ------------------------------------------------------------
@@ -755,7 +810,8 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
755
  - "<sugar> is positive/negative"
756
  - "<sugar> fermentation is positive/negative"
757
  - global non-fermenter phrases
758
- - NEW: "asaccharolytic" β†’ all sugars Negative (Unknown-only)
 
759
  """
760
 
761
  # 0) Simple "<sugar> positive/negative" and "<sugar> is positive"
@@ -812,6 +868,8 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
812
  _set_if_stronger(parsed, field, "Negative")
813
 
814
  # 2) Grouped "does not ferment X, Y and Z" (stop at but/punctuation)
 
 
815
  grouped_neg_pattern = re.compile(
816
  r"does\s+(?:not|n't)\s+ferment\s+([a-z0-9 ,;/&\-]+?)(?:\s+but\b|\.|;|,|$)"
817
  )
@@ -874,6 +932,8 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
874
  continue
875
 
876
  # 6) Global non-fermenter phrases
 
 
877
  if (
878
  re.search(
879
  r"does\s+(?:not|n't)\s+ferment\s+(carbohydrates|sugars)", text_lc
@@ -894,9 +954,22 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
894
  if field not in parsed or parsed[field] == UNKNOWN:
895
  _set_if_stronger(parsed, field, "Negative")
896
 
 
 
 
 
 
 
 
 
 
 
 
 
 
897
 
898
  # ------------------------------------------------------------
899
- # Colony morphology
900
  # ------------------------------------------------------------
901
 
902
  def _normalise_colony_desc(desc: str) -> str:
 
2
  # ------------------------------------------------------------
3
  # Rule-based core parser for microbiology descriptions.
4
  #
5
+ # Stage 11F (Option A ranges + fixes) + 11H + 11I + 11J + 11L + 11M:
6
  #
7
  # - Always store Growth Temperature as "low//high"
8
  # β€’ single: 37 β†’ "37//37"
 
26
  # β€’ grouped "does not ferment lactose and sucrose"
27
  # (without nuking glucose in "but glucose positive")
28
  # β€’ global "non-fermenter" β†’ all sugars Negative (Unknown-only)
29
+ # β€’ "asaccharolytic" β†’ all sugars Negative (Unknown-only)
30
+ # β€’ NEW (11M): "all other sugars negative" β†’ all remaining sugars Negative
31
+ # (Unknown-only; no hard rewrite)
32
  #
33
  # - Core tests:
34
  # β€’ "<kw> positive/negative"
 
39
  # β€’ "<kw> test reaction is positive/negative"
40
  # β€’ "ONPG is negative" handled via core patterns
41
  # β€’ "H2S production is positive/negative"
42
+ # β€’ "MR and VP negative/positive" β†’ both set
43
+ # β€’ NEW (11M): grouped phrases like
44
+ # "gelatin and esculin hydrolysis negative"
45
+ # "lysine, ornithine and arginine negative"
46
+ # β†’ all mentioned tests / sugars set to the given value
47
  #
48
  # - Decarboxylases:
49
  # β€’ "all decarboxylases negative/positive"
50
  # β†’ Lysine / Ornithine / Arginine dihydrolase set accordingly
51
  # (Unknown-only; explicit values can override later)
52
+ # β€’ list-style "lysine, ornithine and arginine negative" now handled
53
+ # by the universal grouped-phrase logic (old decarb-specific
54
+ # list logic removed).
55
  #
56
  # - Capsule / Motility:
57
  # β€’ "capsule present"/"capsule is present" β†’ Capsule Positive
 
62
  # - Gelatin / Esculin:
63
  # β€’ "gelatin positive/negative" β†’ Gelatin Hydrolysis
64
  # β€’ "esculin positive/negative" β†’ Esculin Hydrolysis
65
+ # β€’ grouped "gelatin and esculin hydrolysis negative" handled
66
+ #
67
+ # - Shape:
68
+ # β€’ "coccobacilli / coccobacillus" β†’ Shape = Short Rods
69
  # ------------------------------------------------------------
70
 
71
  from __future__ import annotations
 
122
  "Lipase Test": ["lipase"],
123
  "Nitrate Reduction": ["nitrate reduction", "nitrate"],
124
  "NaCl Tolerant (>=6%)": ["6% nacl", "7% nacl", "nacl tolerant"],
125
+ # Decarboxylases (also match plain amino acid words)
126
  "Lysine Decarboxylase": ["lysine decarboxylase", "lysine decarb", "lysine"],
127
  "Ornitihine Decarboxylase": ["ornithine decarboxylase", "ornithine decarb", "ornithine"],
128
  "Arginine dihydrolase": ["arginine dihydrolase", "arginine decarboxylase", "arginine"],
129
+ # Gelatin / Esculin
130
  "Gelatin Hydrolysis": ["gelatin hydrolysis", "gelatinase", "gelatin"],
 
131
  "Esculin Hydrolysis": ["esculin hydrolysis", "esculin"],
132
  }
133
 
 
216
  _set_if_stronger(parsed, "Gram Stain", "Variable")
217
 
218
  # Shape
219
+ # Prefer "short rods" / coccobacilli over generic rods
220
  if "short rods" in text_lc:
221
  _set_if_stronger(parsed, "Shape", "Short Rods")
222
 
223
+ # NEW: coccobacilli β†’ Short Rods
224
+ if re.search(r"\bcoccobacill(?:us|i)\b", text_lc):
225
+ _set_if_stronger(parsed, "Shape", "Short Rods")
226
+
227
  # Cocci and variants (diplococci, tetracocci, etc.)
228
  if re.search(r"\bcocci\b", text_lc):
229
  _set_if_stronger(parsed, "Shape", "Cocci")
 
298
  - gelatinase / gelatin β†’ Gelatin Hydrolysis
299
  - esculin β†’ Esculin Hydrolysis
300
  - grouped MR/VP: "MR and VP negative"
301
+ - decarboxylase global phrases
302
+ - NEW: generic grouped phrases
303
+ "gelatin and esculin hydrolysis negative"
304
+ "lysine, ornithine and arginine negative"
305
  """
306
  for field, keywords in CORE_BOOL_FIELDS.items():
307
  for kw in keywords:
 
489
  _assign_mr_vp(name1)
490
  _assign_mr_vp(name2)
491
 
492
+ # --- Decarboxylases global "all decarboxylases negative/positive" ---
 
 
493
  m_all_decarb = re.search(
494
  r"all\s+decarboxylases?\s+(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)",
495
  text_lc,
 
500
  for f in ("Lysine Decarboxylase", "Ornitihine Decarboxylase", "Arginine dihydrolase"):
501
  _set_if_stronger(parsed, f, val)
502
 
503
+ # --- NEW (11M): Generic grouped list logic for tests & sugars ---
504
+ #
505
+ # Handles things like:
506
+ # "gelatin and esculin hydrolysis negative"
507
+ # "lysine, ornithine and arginine negative"
508
+ # "indole, urease and citrate positive"
509
+ # "raffinose and inositol negative"
510
+ #
511
+ # Strategy:
512
+ # 1) Capture a list segment and a trailing P/N token.
513
+ # 2) Only keep it if the segment contains at least one known
514
+ # CORE_BOOL_FIELDS keyword or sugar name.
515
+ # 3) Apply the value to all matching tests + sugars in that segment.
516
+ #
517
+ grouped_tests_pattern = re.compile(
518
+ r"([a-z0-9 ,/&\-]+?)\s+"
519
+ r"(?:hydrolysis|decarboxylases?|dihydrolases?|tests?|reactions?)?"
520
+ r"\s*(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)"
521
+ )
522
+
523
+ for m in grouped_tests_pattern.finditer(text_lc):
524
  seg = m.group(1)
525
  val = _value_from_pnv_token(m.group(2))
526
  if not val:
527
  continue
528
 
529
+ seg_lc = seg.lower()
530
+
531
+ # Quick filter: does this segment contain any known test/sugar keyword?
532
+ has_any = False
533
+
534
+ for _, keywords in CORE_BOOL_FIELDS.items():
535
+ if any(re.search(rf"\b{re.escape(kw)}\b", seg_lc) for kw in keywords):
536
+ has_any = True
537
+ break
538
+
539
+ if not has_any:
540
+ for sugar_key in SUGAR_FIELDS.keys():
541
+ if re.search(rf"\b{sugar_key}\b", seg_lc):
542
+ has_any = True
543
+ break
544
+
545
+ if not has_any:
546
+ continue # ignore segments unrelated to tests/sugars
547
+
548
+ # Apply to all matching core boolean tests
549
+ for field, keywords in CORE_BOOL_FIELDS.items():
550
+ for kw in keywords:
551
+ if re.search(rf"\b{re.escape(kw)}\b", seg_lc):
552
+ _set_if_stronger(parsed, field, val)
553
+ break
554
+
555
+ # Apply to all matching sugars
556
+ for sugar_key, field in SUGAR_FIELDS.items():
557
+ if re.search(rf"\b{sugar_key}\b", seg_lc):
558
+ _set_if_stronger(parsed, field, val)
559
 
560
 
561
  # ------------------------------------------------------------
 
810
  - "<sugar> is positive/negative"
811
  - "<sugar> fermentation is positive/negative"
812
  - global non-fermenter phrases
813
+ - "asaccharolytic" β†’ all sugars Negative (Unknown-only)
814
+ - "all other sugars negative" β†’ remaining sugars Negative
815
  """
816
 
817
  # 0) Simple "<sugar> positive/negative" and "<sugar> is positive"
 
868
  _set_if_stronger(parsed, field, "Negative")
869
 
870
  # 2) Grouped "does not ferment X, Y and Z" (stop at but/punctuation)
871
+ # Prevents glucose being accidentally marked negative in:
872
+ # "does not ferment lactose or sucrose, but glucose fermentation is positive"
873
  grouped_neg_pattern = re.compile(
874
  r"does\s+(?:not|n't)\s+ferment\s+([a-z0-9 ,;/&\-]+?)(?:\s+but\b|\.|;|,|$)"
875
  )
 
932
  continue
933
 
934
  # 6) Global non-fermenter phrases
935
+ # e.g. "non-fermenter", "does not ferment sugars"
936
+ # β†’ set all sugars Negative *unless* already set by a more specific rule.
937
  if (
938
  re.search(
939
  r"does\s+(?:not|n't)\s+ferment\s+(carbohydrates|sugars)", text_lc
 
954
  if field not in parsed or parsed[field] == UNKNOWN:
955
  _set_if_stronger(parsed, field, "Negative")
956
 
957
+ # 8) NEW (11M): "all other sugars negative/positive"
958
+ # we treat "other sugars" as "any sugar not already explicitly set"
959
+ m_other = re.search(
960
+ r"all\s+other\s+sugars\s+(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)",
961
+ text_lc,
962
+ )
963
+ if m_other:
964
+ val = _value_from_pnv_token(m_other.group(1))
965
+ if val:
966
+ for field in SUGAR_FIELDS.values():
967
+ if field not in parsed or parsed[field] == UNKNOWN:
968
+ _set_if_stronger(parsed, field, val)
969
+
970
 
971
  # ------------------------------------------------------------
972
+ # Colony morphology (coarse, optional)
973
  # ------------------------------------------------------------
974
 
975
  def _normalise_colony_desc(desc: str) -> str: