Spaces:

EphAsad
/

BactKing

Sleeping

App Files Files Community

EphAsad commited on Nov 24, 2025

Commit

8927925

verified ·

1 Parent(s): 1475dff

Update engine/parser_rules.py

Browse files

Files changed (1) hide show

engine/parser_rules.py +99 -26

engine/parser_rules.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # ------------------------------------------------------------
 # Rule-based core parser for microbiology descriptions.
 #
-# Stage 11F (Option A ranges + fixes) + 11H + 11I + 11J + 11K + 11L:
 #
 # - Always store Growth Temperature as "low//high"
 #   • single: 37 → "37//37"
@@ -26,7 +26,9 @@
 #     • grouped "does not ferment lactose and sucrose"
 #       (without nuking glucose in "but glucose positive")
 #     • global "non-fermenter" → all sugars Negative (Unknown-only)
-#     • NEW: "asaccharolytic" → all sugars Negative (Unknown-only)
 #
 # - Core tests:
 #     • "<kw> positive/negative"
@@ -37,14 +39,19 @@
 #     • "<kw> test reaction is positive/negative"
 #     • "ONPG is negative" handled via core patterns
 #     • "H2S production is positive/negative"
-#     • NEW: "MR and VP negative/positive" → both set
 #
 # - Decarboxylases:
 #     • "all decarboxylases negative/positive"
 #       → Lysine / Ornithine / Arginine dihydrolase set accordingly
 #       (Unknown-only; explicit values can override later)
-#     • "lysine, ornithine and arginine dihydrolase negative"
-#       → list-based assignment for the three decarboxylases
 #
 # - Capsule / Motility:
 #     • "capsule present"/"capsule is present" → Capsule Positive
@@ -55,6 +62,10 @@
 # - Gelatin / Esculin:
 #     • "gelatin positive/negative" → Gelatin Hydrolysis
 #     • "esculin positive/negative" → Esculin Hydrolysis
 # ------------------------------------------------------------
 from __future__ import annotations
@@ -111,11 +122,12 @@ CORE_BOOL_FIELDS: Dict[str, List[str]] = {
     "Lipase Test": ["lipase"],
     "Nitrate Reduction": ["nitrate reduction", "nitrate"],
     "NaCl Tolerant (>=6%)": ["6% nacl", "7% nacl", "nacl tolerant"],
     "Lysine Decarboxylase": ["lysine decarboxylase", "lysine decarb", "lysine"],
     "Ornitihine Decarboxylase": ["ornithine decarboxylase", "ornithine decarb", "ornithine"],
     "Arginine dihydrolase": ["arginine dihydrolase", "arginine decarboxylase", "arginine"],
     "Gelatin Hydrolysis": ["gelatin hydrolysis", "gelatinase", "gelatin"],
-    # Esculin Hydrolysis: also match plain "esculin"
     "Esculin Hydrolysis": ["esculin hydrolysis", "esculin"],
 }
@@ -204,10 +216,14 @@ def _parse_gram_and_shape(text_lc: str, parsed: Dict[str, str]) -> None:
         _set_if_stronger(parsed, "Gram Stain", "Variable")
     # Shape
-    # Prefer "short rods" over generic rods
     if "short rods" in text_lc:
         _set_if_stronger(parsed, "Shape", "Short Rods")
     # Cocci and variants (diplococci, tetracocci, etc.)
     if re.search(r"\bcocci\b", text_lc):
         _set_if_stronger(parsed, "Shape", "Cocci")
@@ -282,7 +298,10 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
       - gelatinase / gelatin → Gelatin Hydrolysis
       - esculin → Esculin Hydrolysis
       - grouped MR/VP: "MR and VP negative"
-      - decarboxylase global + list phrases
     """
     for field, keywords in CORE_BOOL_FIELDS.items():
         for kw in keywords:
@@ -470,9 +489,7 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
         _assign_mr_vp(name1)
         _assign_mr_vp(name2)
-    # --- Decarboxylases global + list logic ---
-    # 1) "all decarboxylases negative/positive"
     m_all_decarb = re.search(
         r"all\s+decarboxylases?\s+(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)",
         text_lc,
@@ -483,24 +500,62 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
             for f in ("Lysine Decarboxylase", "Ornitihine Decarboxylase", "Arginine dihydrolase"):
                 _set_if_stronger(parsed, f, val)
-    # 2) List-style: "lysine, ornithine and arginine dihydrolase negative"
-    #    We look for any phrase ending in a P/N and then check for the names inside.
-    for m in re.finditer(
-        r"([a-z0-9 ,/]+?)\s+(?:decarboxylases?|dihydrolase[s]?)?\s*(?:are\s+)?"
-        r"(positive|negative|variable|pos|neg|\+|\-)",
-        text_lc,
-    ):
         seg = m.group(1)
         val = _value_from_pnv_token(m.group(2))
         if not val:
             continue
-        if "lysine" in seg:
-            _set_if_stronger(parsed, "Lysine Decarboxylase", val)
-        if "ornithine" in seg:
-            _set_if_stronger(parsed, "Ornitihine Decarboxylase", val)
-        if "arginine" in seg:
-            _set_if_stronger(parsed, "Arginine dihydrolase", val)
 # ------------------------------------------------------------
@@ -755,7 +810,8 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
       - "<sugar> is positive/negative"
       - "<sugar> fermentation is positive/negative"
       - global non-fermenter phrases
-      - NEW: "asaccharolytic" → all sugars Negative (Unknown-only)
     """
     # 0) Simple "<sugar> positive/negative" and "<sugar> is positive"
@@ -812,6 +868,8 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
                 _set_if_stronger(parsed, field, "Negative")
     # 2) Grouped "does not ferment X, Y and Z" (stop at but/punctuation)
     grouped_neg_pattern = re.compile(
         r"does\s+(?:not|n't)\s+ferment\s+([a-z0-9 ,;/&\-]+?)(?:\s+but\b|\.|;|,|$)"
     )
@@ -874,6 +932,8 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
                 continue
     # 6) Global non-fermenter phrases
     if (
         re.search(
             r"does\s+(?:not|n't)\s+ferment\s+(carbohydrates|sugars)", text_lc
@@ -894,9 +954,22 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
             if field not in parsed or parsed[field] == UNKNOWN:
                 _set_if_stronger(parsed, field, "Negative")
 # ------------------------------------------------------------
-# Colony morphology
 # ------------------------------------------------------------
 def _normalise_colony_desc(desc: str) -> str:

 # ------------------------------------------------------------
 # Rule-based core parser for microbiology descriptions.
 #
+# Stage 11F (Option A ranges + fixes) + 11H + 11I + 11J + 11L + 11M:
 #
 # - Always store Growth Temperature as "low//high"
 #   • single: 37 → "37//37"
 #     • grouped "does not ferment lactose and sucrose"
 #       (without nuking glucose in "but glucose positive")
 #     • global "non-fermenter" → all sugars Negative (Unknown-only)
+#     • "asaccharolytic" → all sugars Negative (Unknown-only)
+#     • NEW (11M): "all other sugars negative" → all remaining sugars Negative
+#       (Unknown-only; no hard rewrite)
 #
 # - Core tests:
 #     • "<kw> positive/negative"
 #     • "<kw> test reaction is positive/negative"
 #     • "ONPG is negative" handled via core patterns
 #     • "H2S production is positive/negative"
+#     • "MR and VP negative/positive" → both set
+#     • NEW (11M): grouped phrases like
+#         "gelatin and esculin hydrolysis negative"
+#         "lysine, ornithine and arginine negative"
+#       → all mentioned tests / sugars set to the given value
 #
 # - Decarboxylases:
 #     • "all decarboxylases negative/positive"
 #       → Lysine / Ornithine / Arginine dihydrolase set accordingly
 #       (Unknown-only; explicit values can override later)
+#     • list-style "lysine, ornithine and arginine negative" now handled
+#       by the universal grouped-phrase logic (old decarb-specific
+#       list logic removed).
 #
 # - Capsule / Motility:
 #     • "capsule present"/"capsule is present" → Capsule Positive
 # - Gelatin / Esculin:
 #     • "gelatin positive/negative" → Gelatin Hydrolysis
 #     • "esculin positive/negative" → Esculin Hydrolysis
+#     • grouped "gelatin and esculin hydrolysis negative" handled
+#
+# - Shape:
+#     • "coccobacilli / coccobacillus" → Shape = Short Rods
 # ------------------------------------------------------------
 from __future__ import annotations
     "Lipase Test": ["lipase"],
     "Nitrate Reduction": ["nitrate reduction", "nitrate"],
     "NaCl Tolerant (>=6%)": ["6% nacl", "7% nacl", "nacl tolerant"],
+    # Decarboxylases (also match plain amino acid words)
     "Lysine Decarboxylase": ["lysine decarboxylase", "lysine decarb", "lysine"],
     "Ornitihine Decarboxylase": ["ornithine decarboxylase", "ornithine decarb", "ornithine"],
     "Arginine dihydrolase": ["arginine dihydrolase", "arginine decarboxylase", "arginine"],
+    # Gelatin / Esculin
     "Gelatin Hydrolysis": ["gelatin hydrolysis", "gelatinase", "gelatin"],
     "Esculin Hydrolysis": ["esculin hydrolysis", "esculin"],
 }
         _set_if_stronger(parsed, "Gram Stain", "Variable")
     # Shape
+    # Prefer "short rods" / coccobacilli over generic rods
     if "short rods" in text_lc:
         _set_if_stronger(parsed, "Shape", "Short Rods")
+    # NEW: coccobacilli → Short Rods
+    if re.search(r"\bcoccobacill(?:us|i)\b", text_lc):
+        _set_if_stronger(parsed, "Shape", "Short Rods")
     # Cocci and variants (diplococci, tetracocci, etc.)
     if re.search(r"\bcocci\b", text_lc):
         _set_if_stronger(parsed, "Shape", "Cocci")
       - gelatinase / gelatin → Gelatin Hydrolysis
       - esculin → Esculin Hydrolysis
       - grouped MR/VP: "MR and VP negative"
+      - decarboxylase global phrases
+      - NEW: generic grouped phrases
+        "gelatin and esculin hydrolysis negative"
+        "lysine, ornithine and arginine negative"
     """
     for field, keywords in CORE_BOOL_FIELDS.items():
         for kw in keywords:
         _assign_mr_vp(name1)
         _assign_mr_vp(name2)
+    # --- Decarboxylases global "all decarboxylases negative/positive" ---
     m_all_decarb = re.search(
         r"all\s+decarboxylases?\s+(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)",
         text_lc,
             for f in ("Lysine Decarboxylase", "Ornitihine Decarboxylase", "Arginine dihydrolase"):
                 _set_if_stronger(parsed, f, val)
+    # --- NEW (11M): Generic grouped list logic for tests & sugars ---
+    #
+    # Handles things like:
+    #   "gelatin and esculin hydrolysis negative"
+    #   "lysine, ornithine and arginine negative"
+    #   "indole, urease and citrate positive"
+    #   "raffinose and inositol negative"
+    #
+    # Strategy:
+    #   1) Capture a list segment and a trailing P/N token.
+    #   2) Only keep it if the segment contains at least one known
+    #      CORE_BOOL_FIELDS keyword or sugar name.
+    #   3) Apply the value to all matching tests + sugars in that segment.
+    #
+    grouped_tests_pattern = re.compile(
+        r"([a-z0-9 ,/&\-]+?)\s+"
+        r"(?:hydrolysis|decarboxylases?|dihydrolases?|tests?|reactions?)?"
+        r"\s*(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)"
+    )
+    for m in grouped_tests_pattern.finditer(text_lc):
         seg = m.group(1)
         val = _value_from_pnv_token(m.group(2))
         if not val:
             continue
+        seg_lc = seg.lower()
+        # Quick filter: does this segment contain any known test/sugar keyword?
+        has_any = False
+        for _, keywords in CORE_BOOL_FIELDS.items():
+            if any(re.search(rf"\b{re.escape(kw)}\b", seg_lc) for kw in keywords):
+                has_any = True
+                break
+        if not has_any:
+            for sugar_key in SUGAR_FIELDS.keys():
+                if re.search(rf"\b{sugar_key}\b", seg_lc):
+                    has_any = True
+                    break
+        if not has_any:
+            continue  # ignore segments unrelated to tests/sugars
+        # Apply to all matching core boolean tests
+        for field, keywords in CORE_BOOL_FIELDS.items():
+            for kw in keywords:
+                if re.search(rf"\b{re.escape(kw)}\b", seg_lc):
+                    _set_if_stronger(parsed, field, val)
+                    break
+        # Apply to all matching sugars
+        for sugar_key, field in SUGAR_FIELDS.items():
+            if re.search(rf"\b{sugar_key}\b", seg_lc):
+                _set_if_stronger(parsed, field, val)
 # ------------------------------------------------------------
       - "<sugar> is positive/negative"
       - "<sugar> fermentation is positive/negative"
       - global non-fermenter phrases
+      - "asaccharolytic" → all sugars Negative (Unknown-only)
+      - "all other sugars negative" → remaining sugars Negative
     """
     # 0) Simple "<sugar> positive/negative" and "<sugar> is positive"
                 _set_if_stronger(parsed, field, "Negative")
     # 2) Grouped "does not ferment X, Y and Z" (stop at but/punctuation)
+    #    Prevents glucose being accidentally marked negative in:
+    #      "does not ferment lactose or sucrose, but glucose fermentation is positive"
     grouped_neg_pattern = re.compile(
         r"does\s+(?:not|n't)\s+ferment\s+([a-z0-9 ,;/&\-]+?)(?:\s+but\b|\.|;|,|$)"
     )
                 continue
     # 6) Global non-fermenter phrases
+    #     e.g. "non-fermenter", "does not ferment sugars"
+    #     → set all sugars Negative *unless* already set by a more specific rule.
     if (
         re.search(
             r"does\s+(?:not|n't)\s+ferment\s+(carbohydrates|sugars)", text_lc
             if field not in parsed or parsed[field] == UNKNOWN:
                 _set_if_stronger(parsed, field, "Negative")
+    # 8) NEW (11M): "all other sugars negative/positive"
+    #     we treat "other sugars" as "any sugar not already explicitly set"
+    m_other = re.search(
+        r"all\s+other\s+sugars\s+(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)",
+        text_lc,
+    )
+    if m_other:
+        val = _value_from_pnv_token(m_other.group(1))
+        if val:
+            for field in SUGAR_FIELDS.values():
+                if field not in parsed or parsed[field] == UNKNOWN:
+                    _set_if_stronger(parsed, field, val)
 # ------------------------------------------------------------
+# Colony morphology (coarse, optional)
 # ------------------------------------------------------------
 def _normalise_colony_desc(desc: str) -> str: