Spaces:

EphAsad
/

BactKing

Sleeping

App Files Files Community

EphAsad commited on Nov 17, 2025

Commit

33342cc

verified ·

1 Parent(s): eaf9a2b

Update engine/parser_rules.py

Browse files

Files changed (1) hide show

engine/parser_rules.py +717 -694

engine/parser_rules.py CHANGED Viewed

@@ -1,699 +1,722 @@
-# engine/parser_rules.py
-# ------------------------------------------------------------
-# Rule-based core parser for microbiology descriptions.
-#
-# Stage 11E: built on the 0.68-accuracy version, with
-# targeted fixes for:
-# - MR → Methyl Red ("mr" keyword)
-# - Motility (motile/non-motile/nonmotile/immotile +
-#   tumbling/swarming/corkscrew motility)
-# - H₂S (subscript 2 normalised) + "produces/doesn't produce H2S"
-# - "reduces nitrate" / "does not reduce nitrate"
-# - Oxygen parsing: avoid "aerobic" grabbing "anaerobic"
-# - Non-spore-forming → Negative (and avoid non- prefix mistake)
-# - Capsule "capsule positive/negative" phrasing
-# - NaCl tolerance in phrases like "in 6.5% NaCl"
-# - Growth Temperature always as "X//X" (including single temps)
-# - Sugar phrases "glucose positive" etc. → Fermentation fields
-# - Global non-fermenter patterns → all sugars Negative (if unset)
-# - Esculin negative → Esculin Hydrolysis Negative
-# - Colony morphology from "colonies dry, white and irregular..."
-#   and "forming green colonies", etc.
-# - Diplococci / tetracocci / streptococci / staphylococci → Cocci
-# ------------------------------------------------------------
-from __future__ import annotations
-import re
-from typing import Dict, Any, List
-UNKNOWN = "Unknown"
-# ------------------------------------------------------------
-# Core fields and sugar mapping
-# ------------------------------------------------------------
-# Sugar name → core DB column
-SUGAR_FIELDS: Dict[str, str] = {
-    "glucose": "Glucose Fermentation",
-    "lactose": "Lactose Fermentation",
-    "sucrose": "Sucrose Fermentation",
-    "maltose": "Maltose Fermentation",
-    "mannitol": "Mannitol Fermentation",
-    "sorbitol": "Sorbitol Fermentation",
-    "xylose": "Xylose Fermentation",
-    "rhamnose": "Rhamnose Fermentation",
-    "arabinose": "Arabinose Fermentation",
-    "raffinose": "Raffinose Fermentation",
-    "trehalose": "Trehalose Fermentation",
-    "inositol": "Inositol Fermentation",
-}
-CORE_BOOL_FIELDS: Dict[str, List[str]] = {
-    # field: [keywords to recognise the test name]
-    "Catalase": ["catalase"],
-    "Oxidase": ["oxidase"],
-    "Indole": ["indole"],
-    "Urease": ["urease"],
-    "Citrate": ["citrate"],
-    # MR: add "mr" as a keyword
-    "Methyl Red": ["methyl red", "mr test", "mr"],
-    "VP": ["voges-proskauer", "vp test", "vp"],
-    # H2S: allow H2S and hydrogen sulfide
-    "H2S": ["h2s", "hydrogen sulfide"],
-    # DNase: handle "dnase" and "dnase test"
-    "DNase": ["dnase", "dnase test"],
-    "ONPG": ["onpg"],
-    "Coagulase": ["coagulase"],
-    "Lipase Test": ["lipase"],
-    "Nitrate Reduction": ["nitrate reduction", "nitrate"],
-    "NaCl Tolerant (>=6%)": ["6% nacl", "7% nacl", "nacl tolerant"],
-    "Lysine Decarboxylase": ["lysine decarboxylase", "lysine decarb"],
-    "Ornitihine Decarboxylase": ["ornithine decarboxylase", "ornithine decarb"],
-    "Arginine dihydrolase": ["arginine dihydrolase"],
-    "Gelatin Hydrolysis": ["gelatin hydrolysis", "gelatinase"],
-    # Esculin Hydrolysis: also match plain "esculin"
-    "Esculin Hydrolysis": ["esculin hydrolysis", "esculin"],
-}
-# ------------------------------------------------------------
-# Generic helpers
-# ------------------------------------------------------------
-def _clean_text(text: str) -> str:
-    """
-    Normalise a few unicode oddities and collapse whitespace.
-    Also:
-      - strip degree symbols
-      - normalise subscript ₂ → 2 for H₂S
-    """
-    if not text:
-        return ""
-    s = text.replace("°", "").replace("º", "")
-    # normalise subscript 2 (H₂S → H2S)
-    s = s.replace("₂", "2")
-    # keep dashes as-as; regexes handle - and – explicitly
-    return " ".join(s.split())
-def _norm(s: str) -> str:
-    return s.strip().lower()
-def _set_if_stronger(parsed: Dict[str, str], field: str, value: str) -> None:
-    """
-    Write value to parsed[field] if:
-      - field not present, or
-      - we are replacing Unknown with a concrete value
-    """
-    if not value:
-        return
-    if field not in parsed or parsed[field] == UNKNOWN:
-        parsed[field] = value
-def _value_from_pnv_context(segment: str) -> str | None:
-    """
-    Interpret a short phrase as Positive / Negative / Variable.
-    Examples:
-      "positive", "+", "pos" → Positive
-      "negative", "neg", "-" → Negative
-      "variable", "var", "v" → Variable
-    """
-    seg = _norm(segment)
-    if seg in ["positive", "pos", "+"]:
-        return "Positive"
-    if seg in ["negative", "neg", "-"]:
-        return "Negative"
-    if seg in ["variable", "var", "v"]:
-        return "Variable"
-    return None
-# ------------------------------------------------------------
-# Gram stain and shape
-# ------------------------------------------------------------
-def _parse_gram_and_shape(text_lc: str, parsed: Dict[str, str]) -> None:
-    # Gram stain
-    if "gram-positive" in text_lc or "gram positive" in text_lc:
-        _set_if_stronger(parsed, "Gram Stain", "Positive")
-    elif "gram-negative" in text_lc or "gram negative" in text_lc:
-        _set_if_stronger(parsed, "Gram Stain", "Negative")
-    elif "gram variable" in text_lc:
-        _set_if_stronger(parsed, "Gram Stain", "Variable")
-    # Shape
-    # Prefer "short rods" over generic rods
-    if "short rods" in text_lc:
-        _set_if_stronger(parsed, "Shape", "Short Rods")
-    # Cocci and variants (diplococci, tetracocci, etc.)
-    if re.search(r"\bcocci\b", text_lc):
-        _set_if_stronger(parsed, "Shape", "Cocci")
-    if re.search(r"\b(diplococci|tetracocci|streptococci|staphylococci)\b", text_lc):
-        _set_if_stronger(parsed, "Shape", "Cocci")
-    # Rods / bacilli
-    if re.search(r"\brods?\b", text_lc) or "bacilli" in text_lc:
-        _set_if_stronger(parsed, "Shape", "Rods")
-    # Spiral
-    if "spiral" in text_lc or "spirochete" in text_lc:
-        _set_if_stronger(parsed, "Shape", "Spiral")
-# ------------------------------------------------------------
-# Haemolysis
-# ------------------------------------------------------------
-def _parse_haemolysis(text_lc: str, parsed: Dict[str, str]) -> None:
-    """
-    Handle haemolysis phrasing:
-      - beta-haemolysis / beta-haemolytic / beta hemolysis / beta hemolytic
-      - alpha- / gamma- / non-haemolytic
-    Always set Haemolysis to Positive when a clear type is mentioned,
-    except for "none"/gamma where it's Negative.
-    """
-    # Beta
-    if re.search(r"beta[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
-        _set_if_stronger(parsed, "Haemolysis Type", "Beta")
-        _set_if_stronger(parsed, "Haemolysis", "Positive")
-    # Alpha
-    if re.search(r"alpha[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
-        _set_if_stronger(parsed, "Haemolysis Type", "Alpha")
-        _set_if_stronger(parsed, "Haemolysis", "Positive")
-    # Gamma / non-haemolytic
-    if re.search(r"gamma[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
-        _set_if_stronger(parsed, "Haemolysis Type", "Gamma")
-        _set_if_stronger(parsed, "Haemolysis", "Negative")
-    if (
-        "non-haemolytic" in text_lc
-        or "non hemolytic" in text_lc
-        or "non-hemolytic" in text_lc
-    ):
-        _set_if_stronger(parsed, "Haemolysis Type", "None")
-        _set_if_stronger(parsed, "Haemolysis", "Negative")
-    # Variable phrasing
-    if "variable haemolysis" in text_lc or "variable hemolysis" in text_lc:
-        _set_if_stronger(parsed, "Haemolysis Type", "Variable")
-        _set_if_stronger(parsed, "Haemolysis", "Variable")
-# ------------------------------------------------------------
-# Boolean test parser (core enzyme tests etc.)
-# ------------------------------------------------------------
-def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
-    """
-    For each test in CORE_BOOL_FIELDS, look for patterns like:
-      "catalase positive", "positive for catalase", etc.
-    Also handles "negative" and "variable" and a few special
-    cases like nitrate reduction and H2S production.
-    """
-    for field, keywords in CORE_BOOL_FIELDS.items():
-        for kw in keywords:
-            # "... catalase positive"
-            m1 = re.search(
-                rf"{re.escape(kw)}[ \-]?(positive|negative|variable|pos|neg|\+|\-)",
-                text_lc,
-            )
-            if m1:
-                val = _value_from_pnv_context(m1.group(1))
-                if val:
-                    _set_if_stronger(parsed, field, val)
-                    break
-            # "positive for catalase"
-            m2 = re.search(
-                rf"(positive|negative|variable|pos|neg|\+|\-)\s+(for\s+)?{re.escape(kw)}",
-                text_lc,
-            )
-            if m2:
-                val = _value_from_pnv_context(m2.group(1))
-                if val:
-                    _set_if_stronger(parsed, field, val)
-                    break
-        # Special-case NaCl tolerance with explicit percentages
-        if field == "NaCl Tolerant (>=6%)":
-            # e.g. "grows in 6.5% NaCl", "grows at 10% NaCl"
-            for m in re.finditer(
-                r"(grows|growth)\s+(in|at)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
-                text_lc,
-            ):
-                try:
-                    conc = float(m.group(3))
-                    if conc >= 6.0:
-                        _set_if_stronger(parsed, field, "Positive")
-                except Exception:
-                    pass
-            # e.g. "NaCl tolerant up to 10%"
-            for m in re.finditer(
-                r"nacl\s+tolerant\s+(?:to|up to)?\s*(\d+(?:\.\d+)?)\s*%?",
-                text_lc,
-            ):
-                try:
-                    conc = float(m.group(1))
-                    if conc >= 6.0:
-                        _set_if_stronger(parsed, field, "Positive")
-                except Exception:
-                    pass
-            # explicit negative phrasing: "does not grow in 7% NaCl"
             if re.search(
                 r"does (not|n't) grow\s+(in|at)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
                 text_lc,
             ):
-                _set_if_stronger(parsed, field, "Negative")
-            # more general: "in 6.5% NaCl" (assume positive tolerance)
-            for m in re.finditer(
-                r"\bin\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
-                text_lc,
-            ):
-                try:
-                    conc = float(m.group(1))
-                    if conc >= 6.0 and "does not" not in text_lc and "no growth" not in text_lc:
-                        _set_if_stronger(parsed, field, "Positive")
-                except Exception:
-                    pass
-    # Nitrate: "reduces nitrate" / "does not reduce nitrate"
-    if re.search(r"reduces nitrate", text_lc):
-        _set_if_stronger(parsed, "Nitrate Reduction", "Positive")
-    if re.search(r"does (not|n't) reduce nitrate", text_lc):
-        _set_if_stronger(parsed, "Nitrate Reduction", "Negative")
-    # H2S: "produces H2S", "H2S production", "does not produce H2S"
-    if re.search(r"(produces|production of)\s+h2s", text_lc):
-        _set_if_stronger(parsed, "H2S", "Positive")
-    if re.search(r"does (not|n't) produce\s+h2s", text_lc) or re.search(
-        r"no h2s production", text_lc
-    ):
-        _set_if_stronger(parsed, "H2S", "Negative")
-# ------------------------------------------------------------
-# Motility / Capsule / Spores
-# ------------------------------------------------------------
-def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None:
-    # Motility
-    # Basic: "motile" vs "non-motile"
-    if (
-        re.search(r"\bmotile\b", text_lc)
-        and not re.search(r"\bnon[- ]?motile\b", text_lc)
-        and "nonmotile" not in text_lc
-        and "immotile" not in text_lc
-    ):
-        _set_if_stronger(parsed, "Motility", "Positive")
-    if (
-        "non-motile" in text_lc
-        or "non motile" in text_lc
-        or "nonmotile" in text_lc
-        or "immotile" in text_lc
-    ):
-        _set_if_stronger(parsed, "Motility", "Negative")
-    # Specific motility phrases: tumbling, swarming, corkscrew
-    if (
-        "tumbling motility" in text_lc
-        or "swarming motility" in text_lc
-        or "corkscrew motility" in text_lc
-        or ("swarming" in text_lc and "non-swarming" not in text_lc)
-    ):
-        _set_if_stronger(parsed, "Motility", "Positive")
-    # Capsule - include "capsule positive/negative"
-    if (
-        "capsulated" in text_lc
-        or "encapsulated" in text_lc
-        or "capsule present" in text_lc
-        or re.search(r"capsule[ \-]?(positive|pos|\+)", text_lc)
-    ):
-        _set_if_stronger(parsed, "Capsule", "Positive")
-    if (
-        "non-capsulated" in text_lc
-        or "no capsule" in text_lc
-        or re.search(r"capsule[ \-]?(negative|neg|\-)", text_lc)
-    ):
-        _set_if_stronger(parsed, "Capsule", "Negative")
-    # Spore formation:
-    # Check negative phrases FIRST so they win over generic positive phrases.
-    if (
-        "non-spore-forming" in text_lc
-        or "non spore forming" in text_lc
-        or "nonspore-forming" in text_lc
-        or "no spores" in text_lc
-    ):
-        _set_if_stronger(parsed, "Spore Formation", "Negative")
-    if (
-        "spore-forming" in text_lc
-        or "spore forming" in text_lc
-        or "forms spores" in text_lc
-    ):
-        _set_if_stronger(parsed, "Spore Formation", "Positive")
-# ------------------------------------------------------------
-# Oxygen requirement
-# ------------------------------------------------------------
-def _parse_oxygen(text_lc: str, parsed: Dict[str, str]) -> None:
-    """
-    Robust oxygen parsing:
-      - Handle facultative first
-      - Avoid "aerobic" accidentally matching inside "anaerobic"
-    """
-    # Facultative first
-    if re.search(r"facultative(ly)? anaerob", text_lc):
-        _set_if_stronger(parsed, "Oxygen Requirement", "Facultative Anaerobe")
-    # Strict anaerobic (before aerobic, and with word boundary)
-    if re.search(r"\bobligate anaerob", text_lc) or (
-        re.search(r"\banaerobic\b", text_lc) and "facultative" not in text_lc
-    ):
-        _set_if_stronger(parsed, "Oxygen Requirement", "Anaerobic")
-    # Now handle purely aerobic, avoiding "anaerobic"
-    if re.search(r"\bobligate aerobe\b", text_lc) or (
-        re.search(r"\baerobic\b", text_lc) and "anaerobic" not in text_lc
-    ):
-        _set_if_stronger(parsed, "Oxygen Requirement", "Aerobic")
-    if "microaerophilic" in text_lc or "microaerophile" in text_lc:
-        _set_if_stronger(parsed, "Oxygen Requirement", "Microaerophilic")
-    if "capnophilic" in text_lc or "co2" in text_lc:
-        _set_if_stronger(parsed, "Oxygen Requirement", "Capnophilic")
-# ------------------------------------------------------------
-# Growth temperature
-# ------------------------------------------------------------
-def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
-    """
-    Look for explicit temperatures like "grows at 37 °C" or ranges like "4–45 °C".
-    We ALWAYS store as "low//high":
-      - For ranges: "4//45"
-      - For single values: "37//37"
-    """
-    # Ranges like "4-45 °C", "10–40 °C"
-    range_pattern = re.compile(
-        r"(\d+)\s*[-–/]\s*(\d+)\s*(?:c|°c|degrees c|degrees celsius)"
-    )
-    m_range = range_pattern.search(text_lc)
-    if m_range:
-        low = m_range.group(1)
-        high = m_range.group(2)
-        _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
-        return
-    # Single temps like "grows at 37 c"
-    single_pattern = re.compile(
-        r"(grows|growth|optimum|optimal)\s+(?:at\s+)?(\d+)\s*(?:c|°c|degrees c|degrees celsius)"
-    )
-    m_single = single_pattern.search(text_lc)
-    if m_single:
-        temp = m_single.group(2)
-        _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
-        return
-    # Simplified: "grows at 37" (no explicit °C)
-    m_simple_num = re.search(r"grows at (\d+)\b", text_lc)
-    if m_simple_num:
-        temp = m_simple_num.group(1)
-        _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
-        return
-    # Fallback: plain "37c" somewhere in the text
-    m_plain = re.search(
-        r"\b(\d+)\s*(?:c|°c|degrees c|degrees celsius)\b",
-        text_lc,
-    )
-    if m_plain:
-        temp = m_plain.group(1)
-        _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
-# ------------------------------------------------------------
-# Media grown on (coarse mapping)
-# ------------------------------------------------------------
-MEDIA_KEYWORDS = {
-    "Blood Agar": [
-        "blood agar",
-        "blood-agar",
-    ],
-    "MacConkey Agar": [
-        "macconkey agar",
-        "mac conkey agar",
-        "macconkey",
-    ],
-    "Chocolate Agar": [
-        "chocolate agar",
-        "chocolate-agar",
-    ],
-    "Nutrient Agar": [
-        "nutrient agar",
-        "nutrient-agar",
-    ],
-    "XLD Agar": [
-        "xld agar",
-    ],
-    "TCBS Agar": [
-        "tcbs agar",
-        "tcbs",
-    ],
-    "ALOA": [
-        "aloa agar",
-        "aloa",
-    ],
-    "BCYE Agar": [
-        "bcye agar",
-        "bcye",
-    ],
-    "MRS Agar": [
-        "mrs agar",
-    ],
-}
-def _parse_media(text_lc: str, parsed: Dict[str, str]) -> None:
-    found_media: List[str] = []
-    for media_name, patterns in MEDIA_KEYWORDS.items():
-        for p in patterns:
-            if p in text_lc:
-                if media_name not in found_media:
-                    found_media.append(media_name)
-    if found_media:
-        _set_if_stronger(parsed, "Media Grown On", "; ".join(found_media))
-# ------------------------------------------------------------
-# Sugar fermentation parsing
-# ------------------------------------------------------------
-def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
-    """
-    Handles patterns like:
-      - "ferments glucose, mannitol and sucrose but not lactose"
-      - "does not ferment lactose"
-      - "non-lactose fermenter"
-      - "glucose positive, mannitol negative"
-      - global non-fermenter patterns
-    """
-    # 0) Simple "glucose positive / negative" style
-    for sugar_key, field in SUGAR_FIELDS.items():
-        m_simple = re.search(
-            rf"{sugar_key}\s+(positive|negative|variable|pos|neg|\+|\-)",
-            text_lc,
-        )
-        if m_simple:
-            val = _value_from_pnv_context(m_simple.group(1))
-            if val:
-                _set_if_stronger(parsed, field, val)
-    # 1) Pattern: "ferments X, Y and Z but not A, B"
-    ferments_pattern = re.compile(r"ferments\s+([a-z0-9 ,;/&\-]+)")
-    for m in ferments_pattern.finditer(text_lc):
-        seg = m.group(1)
-        # Split positive vs negative part on "but not"
-        neg_split = re.split(r"\bbut not\b", seg, maxsplit=1)
-        pos_part = neg_split[0]
-        neg_part = neg_split[1] if len(neg_split) > 1 else ""
-        # Positive sugars from pos_part
-        for sugar_key, field in SUGAR_FIELDS.items():
-            if re.search(rf"\b{sugar_key}\b", pos_part):
-                _set_if_stronger(parsed, field, "Positive")
-        # Negative sugars from neg_part
-        for sugar_key, field in SUGAR_FIELDS.items():
-            if re.search(rf"\b{sugar_key}\b", neg_part):
-                _set_if_stronger(parsed, field, "Negative")
-    # 2) "does not ferment X" / "doesn't ferment X"
-    for sugar_key, field in SUGAR_FIELDS.items():
-        if re.search(rf"does (not|n't) ferment {sugar_key}\b", text_lc):
-            _set_if_stronger(parsed, field, "Negative")
-    # 3) "non-lactose fermenter", "non lactose fermenter"
-    for sugar_key, field in SUGAR_FIELDS.items():
-        if re.search(rf"non[- ]{sugar_key} ferment(ing|er)?", text_lc):
-            _set_if_stronger(parsed, field, "Negative")
-    # 4) "X fermentation positive/negative"
-    for sugar_key, field in SUGAR_FIELDS.items():
-        # "glucose fermentation positive"
-        m1 = re.search(
-            rf"{sugar_key}\s+fermentation[ \-]?(positive|negative|variable|pos|neg|\+|\-)",
-            text_lc,
-        )
-        if m1:
-            val = _value_from_pnv_context(m1.group(1))
-            if val:
-                _set_if_stronger(parsed, field, val)
-                continue
-        # "positive for glucose fermentation"
-        m2 = re.search(
-            rf"(positive|negative|variable|pos|neg|\+|\-)\s+(for\s+)?{sugar_key}\s+fermentation",
-            text_lc,
-        )
-        if m2:
-            val = _value_from_pnv_context(m2.group(1))
-            if val:
-                _set_if_stronger(parsed, field, val)
-                continue
-    # 5) Global non-fermenter patterns
-    if (
-        re.search(r"does (not|n't) ferment (carbohydrates|sugars)", text_lc)
-        or re.search(r"non[- ]ferment(er|ing|ative)", text_lc)
-    ):
-        for field in SUGAR_FIELDS.values():
-            if field not in parsed or parsed[field] == UNKNOWN:
-                _set_if_stronger(parsed, field, "Negative")
-# ------------------------------------------------------------
-# Colony morphology (coarse, optional)
-# ------------------------------------------------------------
-def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
-    """
-    Very coarse mapping for colony morphology. We try a couple of patterns:
-      - "colonies are yellow, mucoid"
-      - "colonies dry, white and irregular on nutrient agar"
-      - "forming green colonies", "forms mucoid colonie"
-    """
-    # Pattern 1: "colonies are ..."
-    m = re.search(r"colon(y|ies) (are|is)\s+([a-z0-9 ,;\-]+)", text_lc)
-    if m:
-        desc = m.group(3).strip()
-        if desc:
-            pretty = "; ".join(
-                [s.strip().capitalize() for s in re.split(r"[;,]", desc) if s.strip()]
-            )
-            if pretty:
-                _set_if_stronger(parsed, "Colony Morphology", pretty)
-                return
-    # Pattern 2: "colonies dry, white and irregular on nutrient agar"
-    m2 = re.search(
-        r"colonies\s+([a-z0-9 ,;\-]+?)(?:\s+on\b|\.|,)",
-        text_lc,
-    )
-    if m2:
-        desc = m2.group(1).strip()
-        if desc:
-            pretty = "; ".join(
-                [s.strip().capitalize() for s in re.split(r"[;,]", desc) if s.strip()]
-            )
-            if pretty:
-                _set_if_stronger(parsed, "Colony Morphology", pretty)
-                return
-    # Pattern 3: "forming green colonies", "forms mucoid colonies"
-    m3 = re.search(
-        r"(forming|forms|produces)\s+([a-z0-9 ,;\-]+?)\s+colonies",
-        text_lc,
-    )
-    if m3:
-        desc = m3.group(2).strip()
-        if desc:
-            pretty = "; ".join(
-                [s.strip().capitalize() for s in re.split(r"[;,]", desc) if s.strip()]
-            )
-            if pretty:
-                _set_if_stronger(parsed, "Colony Morphology", pretty)
-# ------------------------------------------------------------
-# PUBLIC API
-# ------------------------------------------------------------
-def parse_text_rules(text: str) -> Dict[str, Any]:
-    """
-    Main entry point.
-    Parameters
-    ----------
-    text : str
-        Free-text microbiology description.
-    Returns
-    -------
-    dict
-        {
-          "parsed_fields": { field: value, ... },
-          "source": "rule_parser",
-          "raw": original_text,
-          "error": optional_error_message
-        }
-    """
-    original = text or ""
-    text_clean = _clean_text(original)
-    text_lc = text_clean.lower()
-    parsed: Dict[str, str] = {}
-    try:
-        _parse_gram_and_shape(text_lc, parsed)
-        _parse_haemolysis(text_lc, parsed)
-        _parse_core_bool_tests(text_lc, parsed)
-        _parse_motility_capsule_spores(text_lc, parsed)
-        _parse_oxygen(text_lc, parsed)
-        _parse_growth_temperature(text_lc, parsed)
-        _parse_media(text_lc, parsed)
-        _parse_sugars(text_lc, parsed)
-        _parse_colony(text_lc, parsed)
-        return {
-            "parsed_fields": parsed,
-            "source": "rule_parser",
-            "raw": original,
-        }
-    except Exception as e:
-        # Fail-safe: never crash the app, just report an error
-        return {
-            "parsed_fields": parsed,
-            "source": "rule_parser",
-            "raw": original,
-            "error": f"{type(e).__name__}: {e}",
         }

+# engine/parser_rules.py
+# ------------------------------------------------------------
+# Rule-based core parser for microbiology descriptions.
+#
+# Stage 11F (Option A ranges + fixes):
+# - Always store Growth Temperature as "low//high"
+#   • single: 37 → "37//37"
+#   • two temps in text: min//max (e.g. "4 °C but not 45 °C" → "4//45")
+# - DNase robust parsing (DNase / DNase test, DNase activity, etc.)
+# - Non–spore-forming → Spore Formation = Negative (regex + early return)
+# - "non-H2S producing" → H2S = Negative
+# - "aerobically" / "anaerobically" → Aerobic / Anaerobic
+# - NaCl tolerance phrases improved
+# - Colony morphology from "colonies dry, white and irregular on nutrient agar"
+# ------------------------------------------------------------
+from __future__ import annotations
+import re
+from typing import Dict, Any, List
+UNKNOWN = "Unknown"
+# ------------------------------------------------------------
+# Core fields and sugar mapping
+# ------------------------------------------------------------
+# Sugar name → core DB column
+SUGAR_FIELDS: Dict[str, str] = {
+    "glucose": "Glucose Fermentation",
+    "lactose": "Lactose Fermentation",
+    "sucrose": "Sucrose Fermentation",
+    "maltose": "Maltose Fermentation",
+    "mannitol": "Mannitol Fermentation",
+    "sorbitol": "Sorbitol Fermentation",
+    "xylose": "Xylose Fermentation",
+    "rhamnose": "Rhamnose Fermentation",
+    "arabinose": "Arabinose Fermentation",
+    "raffinose": "Raffinose Fermentation",
+    "trehalose": "Trehalose Fermentation",
+    "inositol": "Inositol Fermentation",
+}
+CORE_BOOL_FIELDS: Dict[str, List[str]] = {
+    # field: [keywords to recognise the test name]
+    "Catalase": ["catalase"],
+    "Oxidase": ["oxidase"],
+    "Indole": ["indole"],
+    "Urease": ["urease"],
+    "Citrate": ["citrate"],
+    # MR: include "mr"
+    "Methyl Red": ["methyl red", "mr test", "mr"],
+    "VP": ["voges-proskauer", "vp test", "vp"],
+    # H2S (includes H₂S → normalised to H2S in _clean_text)
+    "H2S": ["h2s", "hydrogen sulfide"],
+    # DNase: broaden patterns
+    "DNase": [
+        "dnase",
+        "dnase test",
+        "dnase activity",
+        "dnase production",
+        "dnaase",
+        "dna hydrolysis",
+    ],
+    "ONPG": ["onpg"],
+    "Coagulase": ["coagulase"],
+    "Lipase Test": ["lipase"],
+    "Nitrate Reduction": ["nitrate reduction", "nitrate"],
+    "NaCl Tolerant (>=6%)": ["6% nacl", "7% nacl", "nacl tolerant"],
+    "Lysine Decarboxylase": ["lysine decarboxylase", "lysine decarb"],
+    "Ornitihine Decarboxylase": ["ornithine decarboxylase", "ornithine decarb"],
+    "Arginine dihydrolase": ["arginine dihydrolase"],
+    "Gelatin Hydrolysis": ["gelatin hydrolysis", "gelatinase"],
+    # Esculin Hydrolysis: also match plain "esculin"
+    "Esculin Hydrolysis": ["esculin hydrolysis", "esculin"],
+}
+# ------------------------------------------------------------
+# Generic helpers
+# ------------------------------------------------------------
+def _clean_text(text: str) -> str:
+    """
+    Normalise a few unicode oddities and collapse whitespace.
+    Also:
+      - strip degree symbols
+      - normalise subscript ₂ → 2 for H₂S
+    """
+    if not text:
+        return ""
+    s = text.replace("°", "").replace("º", "")
+    # normalise subscript 2 (H₂S → H2S)
+    s = s.replace("₂", "2")
+    # keep dashes as-is; regexes handle - and – explicitly
+    return " ".join(s.split())
+def _norm(s: str) -> str:
+    return s.strip().lower()
+def _set_if_stronger(parsed: Dict[str, str], field: str, value: str) -> None:
+    """
+    Write value to parsed[field] if:
+      - field not present, or
+      - we are replacing Unknown with a concrete value
+    """
+    if not value:
+        return
+    if field not in parsed or parsed[field] == UNKNOWN:
+        parsed[field] = value
+def _value_from_pnv_context(segment: str) -> str | None:
+    """
+    Interpret a short phrase as Positive / Negative / Variable.
+    """
+    seg = _norm(segment)
+    if seg in ["positive", "pos", "+"]:
+        return "Positive"
+    if seg in ["negative", "neg", "-"]:
+        return "Negative"
+    if seg in ["variable", "var", "v"]:
+        return "Variable"
+    return None
+# ------------------------------------------------------------
+# Gram stain and shape
+# ------------------------------------------------------------
+def _parse_gram_and_shape(text_lc: str, parsed: Dict[str, str]) -> None:
+    # Gram stain
+    if "gram-positive" in text_lc or "gram positive" in text_lc:
+        _set_if_stronger(parsed, "Gram Stain", "Positive")
+    elif "gram-negative" in text_lc or "gram negative" in text_lc:
+        _set_if_stronger(parsed, "Gram Stain", "Negative")
+    elif "gram variable" in text_lc:
+        _set_if_stronger(parsed, "Gram Stain", "Variable")
+    # Shape
+    # Prefer "short rods" over generic rods
+    if "short rods" in text_lc:
+        _set_if_stronger(parsed, "Shape", "Short Rods")
+    # Cocci and variants (diplococci, tetracocci, etc.)
+    if re.search(r"\bcocci\b", text_lc):
+        _set_if_stronger(parsed, "Shape", "Cocci")
+    if re.search(r"\b(diplococci|tetracocci|streptococci|staphylococci)\b", text_lc):
+        _set_if_stronger(parsed, "Shape", "Cocci")
+    # Rods / bacilli
+    if re.search(r"\brods?\b", text_lc) or "bacilli" in text_lc:
+        _set_if_stronger(parsed, "Shape", "Rods")
+    # Spiral
+    if "spiral" in text_lc or "spirochete" in text_lc:
+        _set_if_stronger(parsed, "Shape", "Spiral")
+# ------------------------------------------------------------
+# Haemolysis
+# ------------------------------------------------------------
+def _parse_haemolysis(text_lc: str, parsed: Dict[str, str]) -> None:
+    """
+    Handle haemolysis phrasing:
+      - beta-haemolytic / beta hemolytic / beta-haemolysis / etc.
+      - alpha- / gamma- / non-haemolytic
+    """
+    # Beta
+    if re.search(r"beta[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
+        _set_if_stronger(parsed, "Haemolysis Type", "Beta")
+        _set_if_stronger(parsed, "Haemolysis", "Positive")
+    # Alpha
+    if re.search(r"alpha[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
+        _set_if_stronger(parsed, "Haemolysis Type", "Alpha")
+        _set_if_stronger(parsed, "Haemolysis", "Positive")
+    # Gamma / non-haemolytic
+    if re.search(r"gamma[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
+        _set_if_stronger(parsed, "Haemolysis Type", "Gamma")
+        _set_if_stronger(parsed, "Haemolysis", "Negative")
+    if (
+        "non-haemolytic" in text_lc
+        or "non hemolytic" in text_lc
+        or "non-hemolytic" in text_lc
+    ):
+        _set_if_stronger(parsed, "Haemolysis Type", "None")
+        _set_if_stronger(parsed, "Haemolysis", "Negative")
+    # Variable phrasing
+    if "variable haemolysis" in text_lc or "variable hemolysis" in text_lc:
+        _set_if_stronger(parsed, "Haemolysis Type", "Variable")
+        _set_if_stronger(parsed, "Haemolysis", "Variable")
+# ------------------------------------------------------------
+# Boolean test parser (core enzyme tests etc.)
+# ------------------------------------------------------------
+def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
+    """
+    For each test in CORE_BOOL_FIELDS, look for patterns like:
+      "catalase positive", "positive for catalase", etc.
+    Also handles:
+      - NaCl tolerance with % values
+      - Nitrate reduction text
+      - H2S production / non-production
+      - DNase universal coverage
+    """
+    for field, keywords in CORE_BOOL_FIELDS.items():
+        for kw in keywords:
+            # "... catalase positive"
+            m1 = re.search(
+                rf"{re.escape(kw)}[ \-]?(positive|negative|variable|pos|neg|\+|\-)",
+                text_lc,
+            )
+            if m1:
+                val = _value_from_pnv_context(m1.group(1))
+                if val:
+                    _set_if_stronger(parsed, field, val)
+                    break
+            # "positive for catalase"
+            m2 = re.search(
+                rf"(positive|negative|variable|pos|neg|\+|\-)\s+(for\s+)?{re.escape(kw)}",
+                text_lc,
+            )
+            if m2:
+                val = _value_from_pnv_context(m2.group(1))
+                if val:
+                    _set_if_stronger(parsed, field, val)
+                    break
+        # Special-case NaCl tolerance with explicit percentages
+        if field == "NaCl Tolerant (>=6%)":
+            # e.g. "grows in 6.5% NaCl", "grows at 10% NaCl"
+            for m in re.finditer(
+                r"(grows|growth)\s+(in|at)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
+                text_lc,
+            ):
+                try:
+                    conc = float(m.group(3))
+                    if conc >= 6.0:
+                        _set_if_stronger(parsed, field, "Positive")
+                except Exception:
+                    pass
+            # e.g. "NaCl tolerant up to 10%"
+            for m in re.finditer(
+                r"nacl\s+tolerant\s+(?:to|up to)?\s*(\d+(?:\.\d+)?)\s*%?",
+                text_lc,
+            ):
+                try:
+                    conc = float(m.group(1))
+                    if conc >= 6.0:
+                        _set_if_stronger(parsed, field, "Positive")
+                except Exception:
+                    pass
+            # explicit negative phrasing: "does not grow in 7% NaCl"
             if re.search(
                 r"does (not|n't) grow\s+(in|at)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
                 text_lc,
             ):
+                _set_if_stronger(parsed, "NaCl Tolerant (>=6%)", "Negative")
+            # general "in 6.5% NaCl" → assume tolerance if no explicit "no growth"
+            for m in re.finditer(
+                r"\bin\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
+                text_lc,
+            ):
+                try:
+                    conc = float(m.group(1))
+                    if conc >= 6.0 and "does not" not in text_lc and "no growth" not in text_lc:
+                        _set_if_stronger(parsed, "NaCl Tolerant (>=6%)", "Positive")
+                except Exception:
+                    pass
+    # Nitrate: "reduces nitrate" / "does not reduce nitrate"
+    if re.search(r"reduces nitrate", text_lc):
+        _set_if_stronger(parsed, "Nitrate Reduction", "Positive")
+    if re.search(r"does (not|n't) reduce nitrate", text_lc):
+        _set_if_stronger(parsed, "Nitrate Reduction", "Negative")
+    # H2S: "produces H2S", "H2S production", "does not produce H2S",
+    #       "non-H2S producing"
+    if re.search(r"(produces|production of)\s+h2s", text_lc):
+        _set_if_stronger(parsed, "H2S", "Positive")
+    if (
+        re.search(r"does (not|n't) produce\s+h2s", text_lc)
+        or re.search(r"no h2s production", text_lc)
+        or re.search(r"non[- ]h2s producing", text_lc)
+    ):
+        _set_if_stronger(parsed, "H2S", "Negative")
+    # --- DNase universal coverage ---
+    # Positive forms
+    if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(positive|pos|\+)\b", text_lc):
+        _set_if_stronger(parsed, "DNase", "Positive")
+    if re.search(r"\b(positive|pos|\+)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc):
+        _set_if_stronger(parsed, "DNase", "Positive")
+    # Negative forms
+    if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(negative|neg|\-)\b", text_lc):
+        _set_if_stronger(parsed, "DNase", "Negative")
+    if re.search(r"\b(negative|neg|\-)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc):
+        _set_if_stronger(parsed, "DNase", "Negative")
+    # non-DNase-producing
+    if re.search(r"\bnon[- ]?dnase[- ]?producing\b", text_lc):
+        _set_if_stronger(parsed, "DNase", "Negative")
+# ------------------------------------------------------------
+# Motility / Capsule / Spores
+# ------------------------------------------------------------
+def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None:
+    # Motility
+    if (
+        re.search(r"\bmotile\b", text_lc)
+        and not re.search(r"\bnon[- ]?motile\b", text_lc)
+        and "nonmotile" not in text_lc
+        and "immotile" not in text_lc
+    ):
+        _set_if_stronger(parsed, "Motility", "Positive")
+    if (
+        "non-motile" in text_lc
+        or "non motile" in text_lc
+        or "nonmotile" in text_lc
+        or "immotile" in text_lc
+    ):
+        _set_if_stronger(parsed, "Motility", "Negative")
+    # Specific motility phrases: tumbling, swarming, corkscrew
+    if (
+        "tumbling motility" in text_lc
+        or "swarming motility" in text_lc
+        or "corkscrew motility" in text_lc
+        or ("swarming" in text_lc and "non-swarming" not in text_lc)
+    ):
+        _set_if_stronger(parsed, "Motility", "Positive")
+    # Capsule (including "capsule positive/negative")
+    if (
+        "capsulated" in text_lc
+        or "encapsulated" in text_lc
+        or "capsule present" in text_lc
+        or re.search(r"capsule[ \-]?(positive|pos|\+)", text_lc)
+    ):
+        _set_if_stronger(parsed, "Capsule", "Positive")
+    if (
+        "non-capsulated" in text_lc
+        or "no capsule" in text_lc
+        or re.search(r"capsule[ \-]?(negative|neg|\-)", text_lc)
+    ):
+        _set_if_stronger(parsed, "Capsule", "Negative")
+    # Spore formation
+    # NEGATIVE FIRST with strict boundaries, then early-return
+    if (
+        re.search(r"\bnon[-\s]?spore[-\s]?forming\b", text_lc)
+        or "no spores" in text_lc
+    ):
+        _set_if_stronger(parsed, "Spore Formation", "Negative")
+        return  # prevent any positive overwrite
+    # POSITIVE (must not match the negative form)
+    if (
+        re.search(r"\bspore[-\s]?forming\b", text_lc)
+        or "forms spores" in text_lc
+    ):
+        _set_if_stronger(parsed, "Spore Formation", "Positive")
+# ------------------------------------------------------------
+# Oxygen requirement
+# ------------------------------------------------------------
+def _parse_oxygen(text_lc: str, parsed: Dict[str, str]) -> None:
+    """
+    Robust oxygen parsing:
+      - Handle facultative first
+      - Avoid "aerobic" accidentally matching inside "anaerobic"
+      - Include "aerobically" / "anaerobically"
+    """
+    # Facultative first
+    if re.search(r"facultative(ly)? anaerob", text_lc):
+        _set_if_stronger(parsed, "Oxygen Requirement", "Facultative Anaerobe")
+    # Strict anaerobic (before aerobic)
+    if (
+        re.search(r"\bobligate anaerob", text_lc)
+        or (re.search(r"\banaerobic\b", text_lc) and "facultative" not in text_lc)
+        or re.search(r"\banaerobically\b", text_lc)
+    ):
+        _set_if_stronger(parsed, "Oxygen Requirement", "Anaerobic")
+    # Now handle purely aerobic, avoiding "anaerobic"
+    if (
+        re.search(r"\bobligate aerobe\b", text_lc)
+        or (
+            re.search(r"\baerobic\b", text_lc)
+            and "anaerobic" not in text_lc
+        )
+        or (
+            re.search(r"\baerobically\b", text_lc)
+            and "anaerobically" not in text_lc
+        )
+    ):
+        _set_if_stronger(parsed, "Oxygen Requirement", "Aerobic")
+    if "microaerophilic" in text_lc or "microaerophile" in text_lc:
+        _set_if_stronger(parsed, "Oxygen Requirement", "Microaerophilic")
+    if "capnophilic" in text_lc or "co2" in text_lc:
+        _set_if_stronger(parsed, "Oxygen Requirement", "Capnophilic")
+# ------------------------------------------------------------
+# Growth temperature
+# ------------------------------------------------------------
+def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
+    """
+    Look for explicit temperatures like "grows at 37 °C" or ranges like "4–45 °C".
+    We ALWAYS store as "low//high":
+      - true ranges: "4-45 °C" → "4//45"
+      - two temps in text: min//max (Option A), e.g.:
+            "grows at 4 °C but not at 45 °C" → "4//45"
+            "grows at 42 °C but not at 25 °C" → "25//42"
+      - single temps: "37 °C" → "37//37"
+    """
+    # 1) Explicit ranges like "4-45 °C" or "10–40 °C"
+    range_pattern = re.compile(
+        r"(\d+)\s*[-–/]\s*(\d+)\s*(?:c|°c|degrees c|degrees celsius)"
+    )
+    m_range = range_pattern.search(text_lc)
+    if m_range:
+        low = m_range.group(1)
+        high = m_range.group(2)
+        _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
+        return
+    # 2) Option A: any two explicit temps → min//max
+    temps = re.findall(r"(\d+)\s*(?:c|°c|degrees c|degrees celsius)", text_lc)
+    if len(temps) >= 2:
+        nums = [int(t) for t in temps]
+        low = min(nums)
+        high = max(nums)
+        _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
+        return
+    # 3) Single temps like "grows at 37 c"
+    single_pattern = re.compile(
+        r"(grows|growth|optimum|optimal)\s+(?:at\s+)?(\d+)\s*(?:c|°c|degrees c|degrees celsius)"
+    )
+    m_single = single_pattern.search(text_lc)
+    if m_single:
+        temp = m_single.group(2)
+        _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
+        return
+    # 4) Simplified: "grows at 37" (no explicit °C)
+    m_simple_num = re.search(r"grows at (\d+)\b", text_lc)
+    if m_simple_num:
+        temp = m_simple_num.group(1)
+        _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
+        return
+    # 5) Fallback: plain "37c" somewhere in the text
+    m_plain = re.search(
+        r"\b(\d+)\s*(?:c|°c|degrees c|degrees celsius)\b",
+        text_lc,
+    )
+    if m_plain:
+        temp = m_plain.group(1)
+        _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
+# ------------------------------------------------------------
+# Media grown on (coarse mapping)
+# ------------------------------------------------------------
+MEDIA_KEYWORDS = {
+    "Blood Agar": [
+        "blood agar",
+        "blood-agar",
+    ],
+    "MacConkey Agar": [
+        "macconkey agar",
+        "mac conkey agar",
+        "macconkey",
+    ],
+    "Chocolate Agar": [
+        "chocolate agar",
+        "chocolate-agar",
+    ],
+    "Nutrient Agar": [
+        "nutrient agar",
+        "nutrient-agar",
+    ],
+    "XLD Agar": [
+        "xld agar",
+    ],
+    "TCBS Agar": [
+        "tcbs agar",
+        "tcbs",
+    ],
+    "ALOA": [
+        "aloa agar",
+        "aloa",
+    ],
+    "BCYE Agar": [
+        "bcye agar",
+        "bcye",
+    ],
+    "MRS Agar": [
+        "mrs agar",
+    ],
+}
+def _parse_media(text_lc: str, parsed: Dict[str, str]) -> None:
+    found_media: List[str] = []
+    for media_name, patterns in MEDIA_KEYWORDS.items():
+        for p in patterns:
+            if p in text_lc and media_name not in found_media:
+                found_media.append(media_name)
+    if found_media:
+        _set_if_stronger(parsed, "Media Grown On", "; ".join(found_media))
+# ------------------------------------------------------------
+# Sugar fermentation parsing
+# ------------------------------------------------------------
+def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
+    """
+    Handles patterns like:
+      - "glucose positive, mannitol negative"
+      - "ferments glucose, mannitol and sucrose but not lactose"
+      - "does not ferment lactose"
+      - "non-lactose fermenter"
+      - global non-fermenter phrases
+    """
+    # 0) Simple "glucose positive / negative" style
+    for sugar_key, field in SUGAR_FIELDS.items():
+        m_simple = re.search(
+            rf"{sugar_key}\s+(positive|negative|variable|pos|neg|\+|\-)",
+            text_lc,
+        )
+        if m_simple:
+            val = _value_from_pnv_context(m_simple.group(1))
+            if val:
+                _set_if_stronger(parsed, field, val)
+    # 1) "ferments X, Y and Z but not A, B"
+    ferments_pattern = re.compile(r"ferments\s+([a-z0-9 ,;/&\-]+)")
+    for m in ferments_pattern.finditer(text_lc):
+        seg = m.group(1)
+        # Split positive vs negative part on "but not"
+        neg_split = re.split(r"\bbut not\b", seg, maxsplit=1)
+        pos_part = neg_split[0]
+        neg_part = neg_split[1] if len(neg_split) > 1 else ""
+        # Positive sugars from pos_part
+        for sugar_key, field in SUGAR_FIELDS.items():
+            if re.search(rf"\b{sugar_key}\b", pos_part):
+                _set_if_stronger(parsed, field, "Positive")
+        # Negative sugars from neg_part
+        for sugar_key, field in SUGAR_FIELDS.items():
+            if re.search(rf"\b{sugar_key}\b", neg_part):
+                _set_if_stronger(parsed, field, "Negative")
+    # 2) "does not ferment X"
+    for sugar_key, field in SUGAR_FIELDS.items():
+        if re.search(rf"does (not|n't) ferment {sugar_key}\b", text_lc):
+            _set_if_stronger(parsed, field, "Negative")
+    # 3) "non-lactose fermenter"
+    for sugar_key, field in SUGAR_FIELDS.items():
+        if re.search(rf"non[- ]{sugar_key} ferment(ing|er)?", text_lc):
+            _set_if_stronger(parsed, field, "Negative")
+    # 4) "X fermentation positive/negative"
+    for sugar_key, field in SUGAR_FIELDS.items():
+        # "glucose fermentation positive"
+        m1 = re.search(
+            rf"{sugar_key}\s+fermentation[ \-]?(positive|negative|variable|pos|neg|\+|\-)",
+            text_lc,
+        )
+        if m1:
+            val = _value_from_pnv_context(m1.group(1))
+            if val:
+                _set_if_stronger(parsed, field, val)
+                continue
+        # "positive for glucose fermentation"
+        m2 = re.search(
+            rf"(positive|negative|variable|pos|neg|\+|\-)\s+(for\s+)?{sugar_key}\s+fermentation",
+            text_lc,
+        )
+        if m2:
+            val = _value_from_pnv_context(m2.group(1))
+            if val:
+                _set_if_stronger(parsed, field, val)
+                continue
+    # 5) Global non-fermenter patterns
+    if (
+        re.search(r"does (not|n't) ferment (carbohydrates|sugars)", text_lc)
+        or re.search(r"non[- ]ferment(er|ing|ative)", text_lc)
+    ):
+        for field in SUGAR_FIELDS.values():
+            if field not in parsed or parsed[field] == UNKNOWN:
+                _set_if_stronger(parsed, field, "Negative")
+# ------------------------------------------------------------
+# Colony morphology (coarse, optional)
+# ------------------------------------------------------------
+def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
+    """
+    Very coarse mapping for colony morphology. We try:
+      - "colonies are yellow, mucoid"
+      - "colonies dry, white and irregular on nutrient agar"
+      - "forming green colonies", "forms mucoid colonies"
+    """
+    # Pattern 1: "colonies are ..."
+    m = re.search(r"colon(y|ies) (are|is)\s+([a-z0-9 ,;\-]+)", text_lc)
+    if m:
+        desc = m.group(3).strip()
+        if desc:
+            pretty = "; ".join(
+                [s.strip().capitalize() for s in re.split(r"[;,]", desc) if s.strip()]
+            )
+            if pretty:
+                _set_if_stronger(parsed, "Colony Morphology", pretty)
+                return
+    # Pattern 2: "colonies dry, white and irregular on nutrient agar"
+    m2 = re.search(
+        r"colonies\s+([a-z0-9 ,;\-]+?)(?:\s+on\b|\.|,)",
+        text_lc,
+    )
+    if m2:
+        desc = m2.group(1).strip()
+        if desc:
+            pretty = "; ".join(
+                [s.strip().capitalize() for s in re.split(r"[;,]", desc) if s.strip()]
+            )
+            if pretty:
+                _set_if_stronger(parsed, "Colony Morphology", pretty)
+                return
+    # Pattern 3: "forming green colonies", "forms mucoid colonies"
+    m3 = re.search(
+        r"(forming|forms|produces)\s+([a-z0-9 ,;\-]+?)\s+colonies",
+        text_lc,
+    )
+    if m3:
+        desc = m3.group(2).strip()
+        if desc:
+            pretty = "; ".join(
+                [s.strip().capitalize() for s in re.split(r"[;,]", desc) if s.strip()]
+            )
+            if pretty:
+                _set_if_stronger(parsed, "Colony Morphology", pretty)
+# ------------------------------------------------------------
+# PUBLIC API
+# ------------------------------------------------------------
+def parse_text_rules(text: str) -> Dict[str, Any]:
+    """
+    Main entry point.
+    """
+    original = text or ""
+    text_clean = _clean_text(original)
+    text_lc = text_clean.lower()
+    parsed: Dict[str, str] = {}
+    try:
+        _parse_gram_and_shape(text_lc, parsed)
+        _parse_haemolysis(text_lc, parsed)
+        _parse_core_bool_tests(text_lc, parsed)
+        _parse_motility_capsule_spores(text_lc, parsed)
+        _parse_oxygen(text_lc, parsed)
+        _parse_growth_temperature(text_lc, parsed)
+        _parse_media(text_lc, parsed)
+        _parse_sugars(text_lc, parsed)
+        _parse_colony(text_lc, parsed)
+        return {
+            "parsed_fields": parsed,
+            "source": "rule_parser",
+            "raw": original,
+        }
+    except Exception as e:
+        # Fail-safe: never crash the app, just report an error
+        return {
+            "parsed_fields": parsed,
+            "source": "rule_parser",
+            "raw": original,
+            "error": f"{type(e).__name__}: {e}",
         }