Spaces:

EphAsad
/

BactKing

Sleeping

File size: 50,152 Bytes

# engine/parser_rules.py
# ------------------------------------------------------------
# Rule-based core parser for microbiology descriptions.
#
# Stage 11F (Option A ranges + fixes) + 11H + 11I + 11J + 11L + 11M
# + NaCl + haemolysis symbol support + colony morphology tweaks.
#
# - Always store Growth Temperature as "low//high"
#   • single: 37 → "37//37"
#   • any two temps in text: min//max
#   • ranges like "30–37 °C", "grows between 30 and 37 °C" → "30//37"
#
# - DNase robust parsing (DNase test / activity / production)
# - Non–spore-forming → Spore Formation = Negative (with early return)
# - "non-H2S producing" → H2S = Negative
# - Aerobic / Anaerobic including “aerobically / anaerobically”
#
# - NaCl tolerance phrases improved (>= 6% rule)
#     • explicit positives require a growth/tolerance verb + % ≥ 6
#     • explicit negatives ("no growth in NaCl", "does not grow in 7% NaCl",
#       "NaCl sensitive", "not NaCl tolerant") override positives
#     • ambiguous "in 6.5% NaCl" alone no longer auto-Positive
#
# - Colony morphology extraction, including:
#     • "colonies are yellow, mucoid"
#     • "colonies dry, white and irregular on nutrient agar"
#     • "forming smooth, yellow-pigmented, opaque colonies"
#     • "grey colonies", "large grey colonies" etc.
#
# - Sugars:
#     • "<sugar> positive/negative"
#     • "<sugar> is positive/negative"
#     • "<sugar> fermenter" / "non-<sugar> fermenter"
#     • "ferments X, Y but not Z"
#     • grouped "does not ferment lactose and sucrose"
#       (without nuking glucose in "but glucose positive")
#     • global "non-fermenter" → all sugars Negative (Unknown-only)
#     • "asaccharolytic" → all sugars Negative (Unknown-only)
#     • "all other sugars negative" → all remaining sugars Negative
#       (Unknown-only; no hard rewrite)
#
# - Core tests:
#     • "<kw> positive/negative"
#     • "positive for <kw>"
#     • "<kw> is positive/negative"
#     • "<kw> reaction is positive/negative"
#     • "<kw> reaction positive/negative"
#     • "<kw> test reaction is positive/negative"
#     • "ONPG is negative" handled via core patterns
#     • "H2S production is positive/negative"
#     • "MR and VP negative/positive" → both set
#     • grouped phrases like
#         "gelatin and esculin hydrolysis negative"
#         "lysine, ornithine and arginine negative"
#       → all mentioned tests / sugars set to the given value
#
# - Decarboxylases:
#     • "all decarboxylases negative/positive"
#       → Lysine / Ornithine / Arginine dihydrolase set accordingly
#       (Unknown-only; explicit values can override later)
#
# - Capsule / Motility:
#     • "capsule present"/"capsule is present" → Capsule Positive
#     • "capsule absent"/"capsule is absent"/"no capsule" → Capsule Negative
#     • "encapsulated" / "capsulated" → Capsule Positive
#     • "gliding/spreading/swarming motility" → Motility Positive
#
# - Gelatin / Esculin:
#     • "gelatin positive/negative" → Gelatin Hydrolysis
#     • "esculin positive/negative" → Esculin Hydrolysis
#
# - Shape:
#     • "coccobacilli / coccobacillus" → Shape = Short Rods
#     • (no 4F shape descriptor explosion; we keep existing logic)
#
# - Haemolysis:
#     • alpha/beta/gamma haemolysis & haemolytic
#     • now also supports α / β / γ symbols via normalisation
# ------------------------------------------------------------

from __future__ import annotations

import re
from typing import Dict, Any, List


UNKNOWN = "Unknown"

# ------------------------------------------------------------
# Core fields and sugar mapping
# ------------------------------------------------------------

# Sugar name → core DB column
SUGAR_FIELDS: Dict[str, str] = {
    "glucose": "Glucose Fermentation",
    "lactose": "Lactose Fermentation",
    "sucrose": "Sucrose Fermentation",
    "maltose": "Maltose Fermentation",
    "mannitol": "Mannitol Fermentation",
    "sorbitol": "Sorbitol Fermentation",
    "xylose": "Xylose Fermentation",
    "rhamnose": "Rhamnose Fermentation",
    "arabinose": "Arabinose Fermentation",
    "raffinose": "Raffinose Fermentation",
    "trehalose": "Trehalose Fermentation",
    "inositol": "Inositol Fermentation",
}

CORE_BOOL_FIELDS: Dict[str, List[str]] = {
    # field: [keywords to recognise the test name]
    "Catalase": ["catalase"],
    "Oxidase": ["oxidase"],
    "Indole": ["indole"],
    "Urease": ["urease"],
    "Citrate": ["citrate"],
    # MR: include "mr"
    "Methyl Red": ["methyl red", "mr test", "mr"],
    "VP": ["voges-proskauer", "vp test", "vp"],
    # H2S (includes H₂S → normalised to H2S in _clean_text)
    "H2S": ["h2s", "hydrogen sulfide"],
    # DNase: broaden patterns
    "DNase": [
        "dnase",
        "dnase test",
        "dnase activity",
        "dnase production",
        "dnaase",
        "dna hydrolysis",
    ],
    "ONPG": ["onpg"],
    "Coagulase": ["coagulase"],
    "Lipase Test": ["lipase"],
    "Nitrate Reduction": ["nitrate reduction", "nitrate"],
    "NaCl Tolerant (>=6%)": ["6% nacl", "7% nacl", "nacl tolerant"],
    # Decarboxylases (also match plain amino acid words)
    "Lysine Decarboxylase": ["lysine decarboxylase", "lysine decarb", "lysine"],
    "Ornitihine Decarboxylase": ["ornithine decarboxylase", "ornithine decarb", "ornithine"],
    "Arginine dihydrolase": ["arginine dihydrolase", "arginine decarboxylase", "arginine"],
    # Gelatin / Esculin
    "Gelatin Hydrolysis": ["gelatin hydrolysis", "gelatinase", "gelatin"],
    "Esculin Hydrolysis": ["esculin hydrolysis", "esculin"],
}

# ------------------------------------------------------------
# Generic helpers
# ------------------------------------------------------------

def _clean_text(text: str) -> str:
    """
    Normalise unicode oddities and collapse whitespace.
    Also:
      - strip degree symbols
      - normalise subscript ₂ → 2 for H₂S
      - normalise α/β/γ to alpha/beta/gamma for haemolysis patterns
    """
    if not text:
        return ""
    s = text.replace("°", "").replace("º", "")
    # normalise subscript 2 (H₂S → H2S)
    s = s.replace("₂", "2")

    # Greek letters for haemolysis and related descriptors
    s = (
        s.replace("α", "alpha")
         .replace("β", "beta")
         .replace("γ", "gamma")
    )

    # collapse whitespace
    return " ".join(s.split())


def _norm(s: str) -> str:
    return s.strip().lower()


def _set_if_stronger(parsed: Dict[str, str], field: str, value: str) -> None:
    """
    Write value to parsed[field] if:
      - field not present, or
      - we are replacing Unknown with a concrete value
    """
    if not value:
        return
    if field not in parsed or parsed[field] == UNKNOWN:
        parsed[field] = value


def _value_from_pnv_token(token: str) -> str | None:
    """
    Map a simple token to Positive / Negative / Variable.
    """
    seg = _norm(token)
    if seg in ["positive", "pos", "+"]:
        return "Positive"
    if seg in ["negative", "neg", "-"]:
        return "Negative"
    if seg in ["variable", "var", "v"]:
        return "Variable"
    return None


def _value_from_pnv_context(segment: str) -> str | None:
    """
    Interpret a phrase as Positive / Negative / Variable.

    Handles:
      - "positive"
      - "is positive"
      - "+", "neg", etc.
    """
    seg = _norm(segment)
    # direct token first
    val = _value_from_pnv_token(seg)
    if val:
        return val
    # "... is positive"
    m = re.search(r"\bis\s+(positive|negative|variable|pos|neg|\+|\-)\b", seg)
    if m:
        return _value_from_pnv_token(m.group(1))
    return None


# ------------------------------------------------------------
# Gram stain and shape
# ------------------------------------------------------------

def _parse_gram_and_shape(text_lc: str, parsed: Dict[str, str]) -> None:
    # Gram stain
    if "gram-positive" in text_lc or "gram positive" in text_lc:
        _set_if_stronger(parsed, "Gram Stain", "Positive")
    elif "gram-negative" in text_lc or "gram negative" in text_lc:
        _set_if_stronger(parsed, "Gram Stain", "Negative")
    elif "gram variable" in text_lc:
        _set_if_stronger(parsed, "Gram Stain", "Variable")

    # Shape
    # Prefer "short rods" / coccobacilli over generic rods
    if "short rods" in text_lc:
        _set_if_stronger(parsed, "Shape", "Short Rods")

    # NEW: coccobacilli → Short Rods
    if re.search(r"\bcoccobacill(?:us|i)\b", text_lc):
        _set_if_stronger(parsed, "Shape", "Short Rods")

    # Cocci and variants (diplococci, tetracocci, etc.)
    if re.search(r"\bcocci\b", text_lc):
        _set_if_stronger(parsed, "Shape", "Cocci")
    if re.search(r"\b(diplococci|tetracocci|streptococci|staphylococci)\b", text_lc):
        _set_if_stronger(parsed, "Shape", "Cocci")

    # Rods / bacilli
    if re.search(r"\brods?\b", text_lc) or "bacilli" in text_lc:
        _set_if_stronger(parsed, "Shape", "Rods")

    # Spiral
    if "spiral" in text_lc or "spirochete" in text_lc:
        _set_if_stronger(parsed, "Shape", "Spiral")


# ------------------------------------------------------------
# Haemolysis
# ------------------------------------------------------------

def _parse_haemolysis(text_lc: str, parsed: Dict[str, str]) -> None:
    """
    Handle haemolysis phrasing:
      - beta-haemolytic / beta hemolytic / beta-haemolysis / etc.
      - alpha- / gamma- / non-haemolytic
      - α / β / γ symbols are normalised to alpha/beta/gamma in _clean_text
    """
    # Beta
    if re.search(r"beta[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
        _set_if_stronger(parsed, "Haemolysis Type", "Beta")
        _set_if_stronger(parsed, "Haemolysis", "Positive")

    # Alpha
    if re.search(r"alpha[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
        _set_if_stronger(parsed, "Haemolysis Type", "Alpha")
        _set_if_stronger(parsed, "Haemolysis", "Positive")

    # Gamma / non-haemolytic
    if re.search(r"gamma[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
        _set_if_stronger(parsed, "Haemolysis Type", "Gamma")
        _set_if_stronger(parsed, "Haemolysis", "Negative")
    if (
        "non-haemolytic" in text_lc
        or "non hemolytic" in text_lc
        or "non-hemolytic" in text_lc
    ):
        _set_if_stronger(parsed, "Haemolysis Type", "None")
        _set_if_stronger(parsed, "Haemolysis", "Negative")

    # Variable phrasing
    if "variable haemolysis" in text_lc or "variable hemolysis" in text_lc:
        _set_if_stronger(parsed, "Haemolysis Type", "Variable")
        _set_if_stronger(parsed, "Haemolysis", "Variable")


# ------------------------------------------------------------
# Core enzyme / boolean tests
# ------------------------------------------------------------

def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
    """
    For each test in CORE_BOOL_FIELDS, look for patterns like:
      - "catalase positive"
      - "positive for catalase"
      - "catalase is positive"
      - "indole reaction is negative"
      - "indole reaction negative"
      - "indole test reaction is positive"
    Plus:
      - NaCl tolerance with % values
      - Nitrate reduction text
      - H2S production / non-production
      - DNase coverage
      - gelatinase / gelatin → Gelatin Hydrolysis
      - esculin → Esculin Hydrolysis
      - grouped MR/VP: "MR and VP negative"
      - decarboxylase global phrases
      - generic grouped phrases
        "gelatin and esculin hydrolysis negative"
        "lysine, ornithine and arginine negative"
    """
    for field, keywords in CORE_BOOL_FIELDS.items():
        for kw in keywords:
            # 1) "... catalase positive"
            m1 = re.search(
                rf"{re.escape(kw)}[ \-]?"
                r"(positive|negative|variable|pos|neg|\+|\-)",
                text_lc,
            )
            if m1:
                val = _value_from_pnv_context(m1.group(1))
                if val:
                    _set_if_stronger(parsed, field, val)
                    break

            # 2) "positive for catalase"
            m2 = re.search(
                rf"(positive|negative|variable|pos|neg|\+|\-)\s+"
                rf"(for\s+)?{re.escape(kw)}",
                text_lc,
            )
            if m2:
                val = _value_from_pnv_context(m2.group(1))
                if val:
                    _set_if_stronger(parsed, field, val)
                    break

            # 3) "<kw> is positive"
            m3 = re.search(
                rf"{re.escape(kw)}\s+is\s+"
                r"(positive|negative|variable|pos|neg|\+|\-)",
                text_lc,
            )
            if m3:
                val = _value_from_pnv_token(m3.group(1))
                if val:
                    _set_if_stronger(parsed, field, val)
                    break

            # 4) "<kw> reaction is positive/negative"
            m4 = re.search(
                rf"{re.escape(kw)}\s+reaction\s+is\s+"
                r"(positive|negative|variable|pos|neg|\+|\-)",
                text_lc,
            )
            if m4:
                val = _value_from_pnv_token(m4.group(1))
                if val:
                    _set_if_stronger(parsed, field, val)
                    break

            # 5) "<kw> reaction positive/negative"
            m5 = re.search(
                rf"{re.escape(kw)}\s+reaction\s+"
                r"(positive|negative|variable|pos|neg|\+|\-)",
                text_lc,
            )
            if m5:
                val = _value_from_pnv_token(m5.group(1))
                if val:
                    _set_if_stronger(parsed, field, val)
                    break

            # 6) "<kw> test reaction is positive"
            m6 = re.search(
                rf"{re.escape(kw)}\s+test\s+reaction\s+is\s+"
                r"(positive|negative|variable|pos|neg|\+|\-)",
                text_lc,
            )
            if m6:
                val = _value_from_pnv_token(m6.group(1))
                if val:
                    _set_if_stronger(parsed, field, val)
                    break

        # Special-case NaCl tolerance with explicit percentages
        if field == "NaCl Tolerant (>=6%)":
            # We scan the whole text for positive/negative NaCl evidence,
            # then decide once per description. Negative has highest priority.
            has_positive = False
            has_negative = False

            # --- Negative phrasing (highest priority) ---
            # "does not grow in 7% NaCl", "doesn't grow at 10% NaCl"
            if re.search(
                r"does\s+(?:not|n't)\s+grow\s+(in|at)\s*\d+(?:\.\d+)?\s*%?\s*nacl",
                text_lc,
            ):
                has_negative = True

            # "no growth in 6.5% NaCl", "no growth at 8% NaCl"
            if re.search(
                r"no\s+growth\s+(in|at)\s*\d+(?:\.\d+)?\s*%?\s*nacl",
                text_lc,
            ):
                has_negative = True

            # "no growth in NaCl" (no explicit %)
            if re.search(
                r"no\s+growth\s+in\s+nacl",
                text_lc,
            ):
                has_negative = True

            # "unable to grow in 7% NaCl", "unable to grow in NaCl"
            if re.search(
                r"unable\s+to\s+grow\s+(in|at)\s*(\d+(?:\.\d+)?\s*%?\s*)?nacl",
                text_lc,
            ):
                has_negative = True

            # semantic negatives without explicit %
            if re.search(r"cannot\s+tolerate\s+nacl", text_lc):
                has_negative = True
            if re.search(r"not\s+nacl\s+tolerant", text_lc):
                has_negative = True
            if re.search(r"nacl\s+sensitive", text_lc):
                has_negative = True
            if re.search(r"fails\s+to\s+grow\s+(in|at)\s*(\d+(?:\.\d+)?\s*%?\s*)?nacl", text_lc):
                has_negative = True
            if re.search(r"intolerant\s+to\s+nacl", text_lc):
                has_negative = True
            if re.search(r"no\s+tolerance\s+to\s+nacl", text_lc):
                has_negative = True
            if re.search(r"nacl\s+intolerance", text_lc):
                has_negative = True
            if re.search(r"no\s+growth\s+at\s+high\s+nacl", text_lc):
                has_negative = True

            # --- Positive phrasing (requires growth/tolerance verb + % ≥ 6) ---
            # e.g. "grows in 6.5% NaCl", "growth occurs at 10% NaCl"
            for m in re.finditer(
                r"(grows|growth occurs|growth observed|able to grow|tolerates|tolerant)\s+"
                r"(?:in|at|up to|to)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
                text_lc,
            ):
                try:
                    conc = float(m.group(2))
                    if conc >= 6.0:
                        has_positive = True
                except Exception:
                    pass

            # e.g. "NaCl tolerant up to 10%", "NaCl tolerant to 8%"
            for m in re.finditer(
                r"nacl\s+tolerant\s+(?:to|up to)?\s*(\d+(?:\.\d+)?)\s*%?",
                text_lc,
            ):
                try:
                    conc = float(m.group(1))
                    if conc >= 6.0:
                        has_positive = True
                except Exception:
                    pass

            # Decide final value:
            #   Negative > Positive > Unknown
            if has_negative:
                # Negative explicitly overrides any previous value
                parsed["NaCl Tolerant (>=6%)"] = "Negative"
            elif has_positive:
                _set_if_stronger(parsed, "NaCl Tolerant (>=6%)", "Positive")

    # Nitrate: "reduces nitrate" / "does not reduce nitrate"
    if re.search(r"reduces nitrate", text_lc):
        _set_if_stronger(parsed, "Nitrate Reduction", "Positive")
    if re.search(r"does (not|n't) reduce nitrate", text_lc):
        _set_if_stronger(parsed, "Nitrate Reduction", "Negative")

    # H2S: "produces H2S", "H2S production", "H2S production is positive"
    if re.search(r"(produces|production of)\s+h2s", text_lc):
        _set_if_stronger(parsed, "H2S", "Positive")
    if re.search(r"h2s production\s+is\s+(positive|pos|\+)", text_lc):
        _set_if_stronger(parsed, "H2S", "Positive")
    if re.search(r"h2s production\s+is\s+(negative|neg|\-)", text_lc):
        _set_if_stronger(parsed, "H2S", "Negative")
    if (
        re.search(r"does (not|n't) produce\s+h2s", text_lc)
        or re.search(r"no h2s production", text_lc)
        or re.search(r"non[- ]h2s producing", text_lc)
    ):
        _set_if_stronger(parsed, "H2S", "Negative")

    # --- DNase universal coverage ---
    # Positive forms
    if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(positive|pos|\+)\b", text_lc):
        _set_if_stronger(parsed, "DNase", "Positive")

    if re.search(r"\b(positive|pos|\+)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc):
        _set_if_stronger(parsed, "DNase", "Positive")

    # Negative forms
    if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(negative|neg|\-)\b", text_lc):
        _set_if_stronger(parsed, "DNase", "Negative")

    if re.search(r"\b(negative|neg|\-)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc):
        _set_if_stronger(parsed, "DNase", "Negative")

    # non-DNase-producing
    if re.search(r"\bnon[- ]?dnase[- ]?producing\b", text_lc):
        _set_if_stronger(parsed, "DNase", "Negative")

    # --- MR and VP grouped: "MR and VP negative" ---
    mr_vp_pattern = re.compile(
        r"\b("
        r"mr(?: test)?|methyl red|"
        r"vp(?: test)?|voges-proskauer"
        r")\s*(?:test)?\s*(?:and|&)\s*( "
        r"mr(?: test)?|methyl red|"
        r"vp(?: test)?|voges-proskauer"
        r")\s+"
        r"(positive|negative|variable|pos|neg|\+|\-)"
    )
    for m in mr_vp_pattern.finditer(text_lc):
        name1 = m.group(1)
        name2 = m.group(2)
        val = _value_from_pnv_token(m.group(3))
        if not val:
            continue

        def _assign_mr_vp(name: str) -> None:
            n = name.lower()
            if "mr" in n or "methyl red" in n:
                _set_if_stronger(parsed, "Methyl Red", val)
            if "vp" in n or "voges" in n:
                _set_if_stronger(parsed, "VP", val)

        _assign_mr_vp(name1)
        _assign_mr_vp(name2)

    # --- Decarboxylases global "all decarboxylases negative/positive" ---
    m_all_decarb = re.search(
        r"all\s+decarboxylases?\s+(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)",
        text_lc,
    )
    if m_all_decarb:
        val = _value_from_pnv_token(m_all_decarb.group(1))
        if val:
            for f in ("Lysine Decarboxylase", "Ornitihine Decarboxylase", "Arginine dihydrolase"):
                _set_if_stronger(parsed, f, val)

    # --- Generic grouped list logic for tests & sugars ---
    #
    # Handles things like:
    #   "gelatin and esculin hydrolysis negative"
    #   "lysine, ornithine and arginine negative"
    #   "indole, urease and citrate positive"
    #   "raffinose and inositol negative"
    #
    grouped_tests_pattern = re.compile(
        r"([a-z0-9 ,/&\-]+?)\s+"
        r"(?:hydrolysis|decarboxylases?|dihydrolases?|tests?|reactions?)?"
        r"\s*(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)"
    )

    for m in grouped_tests_pattern.finditer(text_lc):
        seg = m.group(1)
        val = _value_from_pnv_token(m.group(2))
        if not val:
            continue

        seg_lc = seg.lower()

        # Quick filter: does this segment contain any known test/sugar keyword?
        has_any = False

        for _, keywords in CORE_BOOL_FIELDS.items():
            if any(re.search(rf"\b{re.escape(kw)}\b", seg_lc) for kw in keywords):
                has_any = True
                break

        if not has_any:
            for sugar_key in SUGAR_FIELDS.keys():
                if re.search(rf"\b{sugar_key}\b", seg_lc):
                    has_any = True
                    break

        if not has_any:
            continue  # ignore segments unrelated to tests/sugars

        # Apply to all matching core boolean tests
        for field, keywords in CORE_BOOL_FIELDS.items():
            for kw in keywords:
                if re.search(rf"\b{re.escape(kw)}\b", seg_lc):
                    _set_if_stronger(parsed, field, val)
                    break

        # Apply to all matching sugars
        for sugar_key, field in SUGAR_FIELDS.items():
            if re.search(rf"\b{sugar_key}\b", seg_lc):
                _set_if_stronger(parsed, field, val)


# ------------------------------------------------------------
# Motility / Capsule / Spores
# ------------------------------------------------------------

def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None:
    # Motility
    if (
        re.search(r"\bmotile\b", text_lc)
        and not re.search(r"\bnon[- ]?motile\b", text_lc)
        and "nonmotile" not in text_lc
        and "immotile" not in text_lc
    ):
        _set_if_stronger(parsed, "Motility", "Positive")

    if (
        "non-motile" in text_lc
        or "non motile" in text_lc
        or "nonmotile" in text_lc
        or "immotile" in text_lc
    ):
        _set_if_stronger(parsed, "Motility", "Negative")

    # Specific motility phrases: tumbling, swarming, corkscrew, gliding, spreading
    if (
        "tumbling motility" in text_lc
        or "swarming motility" in text_lc
        or "corkscrew motility" in text_lc
        or re.search(r"\b(gliding|spreading)\s+motility\b", text_lc)
        or ("swarming" in text_lc and "non-swarming" not in text_lc)
    ):
        _set_if_stronger(parsed, "Motility", "Positive")

    # Capsule (including "capsule positive/negative", present/absent)
    if (
        "capsulated" in text_lc
        or "encapsulated" in text_lc
        or "capsule present" in text_lc
        or re.search(r"capsule\s+is\s+present", text_lc)
        or re.search(r"capsule[ \-]?(positive|pos|\+)", text_lc)
    ):
        _set_if_stronger(parsed, "Capsule", "Positive")

    if (
        "non-capsulated" in text_lc
        or "no capsule" in text_lc
        or "capsule absent" in text_lc
        or re.search(r"capsule\s+is\s+absent", text_lc)
        or re.search(r"capsule[ \-]?(negative|neg|\-)", text_lc)
    ):
        _set_if_stronger(parsed, "Capsule", "Negative")

    # Spore formation
    # NEGATIVE FIRST with strict boundaries, then early-return
    if (
        re.search(r"\bnon[-\s]?spore[-\s]?forming\b", text_lc)
        or "no spores" in text_lc
    ):
        _set_if_stronger(parsed, "Spore Formation", "Negative")
        return  # prevent any positive overwrite

    # POSITIVE (must not match the negative form)
    if (
        re.search(r"\bspore[-\s]?forming\b", text_lc)
        or "forms spores" in text_lc
    ):
        _set_if_stronger(parsed, "Spore Formation", "Positive")


# ------------------------------------------------------------
# Oxygen requirement
# ------------------------------------------------------------

def _parse_oxygen(text_lc: str, parsed: Dict[str, str]) -> None:
    """
    Robust oxygen parsing:
      - Handle facultative first
      - Avoid "aerobic" accidentally matching inside "anaerobic"
      - Include "aerobically" / "anaerobically"
    """
    # Facultative first
    if re.search(r"facultative(ly)? anaerob", text_lc):
        _set_if_stronger(parsed, "Oxygen Requirement", "Facultative Anaerobe")

    # Strict anaerobic (before aerobic)
    if (
        re.search(r"\bobligate anaerob", text_lc)
        or (re.search(r"\banaerobic\b", text_lc) and "facultative" not in text_lc)
        or re.search(r"\banaerobically\b", text_lc)
    ):
        _set_if_stronger(parsed, "Oxygen Requirement", "Anaerobic")

    # Now handle purely aerobic, avoiding "anaerobic"
    if (
        re.search(r"\bobligate aerobe\b", text_lc)
        or (re.search(r"\baerobic\b", text_lc) and "anaerobic" not in text_lc)
        or (
            re.search(r"\baerobically\b", text_lc)
            and "anaerobically" not in text_lc
        )
    ):
        _set_if_stronger(parsed, "Oxygen Requirement", "Aerobic")

    if "microaerophilic" in text_lc or "microaerophile" in text_lc:
        _set_if_stronger(parsed, "Oxygen Requirement", "Microaerophilic")

    if "capnophilic" in text_lc or "co2" in text_lc:
        _set_if_stronger(parsed, "Oxygen Requirement", "Capnophilic")


# ------------------------------------------------------------
# Growth temperature
# ------------------------------------------------------------

def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
    """
    Look for explicit temperatures like "grows at 37 °C" or ranges like "4–45 °C".
    We ALWAYS store as "low//high":
      - true ranges: "4-45 °C" → "4//45"
      - "grows between 30 and 37 °C" → "30//37"
      - "grows at 30–37 °C" → "30//37"
      - two temps in text: min//max (Option A)
      - single temps: "37 °C" → "37//37"
    """
    # 0) Explicit "between X and Y" ranges
    between_pattern = re.compile(
        r"between\s+(\d+)\s*(?:c|°c|degrees c|degrees celsius)?"
        r"\s*(?:and|to|-)\s*(\d+)\s*(?:c|°c|degrees c|degrees celsius)?"
    )
    m_between = between_pattern.search(text_lc)
    if m_between:
        low = m_between.group(1)
        high = m_between.group(2)
        _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
        return

    # 1) Explicit ranges like "4-45 °C" or "10–40 °C"
    range_pattern = re.compile(
        r"(\d+)\s*[-–/]\s*(\d+)\s*(?:c|°c|degrees c|degrees celsius)"
    )
    m_range = range_pattern.search(text_lc)
    if m_range:
        low = m_range.group(1)
        high = m_range.group(2)
        _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
        return

    # 2) Any two explicit temps → min//max
    temps = re.findall(r"(\d+)\s*(?:c|°c|degrees c|degrees celsius)", text_lc)
    if len(temps) >= 2:
        nums = [int(t) for t in temps]
        low = min(nums)
        high = max(nums)
        _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
        return

    # 3) Single temps like "grows at 37 c"
    single_pattern = re.compile(
        r"(grows|growth|optimum|optimal)\s+(?:at\s+)?(\d+)\s*"
        r"(?:c|°c|degrees c|degrees celsius)"
    )
    m_single = single_pattern.search(text_lc)
    if m_single:
        temp = m_single.group(2)
        _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
        return

    # 4) Simplified: "grows at 37" (no explicit °C)
    m_simple_num = re.search(r"grows at (\d+)\b", text_lc)
    if m_simple_num:
        temp = m_simple_num.group(1)
        _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
        return

    # 5) Fallback: plain "37c" somewhere in the text
    m_plain = re.search(
        r"\b(\d+)\s*(?:c|°c|degrees c|degrees celsius)\b",
        text_lc,
    )
    if m_plain:
        temp = m_plain.group(1)
        _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")


# ------------------------------------------------------------
# Media grown on (coarse mapping)
# ------------------------------------------------------------

MEDIA_KEYWORDS = {
    "Blood Agar": [
        "blood agar",
        "blood-agar",
    ],
    "MacConkey Agar": [
        "macconkey agar",
        "mac conkey agar",
        "macconkey",
    ],
    "Chocolate Agar": [
        "chocolate agar",
        "chocolate-agar",
    ],
    "Nutrient Agar": [
        "nutrient agar",
        "nutrient-agar",
        "nut agar",
    ],
    "XLD Agar": [
        "xld agar",
        "xld",
    ],
    "TCBS Agar": [
        "tcbs agar",
        "tcbs",
    ],
    "ALOA": [
        "aloa agar",
        "aloa",
    ],
    "BCYE Agar": [
        "bcye agar",
        "bcye",
        "Buffered Charcoal Yeast Extract Agar",
        "buffered charcoal yeast extract agar"
    ],
    "MRS Agar": [
        "mrs agar",
    ],
    "Mannitol Salt Agar": [
        "msa agar",
        "ms agar",
    ],
    "Cycloserine Cefoxitin Fructose Agar": [
        "ccfa agar",
        "cycloserine cefoxitin fructose agar",
        "ccf agar",
    ],
    "Thayer Martin Agar": [
        "thayer martin agar",
        "tma agar",
        "tma",
    ],
    "Bordet-Gengou Agar": [
        "bordet gengou agar",
    ],
    "Cetrimide Agar": [
        "cetrimide agar",
    ],
    "Anaerobic Agar": [
        "anaerobic agar",
    ],
    "Anaerobic Blood Agar": [
        "anaerobic blood agar",
    ],
    "Hektoen Enteric Agar": [
        "hektoen enteric agar",
        "HK Agar",
        "hk",
    ],
    "Tryptic Soy Agar": [
        "tryptic soy agar",
        "t-soy agar",
        "tsoy",
    ],
    "Brucella Agar": [
        "brucella agar",
    ],
    "Charcoal Agar": [
        "charcoal agar",
    ],
    "Yeast Extract Mannitol Agar": [
        "yeast extract mannitol agar",
    ],
    "Sabouraud Agar": [
        "sabouraud agar",
        "sabouraud dextrose agar",
    ],
    "BHI": [
        "bhi",
        "brain heart infusion agar",
        "brain heart infusion",
    ],
    "Columbia Blood Agar": [
        "columbia blood agar",
        "columbia agar",
        "columbia",
    ],
    "Lowenstein-Jensen Agar": [
        "lowenstein-jensen agar",
        "lowenstein jensen agar",
    ],
    "BSK Medium": [
        "bsk medium",
        "bsk",
        "bsk-ii medium",
        "bsk-h medium",
    ],
    "Ashby Agar": [
        "ashby agar",
        "ashby medium",
    ]
}


def _parse_media(text_lc: str, parsed: Dict[str, str]) -> None:
    found_media: List[str] = []
    for media_name, patterns in MEDIA_KEYWORDS.items():
        for p in patterns:
            if p in text_lc and media_name not in found_media:
                found_media.append(media_name)

    if found_media:
        _set_if_stronger(parsed, "Media Grown On", "; ".join(found_media))


# ------------------------------------------------------------
# Sugar fermentation parsing
# ------------------------------------------------------------

def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
    """
    Handles patterns like:
      - "glucose positive, mannitol negative"
      - "ferments glucose, mannitol and sucrose but not lactose"
      - "does not ferment lactose or sucrose"
      - "non-lactose fermenter"
      - "<sugar> fermenter" (positive unless "non-<sugar> fermenter")
      - "<sugar> is positive/negative"
      - "<sugar> fermentation is positive/negative"
      - global non-fermenter phrases
      - "asaccharolytic" → all sugars Negative (Unknown-only)
      - "all other sugars negative" → remaining sugars Negative
    """

    # 0) Simple "<sugar> positive/negative" and "<sugar> is positive"
    for sugar_key, field in SUGAR_FIELDS.items():
        # "glucose positive"
        m_simple = re.search(
            rf"{sugar_key}\s+(positive|negative|variable|pos|neg|\+|\-)",
            text_lc,
        )
        if m_simple:
            val = _value_from_pnv_context(m_simple.group(1))
            if val:
                _set_if_stronger(parsed, field, val)

        # "<sugar> is positive"
        m_is = re.search(
            rf"{sugar_key}\s+is\s+(positive|negative|variable|pos|neg|\+|\-)",
            text_lc,
        )
        if m_is:
            val = _value_from_pnv_token(m_is.group(1))
            if val:
                _set_if_stronger(parsed, field, val)

    # 0b) "<sugar> fermenter" vs "non-<sugar> fermenter"
    for sugar_key, field in SUGAR_FIELDS.items():
        # positive: "lactose fermenter"
        if re.search(rf"\b{sugar_key}\s+fermenter\b", text_lc) and not re.search(
            rf"\bnon[- ]{sugar_key}\s+fermenter\b", text_lc
        ):
            _set_if_stronger(parsed, field, "Positive")

        # negative: "non-lactose fermenter"
        if re.search(rf"\bnon[- ]{sugar_key}\s+fermenter\b", text_lc):
            _set_if_stronger(parsed, field, "Negative")

    # 1) "ferments X, Y and Z but not A, B"
    ferments_pattern = re.compile(r"ferments\s+([a-z0-9 ,;/&\-]+)")
    for m in ferments_pattern.finditer(text_lc):
        seg = m.group(1)
        # Split positive vs negative part on "but not"
        neg_split = re.split(r"\bbut not\b", seg, maxsplit=1)
        pos_part = neg_split[0]
        neg_part = neg_split[1] if len(neg_split) > 1 else ""

        # Positive sugars from pos_part
        for sugar_key, field in SUGAR_FIELDS.items():
            if re.search(rf"\b{sugar_key}\b", pos_part):
                _set_if_stronger(parsed, field, "Positive")

        # Negative sugars from neg_part
        for sugar_key, field in SUGAR_FIELDS.items():
            if re.search(rf"\b{sugar_key}\b", neg_part):
                _set_if_stronger(parsed, field, "Negative")

    # 2) Grouped "does not ferment X, Y and Z" (stop at but/punctuation)
    #    Prevents glucose being accidentally marked negative in:
    #      "does not ferment lactose or sucrose, but glucose fermentation is positive"
    grouped_neg_pattern = re.compile(
        r"does\s+(?:not|n't)\s+ferment\s+([a-z0-9 ,;/&\-]+?)(?:\s+but\b|\.|;|,|$)"
    )
    for m in grouped_neg_pattern.finditer(text_lc):
        seg = m.group(1)
        for sugar_key, field in SUGAR_FIELDS.items():
            if re.search(rf"\b{sugar_key}\b", seg):
                _set_if_stronger(parsed, field, "Negative")

    # 3) Single "does not ferment X"
    for sugar_key, field in SUGAR_FIELDS.items():
        if re.search(
            rf"does\s+(?:not|n't)\s+ferment\s+{sugar_key}\b", text_lc
        ):
            _set_if_stronger(parsed, field, "Negative")

    # 4) "non-lactose fermenter" and similar
    for sugar_key, field in SUGAR_FIELDS.items():
        if re.search(
            rf"non[- ]{sugar_key}\s+ferment(ing|er)?", text_lc
        ):
            _set_if_stronger(parsed, field, "Negative")

    # 5) "<sugar> fermentation positive/negative" + "is positive"
    for sugar_key, field in SUGAR_FIELDS.items():
        # "glucose fermentation positive"
        m1 = re.search(
            rf"{sugar_key}\s+fermentation[ \-]?"
            r"(positive|negative|variable|pos|neg|\+|\-)",
            text_lc,
        )
        if m1:
            val = _value_from_pnv_context(m1.group(1))
            if val:
                _set_if_stronger(parsed, field, val)
                continue

        # "positive for glucose fermentation"
        m2 = re.search(
            rf"(positive|negative|variable|pos|neg|\+|\-)\s+"
            rf"(for\s+)?{sugar_key}\s+fermentation",
            text_lc,
        )
        if m2:
            val = _value_from_pnv_context(m2.group(1))
            if val:
                _set_if_stronger(parsed, field, val)
                continue

        # "<sugar> fermentation is positive/negative"
        m3 = re.search(
            rf"{sugar_key}\s+fermentation\s+is\s+"
            r"(positive|negative|variable|pos|neg|\+|\-)",
            text_lc,
        )
        if m3:
            val = _value_from_pnv_token(m3.group(1))
            if val:
                _set_if_stronger(parsed, field, val)
                continue

    # 6) Global non-fermenter phrases
    #     e.g. "non-fermenter", "does not ferment sugars"
    #     → set all sugars Negative *unless* already set by a more specific rule.
    if (
        re.search(
            r"does\s+(?:not|n't)\s+ferment\s+(carbohydrates|sugars)", text_lc
        )
        or re.search(r"\bnon[- ]ferment(er|ing|ative)\b", text_lc)
    ):
        for field in SUGAR_FIELDS.values():
            if field not in parsed or parsed[field] == UNKNOWN:
                _set_if_stronger(parsed, field, "Negative")

    # 7) Asaccharolytic → all sugars Negative (Unknown-only)
    if (
        "asaccharolytic" in text_lc
        or "non-saccharolytic" in text_lc
        or "non saccharolytic" in text_lc
    ):
        for field in SUGAR_FIELDS.values():
            if field not in parsed or parsed[field] == UNKNOWN:
                _set_if_stronger(parsed, field, "Negative")

    # 8) "all other sugars negative/positive"
    m_other = re.search(
        r"all\s+other\s+sugars\s+(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)",
        text_lc,
    )
    if m_other:
        val = _value_from_pnv_token(m_other.group(1))
        if val:
            for field in SUGAR_FIELDS.values():
                if field not in parsed or parsed[field] == UNKNOWN:
                    _set_if_stronger(parsed, field, val)


# ------------------------------------------------------------
# Colony morphology (coarse, optional)
# ------------------------------------------------------------

def _normalise_colony_desc(desc: str) -> str:
    """
    Take a raw colony descriptor and normalise into:
      "Smooth; Yellow; Opaque" etc.

    Tweaks:
      - Remove "-pigmented" → "yellow-pigmented" → "yellow"
      - Treat "and" like a separator for parts
    """
    # Remove "-pigmented" so "yellow-pigmented" → "yellow"
    tmp = desc.replace("-pigmented", "")

    # Normalise "and" to a comma so it acts like a separator
    tmp = tmp.replace(" and ", ", ")

    parts = [s.strip() for s in re.split(r"[;,]", tmp) if s.strip()]
    pretty = "; ".join(p.capitalize() for p in parts)
    return pretty


def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
    """
    Very coarse mapping for colony morphology. We try:
      - "colonies are yellow, mucoid"
      - "colonies dry, white and irregular on nutrient agar"
      - "forming smooth, yellow-pigmented, opaque colonies"
      - "grey colonies", "large grey colonies" (no verb)
    """

    # Pattern 1: "colonies are ..."
    m = re.search(r"colon(y|ies)\s+(are|is)\s+([a-z0-9 ,;\-]+)", text_lc)
    if m:
        desc = m.group(3).strip()
        if desc:
            pretty = _normalise_colony_desc(desc)
            if pretty:
                _set_if_stronger(parsed, "Colony Morphology", pretty)
                return

    # Pattern 2: "colonies dry, white and irregular on nutrient agar"
    m2 = re.search(
        r"colonies\s+([a-z0-9 ,;\-]+?)(?:\s+on\b|\.|,)",
        text_lc,
    )
    if m2:
        desc = m2.group(1).strip()
        if desc:
            pretty = _normalise_colony_desc(desc)
            if pretty:
                _set_if_stronger(parsed, "Colony Morphology", pretty)
                return

    # Pattern 3: "forming green colonies", "forms mucoid colonies",
    #            "forming smooth, yellow-pigmented, opaque colonies"
    m3 = re.search(
        r"(forming|forms|produces)\s+([a-z0-9 ,;\-]+?)\s+colonies",
        text_lc,
    )
    if m3:
        desc = m3.group(2).strip()
        if desc:
            pretty = _normalise_colony_desc(desc)
            if pretty:
                _set_if_stronger(parsed, "Colony Morphology", pretty)
                return

    # Pattern 4: plain descriptor before "colonies" (e.g. "grey colonies",
    # "large grey colonies") when none of the above match.
    m4 = re.search(
        r"\b([a-z0-9 ,;\-]+?)\s+colonies\b",
        text_lc,
    )
    if m4:
        desc = m4.group(1).strip()
        if desc:
            pretty = _normalise_colony_desc(desc)
            if pretty:
                _set_if_stronger(parsed, "Colony Morphology", pretty)
                return

def _apply_patches(original_text: str, text_lc: str, parsed: Dict[str, str]) -> Dict[str, str]:
    # ----------------------------------------------
    # helper for P/N/V
    # ----------------------------------------------
    def _pnv(x: str) -> Optional[str]:
        x = x.strip().lower()
        if x in {"positive", "pos", "+", "strongly positive", "weakly positive"}:
            return "Positive"
        if x in {"negative", "neg", "-", "no"}:
            return "Negative"
        if x in {"variable", "var", "mixed"}:
            return "Variable"
        return None

    # ============================================================
    #  NEW LOGIC: Haemolysis Type detection (alpha/beta/none)
    # ============================================================

    # alpha
    m_alpha = re.search(r"(alpha|α)[-\s]*haemolysis", text_lc) or \
              re.search(r"haemolysis type[: ]*(alpha|α)", text_lc)
    if m_alpha:
        if parsed.get("Haemolysis", UNKNOWN) == UNKNOWN:
            parsed["Haemolysis"] = "Positive"
        if parsed.get("Haemolysis Type", UNKNOWN) == UNKNOWN:
            parsed["Haemolysis Type"] = "Alpha"

    # beta
    m_beta = re.search(r"(beta|β)[-\s]*haemolysis", text_lc) or \
             re.search(r"haemolysis type[: ]*(beta|β)", text_lc)
    if m_beta:
        if parsed.get("Haemolysis", UNKNOWN) == UNKNOWN:
            parsed["Haemolysis"] = "Positive"
        if parsed.get("Haemolysis Type", UNKNOWN) == UNKNOWN:
            parsed["Haemolysis Type"] = "Beta"

    # gamma / none
    m_gamma = re.search(r"(gamma|γ)[-\s]*haemolysis", text_lc)
    m_none = re.search(r"(no haemolysis|non[- ]haemolytic|no hemolysis|non[- ]hemolytic)", text_lc)
    if m_gamma or m_none:
        if parsed.get("Haemolysis", UNKNOWN) == UNKNOWN:
            parsed["Haemolysis"] = "Negative"
        if parsed.get("Haemolysis Type", UNKNOWN) == UNKNOWN:
            parsed["Haemolysis Type"] = "None"

    # ============================================================
    # ORIGINAL PATCH v1 LOGIC (fully preserved)
    # ============================================================

    # 1. Haemolysis: generic ± without type
    m_h = re.search(r"haemolysis\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc)
    if m_h and "Haemolysis" not in parsed:
        val = _pnv(m_h.group(1))
        if val:
            parsed["Haemolysis"] = val
            if parsed.get("Haemolysis Type", UNKNOWN) == UNKNOWN and val == "Positive":
                parsed["Haemolysis Type"] = "Unknown"

    # 2. Motility: generic ±
    m_mot = re.search(r"motility\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc)
    if m_mot and "Motility" not in parsed:
        val = _pnv(m_mot.group(1))
        if val:
            parsed["Motility"] = val

    # 3. Spore formation ±
    m_sp = re.search(r"spore formation\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc)
    if m_sp and parsed.get("Spore Formation", UNKNOWN) == UNKNOWN:
        val = _pnv(m_sp.group(1))
        if val:
            parsed["Spore Formation"] = val

    # ============================================================
    #  FIXED NaCl tolerant logic (patch upgrade)
    # ============================================================
    if parsed.get("NaCl Tolerant (>=6%)", UNKNOWN) == UNKNOWN:

        # direct p/n/v
        m_nacl = re.search(
            r"(?:nacl\s*(?:tolerant|tolerance)?|growth\s+in\s+6\%[\s]*nacl)"
            r"\s*(positive|negative|variable|pos|neg|\+|\-)",
            text_lc
        )
        if m_nacl:
            val = _pnv(m_nacl.group(1))
            if val:
                parsed["NaCl Tolerant (>=6%)"] = val

        # "no growth in 6% nacl"
        if parsed.get("NaCl Tolerant (>=6%)", UNKNOWN) == UNKNOWN:
            if re.search(r"no\s+growth\s+in\s+(?:>=)?\s*6\%?\s*nacl", text_lc):
                parsed["NaCl Tolerant (>=6%)"] = "Negative"

        # "grows in 6% nacl"
        if parsed.get("NaCl Tolerant (>=6%)", UNKNOWN) == UNKNOWN:
            if re.search(r"grows?\s+in\s+(?:>=)?\s*6\%?\s*nacl", text_lc):
                parsed["NaCl Tolerant (>=6%)"] = "Positive"

    # ============================================================
    #  Growth Temperature patterns (20/40, 20//40, 20 / 40)
    # ============================================================
    m_temp = re.search(r"\b(\d{1,3})\s*[/]{1,2}\s*(\d{1,3})\b", text_lc)
    if m_temp and parsed.get("Growth Temperature", UNKNOWN) == UNKNOWN:
        parsed["Growth Temperature"] = f"{m_temp.group(1)}//{m_temp.group(2)}"

    # ============================================================
    #  Colony Morphology STRICT LIST extraction
    # ============================================================
    COLONY_TRIGGERS = [
        "colony morphology",
        "colonies are",
        "colonies appear",
        "colonies look",
        "colony appearance",
        "colony characteristics",
    ]
    if any(t in text_lc for t in COLONY_TRIGGERS):
        m_col = re.search(
            r"(?:colony morphology|colonies are|colonies appear|colonies look|colony appearance|colony characteristics)"
            r"[: ]+([a-z0-9 ,;/\-]+)",
            text_lc
        )
        if m_col:
            segment = m_col.group(1)
            parts = [x.strip() for x in re.split(r"[;,/]", segment) if x.strip()]

            clean_desc = [p.capitalize() for p in parts if len(p) > 1]

            if clean_desc:
                existing = parsed.get("Colony Morphology", "")
                existing_list = [x.strip() for x in existing.split(";")] if existing else []

                merged = []
                for x in existing_list:
                    if x not in merged:
                        merged.append(x)
                for x in clean_desc:
                    if x not in merged:
                        merged.append(x)

                parsed["Colony Morphology"] = "; ".join(merged)

    # ============================================================
    #  ORIGINAL MULTI-MEDIA PATCH (unchanged)
    # ============================================================
    if "media grown on" in text_lc or "grown on" in text_lc:
        mm = re.search(r"(?:media\s+grown\s+on|grown\s+on)[: ]+([a-z0-9 ,;/\-]+)", text_lc)
        if mm:
            segment = mm.group(1)
            raw_items = re.split(r"[;,]", segment)
            raw_items = [x.strip() for x in raw_items if x.strip()]

            detected_media = []
            for item in raw_items:
                for media_name, patterns in MEDIA_KEYWORDS.items():
                    for p in patterns:
                        if p in item and media_name not in detected_media:
                            detected_media.append(media_name)

            if detected_media:
                existing = parsed.get("Media Grown On", "")
                existing_list = [x.strip() for x in existing.split(";")] if existing else []

                merged = []
                for m in existing_list:
                    if m not in merged:
                        merged.append(m)
                for m in detected_media:
                    if m not in merged:
                        merged.append(m)

                parsed["Media Grown On"] = "; ".join(merged)

    return parsed

# ------------------------------------------------------------
# PUBLIC API
# ------------------------------------------------------------

def parse_text_rules(text: str) -> Dict[str, Any]:
    """
    Main entry point for the rule-based core parser.
    """
    original = text or ""
    text_clean = _clean_text(original)
    text_lc = text_clean.lower()

    parsed: Dict[str, str] = {}

    try:
        _parse_gram_and_shape(text_lc, parsed)
        _parse_haemolysis(text_lc, parsed)
        _parse_core_bool_tests(text_lc, parsed)
        _parse_motility_capsule_spores(text_lc, parsed)
        _parse_oxygen(text_lc, parsed)
        _parse_growth_temperature(text_lc, parsed)
        _parse_media(text_lc, parsed)
        _parse_sugars(text_lc, parsed)
        _parse_colony(text_lc, parsed)
        parsed = _apply_patches(original, text_lc, parsed)
        
        return {
            "parsed_fields": parsed,
            "source": "rule_parser",
            "raw": original,
        }

    except Exception as e:
        # Fail-safe: never crash the app, just report an error
        return {
            "parsed_fields": parsed,
            "source": "rule_parser",
            "raw": original,
            "error": f"{type(e).__name__}: {e}",
        }