Spaces:

EphAsad
/

BactKing

Sleeping

App Files Files Community

EphAsad commited on Nov 17, 2025

Commit

82eb007

verified ·

1 Parent(s): 596952c

Update engine/parser_rules.py

Browse files

Files changed (1) hide show

engine/parser_rules.py +116 -210

engine/parser_rules.py CHANGED Viewed

@@ -2,7 +2,8 @@
 # ------------------------------------------------------------
 # Rule-based core parser for microbiology descriptions.
 #
-# Stage 11F (Option A ranges + fixes) + Stage 11H additions:
 # - Always store Growth Temperature as "low//high"
 #   • single: 37 → "37//37"
 #   • two temps in text: min//max (e.g. "4 °C but not 45 °C" → "4//45")
@@ -12,9 +13,19 @@
 # - "aerobically" / "anaerobically" → Aerobic / Anaerobic
 # - NaCl tolerance phrases improved
 # - Colony morphology from "colonies dry, white and irregular on nutrient agar"
-# - NEW (11H):
 #   • "Gelatinase positive/negative" → Gelatin Hydrolysis Positive/Negative
 #   • "<sugar> fermenter" → <Sugar> Fermentation = Positive
 # ------------------------------------------------------------
 from __future__ import annotations
@@ -29,7 +40,6 @@ UNKNOWN = "Unknown"
 # Core fields and sugar mapping
 # ------------------------------------------------------------
-# Sugar name → core DB column
 SUGAR_FIELDS: Dict[str, str] = {
     "glucose": "Glucose Fermentation",
     "lactose": "Lactose Fermentation",
@@ -46,25 +56,17 @@ SUGAR_FIELDS: Dict[str, str] = {
 }
 CORE_BOOL_FIELDS: Dict[str, List[str]] = {
-    # field: [keywords to recognise the test name]
     "Catalase": ["catalase"],
     "Oxidase": ["oxidase"],
     "Indole": ["indole"],
     "Urease": ["urease"],
     "Citrate": ["citrate"],
-    # MR: include "mr"
     "Methyl Red": ["methyl red", "mr test", "mr"],
     "VP": ["voges-proskauer", "vp test", "vp"],
-    # H2S (includes H₂S → normalised to H2S in _clean_text)
     "H2S": ["h2s", "hydrogen sulfide"],
-    # DNase: broaden patterns
     "DNase": [
-        "dnase",
-        "dnase test",
-        "dnase activity",
-        "dnase production",
-        "dnaase",
-        "dna hydrolysis",
     ],
     "ONPG": ["onpg"],
     "Coagulase": ["coagulase"],
@@ -75,7 +77,6 @@ CORE_BOOL_FIELDS: Dict[str, List[str]] = {
     "Ornitihine Decarboxylase": ["ornithine decarboxylase", "ornithine decarb"],
     "Arginine dihydrolase": ["arginine dihydrolase"],
     "Gelatin Hydrolysis": ["gelatin hydrolysis", "gelatinase"],
-    # Esculin Hydrolysis: also match plain "esculin"
     "Esculin Hydrolysis": ["esculin hydrolysis", "esculin"],
 }
@@ -84,42 +85,23 @@ CORE_BOOL_FIELDS: Dict[str, List[str]] = {
 # ------------------------------------------------------------
 def _clean_text(text: str) -> str:
-    """
-    Normalise a few unicode oddities and collapse whitespace.
-    Also:
-      - strip degree symbols
-      - normalise subscript ₂ → 2 for H₂S
-    """
     if not text:
         return ""
     s = text.replace("°", "").replace("º", "")
-    # normalise subscript 2 (H₂S → H2S)
     s = s.replace("₂", "2")
-    # keep dashes as-is; regexes handle - and – explicitly
     return " ".join(s.split())
 def _norm(s: str) -> str:
     return s.strip().lower()
 def _set_if_stronger(parsed: Dict[str, str], field: str, value: str) -> None:
-    """
-    Write value to parsed[field] if:
-      - field not present, or
-      - we are replacing Unknown with a concrete value
-    """
     if not value:
         return
     if field not in parsed or parsed[field] == UNKNOWN:
         parsed[field] = value
-def _value_from_pnv_context(segment: str) -> str | None:
-    """
-    Interpret a short phrase as Positive / Negative / Variable.
-    """
-    seg = _norm(segment)
     if seg in ["positive", "pos", "+"]:
         return "Positive"
     if seg in ["negative", "neg", "-"]:
@@ -128,13 +110,21 @@ def _value_from_pnv_context(segment: str) -> str | None:
         return "Variable"
     return None
 # ------------------------------------------------------------
 # Gram stain and shape
 # ------------------------------------------------------------
 def _parse_gram_and_shape(text_lc: str, parsed: Dict[str, str]) -> None:
-    # Gram stain
     if "gram-positive" in text_lc or "gram positive" in text_lc:
         _set_if_stronger(parsed, "Gram Stain", "Positive")
     elif "gram-negative" in text_lc or "gram negative" in text_lc:
@@ -142,50 +132,37 @@ def _parse_gram_and_shape(text_lc: str, parsed: Dict[str, str]) -> None:
     elif "gram variable" in text_lc:
         _set_if_stronger(parsed, "Gram Stain", "Variable")
-    # Shape
-    # Prefer "short rods" over generic rods
     if "short rods" in text_lc:
         _set_if_stronger(parsed, "Shape", "Short Rods")
-    # Cocci and variants (diplococci, tetracocci, etc.)
     if re.search(r"\bcocci\b", text_lc):
         _set_if_stronger(parsed, "Shape", "Cocci")
     if re.search(r"\b(diplococci|tetracocci|streptococci|staphylococci)\b", text_lc):
         _set_if_stronger(parsed, "Shape", "Cocci")
-    # Rods / bacilli
     if re.search(r"\brods?\b", text_lc) or "bacilli" in text_lc:
         _set_if_stronger(parsed, "Shape", "Rods")
-    # Spiral
     if "spiral" in text_lc or "spirochete" in text_lc:
         _set_if_stronger(parsed, "Shape", "Spiral")
 # ------------------------------------------------------------
 # Haemolysis
 # ------------------------------------------------------------
 def _parse_haemolysis(text_lc: str, parsed: Dict[str, str]) -> None:
-    """
-    Handle haemolysis phrasing:
-      - beta-haemolytic / beta hemolytic / beta-haemolysis / etc.
-      - alpha- / gamma- / non-haemolytic
-    """
-    # Beta
     if re.search(r"beta[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
         _set_if_stronger(parsed, "Haemolysis Type", "Beta")
         _set_if_stronger(parsed, "Haemolysis", "Positive")
-    # Alpha
     if re.search(r"alpha[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
         _set_if_stronger(parsed, "Haemolysis Type", "Alpha")
         _set_if_stronger(parsed, "Haemolysis", "Positive")
-    # Gamma / non-haemolytic
     if re.search(r"gamma[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
         _set_if_stronger(parsed, "Haemolysis Type", "Gamma")
         _set_if_stronger(parsed, "Haemolysis", "Negative")
     if (
         "non-haemolytic" in text_lc
         or "non hemolytic" in text_lc
@@ -194,32 +171,20 @@ def _parse_haemolysis(text_lc: str, parsed: Dict[str, str]) -> None:
         _set_if_stronger(parsed, "Haemolysis Type", "None")
         _set_if_stronger(parsed, "Haemolysis", "Negative")
-    # Variable phrasing
     if "variable haemolysis" in text_lc or "variable hemolysis" in text_lc:
         _set_if_stronger(parsed, "Haemolysis Type", "Variable")
         _set_if_stronger(parsed, "Haemolysis", "Variable")
 # ------------------------------------------------------------
-# Boolean test parser (core enzyme tests etc.)
 # ------------------------------------------------------------
 def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
-    """
-    For each test in CORE_BOOL_FIELDS, look for patterns like:
-      "catalase positive", "positive for catalase", etc.
-    Also handles:
-      - NaCl tolerance with % values
-      - Nitrate reduction text
-      - H2S production / non-production
-      - DNase universal coverage
-      - NEW (11H): explicit gelatinase → Gelatin Hydrolysis mapping
-    """
     for field, keywords in CORE_BOOL_FIELDS.items():
         for kw in keywords:
-            # "... catalase positive"
             m1 = re.search(
-                rf"{re.escape(kw)}[ \-]?(positive|negative|variable|pos|neg|\+|\-)",
                 text_lc,
             )
             if m1:
@@ -228,9 +193,9 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
                     _set_if_stronger(parsed, field, val)
                     break
-            # "positive for catalase"
             m2 = re.search(
-                rf"(positive|negative|variable|pos|neg|\+|\-)\s+(for\s+)?{re.escape(kw)}",
                 text_lc,
             )
             if m2:
@@ -239,9 +204,18 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
                     _set_if_stronger(parsed, field, val)
                     break
-        # Special-case NaCl tolerance with explicit percentages
         if field == "NaCl Tolerant (>=6%)":
-            # e.g. "grows in 6.5% NaCl", "grows at 10% NaCl"
             for m in re.finditer(
                 r"(grows|growth)\s+(in|at)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
                 text_lc,
@@ -253,7 +227,6 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
                 except Exception:
                     pass
-            # e.g. "NaCl tolerant up to 10%"
             for m in re.finditer(
                 r"nacl\s+tolerant\s+(?:to|up to)?\s*(\d+(?:\.\d+)?)\s*%?",
                 text_lc,
@@ -265,14 +238,12 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
                 except Exception:
                     pass
-            # explicit negative phrasing: "does not grow in 7% NaCl"
             if re.search(
                 r"does (not|n't) grow\s+(in|at)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
                 text_lc,
             ):
                 _set_if_stronger(parsed, "NaCl Tolerant (>=6%)", "Negative")
-            # general "in 6.5% NaCl" → assume tolerance if no explicit "no growth"
             for m in re.finditer(
                 r"\bin\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
                 text_lc,
@@ -284,16 +255,19 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
                 except Exception:
                     pass
-    # Nitrate: "reduces nitrate" / "does not reduce nitrate"
     if re.search(r"reduces nitrate", text_lc):
         _set_if_stronger(parsed, "Nitrate Reduction", "Positive")
     if re.search(r"does (not|n't) reduce nitrate", text_lc):
         _set_if_stronger(parsed, "Nitrate Reduction", "Negative")
-    # H2S: "produces H2S", "H2S production", "does not produce H2S",
-    #       "non-H2S producing"
     if re.search(r"(produces|production of)\s+h2s", text_lc):
         _set_if_stronger(parsed, "H2S", "Positive")
     if (
         re.search(r"does (not|n't) produce\s+h2s", text_lc)
         or re.search(r"no h2s production", text_lc)
@@ -301,30 +275,21 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
     ):
         _set_if_stronger(parsed, "H2S", "Negative")
-    # --- DNase universal coverage ---
-    # Positive forms
     if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(positive|pos|\+)\b", text_lc):
         _set_if_stronger(parsed, "DNase", "Positive")
     if re.search(r"\b(positive|pos|\+)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc):
         _set_if_stronger(parsed, "DNase", "Positive")
-    # Negative forms
     if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(negative|neg|\-)\b", text_lc):
         _set_if_stronger(parsed, "DNase", "Negative")
     if re.search(r"\b(negative|neg|\-)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc):
         _set_if_stronger(parsed, "DNase", "Negative")
-    # non-DNase-producing
     if re.search(r"\bnon[- ]?dnase[- ]?producing\b", text_lc):
         _set_if_stronger(parsed, "DNase", "Negative")
-    # --- NEW: Gelatinase → Gelatin Hydrolysis ---
-    # Explicit mapping just in case generic patterns miss it
     if re.search(r"\bgelatinase\s*(positive|pos|\+)\b", text_lc):
         _set_if_stronger(parsed, "Gelatin Hydrolysis", "Positive")
     if re.search(r"\bgelatinase\s*(negative|neg|\-)\b", text_lc):
         _set_if_stronger(parsed, "Gelatin Hydrolysis", "Negative")
@@ -334,7 +299,6 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
 # ------------------------------------------------------------
 def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None:
-    # Motility
     if (
         re.search(r"\bmotile\b", text_lc)
         and not re.search(r"\bnon[- ]?motile\b", text_lc)
@@ -351,7 +315,6 @@ def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None
     ):
         _set_if_stronger(parsed, "Motility", "Negative")
-    # Specific motility phrases: tumbling, swarming, corkscrew
     if (
         "tumbling motility" in text_lc
         or "swarming motility" in text_lc
@@ -360,7 +323,6 @@ def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None
     ):
         _set_if_stronger(parsed, "Motility", "Positive")
-    # Capsule (including "capsule positive/negative")
     if (
         "capsulated" in text_lc
         or "encapsulated" in text_lc
@@ -376,16 +338,13 @@ def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None
     ):
         _set_if_stronger(parsed, "Capsule", "Negative")
-    # Spore formation
-    # NEGATIVE FIRST with strict boundaries, then early-return
     if (
         re.search(r"\bnon[-\s]?spore[-\s]?forming\b", text_lc)
         or "no spores" in text_lc
     ):
         _set_if_stronger(parsed, "Spore Formation", "Negative")
-        return  # prevent any positive overwrite
-    # POSITIVE (must not match the negative form)
     if (
         re.search(r"\bspore[-\s]?forming\b", text_lc)
         or "forms spores" in text_lc
@@ -398,17 +357,9 @@ def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None
 # ------------------------------------------------------------
 def _parse_oxygen(text_lc: str, parsed: Dict[str, str]) -> None:
-    """
-    Robust oxygen parsing:
-      - Handle facultative first
-      - Avoid "aerobic" accidentally matching inside "anaerobic"
-      - Include "aerobically" / "anaerobically"
-    """
-    # Facultative first
     if re.search(r"facultative(ly)? anaerob", text_lc):
         _set_if_stronger(parsed, "Oxygen Requirement", "Facultative Anaerobe")
-    # Strict anaerobic (before aerobic)
     if (
         re.search(r"\bobligate anaerob", text_lc)
         or (re.search(r"\banaerobic\b", text_lc) and "facultative" not in text_lc)
@@ -416,17 +367,10 @@ def _parse_oxygen(text_lc: str, parsed: Dict[str, str]) -> None:
     ):
         _set_if_stronger(parsed, "Oxygen Requirement", "Anaerobic")
-    # Now handle purely aerobic, avoiding "anaerobic"
     if (
         re.search(r"\bobligate aerobe\b", text_lc)
-        or (
-            re.search(r"\baerobic\b", text_lc)
-            and "anaerobic" not in text_lc
-        )
-        or (
-            re.search(r"\baerobically\b", text_lc)
-            and "anaerobically" not in text_lc
-        )
     ):
         _set_if_stronger(parsed, "Oxygen Requirement", "Aerobic")
@@ -438,20 +382,10 @@ def _parse_oxygen(text_lc: str, parsed: Dict[str, str]) -> None:
 # ------------------------------------------------------------
-# Growth temperature
 # ------------------------------------------------------------
 def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
-    """
-    Look for explicit temperatures like "grows at 37 °C" or ranges like "4–45 °C".
-    We ALWAYS store as "low//high":
-      - true ranges: "4-45 °C" → "4//45"
-      - two temps in text: min//max (Option A), e.g.:
-            "grows at 4 °C but not at 45 °C" → "4//45"
-            "grows at 42 °C but not at 25 °C" → "25//42"
-      - single temps: "37 °C" → "37//37"
-    """
-    # 1) Explicit ranges like "4-45 °C" or "10–40 °C"
     range_pattern = re.compile(
         r"(\d+)\s*[-–/]\s*(\d+)\s*(?:c|°c|degrees c|degrees celsius)"
     )
@@ -462,7 +396,6 @@ def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
         _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
         return
-    # 2) Option A: any two explicit temps → min//max
     temps = re.findall(r"(\d+)\s*(?:c|°c|degrees c|degrees celsius)", text_lc)
     if len(temps) >= 2:
         nums = [int(t) for t in temps]
@@ -471,9 +404,9 @@ def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
         _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
         return
-    # 3) Single temps like "grows at 37 c"
     single_pattern = re.compile(
-        r"(grows|growth|optimum|optimal)\s+(?:at\s+)?(\d+)\s*(?:c|°c|degrees c|degrees celsius)"
     )
     m_single = single_pattern.search(text_lc)
     if m_single:
@@ -481,14 +414,12 @@ def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
         _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
         return
-    # 4) Simplified: "grows at 37" (no explicit °C)
     m_simple_num = re.search(r"grows at (\d+)\b", text_lc)
     if m_simple_num:
         temp = m_simple_num.group(1)
         _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
         return
-    # 5) Fallback: plain "37c" somewhere in the text
     m_plain = re.search(
         r"\b(\d+)\s*(?:c|°c|degrees c|degrees celsius)\b",
         text_lc,
@@ -499,48 +430,21 @@ def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
 # ------------------------------------------------------------
-# Media grown on (coarse mapping)
 # ------------------------------------------------------------
 MEDIA_KEYWORDS = {
-    "Blood Agar": [
-        "blood agar",
-        "blood-agar",
-    ],
-    "MacConkey Agar": [
-        "macconkey agar",
-        "mac conkey agar",
-        "macconkey",
-    ],
-    "Chocolate Agar": [
-        "chocolate agar",
-        "chocolate-agar",
-    ],
-    "Nutrient Agar": [
-        "nutrient agar",
-        "nutrient-agar",
-    ],
-    "XLD Agar": [
-        "xld agar",
-    ],
-    "TCBS Agar": [
-        "tcbs agar",
-        "tcbs",
-    ],
-    "ALOA": [
-        "aloa agar",
-        "aloa",
-    ],
-    "BCYE Agar": [
-        "bcye agar",
-        "bcye",
-    ],
-    "MRS Agar": [
-        "mrs agar",
-    ],
 }
 def _parse_media(text_lc: str, parsed: Dict[str, str]) -> None:
     found_media: List[str] = []
     for media_name, patterns in MEDIA_KEYWORDS.items():
@@ -553,73 +457,76 @@ def _parse_media(text_lc: str, parsed: Dict[str, str]) -> None:
 # ------------------------------------------------------------
-# Sugar fermentation parsing
 # ------------------------------------------------------------
 def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
-    """
-    Handles patterns like:
-      - "glucose positive, mannitol negative"
-      - "ferments glucose, mannitol and sucrose but not lactose"
-      - "does not ferment lactose"
-      - "non-lactose fermenter"
-      - "<sugar> fermenter" (positive)
-      - global non-fermenter phrases
-    """
-    # 0) Simple "glucose positive / negative" style
     for sugar_key, field in SUGAR_FIELDS.items():
         m_simple = re.search(
-            rf"{sugar_key}\s+(positive|negative|variable|pos|neg|\+|\-)",
-            text_lc,
         )
         if m_simple:
             val = _value_from_pnv_context(m_simple.group(1))
             if val:
                 _set_if_stronger(parsed, field, val)
-    # 0b) NEW: "<sugar> fermenter" → Positive (unless "non-<sugar> fermenter")
     for sugar_key, field in SUGAR_FIELDS.items():
-        # positive: "lactose fermenter"
         if re.search(rf"\b{sugar_key}\s+fermenter\b", text_lc) and not re.search(
             rf"\bnon[- ]{sugar_key}\s+fermenter\b", text_lc
         ):
             _set_if_stronger(parsed, field, "Positive")
-    # 1) "ferments X, Y and Z but not A, B"
     ferments_pattern = re.compile(r"ferments\s+([a-z0-9 ,;/&\-]+)")
     for m in ferments_pattern.finditer(text_lc):
         seg = m.group(1)
-        # Split positive vs negative part on "but not"
         neg_split = re.split(r"\bbut not\b", seg, maxsplit=1)
         pos_part = neg_split[0]
         neg_part = neg_split[1] if len(neg_split) > 1 else ""
-        # Positive sugars from pos_part
         for sugar_key, field in SUGAR_FIELDS.items():
             if re.search(rf"\b{sugar_key}\b", pos_part):
                 _set_if_stronger(parsed, field, "Positive")
-        # Negative sugars from neg_part
         for sugar_key, field in SUGAR_FIELDS.items():
             if re.search(rf"\b{sugar_key}\b", neg_part):
                 _set_if_stronger(parsed, field, "Negative")
-    # 2) "does not ferment X"
     for sugar_key, field in SUGAR_FIELDS.items():
-        if re.search(rf"does (not|n't) ferment {sugar_key}\b", text_lc):
             _set_if_stronger(parsed, field, "Negative")
-    # 3) "non-lactose fermenter"
     for sugar_key, field in SUGAR_FIELDS.items():
-        if re.search(rf"non[- ]{sugar_key} ferment(ing|er)?", text_lc):
             _set_if_stronger(parsed, field, "Negative")
-    # 4) "X fermentation positive/negative"
     for sugar_key, field in SUGAR_FIELDS.items():
-        # "glucose fermentation positive"
         m1 = re.search(
-            rf"{sugar_key}\s+fermentation[ \-]?(positive|negative|variable|pos|neg|\+|\-)",
             text_lc,
         )
         if m1:
@@ -628,9 +535,9 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
                 _set_if_stronger(parsed, field, val)
                 continue
-        # "positive for glucose fermentation"
         m2 = re.search(
-            rf"(positive|negative|variable|pos|neg|\+|\-)\s+(for\s+)?{sugar_key}\s+fermentation",
             text_lc,
         )
         if m2:
@@ -639,10 +546,22 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
                 _set_if_stronger(parsed, field, val)
                 continue
-    # 5) Global non-fermenter patterns
     if (
-        re.search(r"does (not|n't) ferment (carbohydrates|sugars)", text_lc)
-        or re.search(r"non[- ]ferment(er|ing|ative)", text_lc)
     ):
         for field in SUGAR_FIELDS.values():
             if field not in parsed or parsed[field] == UNKNOWN:
@@ -650,18 +569,11 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
 # ------------------------------------------------------------
-# Colony morphology (coarse, optional)
 # ------------------------------------------------------------
 def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
-    """
-    Very coarse mapping for colony morphology. We try:
-      - "colonies are yellow, mucoid"
-      - "colonies dry, white and irregular on nutrient agar"
-      - "forming green colonies", "forms mucoid colonies"
-    """
-    # Pattern 1: "colonies are ..."
-    m = re.search(r"colon(y|ies) (are|is)\s+([a-z0-9 ,;\-]+)", text_lc)
     if m:
         desc = m.group(3).strip()
         if desc:
@@ -672,7 +584,6 @@ def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
                 _set_if_stronger(parsed, "Colony Morphology", pretty)
                 return
-    # Pattern 2: "colonies dry, white and irregular on nutrient agar"
     m2 = re.search(
         r"colonies\s+([a-z0-9 ,;\-]+?)(?:\s+on\b|\.|,)",
         text_lc,
@@ -687,7 +598,6 @@ def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
                 _set_if_stronger(parsed, "Colony Morphology", pretty)
                 return
-    # Pattern 3: "forming green colonies", "forms mucoid colonies"
     m3 = re.search(
         r"(forming|forms|produces)\s+([a-z0-9 ,;\-]+?)\s+colonies",
         text_lc,
@@ -707,9 +617,6 @@ def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
 # ------------------------------------------------------------
 def parse_text_rules(text: str) -> Dict[str, Any]:
-    """
-    Main entry point.
-    """
     original = text or ""
     text_clean = _clean_text(original)
     text_lc = text_clean.lower()
@@ -734,7 +641,6 @@ def parse_text_rules(text: str) -> Dict[str, Any]:
         }
     except Exception as e:
-        # Fail-safe: never crash the app, just report an error
         return {
             "parsed_fields": parsed,
             "source": "rule_parser",

 # ------------------------------------------------------------
 # Rule-based core parser for microbiology descriptions.
 #
+# Stage 11F (Option A ranges + fixes) + 11H + 11I:
+#
 # - Always store Growth Temperature as "low//high"
 #   • single: 37 → "37//37"
 #   • two temps in text: min//max (e.g. "4 °C but not 45 °C" → "4//45")
 # - "aerobically" / "anaerobically" → Aerobic / Anaerobic
 # - NaCl tolerance phrases improved
 # - Colony morphology from "colonies dry, white and irregular on nutrient agar"
+#
+# New in this version:
 #   • "Gelatinase positive/negative" → Gelatin Hydrolysis Positive/Negative
 #   • "<sugar> fermenter" → <Sugar> Fermentation = Positive
+#   • "<sugar> is positive/negative" handled
+#   • "<sugar> fermentation is positive/negative" handled
+#   • Grouped "does not ferment lactose and sucrose" handled
+#   • Global non-fermenter + explicit positive sugar:
+#       "Non-fermenter, ferments glucose weakly"
+#       → all sugars Negative *except* Glucose = Positive
+#   • Core tests accept "is positive/is negative/is variable"
+#   • "H2S production is positive/negative" handled
+#   • ONPG phrases like "ONPG is negative" now parsed
 # ------------------------------------------------------------
 from __future__ import annotations
 # Core fields and sugar mapping
 # ------------------------------------------------------------
 SUGAR_FIELDS: Dict[str, str] = {
     "glucose": "Glucose Fermentation",
     "lactose": "Lactose Fermentation",
 }
 CORE_BOOL_FIELDS: Dict[str, List[str]] = {
     "Catalase": ["catalase"],
     "Oxidase": ["oxidase"],
     "Indole": ["indole"],
     "Urease": ["urease"],
     "Citrate": ["citrate"],
     "Methyl Red": ["methyl red", "mr test", "mr"],
     "VP": ["voges-proskauer", "vp test", "vp"],
     "H2S": ["h2s", "hydrogen sulfide"],
     "DNase": [
+        "dnase", "dnase test", "dnase activity",
+        "dnase production", "dnaase", "dna hydrolysis",
     ],
     "ONPG": ["onpg"],
     "Coagulase": ["coagulase"],
     "Ornitihine Decarboxylase": ["ornithine decarboxylase", "ornithine decarb"],
     "Arginine dihydrolase": ["arginine dihydrolase"],
     "Gelatin Hydrolysis": ["gelatin hydrolysis", "gelatinase"],
     "Esculin Hydrolysis": ["esculin hydrolysis", "esculin"],
 }
 # ------------------------------------------------------------
 def _clean_text(text: str) -> str:
     if not text:
         return ""
     s = text.replace("°", "").replace("º", "")
     s = s.replace("₂", "2")
     return " ".join(s.split())
 def _norm(s: str) -> str:
     return s.strip().lower()
 def _set_if_stronger(parsed: Dict[str, str], field: str, value: str) -> None:
     if not value:
         return
     if field not in parsed or parsed[field] == UNKNOWN:
         parsed[field] = value
+def _value_from_pnv_token(token: str) -> str | None:
+    seg = _norm(token)
     if seg in ["positive", "pos", "+"]:
         return "Positive"
     if seg in ["negative", "neg", "-"]:
         return "Variable"
     return None
+def _value_from_pnv_context(segment: str) -> str | None:
+    seg = _norm(segment)
+    val = _value_from_pnv_token(seg)
+    if val:
+        return val
+    m = re.search(r"\bis\s+(positive|negative|variable|pos|neg|\+|\-)\b", seg)
+    if m:
+        return _value_from_pnv_token(m.group(1))
+    return None
 # ------------------------------------------------------------
 # Gram stain and shape
 # ------------------------------------------------------------
 def _parse_gram_and_shape(text_lc: str, parsed: Dict[str, str]) -> None:
     if "gram-positive" in text_lc or "gram positive" in text_lc:
         _set_if_stronger(parsed, "Gram Stain", "Positive")
     elif "gram-negative" in text_lc or "gram negative" in text_lc:
     elif "gram variable" in text_lc:
         _set_if_stronger(parsed, "Gram Stain", "Variable")
     if "short rods" in text_lc:
         _set_if_stronger(parsed, "Shape", "Short Rods")
     if re.search(r"\bcocci\b", text_lc):
         _set_if_stronger(parsed, "Shape", "Cocci")
     if re.search(r"\b(diplococci|tetracocci|streptococci|staphylococci)\b", text_lc):
         _set_if_stronger(parsed, "Shape", "Cocci")
     if re.search(r"\brods?\b", text_lc) or "bacilli" in text_lc:
         _set_if_stronger(parsed, "Shape", "Rods")
     if "spiral" in text_lc or "spirochete" in text_lc:
         _set_if_stronger(parsed, "Shape", "Spiral")
 # ------------------------------------------------------------
 # Haemolysis
 # ------------------------------------------------------------
 def _parse_haemolysis(text_lc: str, parsed: Dict[str, str]) -> None:
     if re.search(r"beta[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
         _set_if_stronger(parsed, "Haemolysis Type", "Beta")
         _set_if_stronger(parsed, "Haemolysis", "Positive")
     if re.search(r"alpha[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
         _set_if_stronger(parsed, "Haemolysis Type", "Alpha")
         _set_if_stronger(parsed, "Haemolysis", "Positive")
     if re.search(r"gamma[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
         _set_if_stronger(parsed, "Haemolysis Type", "Gamma")
         _set_if_stronger(parsed, "Haemolysis", "Negative")
     if (
         "non-haemolytic" in text_lc
         or "non hemolytic" in text_lc
         _set_if_stronger(parsed, "Haemolysis Type", "None")
         _set_if_stronger(parsed, "Haemolysis", "Negative")
     if "variable haemolysis" in text_lc or "variable hemolysis" in text_lc:
         _set_if_stronger(parsed, "Haemolysis Type", "Variable")
         _set_if_stronger(parsed, "Haemolysis", "Variable")
 # ------------------------------------------------------------
+# Core enzyme test parsing
 # ------------------------------------------------------------
 def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
     for field, keywords in CORE_BOOL_FIELDS.items():
         for kw in keywords:
             m1 = re.search(
+                rf"{re.escape(kw)}[ \-]?"
+                r"(positive|negative|variable|pos|neg|\+|\-)",
                 text_lc,
             )
             if m1:
                     _set_if_stronger(parsed, field, val)
                     break
             m2 = re.search(
+                rf"(positive|negative|variable|pos|neg|\+|\-)\s+"
+                rf"(for\s+)?{re.escape(kw)}",
                 text_lc,
             )
             if m2:
                     _set_if_stronger(parsed, field, val)
                     break
+            m3 = re.search(
+                rf"{re.escape(kw)}\s+is\s+"
+                r"(positive|negative|variable|pos|neg|\+|\-)",
+                text_lc,
+            )
+            if m3:
+                val = _value_from_pnv_token(m3.group(1))
+                if val:
+                    _set_if_stronger(parsed, field, val)
+                    break
+    # --- NaCl tolerance explicit patterns ---
         if field == "NaCl Tolerant (>=6%)":
             for m in re.finditer(
                 r"(grows|growth)\s+(in|at)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
                 text_lc,
                 except Exception:
                     pass
             for m in re.finditer(
                 r"nacl\s+tolerant\s+(?:to|up to)?\s*(\d+(?:\.\d+)?)\s*%?",
                 text_lc,
                 except Exception:
                     pass
             if re.search(
                 r"does (not|n't) grow\s+(in|at)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
                 text_lc,
             ):
                 _set_if_stronger(parsed, "NaCl Tolerant (>=6%)", "Negative")
             for m in re.finditer(
                 r"\bin\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
                 text_lc,
                 except Exception:
                     pass
+    # Nitrate
     if re.search(r"reduces nitrate", text_lc):
         _set_if_stronger(parsed, "Nitrate Reduction", "Positive")
     if re.search(r"does (not|n't) reduce nitrate", text_lc):
         _set_if_stronger(parsed, "Nitrate Reduction", "Negative")
+    # H2S
     if re.search(r"(produces|production of)\s+h2s", text_lc):
         _set_if_stronger(parsed, "H2S", "Positive")
+    if re.search(r"h2s production\s+is\s+(positive|pos|\+)", text_lc):
+        _set_if_stronger(parsed, "H2S", "Positive")
+    if re.search(r"h2s production\s+is\s+(negative|neg|\-)", text_lc):
+        _set_if_stronger(parsed, "H2S", "Negative")
     if (
         re.search(r"does (not|n't) produce\s+h2s", text_lc)
         or re.search(r"no h2s production", text_lc)
     ):
         _set_if_stronger(parsed, "H2S", "Negative")
+    # DNase
     if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(positive|pos|\+)\b", text_lc):
         _set_if_stronger(parsed, "DNase", "Positive")
     if re.search(r"\b(positive|pos|\+)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc):
         _set_if_stronger(parsed, "DNase", "Positive")
     if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(negative|neg|\-)\b", text_lc):
         _set_if_stronger(parsed, "DNase", "Negative")
     if re.search(r"\b(negative|neg|\-)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc):
         _set_if_stronger(parsed, "DNase", "Negative")
     if re.search(r"\bnon[- ]?dnase[- ]?producing\b", text_lc):
         _set_if_stronger(parsed, "DNase", "Negative")
+    # NEW: Gelatinase mapping
     if re.search(r"\bgelatinase\s*(positive|pos|\+)\b", text_lc):
         _set_if_stronger(parsed, "Gelatin Hydrolysis", "Positive")
     if re.search(r"\bgelatinase\s*(negative|neg|\-)\b", text_lc):
         _set_if_stronger(parsed, "Gelatin Hydrolysis", "Negative")
 # ------------------------------------------------------------
 def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None:
     if (
         re.search(r"\bmotile\b", text_lc)
         and not re.search(r"\bnon[- ]?motile\b", text_lc)
     ):
         _set_if_stronger(parsed, "Motility", "Negative")
     if (
         "tumbling motility" in text_lc
         or "swarming motility" in text_lc
     ):
         _set_if_stronger(parsed, "Motility", "Positive")
     if (
         "capsulated" in text_lc
         or "encapsulated" in text_lc
     ):
         _set_if_stronger(parsed, "Capsule", "Negative")
     if (
         re.search(r"\bnon[-\s]?spore[-\s]?forming\b", text_lc)
         or "no spores" in text_lc
     ):
         _set_if_stronger(parsed, "Spore Formation", "Negative")
+        return
     if (
         re.search(r"\bspore[-\s]?forming\b", text_lc)
         or "forms spores" in text_lc
 # ------------------------------------------------------------
 def _parse_oxygen(text_lc: str, parsed: Dict[str, str]) -> None:
     if re.search(r"facultative(ly)? anaerob", text_lc):
         _set_if_stronger(parsed, "Oxygen Requirement", "Facultative Anaerobe")
     if (
         re.search(r"\bobligate anaerob", text_lc)
         or (re.search(r"\banaerobic\b", text_lc) and "facultative" not in text_lc)
     ):
         _set_if_stronger(parsed, "Oxygen Requirement", "Anaerobic")
     if (
         re.search(r"\bobligate aerobe\b", text_lc)
+        or (re.search(r"\baerobic\b", text_lc) and "anaerobic" not in text_lc)
+        or (re.search(r"\baerobically\b", text_lc) and "anaerobically" not in text_lc)
     ):
         _set_if_stronger(parsed, "Oxygen Requirement", "Aerobic")
 # ------------------------------------------------------------
+# Growth Temperature
 # ------------------------------------------------------------
 def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
     range_pattern = re.compile(
         r"(\d+)\s*[-–/]\s*(\d+)\s*(?:c|°c|degrees c|degrees celsius)"
     )
         _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
         return
     temps = re.findall(r"(\d+)\s*(?:c|°c|degrees c|degrees celsius)", text_lc)
     if len(temps) >= 2:
         nums = [int(t) for t in temps]
         _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
         return
     single_pattern = re.compile(
+        r"(grows|growth|optimum|optimal)\s+(?:at\s+)?(\d+)\s*"
+        r"(?:c|°c|degrees c|degrees celsius)"
     )
     m_single = single_pattern.search(text_lc)
     if m_single:
         _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
         return
     m_simple_num = re.search(r"grows at (\d+)\b", text_lc)
     if m_simple_num:
         temp = m_simple_num.group(1)
         _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
         return
     m_plain = re.search(
         r"\b(\d+)\s*(?:c|°c|degrees c|degrees celsius)\b",
         text_lc,
 # ------------------------------------------------------------
+# Media Grown On
 # ------------------------------------------------------------
 MEDIA_KEYWORDS = {
+    "Blood Agar": ["blood agar", "blood-agar"],
+    "MacConkey Agar": ["macconkey agar", "mac conkey agar", "macconkey"],
+    "Chocolate Agar": ["chocolate agar", "chocolate-agar"],
+    "Nutrient Agar": ["nutrient agar", "nutrient-agar"],
+    "XLD Agar": ["xld agar"],
+    "TCBS Agar": ["tcbs agar", "tcbs"],
+    "ALOA": ["aloa agar", "aloa"],
+    "BCYE Agar": ["bcye agar", "bcye"],
+    "MRS Agar": ["mrs agar"],
 }
 def _parse_media(text_lc: str, parsed: Dict[str, str]) -> None:
     found_media: List[str] = []
     for media_name, patterns in MEDIA_KEYWORDS.items():
 # ------------------------------------------------------------
+# Sugar fermentation
 # ------------------------------------------------------------
 def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
     for sugar_key, field in SUGAR_FIELDS.items():
         m_simple = re.search(
+            rf"{sugar_key}\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc
         )
         if m_simple:
             val = _value_from_pnv_context(m_simple.group(1))
             if val:
                 _set_if_stronger(parsed, field, val)
+        m_is = re.search(
+            rf"{sugar_key}\s+is\s+(positive|negative|variable|pos|neg|\+|\-)",
+            text_lc,
+        )
+        if m_is:
+            val = _value_from_pnv_token(m_is.group(1))
+            if val:
+                _set_if_stronger(parsed, field, val)
     for sugar_key, field in SUGAR_FIELDS.items():
         if re.search(rf"\b{sugar_key}\s+fermenter\b", text_lc) and not re.search(
             rf"\bnon[- ]{sugar_key}\s+fermenter\b", text_lc
         ):
             _set_if_stronger(parsed, field, "Positive")
+        if re.search(rf"\bnon[- ]{sugar_key}\s+fermenter\b", text_lc):
+            _set_if_stronger(parsed, field, "Negative")
     ferments_pattern = re.compile(r"ferments\s+([a-z0-9 ,;/&\-]+)")
     for m in ferments_pattern.finditer(text_lc):
         seg = m.group(1)
         neg_split = re.split(r"\bbut not\b", seg, maxsplit=1)
         pos_part = neg_split[0]
         neg_part = neg_split[1] if len(neg_split) > 1 else ""
         for sugar_key, field in SUGAR_FIELDS.items():
             if re.search(rf"\b{sugar_key}\b", pos_part):
                 _set_if_stronger(parsed, field, "Positive")
         for sugar_key, field in SUGAR_FIELDS.items():
             if re.search(rf"\b{sugar_key}\b", neg_part):
                 _set_if_stronger(parsed, field, "Negative")
+    grouped_neg_pattern = re.compile(
+        r"does\s+(?:not|n't)\s+ferment\s+([a-z0-9 ,;/&\-]+)"
+    )
+    for m in grouped_neg_pattern.finditer(text_lc):
+        seg = m.group(1)
+        for sugar_key, field in SUGAR_FIELDS.items():
+            if re.search(rf"\b{sugar_key}\b", seg):
+                _set_if_stronger(parsed, field, "Negative")
     for sugar_key, field in SUGAR_FIELDS.items():
+        if re.search(
+            rf"does\s+(?:not|n't)\s+ferment\s+{sugar_key}\b", text_lc
+        ):
             _set_if_stronger(parsed, field, "Negative")
     for sugar_key, field in SUGAR_FIELDS.items():
+        if re.search(
+            rf"non[- ]{sugar_key}\s+ferment(ing|er)?", text_lc
+        ):
             _set_if_stronger(parsed, field, "Negative")
     for sugar_key, field in SUGAR_FIELDS.items():
         m1 = re.search(
+            rf"{sugar_key}\s+fermentation[ \-]?"
+            r"(positive|negative|variable|pos|neg|\+|\-)",
             text_lc,
         )
         if m1:
                 _set_if_stronger(parsed, field, val)
                 continue
         m2 = re.search(
+            rf"(positive|negative|variable|pos|neg|\+|\-)\s+"
+            rf"(for\s+)?{sugar_key}\s+fermentation",
             text_lc,
         )
         if m2:
                 _set_if_stronger(parsed, field, val)
                 continue
+        m3 = re.search(
+            rf"{sugar_key}\s+fermentation\s+is\s+"
+            r"(positive|negative|variable|pos|neg|\+|\-)",
+            text_lc,
+        )
+        if m3:
+            val = _value_from_pnv_token(m3.group(1))
+            if val:
+                _set_if_stronger(parsed, field, val)
+                continue
     if (
+        re.search(
+            r"does\s+(?:not|n't)\s+ferment\s+(carbohydrates|sugars)", text_lc
+        )
+        or re.search(r"\bnon[- ]ferment(er|ing|ative)\b", text_lc)
     ):
         for field in SUGAR_FIELDS.values():
             if field not in parsed or parsed[field] == UNKNOWN:
 # ------------------------------------------------------------
+# Colony morphology
 # ------------------------------------------------------------
 def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
+    m = re.search(r"colon(y|ies)\s+(are|is)\s+([a-z0-9 ,;\-]+)", text_lc)
     if m:
         desc = m.group(3).strip()
         if desc:
                 _set_if_stronger(parsed, "Colony Morphology", pretty)
                 return
     m2 = re.search(
         r"colonies\s+([a-z0-9 ,;\-]+?)(?:\s+on\b|\.|,)",
         text_lc,
                 _set_if_stronger(parsed, "Colony Morphology", pretty)
                 return
     m3 = re.search(
         r"(forming|forms|produces)\s+([a-z0-9 ,;\-]+?)\s+colonies",
         text_lc,
 # ------------------------------------------------------------
 def parse_text_rules(text: str) -> Dict[str, Any]:
     original = text or ""
     text_clean = _clean_text(original)
     text_lc = text_clean.lower()
         }
     except Exception as e:
         return {
             "parsed_fields": parsed,
             "source": "rule_parser",