BactKing / engine /parser_rules.py
EphAsad's picture
Update engine/parser_rules.py
0f6879b verified
# engine/parser_rules.py
# ------------------------------------------------------------
# Rule-based core parser for microbiology descriptions.
#
# Stage 11F (Option A ranges + fixes) + 11H + 11I + 11J + 11L + 11M
# + NaCl + haemolysis symbol support + colony morphology tweaks.
#
# - Always store Growth Temperature as "low//high"
# • single: 37 → "37//37"
# • any two temps in text: min//max
# • ranges like "30–37 °C", "grows between 30 and 37 °C" → "30//37"
#
# - DNase robust parsing (DNase test / activity / production)
# - Non–spore-forming → Spore Formation = Negative (with early return)
# - "non-H2S producing" → H2S = Negative
# - Aerobic / Anaerobic including “aerobically / anaerobically”
#
# - NaCl tolerance phrases improved (>= 6% rule)
# • explicit positives require a growth/tolerance verb + % ≥ 6
# • explicit negatives ("no growth in NaCl", "does not grow in 7% NaCl",
# "NaCl sensitive", "not NaCl tolerant") override positives
# • ambiguous "in 6.5% NaCl" alone no longer auto-Positive
#
# - Colony morphology extraction, including:
# • "colonies are yellow, mucoid"
# • "colonies dry, white and irregular on nutrient agar"
# • "forming smooth, yellow-pigmented, opaque colonies"
# • "grey colonies", "large grey colonies" etc.
#
# - Sugars:
# • "<sugar> positive/negative"
# • "<sugar> is positive/negative"
# • "<sugar> fermenter" / "non-<sugar> fermenter"
# • "ferments X, Y but not Z"
# • grouped "does not ferment lactose and sucrose"
# (without nuking glucose in "but glucose positive")
# • global "non-fermenter" → all sugars Negative (Unknown-only)
# • "asaccharolytic" → all sugars Negative (Unknown-only)
# • "all other sugars negative" → all remaining sugars Negative
# (Unknown-only; no hard rewrite)
#
# - Core tests:
# • "<kw> positive/negative"
# • "positive for <kw>"
# • "<kw> is positive/negative"
# • "<kw> reaction is positive/negative"
# • "<kw> reaction positive/negative"
# • "<kw> test reaction is positive/negative"
# • "ONPG is negative" handled via core patterns
# • "H2S production is positive/negative"
# • "MR and VP negative/positive" → both set
# • grouped phrases like
# "gelatin and esculin hydrolysis negative"
# "lysine, ornithine and arginine negative"
# → all mentioned tests / sugars set to the given value
#
# - Decarboxylases:
# • "all decarboxylases negative/positive"
# → Lysine / Ornithine / Arginine dihydrolase set accordingly
# (Unknown-only; explicit values can override later)
#
# - Capsule / Motility:
# • "capsule present"/"capsule is present" → Capsule Positive
# • "capsule absent"/"capsule is absent"/"no capsule" → Capsule Negative
# • "encapsulated" / "capsulated" → Capsule Positive
# • "gliding/spreading/swarming motility" → Motility Positive
#
# - Gelatin / Esculin:
# • "gelatin positive/negative" → Gelatin Hydrolysis
# • "esculin positive/negative" → Esculin Hydrolysis
#
# - Shape:
# • "coccobacilli / coccobacillus" → Shape = Short Rods
# • (no 4F shape descriptor explosion; we keep existing logic)
#
# - Haemolysis:
# • alpha/beta/gamma haemolysis & haemolytic
# • now also supports α / β / γ symbols via normalisation
# ------------------------------------------------------------
from __future__ import annotations
import re
from typing import Dict, Any, List
UNKNOWN = "Unknown"
# ------------------------------------------------------------
# Core fields and sugar mapping
# ------------------------------------------------------------
# Sugar name → core DB column
SUGAR_FIELDS: Dict[str, str] = {
"glucose": "Glucose Fermentation",
"lactose": "Lactose Fermentation",
"sucrose": "Sucrose Fermentation",
"maltose": "Maltose Fermentation",
"mannitol": "Mannitol Fermentation",
"sorbitol": "Sorbitol Fermentation",
"xylose": "Xylose Fermentation",
"rhamnose": "Rhamnose Fermentation",
"arabinose": "Arabinose Fermentation",
"raffinose": "Raffinose Fermentation",
"trehalose": "Trehalose Fermentation",
"inositol": "Inositol Fermentation",
}
CORE_BOOL_FIELDS: Dict[str, List[str]] = {
# field: [keywords to recognise the test name]
"Catalase": ["catalase"],
"Oxidase": ["oxidase"],
"Indole": ["indole"],
"Urease": ["urease"],
"Citrate": ["citrate"],
# MR: include "mr"
"Methyl Red": ["methyl red", "mr test", "mr"],
"VP": ["voges-proskauer", "vp test", "vp"],
# H2S (includes H₂S → normalised to H2S in _clean_text)
"H2S": ["h2s", "hydrogen sulfide"],
# DNase: broaden patterns
"DNase": [
"dnase",
"dnase test",
"dnase activity",
"dnase production",
"dnaase",
"dna hydrolysis",
],
"ONPG": ["onpg"],
"Coagulase": ["coagulase"],
"Lipase Test": ["lipase"],
"Nitrate Reduction": ["nitrate reduction", "nitrate"],
"NaCl Tolerant (>=6%)": ["6% nacl", "7% nacl", "nacl tolerant"],
# Decarboxylases (also match plain amino acid words)
"Lysine Decarboxylase": ["lysine decarboxylase", "lysine decarb", "lysine"],
"Ornitihine Decarboxylase": ["ornithine decarboxylase", "ornithine decarb", "ornithine"],
"Arginine dihydrolase": ["arginine dihydrolase", "arginine decarboxylase", "arginine"],
# Gelatin / Esculin
"Gelatin Hydrolysis": ["gelatin hydrolysis", "gelatinase", "gelatin"],
"Esculin Hydrolysis": ["esculin hydrolysis", "esculin"],
}
# ------------------------------------------------------------
# Generic helpers
# ------------------------------------------------------------
def _clean_text(text: str) -> str:
"""
Normalise unicode oddities and collapse whitespace.
Also:
- strip degree symbols
- normalise subscript ₂ → 2 for H₂S
- normalise α/β/γ to alpha/beta/gamma for haemolysis patterns
"""
if not text:
return ""
s = text.replace("°", "").replace("º", "")
# normalise subscript 2 (H₂S → H2S)
s = s.replace("₂", "2")
# Greek letters for haemolysis and related descriptors
s = (
s.replace("α", "alpha")
.replace("β", "beta")
.replace("γ", "gamma")
)
# collapse whitespace
return " ".join(s.split())
def _norm(s: str) -> str:
return s.strip().lower()
def _set_if_stronger(parsed: Dict[str, str], field: str, value: str) -> None:
"""
Write value to parsed[field] if:
- field not present, or
- we are replacing Unknown with a concrete value
"""
if not value:
return
if field not in parsed or parsed[field] == UNKNOWN:
parsed[field] = value
def _value_from_pnv_token(token: str) -> str | None:
"""
Map a simple token to Positive / Negative / Variable.
"""
seg = _norm(token)
if seg in ["positive", "pos", "+"]:
return "Positive"
if seg in ["negative", "neg", "-"]:
return "Negative"
if seg in ["variable", "var", "v"]:
return "Variable"
return None
def _value_from_pnv_context(segment: str) -> str | None:
"""
Interpret a phrase as Positive / Negative / Variable.
Handles:
- "positive"
- "is positive"
- "+", "neg", etc.
"""
seg = _norm(segment)
# direct token first
val = _value_from_pnv_token(seg)
if val:
return val
# "... is positive"
m = re.search(r"\bis\s+(positive|negative|variable|pos|neg|\+|\-)\b", seg)
if m:
return _value_from_pnv_token(m.group(1))
return None
# ------------------------------------------------------------
# Gram stain and shape
# ------------------------------------------------------------
def _parse_gram_and_shape(text_lc: str, parsed: Dict[str, str]) -> None:
# Gram stain
if "gram-positive" in text_lc or "gram positive" in text_lc:
_set_if_stronger(parsed, "Gram Stain", "Positive")
elif "gram-negative" in text_lc or "gram negative" in text_lc:
_set_if_stronger(parsed, "Gram Stain", "Negative")
elif "gram variable" in text_lc:
_set_if_stronger(parsed, "Gram Stain", "Variable")
# Shape
# Prefer "short rods" / coccobacilli over generic rods
if "short rods" in text_lc:
_set_if_stronger(parsed, "Shape", "Short Rods")
# NEW: coccobacilli → Short Rods
if re.search(r"\bcoccobacill(?:us|i)\b", text_lc):
_set_if_stronger(parsed, "Shape", "Short Rods")
# Cocci and variants (diplococci, tetracocci, etc.)
if re.search(r"\bcocci\b", text_lc):
_set_if_stronger(parsed, "Shape", "Cocci")
if re.search(r"\b(diplococci|tetracocci|streptococci|staphylococci)\b", text_lc):
_set_if_stronger(parsed, "Shape", "Cocci")
# Rods / bacilli
if re.search(r"\brods?\b", text_lc) or "bacilli" in text_lc:
_set_if_stronger(parsed, "Shape", "Rods")
# Spiral
if "spiral" in text_lc or "spirochete" in text_lc:
_set_if_stronger(parsed, "Shape", "Spiral")
# ------------------------------------------------------------
# Haemolysis
# ------------------------------------------------------------
def _parse_haemolysis(text_lc: str, parsed: Dict[str, str]) -> None:
"""
Handle haemolysis phrasing:
- beta-haemolytic / beta hemolytic / beta-haemolysis / etc.
- alpha- / gamma- / non-haemolytic
- α / β / γ symbols are normalised to alpha/beta/gamma in _clean_text
"""
# Beta
if re.search(r"beta[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
_set_if_stronger(parsed, "Haemolysis Type", "Beta")
_set_if_stronger(parsed, "Haemolysis", "Positive")
# Alpha
if re.search(r"alpha[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
_set_if_stronger(parsed, "Haemolysis Type", "Alpha")
_set_if_stronger(parsed, "Haemolysis", "Positive")
# Gamma / non-haemolytic
if re.search(r"gamma[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
_set_if_stronger(parsed, "Haemolysis Type", "Gamma")
_set_if_stronger(parsed, "Haemolysis", "Negative")
if (
"non-haemolytic" in text_lc
or "non hemolytic" in text_lc
or "non-hemolytic" in text_lc
):
_set_if_stronger(parsed, "Haemolysis Type", "None")
_set_if_stronger(parsed, "Haemolysis", "Negative")
# Variable phrasing
if "variable haemolysis" in text_lc or "variable hemolysis" in text_lc:
_set_if_stronger(parsed, "Haemolysis Type", "Variable")
_set_if_stronger(parsed, "Haemolysis", "Variable")
# ------------------------------------------------------------
# Core enzyme / boolean tests
# ------------------------------------------------------------
def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
"""
For each test in CORE_BOOL_FIELDS, look for patterns like:
- "catalase positive"
- "positive for catalase"
- "catalase is positive"
- "indole reaction is negative"
- "indole reaction negative"
- "indole test reaction is positive"
Plus:
- NaCl tolerance with % values
- Nitrate reduction text
- H2S production / non-production
- DNase coverage
- gelatinase / gelatin → Gelatin Hydrolysis
- esculin → Esculin Hydrolysis
- grouped MR/VP: "MR and VP negative"
- decarboxylase global phrases
- generic grouped phrases
"gelatin and esculin hydrolysis negative"
"lysine, ornithine and arginine negative"
"""
for field, keywords in CORE_BOOL_FIELDS.items():
for kw in keywords:
# 1) "... catalase positive"
m1 = re.search(
rf"{re.escape(kw)}[ \-]?"
r"(positive|negative|variable|pos|neg|\+|\-)",
text_lc,
)
if m1:
val = _value_from_pnv_context(m1.group(1))
if val:
_set_if_stronger(parsed, field, val)
break
# 2) "positive for catalase"
m2 = re.search(
rf"(positive|negative|variable|pos|neg|\+|\-)\s+"
rf"(for\s+)?{re.escape(kw)}",
text_lc,
)
if m2:
val = _value_from_pnv_context(m2.group(1))
if val:
_set_if_stronger(parsed, field, val)
break
# 3) "<kw> is positive"
m3 = re.search(
rf"{re.escape(kw)}\s+is\s+"
r"(positive|negative|variable|pos|neg|\+|\-)",
text_lc,
)
if m3:
val = _value_from_pnv_token(m3.group(1))
if val:
_set_if_stronger(parsed, field, val)
break
# 4) "<kw> reaction is positive/negative"
m4 = re.search(
rf"{re.escape(kw)}\s+reaction\s+is\s+"
r"(positive|negative|variable|pos|neg|\+|\-)",
text_lc,
)
if m4:
val = _value_from_pnv_token(m4.group(1))
if val:
_set_if_stronger(parsed, field, val)
break
# 5) "<kw> reaction positive/negative"
m5 = re.search(
rf"{re.escape(kw)}\s+reaction\s+"
r"(positive|negative|variable|pos|neg|\+|\-)",
text_lc,
)
if m5:
val = _value_from_pnv_token(m5.group(1))
if val:
_set_if_stronger(parsed, field, val)
break
# 6) "<kw> test reaction is positive"
m6 = re.search(
rf"{re.escape(kw)}\s+test\s+reaction\s+is\s+"
r"(positive|negative|variable|pos|neg|\+|\-)",
text_lc,
)
if m6:
val = _value_from_pnv_token(m6.group(1))
if val:
_set_if_stronger(parsed, field, val)
break
# Special-case NaCl tolerance with explicit percentages
if field == "NaCl Tolerant (>=6%)":
# We scan the whole text for positive/negative NaCl evidence,
# then decide once per description. Negative has highest priority.
has_positive = False
has_negative = False
# --- Negative phrasing (highest priority) ---
# "does not grow in 7% NaCl", "doesn't grow at 10% NaCl"
if re.search(
r"does\s+(?:not|n't)\s+grow\s+(in|at)\s*\d+(?:\.\d+)?\s*%?\s*nacl",
text_lc,
):
has_negative = True
# "no growth in 6.5% NaCl", "no growth at 8% NaCl"
if re.search(
r"no\s+growth\s+(in|at)\s*\d+(?:\.\d+)?\s*%?\s*nacl",
text_lc,
):
has_negative = True
# "no growth in NaCl" (no explicit %)
if re.search(
r"no\s+growth\s+in\s+nacl",
text_lc,
):
has_negative = True
# "unable to grow in 7% NaCl", "unable to grow in NaCl"
if re.search(
r"unable\s+to\s+grow\s+(in|at)\s*(\d+(?:\.\d+)?\s*%?\s*)?nacl",
text_lc,
):
has_negative = True
# semantic negatives without explicit %
if re.search(r"cannot\s+tolerate\s+nacl", text_lc):
has_negative = True
if re.search(r"not\s+nacl\s+tolerant", text_lc):
has_negative = True
if re.search(r"nacl\s+sensitive", text_lc):
has_negative = True
if re.search(r"fails\s+to\s+grow\s+(in|at)\s*(\d+(?:\.\d+)?\s*%?\s*)?nacl", text_lc):
has_negative = True
if re.search(r"intolerant\s+to\s+nacl", text_lc):
has_negative = True
if re.search(r"no\s+tolerance\s+to\s+nacl", text_lc):
has_negative = True
if re.search(r"nacl\s+intolerance", text_lc):
has_negative = True
if re.search(r"no\s+growth\s+at\s+high\s+nacl", text_lc):
has_negative = True
# --- Positive phrasing (requires growth/tolerance verb + % ≥ 6) ---
# e.g. "grows in 6.5% NaCl", "growth occurs at 10% NaCl"
for m in re.finditer(
r"(grows|growth occurs|growth observed|able to grow|tolerates|tolerant)\s+"
r"(?:in|at|up to|to)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
text_lc,
):
try:
conc = float(m.group(2))
if conc >= 6.0:
has_positive = True
except Exception:
pass
# e.g. "NaCl tolerant up to 10%", "NaCl tolerant to 8%"
for m in re.finditer(
r"nacl\s+tolerant\s+(?:to|up to)?\s*(\d+(?:\.\d+)?)\s*%?",
text_lc,
):
try:
conc = float(m.group(1))
if conc >= 6.0:
has_positive = True
except Exception:
pass
# Decide final value:
# Negative > Positive > Unknown
if has_negative:
# Negative explicitly overrides any previous value
parsed["NaCl Tolerant (>=6%)"] = "Negative"
elif has_positive:
_set_if_stronger(parsed, "NaCl Tolerant (>=6%)", "Positive")
# Nitrate: "reduces nitrate" / "does not reduce nitrate"
if re.search(r"reduces nitrate", text_lc):
_set_if_stronger(parsed, "Nitrate Reduction", "Positive")
if re.search(r"does (not|n't) reduce nitrate", text_lc):
_set_if_stronger(parsed, "Nitrate Reduction", "Negative")
# H2S: "produces H2S", "H2S production", "H2S production is positive"
if re.search(r"(produces|production of)\s+h2s", text_lc):
_set_if_stronger(parsed, "H2S", "Positive")
if re.search(r"h2s production\s+is\s+(positive|pos|\+)", text_lc):
_set_if_stronger(parsed, "H2S", "Positive")
if re.search(r"h2s production\s+is\s+(negative|neg|\-)", text_lc):
_set_if_stronger(parsed, "H2S", "Negative")
if (
re.search(r"does (not|n't) produce\s+h2s", text_lc)
or re.search(r"no h2s production", text_lc)
or re.search(r"non[- ]h2s producing", text_lc)
):
_set_if_stronger(parsed, "H2S", "Negative")
# --- DNase universal coverage ---
# Positive forms
if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(positive|pos|\+)\b", text_lc):
_set_if_stronger(parsed, "DNase", "Positive")
if re.search(r"\b(positive|pos|\+)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc):
_set_if_stronger(parsed, "DNase", "Positive")
# Negative forms
if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(negative|neg|\-)\b", text_lc):
_set_if_stronger(parsed, "DNase", "Negative")
if re.search(r"\b(negative|neg|\-)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc):
_set_if_stronger(parsed, "DNase", "Negative")
# non-DNase-producing
if re.search(r"\bnon[- ]?dnase[- ]?producing\b", text_lc):
_set_if_stronger(parsed, "DNase", "Negative")
# --- MR and VP grouped: "MR and VP negative" ---
mr_vp_pattern = re.compile(
r"\b("
r"mr(?: test)?|methyl red|"
r"vp(?: test)?|voges-proskauer"
r")\s*(?:test)?\s*(?:and|&)\s*( "
r"mr(?: test)?|methyl red|"
r"vp(?: test)?|voges-proskauer"
r")\s+"
r"(positive|negative|variable|pos|neg|\+|\-)"
)
for m in mr_vp_pattern.finditer(text_lc):
name1 = m.group(1)
name2 = m.group(2)
val = _value_from_pnv_token(m.group(3))
if not val:
continue
def _assign_mr_vp(name: str) -> None:
n = name.lower()
if "mr" in n or "methyl red" in n:
_set_if_stronger(parsed, "Methyl Red", val)
if "vp" in n or "voges" in n:
_set_if_stronger(parsed, "VP", val)
_assign_mr_vp(name1)
_assign_mr_vp(name2)
# --- Decarboxylases global "all decarboxylases negative/positive" ---
m_all_decarb = re.search(
r"all\s+decarboxylases?\s+(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)",
text_lc,
)
if m_all_decarb:
val = _value_from_pnv_token(m_all_decarb.group(1))
if val:
for f in ("Lysine Decarboxylase", "Ornitihine Decarboxylase", "Arginine dihydrolase"):
_set_if_stronger(parsed, f, val)
# --- Generic grouped list logic for tests & sugars ---
#
# Handles things like:
# "gelatin and esculin hydrolysis negative"
# "lysine, ornithine and arginine negative"
# "indole, urease and citrate positive"
# "raffinose and inositol negative"
#
grouped_tests_pattern = re.compile(
r"([a-z0-9 ,/&\-]+?)\s+"
r"(?:hydrolysis|decarboxylases?|dihydrolases?|tests?|reactions?)?"
r"\s*(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)"
)
for m in grouped_tests_pattern.finditer(text_lc):
seg = m.group(1)
val = _value_from_pnv_token(m.group(2))
if not val:
continue
seg_lc = seg.lower()
# Quick filter: does this segment contain any known test/sugar keyword?
has_any = False
for _, keywords in CORE_BOOL_FIELDS.items():
if any(re.search(rf"\b{re.escape(kw)}\b", seg_lc) for kw in keywords):
has_any = True
break
if not has_any:
for sugar_key in SUGAR_FIELDS.keys():
if re.search(rf"\b{sugar_key}\b", seg_lc):
has_any = True
break
if not has_any:
continue # ignore segments unrelated to tests/sugars
# Apply to all matching core boolean tests
for field, keywords in CORE_BOOL_FIELDS.items():
for kw in keywords:
if re.search(rf"\b{re.escape(kw)}\b", seg_lc):
_set_if_stronger(parsed, field, val)
break
# Apply to all matching sugars
for sugar_key, field in SUGAR_FIELDS.items():
if re.search(rf"\b{sugar_key}\b", seg_lc):
_set_if_stronger(parsed, field, val)
# ------------------------------------------------------------
# Motility / Capsule / Spores
# ------------------------------------------------------------
def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None:
# Motility
if (
re.search(r"\bmotile\b", text_lc)
and not re.search(r"\bnon[- ]?motile\b", text_lc)
and "nonmotile" not in text_lc
and "immotile" not in text_lc
):
_set_if_stronger(parsed, "Motility", "Positive")
if (
"non-motile" in text_lc
or "non motile" in text_lc
or "nonmotile" in text_lc
or "immotile" in text_lc
):
_set_if_stronger(parsed, "Motility", "Negative")
# Specific motility phrases: tumbling, swarming, corkscrew, gliding, spreading
if (
"tumbling motility" in text_lc
or "swarming motility" in text_lc
or "corkscrew motility" in text_lc
or re.search(r"\b(gliding|spreading)\s+motility\b", text_lc)
or ("swarming" in text_lc and "non-swarming" not in text_lc)
):
_set_if_stronger(parsed, "Motility", "Positive")
# Capsule (including "capsule positive/negative", present/absent)
if (
"capsulated" in text_lc
or "encapsulated" in text_lc
or "capsule present" in text_lc
or re.search(r"capsule\s+is\s+present", text_lc)
or re.search(r"capsule[ \-]?(positive|pos|\+)", text_lc)
):
_set_if_stronger(parsed, "Capsule", "Positive")
if (
"non-capsulated" in text_lc
or "no capsule" in text_lc
or "capsule absent" in text_lc
or re.search(r"capsule\s+is\s+absent", text_lc)
or re.search(r"capsule[ \-]?(negative|neg|\-)", text_lc)
):
_set_if_stronger(parsed, "Capsule", "Negative")
# Spore formation
# NEGATIVE FIRST with strict boundaries, then early-return
if (
re.search(r"\bnon[-\s]?spore[-\s]?forming\b", text_lc)
or "no spores" in text_lc
):
_set_if_stronger(parsed, "Spore Formation", "Negative")
return # prevent any positive overwrite
# POSITIVE (must not match the negative form)
if (
re.search(r"\bspore[-\s]?forming\b", text_lc)
or "forms spores" in text_lc
):
_set_if_stronger(parsed, "Spore Formation", "Positive")
# ------------------------------------------------------------
# Oxygen requirement
# ------------------------------------------------------------
def _parse_oxygen(text_lc: str, parsed: Dict[str, str]) -> None:
"""
Robust oxygen parsing:
- Handle facultative first
- Avoid "aerobic" accidentally matching inside "anaerobic"
- Include "aerobically" / "anaerobically"
"""
# Facultative first
if re.search(r"facultative(ly)? anaerob", text_lc):
_set_if_stronger(parsed, "Oxygen Requirement", "Facultative Anaerobe")
# Strict anaerobic (before aerobic)
if (
re.search(r"\bobligate anaerob", text_lc)
or (re.search(r"\banaerobic\b", text_lc) and "facultative" not in text_lc)
or re.search(r"\banaerobically\b", text_lc)
):
_set_if_stronger(parsed, "Oxygen Requirement", "Anaerobic")
# Now handle purely aerobic, avoiding "anaerobic"
if (
re.search(r"\bobligate aerobe\b", text_lc)
or (re.search(r"\baerobic\b", text_lc) and "anaerobic" not in text_lc)
or (
re.search(r"\baerobically\b", text_lc)
and "anaerobically" not in text_lc
)
):
_set_if_stronger(parsed, "Oxygen Requirement", "Aerobic")
if "microaerophilic" in text_lc or "microaerophile" in text_lc:
_set_if_stronger(parsed, "Oxygen Requirement", "Microaerophilic")
if "capnophilic" in text_lc or "co2" in text_lc:
_set_if_stronger(parsed, "Oxygen Requirement", "Capnophilic")
# ------------------------------------------------------------
# Growth temperature
# ------------------------------------------------------------
def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
"""
Look for explicit temperatures like "grows at 37 °C" or ranges like "4–45 °C".
We ALWAYS store as "low//high":
- true ranges: "4-45 °C" → "4//45"
- "grows between 30 and 37 °C" → "30//37"
- "grows at 30–37 °C" → "30//37"
- two temps in text: min//max (Option A)
- single temps: "37 °C" → "37//37"
"""
# 0) Explicit "between X and Y" ranges
between_pattern = re.compile(
r"between\s+(\d+)\s*(?:c|°c|degrees c|degrees celsius)?"
r"\s*(?:and|to|-)\s*(\d+)\s*(?:c|°c|degrees c|degrees celsius)?"
)
m_between = between_pattern.search(text_lc)
if m_between:
low = m_between.group(1)
high = m_between.group(2)
_set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
return
# 1) Explicit ranges like "4-45 °C" or "10–40 °C"
range_pattern = re.compile(
r"(\d+)\s*[-–/]\s*(\d+)\s*(?:c|°c|degrees c|degrees celsius)"
)
m_range = range_pattern.search(text_lc)
if m_range:
low = m_range.group(1)
high = m_range.group(2)
_set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
return
# 2) Any two explicit temps → min//max
temps = re.findall(r"(\d+)\s*(?:c|°c|degrees c|degrees celsius)", text_lc)
if len(temps) >= 2:
nums = [int(t) for t in temps]
low = min(nums)
high = max(nums)
_set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
return
# 3) Single temps like "grows at 37 c"
single_pattern = re.compile(
r"(grows|growth|optimum|optimal)\s+(?:at\s+)?(\d+)\s*"
r"(?:c|°c|degrees c|degrees celsius)"
)
m_single = single_pattern.search(text_lc)
if m_single:
temp = m_single.group(2)
_set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
return
# 4) Simplified: "grows at 37" (no explicit °C)
m_simple_num = re.search(r"grows at (\d+)\b", text_lc)
if m_simple_num:
temp = m_simple_num.group(1)
_set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
return
# 5) Fallback: plain "37c" somewhere in the text
m_plain = re.search(
r"\b(\d+)\s*(?:c|°c|degrees c|degrees celsius)\b",
text_lc,
)
if m_plain:
temp = m_plain.group(1)
_set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
# ------------------------------------------------------------
# Media grown on (coarse mapping)
# ------------------------------------------------------------
MEDIA_KEYWORDS = {
"Blood Agar": [
"blood agar",
"blood-agar",
],
"MacConkey Agar": [
"macconkey agar",
"mac conkey agar",
"macconkey",
],
"Chocolate Agar": [
"chocolate agar",
"chocolate-agar",
],
"Nutrient Agar": [
"nutrient agar",
"nutrient-agar",
"nut agar",
],
"XLD Agar": [
"xld agar",
"xld",
],
"TCBS Agar": [
"tcbs agar",
"tcbs",
],
"ALOA": [
"aloa agar",
"aloa",
],
"BCYE Agar": [
"bcye agar",
"bcye",
"Buffered Charcoal Yeast Extract Agar",
"buffered charcoal yeast extract agar"
],
"MRS Agar": [
"mrs agar",
],
"Mannitol Salt Agar": [
"msa agar",
"ms agar",
],
"Cycloserine Cefoxitin Fructose Agar": [
"ccfa agar",
"cycloserine cefoxitin fructose agar",
"ccf agar",
],
"Thayer Martin Agar": [
"thayer martin agar",
"tma agar",
"tma",
],
"Bordet-Gengou Agar": [
"bordet gengou agar",
],
"Cetrimide Agar": [
"cetrimide agar",
],
"Anaerobic Agar": [
"anaerobic agar",
],
"Anaerobic Blood Agar": [
"anaerobic blood agar",
],
"Hektoen Enteric Agar": [
"hektoen enteric agar",
"HK Agar",
"hk",
],
"Tryptic Soy Agar": [
"tryptic soy agar",
"t-soy agar",
"tsoy",
],
"Brucella Agar": [
"brucella agar",
],
"Charcoal Agar": [
"charcoal agar",
],
"Yeast Extract Mannitol Agar": [
"yeast extract mannitol agar",
],
"Sabouraud Agar": [
"sabouraud agar",
"sabouraud dextrose agar",
],
"BHI": [
"bhi",
"brain heart infusion agar",
"brain heart infusion",
],
"Columbia Blood Agar": [
"columbia blood agar",
"columbia agar",
"columbia",
],
"Lowenstein-Jensen Agar": [
"lowenstein-jensen agar",
"lowenstein jensen agar",
],
"BSK Medium": [
"bsk medium",
"bsk",
"bsk-ii medium",
"bsk-h medium",
],
"Ashby Agar": [
"ashby agar",
"ashby medium",
]
}
def _parse_media(text_lc: str, parsed: Dict[str, str]) -> None:
found_media: List[str] = []
for media_name, patterns in MEDIA_KEYWORDS.items():
for p in patterns:
if p in text_lc and media_name not in found_media:
found_media.append(media_name)
if found_media:
_set_if_stronger(parsed, "Media Grown On", "; ".join(found_media))
# ------------------------------------------------------------
# Sugar fermentation parsing
# ------------------------------------------------------------
def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
"""
Handles patterns like:
- "glucose positive, mannitol negative"
- "ferments glucose, mannitol and sucrose but not lactose"
- "does not ferment lactose or sucrose"
- "non-lactose fermenter"
- "<sugar> fermenter" (positive unless "non-<sugar> fermenter")
- "<sugar> is positive/negative"
- "<sugar> fermentation is positive/negative"
- global non-fermenter phrases
- "asaccharolytic" → all sugars Negative (Unknown-only)
- "all other sugars negative" → remaining sugars Negative
"""
# 0) Simple "<sugar> positive/negative" and "<sugar> is positive"
for sugar_key, field in SUGAR_FIELDS.items():
# "glucose positive"
m_simple = re.search(
rf"{sugar_key}\s+(positive|negative|variable|pos|neg|\+|\-)",
text_lc,
)
if m_simple:
val = _value_from_pnv_context(m_simple.group(1))
if val:
_set_if_stronger(parsed, field, val)
# "<sugar> is positive"
m_is = re.search(
rf"{sugar_key}\s+is\s+(positive|negative|variable|pos|neg|\+|\-)",
text_lc,
)
if m_is:
val = _value_from_pnv_token(m_is.group(1))
if val:
_set_if_stronger(parsed, field, val)
# 0b) "<sugar> fermenter" vs "non-<sugar> fermenter"
for sugar_key, field in SUGAR_FIELDS.items():
# positive: "lactose fermenter"
if re.search(rf"\b{sugar_key}\s+fermenter\b", text_lc) and not re.search(
rf"\bnon[- ]{sugar_key}\s+fermenter\b", text_lc
):
_set_if_stronger(parsed, field, "Positive")
# negative: "non-lactose fermenter"
if re.search(rf"\bnon[- ]{sugar_key}\s+fermenter\b", text_lc):
_set_if_stronger(parsed, field, "Negative")
# 1) "ferments X, Y and Z but not A, B"
ferments_pattern = re.compile(r"ferments\s+([a-z0-9 ,;/&\-]+)")
for m in ferments_pattern.finditer(text_lc):
seg = m.group(1)
# Split positive vs negative part on "but not"
neg_split = re.split(r"\bbut not\b", seg, maxsplit=1)
pos_part = neg_split[0]
neg_part = neg_split[1] if len(neg_split) > 1 else ""
# Positive sugars from pos_part
for sugar_key, field in SUGAR_FIELDS.items():
if re.search(rf"\b{sugar_key}\b", pos_part):
_set_if_stronger(parsed, field, "Positive")
# Negative sugars from neg_part
for sugar_key, field in SUGAR_FIELDS.items():
if re.search(rf"\b{sugar_key}\b", neg_part):
_set_if_stronger(parsed, field, "Negative")
# 2) Grouped "does not ferment X, Y and Z" (stop at but/punctuation)
# Prevents glucose being accidentally marked negative in:
# "does not ferment lactose or sucrose, but glucose fermentation is positive"
grouped_neg_pattern = re.compile(
r"does\s+(?:not|n't)\s+ferment\s+([a-z0-9 ,;/&\-]+?)(?:\s+but\b|\.|;|,|$)"
)
for m in grouped_neg_pattern.finditer(text_lc):
seg = m.group(1)
for sugar_key, field in SUGAR_FIELDS.items():
if re.search(rf"\b{sugar_key}\b", seg):
_set_if_stronger(parsed, field, "Negative")
# 3) Single "does not ferment X"
for sugar_key, field in SUGAR_FIELDS.items():
if re.search(
rf"does\s+(?:not|n't)\s+ferment\s+{sugar_key}\b", text_lc
):
_set_if_stronger(parsed, field, "Negative")
# 4) "non-lactose fermenter" and similar
for sugar_key, field in SUGAR_FIELDS.items():
if re.search(
rf"non[- ]{sugar_key}\s+ferment(ing|er)?", text_lc
):
_set_if_stronger(parsed, field, "Negative")
# 5) "<sugar> fermentation positive/negative" + "is positive"
for sugar_key, field in SUGAR_FIELDS.items():
# "glucose fermentation positive"
m1 = re.search(
rf"{sugar_key}\s+fermentation[ \-]?"
r"(positive|negative|variable|pos|neg|\+|\-)",
text_lc,
)
if m1:
val = _value_from_pnv_context(m1.group(1))
if val:
_set_if_stronger(parsed, field, val)
continue
# "positive for glucose fermentation"
m2 = re.search(
rf"(positive|negative|variable|pos|neg|\+|\-)\s+"
rf"(for\s+)?{sugar_key}\s+fermentation",
text_lc,
)
if m2:
val = _value_from_pnv_context(m2.group(1))
if val:
_set_if_stronger(parsed, field, val)
continue
# "<sugar> fermentation is positive/negative"
m3 = re.search(
rf"{sugar_key}\s+fermentation\s+is\s+"
r"(positive|negative|variable|pos|neg|\+|\-)",
text_lc,
)
if m3:
val = _value_from_pnv_token(m3.group(1))
if val:
_set_if_stronger(parsed, field, val)
continue
# 6) Global non-fermenter phrases
# e.g. "non-fermenter", "does not ferment sugars"
# → set all sugars Negative *unless* already set by a more specific rule.
if (
re.search(
r"does\s+(?:not|n't)\s+ferment\s+(carbohydrates|sugars)", text_lc
)
or re.search(r"\bnon[- ]ferment(er|ing|ative)\b", text_lc)
):
for field in SUGAR_FIELDS.values():
if field not in parsed or parsed[field] == UNKNOWN:
_set_if_stronger(parsed, field, "Negative")
# 7) Asaccharolytic → all sugars Negative (Unknown-only)
if (
"asaccharolytic" in text_lc
or "non-saccharolytic" in text_lc
or "non saccharolytic" in text_lc
):
for field in SUGAR_FIELDS.values():
if field not in parsed or parsed[field] == UNKNOWN:
_set_if_stronger(parsed, field, "Negative")
# 8) "all other sugars negative/positive"
m_other = re.search(
r"all\s+other\s+sugars\s+(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)",
text_lc,
)
if m_other:
val = _value_from_pnv_token(m_other.group(1))
if val:
for field in SUGAR_FIELDS.values():
if field not in parsed or parsed[field] == UNKNOWN:
_set_if_stronger(parsed, field, val)
# ------------------------------------------------------------
# Colony morphology (coarse, optional)
# ------------------------------------------------------------
def _normalise_colony_desc(desc: str) -> str:
"""
Take a raw colony descriptor and normalise into:
"Smooth; Yellow; Opaque" etc.
Tweaks:
- Remove "-pigmented" → "yellow-pigmented" → "yellow"
- Treat "and" like a separator for parts
"""
# Remove "-pigmented" so "yellow-pigmented" → "yellow"
tmp = desc.replace("-pigmented", "")
# Normalise "and" to a comma so it acts like a separator
tmp = tmp.replace(" and ", ", ")
parts = [s.strip() for s in re.split(r"[;,]", tmp) if s.strip()]
pretty = "; ".join(p.capitalize() for p in parts)
return pretty
def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
"""
Very coarse mapping for colony morphology. We try:
- "colonies are yellow, mucoid"
- "colonies dry, white and irregular on nutrient agar"
- "forming smooth, yellow-pigmented, opaque colonies"
- "grey colonies", "large grey colonies" (no verb)
"""
# Pattern 1: "colonies are ..."
m = re.search(r"colon(y|ies)\s+(are|is)\s+([a-z0-9 ,;\-]+)", text_lc)
if m:
desc = m.group(3).strip()
if desc:
pretty = _normalise_colony_desc(desc)
if pretty:
_set_if_stronger(parsed, "Colony Morphology", pretty)
return
# Pattern 2: "colonies dry, white and irregular on nutrient agar"
m2 = re.search(
r"colonies\s+([a-z0-9 ,;\-]+?)(?:\s+on\b|\.|,)",
text_lc,
)
if m2:
desc = m2.group(1).strip()
if desc:
pretty = _normalise_colony_desc(desc)
if pretty:
_set_if_stronger(parsed, "Colony Morphology", pretty)
return
# Pattern 3: "forming green colonies", "forms mucoid colonies",
# "forming smooth, yellow-pigmented, opaque colonies"
m3 = re.search(
r"(forming|forms|produces)\s+([a-z0-9 ,;\-]+?)\s+colonies",
text_lc,
)
if m3:
desc = m3.group(2).strip()
if desc:
pretty = _normalise_colony_desc(desc)
if pretty:
_set_if_stronger(parsed, "Colony Morphology", pretty)
return
# Pattern 4: plain descriptor before "colonies" (e.g. "grey colonies",
# "large grey colonies") when none of the above match.
m4 = re.search(
r"\b([a-z0-9 ,;\-]+?)\s+colonies\b",
text_lc,
)
if m4:
desc = m4.group(1).strip()
if desc:
pretty = _normalise_colony_desc(desc)
if pretty:
_set_if_stronger(parsed, "Colony Morphology", pretty)
return
def _apply_patches(original_text: str, text_lc: str, parsed: Dict[str, str]) -> Dict[str, str]:
# ----------------------------------------------
# helper for P/N/V
# ----------------------------------------------
def _pnv(x: str) -> Optional[str]:
x = x.strip().lower()
if x in {"positive", "pos", "+", "strongly positive", "weakly positive"}:
return "Positive"
if x in {"negative", "neg", "-", "no"}:
return "Negative"
if x in {"variable", "var", "mixed"}:
return "Variable"
return None
# ============================================================
# NEW LOGIC: Haemolysis Type detection (alpha/beta/none)
# ============================================================
# alpha
m_alpha = re.search(r"(alpha|α)[-\s]*haemolysis", text_lc) or \
re.search(r"haemolysis type[: ]*(alpha|α)", text_lc)
if m_alpha:
if parsed.get("Haemolysis", UNKNOWN) == UNKNOWN:
parsed["Haemolysis"] = "Positive"
if parsed.get("Haemolysis Type", UNKNOWN) == UNKNOWN:
parsed["Haemolysis Type"] = "Alpha"
# beta
m_beta = re.search(r"(beta|β)[-\s]*haemolysis", text_lc) or \
re.search(r"haemolysis type[: ]*(beta|β)", text_lc)
if m_beta:
if parsed.get("Haemolysis", UNKNOWN) == UNKNOWN:
parsed["Haemolysis"] = "Positive"
if parsed.get("Haemolysis Type", UNKNOWN) == UNKNOWN:
parsed["Haemolysis Type"] = "Beta"
# gamma / none
m_gamma = re.search(r"(gamma|γ)[-\s]*haemolysis", text_lc)
m_none = re.search(r"(no haemolysis|non[- ]haemolytic|no hemolysis|non[- ]hemolytic)", text_lc)
if m_gamma or m_none:
if parsed.get("Haemolysis", UNKNOWN) == UNKNOWN:
parsed["Haemolysis"] = "Negative"
if parsed.get("Haemolysis Type", UNKNOWN) == UNKNOWN:
parsed["Haemolysis Type"] = "None"
# ============================================================
# ORIGINAL PATCH v1 LOGIC (fully preserved)
# ============================================================
# 1. Haemolysis: generic ± without type
m_h = re.search(r"haemolysis\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc)
if m_h and "Haemolysis" not in parsed:
val = _pnv(m_h.group(1))
if val:
parsed["Haemolysis"] = val
if parsed.get("Haemolysis Type", UNKNOWN) == UNKNOWN and val == "Positive":
parsed["Haemolysis Type"] = "Unknown"
# 2. Motility: generic ±
m_mot = re.search(r"motility\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc)
if m_mot and "Motility" not in parsed:
val = _pnv(m_mot.group(1))
if val:
parsed["Motility"] = val
# 3. Spore formation ±
m_sp = re.search(r"spore formation\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc)
if m_sp and parsed.get("Spore Formation", UNKNOWN) == UNKNOWN:
val = _pnv(m_sp.group(1))
if val:
parsed["Spore Formation"] = val
# ============================================================
# FIXED NaCl tolerant logic (patch upgrade)
# ============================================================
if parsed.get("NaCl Tolerant (>=6%)", UNKNOWN) == UNKNOWN:
# direct p/n/v
m_nacl = re.search(
r"(?:nacl\s*(?:tolerant|tolerance)?|growth\s+in\s+6\%[\s]*nacl)"
r"\s*(positive|negative|variable|pos|neg|\+|\-)",
text_lc
)
if m_nacl:
val = _pnv(m_nacl.group(1))
if val:
parsed["NaCl Tolerant (>=6%)"] = val
# "no growth in 6% nacl"
if parsed.get("NaCl Tolerant (>=6%)", UNKNOWN) == UNKNOWN:
if re.search(r"no\s+growth\s+in\s+(?:>=)?\s*6\%?\s*nacl", text_lc):
parsed["NaCl Tolerant (>=6%)"] = "Negative"
# "grows in 6% nacl"
if parsed.get("NaCl Tolerant (>=6%)", UNKNOWN) == UNKNOWN:
if re.search(r"grows?\s+in\s+(?:>=)?\s*6\%?\s*nacl", text_lc):
parsed["NaCl Tolerant (>=6%)"] = "Positive"
# ============================================================
# Growth Temperature patterns (20/40, 20//40, 20 / 40)
# ============================================================
m_temp = re.search(r"\b(\d{1,3})\s*[/]{1,2}\s*(\d{1,3})\b", text_lc)
if m_temp and parsed.get("Growth Temperature", UNKNOWN) == UNKNOWN:
parsed["Growth Temperature"] = f"{m_temp.group(1)}//{m_temp.group(2)}"
# ============================================================
# Colony Morphology STRICT LIST extraction
# ============================================================
COLONY_TRIGGERS = [
"colony morphology",
"colonies are",
"colonies appear",
"colonies look",
"colony appearance",
"colony characteristics",
]
if any(t in text_lc for t in COLONY_TRIGGERS):
m_col = re.search(
r"(?:colony morphology|colonies are|colonies appear|colonies look|colony appearance|colony characteristics)"
r"[: ]+([a-z0-9 ,;/\-]+)",
text_lc
)
if m_col:
segment = m_col.group(1)
parts = [x.strip() for x in re.split(r"[;,/]", segment) if x.strip()]
clean_desc = [p.capitalize() for p in parts if len(p) > 1]
if clean_desc:
existing = parsed.get("Colony Morphology", "")
existing_list = [x.strip() for x in existing.split(";")] if existing else []
merged = []
for x in existing_list:
if x not in merged:
merged.append(x)
for x in clean_desc:
if x not in merged:
merged.append(x)
parsed["Colony Morphology"] = "; ".join(merged)
# ============================================================
# ORIGINAL MULTI-MEDIA PATCH (unchanged)
# ============================================================
if "media grown on" in text_lc or "grown on" in text_lc:
mm = re.search(r"(?:media\s+grown\s+on|grown\s+on)[: ]+([a-z0-9 ,;/\-]+)", text_lc)
if mm:
segment = mm.group(1)
raw_items = re.split(r"[;,]", segment)
raw_items = [x.strip() for x in raw_items if x.strip()]
detected_media = []
for item in raw_items:
for media_name, patterns in MEDIA_KEYWORDS.items():
for p in patterns:
if p in item and media_name not in detected_media:
detected_media.append(media_name)
if detected_media:
existing = parsed.get("Media Grown On", "")
existing_list = [x.strip() for x in existing.split(";")] if existing else []
merged = []
for m in existing_list:
if m not in merged:
merged.append(m)
for m in detected_media:
if m not in merged:
merged.append(m)
parsed["Media Grown On"] = "; ".join(merged)
return parsed
# ------------------------------------------------------------
# PUBLIC API
# ------------------------------------------------------------
def parse_text_rules(text: str) -> Dict[str, Any]:
"""
Main entry point for the rule-based core parser.
"""
original = text or ""
text_clean = _clean_text(original)
text_lc = text_clean.lower()
parsed: Dict[str, str] = {}
try:
_parse_gram_and_shape(text_lc, parsed)
_parse_haemolysis(text_lc, parsed)
_parse_core_bool_tests(text_lc, parsed)
_parse_motility_capsule_spores(text_lc, parsed)
_parse_oxygen(text_lc, parsed)
_parse_growth_temperature(text_lc, parsed)
_parse_media(text_lc, parsed)
_parse_sugars(text_lc, parsed)
_parse_colony(text_lc, parsed)
parsed = _apply_patches(original, text_lc, parsed)
return {
"parsed_fields": parsed,
"source": "rule_parser",
"raw": original,
}
except Exception as e:
# Fail-safe: never crash the app, just report an error
return {
"parsed_fields": parsed,
"source": "rule_parser",
"raw": original,
"error": f"{type(e).__name__}: {e}",
}