|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
import re |
|
|
from typing import Dict, Any, List |
|
|
|
|
|
|
|
|
UNKNOWN = "Unknown" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SUGAR_FIELDS: Dict[str, str] = { |
|
|
"glucose": "Glucose Fermentation", |
|
|
"lactose": "Lactose Fermentation", |
|
|
"sucrose": "Sucrose Fermentation", |
|
|
"maltose": "Maltose Fermentation", |
|
|
"mannitol": "Mannitol Fermentation", |
|
|
"sorbitol": "Sorbitol Fermentation", |
|
|
"xylose": "Xylose Fermentation", |
|
|
"rhamnose": "Rhamnose Fermentation", |
|
|
"arabinose": "Arabinose Fermentation", |
|
|
"raffinose": "Raffinose Fermentation", |
|
|
"trehalose": "Trehalose Fermentation", |
|
|
"inositol": "Inositol Fermentation", |
|
|
} |
|
|
|
|
|
CORE_BOOL_FIELDS: Dict[str, List[str]] = { |
|
|
|
|
|
"Catalase": ["catalase"], |
|
|
"Oxidase": ["oxidase"], |
|
|
"Indole": ["indole"], |
|
|
"Urease": ["urease"], |
|
|
"Citrate": ["citrate"], |
|
|
|
|
|
"Methyl Red": ["methyl red", "mr test", "mr"], |
|
|
"VP": ["voges-proskauer", "vp test", "vp"], |
|
|
|
|
|
"H2S": ["h2s", "hydrogen sulfide"], |
|
|
|
|
|
"DNase": [ |
|
|
"dnase", |
|
|
"dnase test", |
|
|
"dnase activity", |
|
|
"dnase production", |
|
|
"dnaase", |
|
|
"dna hydrolysis", |
|
|
], |
|
|
"ONPG": ["onpg"], |
|
|
"Coagulase": ["coagulase"], |
|
|
"Lipase Test": ["lipase"], |
|
|
"Nitrate Reduction": ["nitrate reduction", "nitrate"], |
|
|
"NaCl Tolerant (>=6%)": ["6% nacl", "7% nacl", "nacl tolerant"], |
|
|
|
|
|
"Lysine Decarboxylase": ["lysine decarboxylase", "lysine decarb", "lysine"], |
|
|
"Ornitihine Decarboxylase": ["ornithine decarboxylase", "ornithine decarb", "ornithine"], |
|
|
"Arginine dihydrolase": ["arginine dihydrolase", "arginine decarboxylase", "arginine"], |
|
|
|
|
|
"Gelatin Hydrolysis": ["gelatin hydrolysis", "gelatinase", "gelatin"], |
|
|
"Esculin Hydrolysis": ["esculin hydrolysis", "esculin"], |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _clean_text(text: str) -> str: |
|
|
""" |
|
|
Normalise unicode oddities and collapse whitespace. |
|
|
Also: |
|
|
- strip degree symbols |
|
|
- normalise subscript ₂ → 2 for H₂S |
|
|
- normalise α/β/γ to alpha/beta/gamma for haemolysis patterns |
|
|
""" |
|
|
if not text: |
|
|
return "" |
|
|
s = text.replace("°", "").replace("º", "") |
|
|
|
|
|
s = s.replace("₂", "2") |
|
|
|
|
|
|
|
|
s = ( |
|
|
s.replace("α", "alpha") |
|
|
.replace("β", "beta") |
|
|
.replace("γ", "gamma") |
|
|
) |
|
|
|
|
|
|
|
|
return " ".join(s.split()) |
|
|
|
|
|
|
|
|
def _norm(s: str) -> str: |
|
|
return s.strip().lower() |
|
|
|
|
|
|
|
|
def _set_if_stronger(parsed: Dict[str, str], field: str, value: str) -> None: |
|
|
""" |
|
|
Write value to parsed[field] if: |
|
|
- field not present, or |
|
|
- we are replacing Unknown with a concrete value |
|
|
""" |
|
|
if not value: |
|
|
return |
|
|
if field not in parsed or parsed[field] == UNKNOWN: |
|
|
parsed[field] = value |
|
|
|
|
|
|
|
|
def _value_from_pnv_token(token: str) -> str | None: |
|
|
""" |
|
|
Map a simple token to Positive / Negative / Variable. |
|
|
""" |
|
|
seg = _norm(token) |
|
|
if seg in ["positive", "pos", "+"]: |
|
|
return "Positive" |
|
|
if seg in ["negative", "neg", "-"]: |
|
|
return "Negative" |
|
|
if seg in ["variable", "var", "v"]: |
|
|
return "Variable" |
|
|
return None |
|
|
|
|
|
|
|
|
def _value_from_pnv_context(segment: str) -> str | None: |
|
|
""" |
|
|
Interpret a phrase as Positive / Negative / Variable. |
|
|
|
|
|
Handles: |
|
|
- "positive" |
|
|
- "is positive" |
|
|
- "+", "neg", etc. |
|
|
""" |
|
|
seg = _norm(segment) |
|
|
|
|
|
val = _value_from_pnv_token(seg) |
|
|
if val: |
|
|
return val |
|
|
|
|
|
m = re.search(r"\bis\s+(positive|negative|variable|pos|neg|\+|\-)\b", seg) |
|
|
if m: |
|
|
return _value_from_pnv_token(m.group(1)) |
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_gram_and_shape(text_lc: str, parsed: Dict[str, str]) -> None: |
|
|
|
|
|
if "gram-positive" in text_lc or "gram positive" in text_lc: |
|
|
_set_if_stronger(parsed, "Gram Stain", "Positive") |
|
|
elif "gram-negative" in text_lc or "gram negative" in text_lc: |
|
|
_set_if_stronger(parsed, "Gram Stain", "Negative") |
|
|
elif "gram variable" in text_lc: |
|
|
_set_if_stronger(parsed, "Gram Stain", "Variable") |
|
|
|
|
|
|
|
|
|
|
|
if "short rods" in text_lc: |
|
|
_set_if_stronger(parsed, "Shape", "Short Rods") |
|
|
|
|
|
|
|
|
if re.search(r"\bcoccobacill(?:us|i)\b", text_lc): |
|
|
_set_if_stronger(parsed, "Shape", "Short Rods") |
|
|
|
|
|
|
|
|
if re.search(r"\bcocci\b", text_lc): |
|
|
_set_if_stronger(parsed, "Shape", "Cocci") |
|
|
if re.search(r"\b(diplococci|tetracocci|streptococci|staphylococci)\b", text_lc): |
|
|
_set_if_stronger(parsed, "Shape", "Cocci") |
|
|
|
|
|
|
|
|
if re.search(r"\brods?\b", text_lc) or "bacilli" in text_lc: |
|
|
_set_if_stronger(parsed, "Shape", "Rods") |
|
|
|
|
|
|
|
|
if "spiral" in text_lc or "spirochete" in text_lc: |
|
|
_set_if_stronger(parsed, "Shape", "Spiral") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_haemolysis(text_lc: str, parsed: Dict[str, str]) -> None: |
|
|
""" |
|
|
Handle haemolysis phrasing: |
|
|
- beta-haemolytic / beta hemolytic / beta-haemolysis / etc. |
|
|
- alpha- / gamma- / non-haemolytic |
|
|
- α / β / γ symbols are normalised to alpha/beta/gamma in _clean_text |
|
|
""" |
|
|
|
|
|
if re.search(r"beta[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc): |
|
|
_set_if_stronger(parsed, "Haemolysis Type", "Beta") |
|
|
_set_if_stronger(parsed, "Haemolysis", "Positive") |
|
|
|
|
|
|
|
|
if re.search(r"alpha[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc): |
|
|
_set_if_stronger(parsed, "Haemolysis Type", "Alpha") |
|
|
_set_if_stronger(parsed, "Haemolysis", "Positive") |
|
|
|
|
|
|
|
|
if re.search(r"gamma[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc): |
|
|
_set_if_stronger(parsed, "Haemolysis Type", "Gamma") |
|
|
_set_if_stronger(parsed, "Haemolysis", "Negative") |
|
|
if ( |
|
|
"non-haemolytic" in text_lc |
|
|
or "non hemolytic" in text_lc |
|
|
or "non-hemolytic" in text_lc |
|
|
): |
|
|
_set_if_stronger(parsed, "Haemolysis Type", "None") |
|
|
_set_if_stronger(parsed, "Haemolysis", "Negative") |
|
|
|
|
|
|
|
|
if "variable haemolysis" in text_lc or "variable hemolysis" in text_lc: |
|
|
_set_if_stronger(parsed, "Haemolysis Type", "Variable") |
|
|
_set_if_stronger(parsed, "Haemolysis", "Variable") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None: |
|
|
""" |
|
|
For each test in CORE_BOOL_FIELDS, look for patterns like: |
|
|
- "catalase positive" |
|
|
- "positive for catalase" |
|
|
- "catalase is positive" |
|
|
- "indole reaction is negative" |
|
|
- "indole reaction negative" |
|
|
- "indole test reaction is positive" |
|
|
Plus: |
|
|
- NaCl tolerance with % values |
|
|
- Nitrate reduction text |
|
|
- H2S production / non-production |
|
|
- DNase coverage |
|
|
- gelatinase / gelatin → Gelatin Hydrolysis |
|
|
- esculin → Esculin Hydrolysis |
|
|
- grouped MR/VP: "MR and VP negative" |
|
|
- decarboxylase global phrases |
|
|
- generic grouped phrases |
|
|
"gelatin and esculin hydrolysis negative" |
|
|
"lysine, ornithine and arginine negative" |
|
|
""" |
|
|
for field, keywords in CORE_BOOL_FIELDS.items(): |
|
|
for kw in keywords: |
|
|
|
|
|
m1 = re.search( |
|
|
rf"{re.escape(kw)}[ \-]?" |
|
|
r"(positive|negative|variable|pos|neg|\+|\-)", |
|
|
text_lc, |
|
|
) |
|
|
if m1: |
|
|
val = _value_from_pnv_context(m1.group(1)) |
|
|
if val: |
|
|
_set_if_stronger(parsed, field, val) |
|
|
break |
|
|
|
|
|
|
|
|
m2 = re.search( |
|
|
rf"(positive|negative|variable|pos|neg|\+|\-)\s+" |
|
|
rf"(for\s+)?{re.escape(kw)}", |
|
|
text_lc, |
|
|
) |
|
|
if m2: |
|
|
val = _value_from_pnv_context(m2.group(1)) |
|
|
if val: |
|
|
_set_if_stronger(parsed, field, val) |
|
|
break |
|
|
|
|
|
|
|
|
m3 = re.search( |
|
|
rf"{re.escape(kw)}\s+is\s+" |
|
|
r"(positive|negative|variable|pos|neg|\+|\-)", |
|
|
text_lc, |
|
|
) |
|
|
if m3: |
|
|
val = _value_from_pnv_token(m3.group(1)) |
|
|
if val: |
|
|
_set_if_stronger(parsed, field, val) |
|
|
break |
|
|
|
|
|
|
|
|
m4 = re.search( |
|
|
rf"{re.escape(kw)}\s+reaction\s+is\s+" |
|
|
r"(positive|negative|variable|pos|neg|\+|\-)", |
|
|
text_lc, |
|
|
) |
|
|
if m4: |
|
|
val = _value_from_pnv_token(m4.group(1)) |
|
|
if val: |
|
|
_set_if_stronger(parsed, field, val) |
|
|
break |
|
|
|
|
|
|
|
|
m5 = re.search( |
|
|
rf"{re.escape(kw)}\s+reaction\s+" |
|
|
r"(positive|negative|variable|pos|neg|\+|\-)", |
|
|
text_lc, |
|
|
) |
|
|
if m5: |
|
|
val = _value_from_pnv_token(m5.group(1)) |
|
|
if val: |
|
|
_set_if_stronger(parsed, field, val) |
|
|
break |
|
|
|
|
|
|
|
|
m6 = re.search( |
|
|
rf"{re.escape(kw)}\s+test\s+reaction\s+is\s+" |
|
|
r"(positive|negative|variable|pos|neg|\+|\-)", |
|
|
text_lc, |
|
|
) |
|
|
if m6: |
|
|
val = _value_from_pnv_token(m6.group(1)) |
|
|
if val: |
|
|
_set_if_stronger(parsed, field, val) |
|
|
break |
|
|
|
|
|
|
|
|
if field == "NaCl Tolerant (>=6%)": |
|
|
|
|
|
|
|
|
has_positive = False |
|
|
has_negative = False |
|
|
|
|
|
|
|
|
|
|
|
if re.search( |
|
|
r"does\s+(?:not|n't)\s+grow\s+(in|at)\s*\d+(?:\.\d+)?\s*%?\s*nacl", |
|
|
text_lc, |
|
|
): |
|
|
has_negative = True |
|
|
|
|
|
|
|
|
if re.search( |
|
|
r"no\s+growth\s+(in|at)\s*\d+(?:\.\d+)?\s*%?\s*nacl", |
|
|
text_lc, |
|
|
): |
|
|
has_negative = True |
|
|
|
|
|
|
|
|
if re.search( |
|
|
r"no\s+growth\s+in\s+nacl", |
|
|
text_lc, |
|
|
): |
|
|
has_negative = True |
|
|
|
|
|
|
|
|
if re.search( |
|
|
r"unable\s+to\s+grow\s+(in|at)\s*(\d+(?:\.\d+)?\s*%?\s*)?nacl", |
|
|
text_lc, |
|
|
): |
|
|
has_negative = True |
|
|
|
|
|
|
|
|
if re.search(r"cannot\s+tolerate\s+nacl", text_lc): |
|
|
has_negative = True |
|
|
if re.search(r"not\s+nacl\s+tolerant", text_lc): |
|
|
has_negative = True |
|
|
if re.search(r"nacl\s+sensitive", text_lc): |
|
|
has_negative = True |
|
|
if re.search(r"fails\s+to\s+grow\s+(in|at)\s*(\d+(?:\.\d+)?\s*%?\s*)?nacl", text_lc): |
|
|
has_negative = True |
|
|
if re.search(r"intolerant\s+to\s+nacl", text_lc): |
|
|
has_negative = True |
|
|
if re.search(r"no\s+tolerance\s+to\s+nacl", text_lc): |
|
|
has_negative = True |
|
|
if re.search(r"nacl\s+intolerance", text_lc): |
|
|
has_negative = True |
|
|
if re.search(r"no\s+growth\s+at\s+high\s+nacl", text_lc): |
|
|
has_negative = True |
|
|
|
|
|
|
|
|
|
|
|
for m in re.finditer( |
|
|
r"(grows|growth occurs|growth observed|able to grow|tolerates|tolerant)\s+" |
|
|
r"(?:in|at|up to|to)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl", |
|
|
text_lc, |
|
|
): |
|
|
try: |
|
|
conc = float(m.group(2)) |
|
|
if conc >= 6.0: |
|
|
has_positive = True |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
for m in re.finditer( |
|
|
r"nacl\s+tolerant\s+(?:to|up to)?\s*(\d+(?:\.\d+)?)\s*%?", |
|
|
text_lc, |
|
|
): |
|
|
try: |
|
|
conc = float(m.group(1)) |
|
|
if conc >= 6.0: |
|
|
has_positive = True |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
if has_negative: |
|
|
|
|
|
parsed["NaCl Tolerant (>=6%)"] = "Negative" |
|
|
elif has_positive: |
|
|
_set_if_stronger(parsed, "NaCl Tolerant (>=6%)", "Positive") |
|
|
|
|
|
|
|
|
if re.search(r"reduces nitrate", text_lc): |
|
|
_set_if_stronger(parsed, "Nitrate Reduction", "Positive") |
|
|
if re.search(r"does (not|n't) reduce nitrate", text_lc): |
|
|
_set_if_stronger(parsed, "Nitrate Reduction", "Negative") |
|
|
|
|
|
|
|
|
if re.search(r"(produces|production of)\s+h2s", text_lc): |
|
|
_set_if_stronger(parsed, "H2S", "Positive") |
|
|
if re.search(r"h2s production\s+is\s+(positive|pos|\+)", text_lc): |
|
|
_set_if_stronger(parsed, "H2S", "Positive") |
|
|
if re.search(r"h2s production\s+is\s+(negative|neg|\-)", text_lc): |
|
|
_set_if_stronger(parsed, "H2S", "Negative") |
|
|
if ( |
|
|
re.search(r"does (not|n't) produce\s+h2s", text_lc) |
|
|
or re.search(r"no h2s production", text_lc) |
|
|
or re.search(r"non[- ]h2s producing", text_lc) |
|
|
): |
|
|
_set_if_stronger(parsed, "H2S", "Negative") |
|
|
|
|
|
|
|
|
|
|
|
if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(positive|pos|\+)\b", text_lc): |
|
|
_set_if_stronger(parsed, "DNase", "Positive") |
|
|
|
|
|
if re.search(r"\b(positive|pos|\+)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc): |
|
|
_set_if_stronger(parsed, "DNase", "Positive") |
|
|
|
|
|
|
|
|
if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(negative|neg|\-)\b", text_lc): |
|
|
_set_if_stronger(parsed, "DNase", "Negative") |
|
|
|
|
|
if re.search(r"\b(negative|neg|\-)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc): |
|
|
_set_if_stronger(parsed, "DNase", "Negative") |
|
|
|
|
|
|
|
|
if re.search(r"\bnon[- ]?dnase[- ]?producing\b", text_lc): |
|
|
_set_if_stronger(parsed, "DNase", "Negative") |
|
|
|
|
|
|
|
|
mr_vp_pattern = re.compile( |
|
|
r"\b(" |
|
|
r"mr(?: test)?|methyl red|" |
|
|
r"vp(?: test)?|voges-proskauer" |
|
|
r")\s*(?:test)?\s*(?:and|&)\s*( " |
|
|
r"mr(?: test)?|methyl red|" |
|
|
r"vp(?: test)?|voges-proskauer" |
|
|
r")\s+" |
|
|
r"(positive|negative|variable|pos|neg|\+|\-)" |
|
|
) |
|
|
for m in mr_vp_pattern.finditer(text_lc): |
|
|
name1 = m.group(1) |
|
|
name2 = m.group(2) |
|
|
val = _value_from_pnv_token(m.group(3)) |
|
|
if not val: |
|
|
continue |
|
|
|
|
|
def _assign_mr_vp(name: str) -> None: |
|
|
n = name.lower() |
|
|
if "mr" in n or "methyl red" in n: |
|
|
_set_if_stronger(parsed, "Methyl Red", val) |
|
|
if "vp" in n or "voges" in n: |
|
|
_set_if_stronger(parsed, "VP", val) |
|
|
|
|
|
_assign_mr_vp(name1) |
|
|
_assign_mr_vp(name2) |
|
|
|
|
|
|
|
|
m_all_decarb = re.search( |
|
|
r"all\s+decarboxylases?\s+(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)", |
|
|
text_lc, |
|
|
) |
|
|
if m_all_decarb: |
|
|
val = _value_from_pnv_token(m_all_decarb.group(1)) |
|
|
if val: |
|
|
for f in ("Lysine Decarboxylase", "Ornitihine Decarboxylase", "Arginine dihydrolase"): |
|
|
_set_if_stronger(parsed, f, val) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
grouped_tests_pattern = re.compile( |
|
|
r"([a-z0-9 ,/&\-]+?)\s+" |
|
|
r"(?:hydrolysis|decarboxylases?|dihydrolases?|tests?|reactions?)?" |
|
|
r"\s*(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)" |
|
|
) |
|
|
|
|
|
for m in grouped_tests_pattern.finditer(text_lc): |
|
|
seg = m.group(1) |
|
|
val = _value_from_pnv_token(m.group(2)) |
|
|
if not val: |
|
|
continue |
|
|
|
|
|
seg_lc = seg.lower() |
|
|
|
|
|
|
|
|
has_any = False |
|
|
|
|
|
for _, keywords in CORE_BOOL_FIELDS.items(): |
|
|
if any(re.search(rf"\b{re.escape(kw)}\b", seg_lc) for kw in keywords): |
|
|
has_any = True |
|
|
break |
|
|
|
|
|
if not has_any: |
|
|
for sugar_key in SUGAR_FIELDS.keys(): |
|
|
if re.search(rf"\b{sugar_key}\b", seg_lc): |
|
|
has_any = True |
|
|
break |
|
|
|
|
|
if not has_any: |
|
|
continue |
|
|
|
|
|
|
|
|
for field, keywords in CORE_BOOL_FIELDS.items(): |
|
|
for kw in keywords: |
|
|
if re.search(rf"\b{re.escape(kw)}\b", seg_lc): |
|
|
_set_if_stronger(parsed, field, val) |
|
|
break |
|
|
|
|
|
|
|
|
for sugar_key, field in SUGAR_FIELDS.items(): |
|
|
if re.search(rf"\b{sugar_key}\b", seg_lc): |
|
|
_set_if_stronger(parsed, field, val) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None: |
|
|
|
|
|
if ( |
|
|
re.search(r"\bmotile\b", text_lc) |
|
|
and not re.search(r"\bnon[- ]?motile\b", text_lc) |
|
|
and "nonmotile" not in text_lc |
|
|
and "immotile" not in text_lc |
|
|
): |
|
|
_set_if_stronger(parsed, "Motility", "Positive") |
|
|
|
|
|
if ( |
|
|
"non-motile" in text_lc |
|
|
or "non motile" in text_lc |
|
|
or "nonmotile" in text_lc |
|
|
or "immotile" in text_lc |
|
|
): |
|
|
_set_if_stronger(parsed, "Motility", "Negative") |
|
|
|
|
|
|
|
|
if ( |
|
|
"tumbling motility" in text_lc |
|
|
or "swarming motility" in text_lc |
|
|
or "corkscrew motility" in text_lc |
|
|
or re.search(r"\b(gliding|spreading)\s+motility\b", text_lc) |
|
|
or ("swarming" in text_lc and "non-swarming" not in text_lc) |
|
|
): |
|
|
_set_if_stronger(parsed, "Motility", "Positive") |
|
|
|
|
|
|
|
|
if ( |
|
|
"capsulated" in text_lc |
|
|
or "encapsulated" in text_lc |
|
|
or "capsule present" in text_lc |
|
|
or re.search(r"capsule\s+is\s+present", text_lc) |
|
|
or re.search(r"capsule[ \-]?(positive|pos|\+)", text_lc) |
|
|
): |
|
|
_set_if_stronger(parsed, "Capsule", "Positive") |
|
|
|
|
|
if ( |
|
|
"non-capsulated" in text_lc |
|
|
or "no capsule" in text_lc |
|
|
or "capsule absent" in text_lc |
|
|
or re.search(r"capsule\s+is\s+absent", text_lc) |
|
|
or re.search(r"capsule[ \-]?(negative|neg|\-)", text_lc) |
|
|
): |
|
|
_set_if_stronger(parsed, "Capsule", "Negative") |
|
|
|
|
|
|
|
|
|
|
|
if ( |
|
|
re.search(r"\bnon[-\s]?spore[-\s]?forming\b", text_lc) |
|
|
or "no spores" in text_lc |
|
|
): |
|
|
_set_if_stronger(parsed, "Spore Formation", "Negative") |
|
|
return |
|
|
|
|
|
|
|
|
if ( |
|
|
re.search(r"\bspore[-\s]?forming\b", text_lc) |
|
|
or "forms spores" in text_lc |
|
|
): |
|
|
_set_if_stronger(parsed, "Spore Formation", "Positive") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_oxygen(text_lc: str, parsed: Dict[str, str]) -> None: |
|
|
""" |
|
|
Robust oxygen parsing: |
|
|
- Handle facultative first |
|
|
- Avoid "aerobic" accidentally matching inside "anaerobic" |
|
|
- Include "aerobically" / "anaerobically" |
|
|
""" |
|
|
|
|
|
if re.search(r"facultative(ly)? anaerob", text_lc): |
|
|
_set_if_stronger(parsed, "Oxygen Requirement", "Facultative Anaerobe") |
|
|
|
|
|
|
|
|
if ( |
|
|
re.search(r"\bobligate anaerob", text_lc) |
|
|
or (re.search(r"\banaerobic\b", text_lc) and "facultative" not in text_lc) |
|
|
or re.search(r"\banaerobically\b", text_lc) |
|
|
): |
|
|
_set_if_stronger(parsed, "Oxygen Requirement", "Anaerobic") |
|
|
|
|
|
|
|
|
if ( |
|
|
re.search(r"\bobligate aerobe\b", text_lc) |
|
|
or (re.search(r"\baerobic\b", text_lc) and "anaerobic" not in text_lc) |
|
|
or ( |
|
|
re.search(r"\baerobically\b", text_lc) |
|
|
and "anaerobically" not in text_lc |
|
|
) |
|
|
): |
|
|
_set_if_stronger(parsed, "Oxygen Requirement", "Aerobic") |
|
|
|
|
|
if "microaerophilic" in text_lc or "microaerophile" in text_lc: |
|
|
_set_if_stronger(parsed, "Oxygen Requirement", "Microaerophilic") |
|
|
|
|
|
if "capnophilic" in text_lc or "co2" in text_lc: |
|
|
_set_if_stronger(parsed, "Oxygen Requirement", "Capnophilic") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None: |
|
|
""" |
|
|
Look for explicit temperatures like "grows at 37 °C" or ranges like "4–45 °C". |
|
|
We ALWAYS store as "low//high": |
|
|
- true ranges: "4-45 °C" → "4//45" |
|
|
- "grows between 30 and 37 °C" → "30//37" |
|
|
- "grows at 30–37 °C" → "30//37" |
|
|
- two temps in text: min//max (Option A) |
|
|
- single temps: "37 °C" → "37//37" |
|
|
""" |
|
|
|
|
|
between_pattern = re.compile( |
|
|
r"between\s+(\d+)\s*(?:c|°c|degrees c|degrees celsius)?" |
|
|
r"\s*(?:and|to|-)\s*(\d+)\s*(?:c|°c|degrees c|degrees celsius)?" |
|
|
) |
|
|
m_between = between_pattern.search(text_lc) |
|
|
if m_between: |
|
|
low = m_between.group(1) |
|
|
high = m_between.group(2) |
|
|
_set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}") |
|
|
return |
|
|
|
|
|
|
|
|
range_pattern = re.compile( |
|
|
r"(\d+)\s*[-–/]\s*(\d+)\s*(?:c|°c|degrees c|degrees celsius)" |
|
|
) |
|
|
m_range = range_pattern.search(text_lc) |
|
|
if m_range: |
|
|
low = m_range.group(1) |
|
|
high = m_range.group(2) |
|
|
_set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}") |
|
|
return |
|
|
|
|
|
|
|
|
temps = re.findall(r"(\d+)\s*(?:c|°c|degrees c|degrees celsius)", text_lc) |
|
|
if len(temps) >= 2: |
|
|
nums = [int(t) for t in temps] |
|
|
low = min(nums) |
|
|
high = max(nums) |
|
|
_set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}") |
|
|
return |
|
|
|
|
|
|
|
|
single_pattern = re.compile( |
|
|
r"(grows|growth|optimum|optimal)\s+(?:at\s+)?(\d+)\s*" |
|
|
r"(?:c|°c|degrees c|degrees celsius)" |
|
|
) |
|
|
m_single = single_pattern.search(text_lc) |
|
|
if m_single: |
|
|
temp = m_single.group(2) |
|
|
_set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}") |
|
|
return |
|
|
|
|
|
|
|
|
m_simple_num = re.search(r"grows at (\d+)\b", text_lc) |
|
|
if m_simple_num: |
|
|
temp = m_simple_num.group(1) |
|
|
_set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}") |
|
|
return |
|
|
|
|
|
|
|
|
m_plain = re.search( |
|
|
r"\b(\d+)\s*(?:c|°c|degrees c|degrees celsius)\b", |
|
|
text_lc, |
|
|
) |
|
|
if m_plain: |
|
|
temp = m_plain.group(1) |
|
|
_set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MEDIA_KEYWORDS = { |
|
|
"Blood Agar": [ |
|
|
"blood agar", |
|
|
"blood-agar", |
|
|
], |
|
|
"MacConkey Agar": [ |
|
|
"macconkey agar", |
|
|
"mac conkey agar", |
|
|
"macconkey", |
|
|
], |
|
|
"Chocolate Agar": [ |
|
|
"chocolate agar", |
|
|
"chocolate-agar", |
|
|
], |
|
|
"Nutrient Agar": [ |
|
|
"nutrient agar", |
|
|
"nutrient-agar", |
|
|
"nut agar", |
|
|
], |
|
|
"XLD Agar": [ |
|
|
"xld agar", |
|
|
"xld", |
|
|
], |
|
|
"TCBS Agar": [ |
|
|
"tcbs agar", |
|
|
"tcbs", |
|
|
], |
|
|
"ALOA": [ |
|
|
"aloa agar", |
|
|
"aloa", |
|
|
], |
|
|
"BCYE Agar": [ |
|
|
"bcye agar", |
|
|
"bcye", |
|
|
"Buffered Charcoal Yeast Extract Agar", |
|
|
"buffered charcoal yeast extract agar" |
|
|
], |
|
|
"MRS Agar": [ |
|
|
"mrs agar", |
|
|
], |
|
|
"Mannitol Salt Agar": [ |
|
|
"msa agar", |
|
|
"ms agar", |
|
|
], |
|
|
"Cycloserine Cefoxitin Fructose Agar": [ |
|
|
"ccfa agar", |
|
|
"cycloserine cefoxitin fructose agar", |
|
|
"ccf agar", |
|
|
], |
|
|
"Thayer Martin Agar": [ |
|
|
"thayer martin agar", |
|
|
"tma agar", |
|
|
"tma", |
|
|
], |
|
|
"Bordet-Gengou Agar": [ |
|
|
"bordet gengou agar", |
|
|
], |
|
|
"Cetrimide Agar": [ |
|
|
"cetrimide agar", |
|
|
], |
|
|
"Anaerobic Agar": [ |
|
|
"anaerobic agar", |
|
|
], |
|
|
"Anaerobic Blood Agar": [ |
|
|
"anaerobic blood agar", |
|
|
], |
|
|
"Hektoen Enteric Agar": [ |
|
|
"hektoen enteric agar", |
|
|
"HK Agar", |
|
|
"hk", |
|
|
], |
|
|
"Tryptic Soy Agar": [ |
|
|
"tryptic soy agar", |
|
|
"t-soy agar", |
|
|
"tsoy", |
|
|
], |
|
|
"Brucella Agar": [ |
|
|
"brucella agar", |
|
|
], |
|
|
"Charcoal Agar": [ |
|
|
"charcoal agar", |
|
|
], |
|
|
"Yeast Extract Mannitol Agar": [ |
|
|
"yeast extract mannitol agar", |
|
|
], |
|
|
"Sabouraud Agar": [ |
|
|
"sabouraud agar", |
|
|
"sabouraud dextrose agar", |
|
|
], |
|
|
"BHI": [ |
|
|
"bhi", |
|
|
"brain heart infusion agar", |
|
|
"brain heart infusion", |
|
|
], |
|
|
"Columbia Blood Agar": [ |
|
|
"columbia blood agar", |
|
|
"columbia agar", |
|
|
"columbia", |
|
|
], |
|
|
"Lowenstein-Jensen Agar": [ |
|
|
"lowenstein-jensen agar", |
|
|
"lowenstein jensen agar", |
|
|
], |
|
|
"BSK Medium": [ |
|
|
"bsk medium", |
|
|
"bsk", |
|
|
"bsk-ii medium", |
|
|
"bsk-h medium", |
|
|
], |
|
|
"Ashby Agar": [ |
|
|
"ashby agar", |
|
|
"ashby medium", |
|
|
] |
|
|
} |
|
|
|
|
|
|
|
|
def _parse_media(text_lc: str, parsed: Dict[str, str]) -> None: |
|
|
found_media: List[str] = [] |
|
|
for media_name, patterns in MEDIA_KEYWORDS.items(): |
|
|
for p in patterns: |
|
|
if p in text_lc and media_name not in found_media: |
|
|
found_media.append(media_name) |
|
|
|
|
|
if found_media: |
|
|
_set_if_stronger(parsed, "Media Grown On", "; ".join(found_media)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None: |
|
|
""" |
|
|
Handles patterns like: |
|
|
- "glucose positive, mannitol negative" |
|
|
- "ferments glucose, mannitol and sucrose but not lactose" |
|
|
- "does not ferment lactose or sucrose" |
|
|
- "non-lactose fermenter" |
|
|
- "<sugar> fermenter" (positive unless "non-<sugar> fermenter") |
|
|
- "<sugar> is positive/negative" |
|
|
- "<sugar> fermentation is positive/negative" |
|
|
- global non-fermenter phrases |
|
|
- "asaccharolytic" → all sugars Negative (Unknown-only) |
|
|
- "all other sugars negative" → remaining sugars Negative |
|
|
""" |
|
|
|
|
|
|
|
|
for sugar_key, field in SUGAR_FIELDS.items(): |
|
|
|
|
|
m_simple = re.search( |
|
|
rf"{sugar_key}\s+(positive|negative|variable|pos|neg|\+|\-)", |
|
|
text_lc, |
|
|
) |
|
|
if m_simple: |
|
|
val = _value_from_pnv_context(m_simple.group(1)) |
|
|
if val: |
|
|
_set_if_stronger(parsed, field, val) |
|
|
|
|
|
|
|
|
m_is = re.search( |
|
|
rf"{sugar_key}\s+is\s+(positive|negative|variable|pos|neg|\+|\-)", |
|
|
text_lc, |
|
|
) |
|
|
if m_is: |
|
|
val = _value_from_pnv_token(m_is.group(1)) |
|
|
if val: |
|
|
_set_if_stronger(parsed, field, val) |
|
|
|
|
|
|
|
|
for sugar_key, field in SUGAR_FIELDS.items(): |
|
|
|
|
|
if re.search(rf"\b{sugar_key}\s+fermenter\b", text_lc) and not re.search( |
|
|
rf"\bnon[- ]{sugar_key}\s+fermenter\b", text_lc |
|
|
): |
|
|
_set_if_stronger(parsed, field, "Positive") |
|
|
|
|
|
|
|
|
if re.search(rf"\bnon[- ]{sugar_key}\s+fermenter\b", text_lc): |
|
|
_set_if_stronger(parsed, field, "Negative") |
|
|
|
|
|
|
|
|
ferments_pattern = re.compile(r"ferments\s+([a-z0-9 ,;/&\-]+)") |
|
|
for m in ferments_pattern.finditer(text_lc): |
|
|
seg = m.group(1) |
|
|
|
|
|
neg_split = re.split(r"\bbut not\b", seg, maxsplit=1) |
|
|
pos_part = neg_split[0] |
|
|
neg_part = neg_split[1] if len(neg_split) > 1 else "" |
|
|
|
|
|
|
|
|
for sugar_key, field in SUGAR_FIELDS.items(): |
|
|
if re.search(rf"\b{sugar_key}\b", pos_part): |
|
|
_set_if_stronger(parsed, field, "Positive") |
|
|
|
|
|
|
|
|
for sugar_key, field in SUGAR_FIELDS.items(): |
|
|
if re.search(rf"\b{sugar_key}\b", neg_part): |
|
|
_set_if_stronger(parsed, field, "Negative") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
grouped_neg_pattern = re.compile( |
|
|
r"does\s+(?:not|n't)\s+ferment\s+([a-z0-9 ,;/&\-]+?)(?:\s+but\b|\.|;|,|$)" |
|
|
) |
|
|
for m in grouped_neg_pattern.finditer(text_lc): |
|
|
seg = m.group(1) |
|
|
for sugar_key, field in SUGAR_FIELDS.items(): |
|
|
if re.search(rf"\b{sugar_key}\b", seg): |
|
|
_set_if_stronger(parsed, field, "Negative") |
|
|
|
|
|
|
|
|
for sugar_key, field in SUGAR_FIELDS.items(): |
|
|
if re.search( |
|
|
rf"does\s+(?:not|n't)\s+ferment\s+{sugar_key}\b", text_lc |
|
|
): |
|
|
_set_if_stronger(parsed, field, "Negative") |
|
|
|
|
|
|
|
|
for sugar_key, field in SUGAR_FIELDS.items(): |
|
|
if re.search( |
|
|
rf"non[- ]{sugar_key}\s+ferment(ing|er)?", text_lc |
|
|
): |
|
|
_set_if_stronger(parsed, field, "Negative") |
|
|
|
|
|
|
|
|
for sugar_key, field in SUGAR_FIELDS.items(): |
|
|
|
|
|
m1 = re.search( |
|
|
rf"{sugar_key}\s+fermentation[ \-]?" |
|
|
r"(positive|negative|variable|pos|neg|\+|\-)", |
|
|
text_lc, |
|
|
) |
|
|
if m1: |
|
|
val = _value_from_pnv_context(m1.group(1)) |
|
|
if val: |
|
|
_set_if_stronger(parsed, field, val) |
|
|
continue |
|
|
|
|
|
|
|
|
m2 = re.search( |
|
|
rf"(positive|negative|variable|pos|neg|\+|\-)\s+" |
|
|
rf"(for\s+)?{sugar_key}\s+fermentation", |
|
|
text_lc, |
|
|
) |
|
|
if m2: |
|
|
val = _value_from_pnv_context(m2.group(1)) |
|
|
if val: |
|
|
_set_if_stronger(parsed, field, val) |
|
|
continue |
|
|
|
|
|
|
|
|
m3 = re.search( |
|
|
rf"{sugar_key}\s+fermentation\s+is\s+" |
|
|
r"(positive|negative|variable|pos|neg|\+|\-)", |
|
|
text_lc, |
|
|
) |
|
|
if m3: |
|
|
val = _value_from_pnv_token(m3.group(1)) |
|
|
if val: |
|
|
_set_if_stronger(parsed, field, val) |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ( |
|
|
re.search( |
|
|
r"does\s+(?:not|n't)\s+ferment\s+(carbohydrates|sugars)", text_lc |
|
|
) |
|
|
or re.search(r"\bnon[- ]ferment(er|ing|ative)\b", text_lc) |
|
|
): |
|
|
for field in SUGAR_FIELDS.values(): |
|
|
if field not in parsed or parsed[field] == UNKNOWN: |
|
|
_set_if_stronger(parsed, field, "Negative") |
|
|
|
|
|
|
|
|
if ( |
|
|
"asaccharolytic" in text_lc |
|
|
or "non-saccharolytic" in text_lc |
|
|
or "non saccharolytic" in text_lc |
|
|
): |
|
|
for field in SUGAR_FIELDS.values(): |
|
|
if field not in parsed or parsed[field] == UNKNOWN: |
|
|
_set_if_stronger(parsed, field, "Negative") |
|
|
|
|
|
|
|
|
m_other = re.search( |
|
|
r"all\s+other\s+sugars\s+(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)", |
|
|
text_lc, |
|
|
) |
|
|
if m_other: |
|
|
val = _value_from_pnv_token(m_other.group(1)) |
|
|
if val: |
|
|
for field in SUGAR_FIELDS.values(): |
|
|
if field not in parsed or parsed[field] == UNKNOWN: |
|
|
_set_if_stronger(parsed, field, val) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _normalise_colony_desc(desc: str) -> str: |
|
|
""" |
|
|
Take a raw colony descriptor and normalise into: |
|
|
"Smooth; Yellow; Opaque" etc. |
|
|
|
|
|
Tweaks: |
|
|
- Remove "-pigmented" → "yellow-pigmented" → "yellow" |
|
|
- Treat "and" like a separator for parts |
|
|
""" |
|
|
|
|
|
tmp = desc.replace("-pigmented", "") |
|
|
|
|
|
|
|
|
tmp = tmp.replace(" and ", ", ") |
|
|
|
|
|
parts = [s.strip() for s in re.split(r"[;,]", tmp) if s.strip()] |
|
|
pretty = "; ".join(p.capitalize() for p in parts) |
|
|
return pretty |
|
|
|
|
|
|
|
|
def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None: |
|
|
""" |
|
|
Very coarse mapping for colony morphology. We try: |
|
|
- "colonies are yellow, mucoid" |
|
|
- "colonies dry, white and irregular on nutrient agar" |
|
|
- "forming smooth, yellow-pigmented, opaque colonies" |
|
|
- "grey colonies", "large grey colonies" (no verb) |
|
|
""" |
|
|
|
|
|
|
|
|
m = re.search(r"colon(y|ies)\s+(are|is)\s+([a-z0-9 ,;\-]+)", text_lc) |
|
|
if m: |
|
|
desc = m.group(3).strip() |
|
|
if desc: |
|
|
pretty = _normalise_colony_desc(desc) |
|
|
if pretty: |
|
|
_set_if_stronger(parsed, "Colony Morphology", pretty) |
|
|
return |
|
|
|
|
|
|
|
|
m2 = re.search( |
|
|
r"colonies\s+([a-z0-9 ,;\-]+?)(?:\s+on\b|\.|,)", |
|
|
text_lc, |
|
|
) |
|
|
if m2: |
|
|
desc = m2.group(1).strip() |
|
|
if desc: |
|
|
pretty = _normalise_colony_desc(desc) |
|
|
if pretty: |
|
|
_set_if_stronger(parsed, "Colony Morphology", pretty) |
|
|
return |
|
|
|
|
|
|
|
|
|
|
|
m3 = re.search( |
|
|
r"(forming|forms|produces)\s+([a-z0-9 ,;\-]+?)\s+colonies", |
|
|
text_lc, |
|
|
) |
|
|
if m3: |
|
|
desc = m3.group(2).strip() |
|
|
if desc: |
|
|
pretty = _normalise_colony_desc(desc) |
|
|
if pretty: |
|
|
_set_if_stronger(parsed, "Colony Morphology", pretty) |
|
|
return |
|
|
|
|
|
|
|
|
|
|
|
m4 = re.search( |
|
|
r"\b([a-z0-9 ,;\-]+?)\s+colonies\b", |
|
|
text_lc, |
|
|
) |
|
|
if m4: |
|
|
desc = m4.group(1).strip() |
|
|
if desc: |
|
|
pretty = _normalise_colony_desc(desc) |
|
|
if pretty: |
|
|
_set_if_stronger(parsed, "Colony Morphology", pretty) |
|
|
return |
|
|
|
|
|
def _apply_patches(original_text: str, text_lc: str, parsed: Dict[str, str]) -> Dict[str, str]: |
|
|
|
|
|
|
|
|
|
|
|
def _pnv(x: str) -> Optional[str]: |
|
|
x = x.strip().lower() |
|
|
if x in {"positive", "pos", "+", "strongly positive", "weakly positive"}: |
|
|
return "Positive" |
|
|
if x in {"negative", "neg", "-", "no"}: |
|
|
return "Negative" |
|
|
if x in {"variable", "var", "mixed"}: |
|
|
return "Variable" |
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
m_alpha = re.search(r"(alpha|α)[-\s]*haemolysis", text_lc) or \ |
|
|
re.search(r"haemolysis type[: ]*(alpha|α)", text_lc) |
|
|
if m_alpha: |
|
|
if parsed.get("Haemolysis", UNKNOWN) == UNKNOWN: |
|
|
parsed["Haemolysis"] = "Positive" |
|
|
if parsed.get("Haemolysis Type", UNKNOWN) == UNKNOWN: |
|
|
parsed["Haemolysis Type"] = "Alpha" |
|
|
|
|
|
|
|
|
m_beta = re.search(r"(beta|β)[-\s]*haemolysis", text_lc) or \ |
|
|
re.search(r"haemolysis type[: ]*(beta|β)", text_lc) |
|
|
if m_beta: |
|
|
if parsed.get("Haemolysis", UNKNOWN) == UNKNOWN: |
|
|
parsed["Haemolysis"] = "Positive" |
|
|
if parsed.get("Haemolysis Type", UNKNOWN) == UNKNOWN: |
|
|
parsed["Haemolysis Type"] = "Beta" |
|
|
|
|
|
|
|
|
m_gamma = re.search(r"(gamma|γ)[-\s]*haemolysis", text_lc) |
|
|
m_none = re.search(r"(no haemolysis|non[- ]haemolytic|no hemolysis|non[- ]hemolytic)", text_lc) |
|
|
if m_gamma or m_none: |
|
|
if parsed.get("Haemolysis", UNKNOWN) == UNKNOWN: |
|
|
parsed["Haemolysis"] = "Negative" |
|
|
if parsed.get("Haemolysis Type", UNKNOWN) == UNKNOWN: |
|
|
parsed["Haemolysis Type"] = "None" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
m_h = re.search(r"haemolysis\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc) |
|
|
if m_h and "Haemolysis" not in parsed: |
|
|
val = _pnv(m_h.group(1)) |
|
|
if val: |
|
|
parsed["Haemolysis"] = val |
|
|
if parsed.get("Haemolysis Type", UNKNOWN) == UNKNOWN and val == "Positive": |
|
|
parsed["Haemolysis Type"] = "Unknown" |
|
|
|
|
|
|
|
|
m_mot = re.search(r"motility\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc) |
|
|
if m_mot and "Motility" not in parsed: |
|
|
val = _pnv(m_mot.group(1)) |
|
|
if val: |
|
|
parsed["Motility"] = val |
|
|
|
|
|
|
|
|
m_sp = re.search(r"spore formation\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc) |
|
|
if m_sp and parsed.get("Spore Formation", UNKNOWN) == UNKNOWN: |
|
|
val = _pnv(m_sp.group(1)) |
|
|
if val: |
|
|
parsed["Spore Formation"] = val |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if parsed.get("NaCl Tolerant (>=6%)", UNKNOWN) == UNKNOWN: |
|
|
|
|
|
|
|
|
m_nacl = re.search( |
|
|
r"(?:nacl\s*(?:tolerant|tolerance)?|growth\s+in\s+6\%[\s]*nacl)" |
|
|
r"\s*(positive|negative|variable|pos|neg|\+|\-)", |
|
|
text_lc |
|
|
) |
|
|
if m_nacl: |
|
|
val = _pnv(m_nacl.group(1)) |
|
|
if val: |
|
|
parsed["NaCl Tolerant (>=6%)"] = val |
|
|
|
|
|
|
|
|
if parsed.get("NaCl Tolerant (>=6%)", UNKNOWN) == UNKNOWN: |
|
|
if re.search(r"no\s+growth\s+in\s+(?:>=)?\s*6\%?\s*nacl", text_lc): |
|
|
parsed["NaCl Tolerant (>=6%)"] = "Negative" |
|
|
|
|
|
|
|
|
if parsed.get("NaCl Tolerant (>=6%)", UNKNOWN) == UNKNOWN: |
|
|
if re.search(r"grows?\s+in\s+(?:>=)?\s*6\%?\s*nacl", text_lc): |
|
|
parsed["NaCl Tolerant (>=6%)"] = "Positive" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
m_temp = re.search(r"\b(\d{1,3})\s*[/]{1,2}\s*(\d{1,3})\b", text_lc) |
|
|
if m_temp and parsed.get("Growth Temperature", UNKNOWN) == UNKNOWN: |
|
|
parsed["Growth Temperature"] = f"{m_temp.group(1)}//{m_temp.group(2)}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
COLONY_TRIGGERS = [ |
|
|
"colony morphology", |
|
|
"colonies are", |
|
|
"colonies appear", |
|
|
"colonies look", |
|
|
"colony appearance", |
|
|
"colony characteristics", |
|
|
] |
|
|
if any(t in text_lc for t in COLONY_TRIGGERS): |
|
|
m_col = re.search( |
|
|
r"(?:colony morphology|colonies are|colonies appear|colonies look|colony appearance|colony characteristics)" |
|
|
r"[: ]+([a-z0-9 ,;/\-]+)", |
|
|
text_lc |
|
|
) |
|
|
if m_col: |
|
|
segment = m_col.group(1) |
|
|
parts = [x.strip() for x in re.split(r"[;,/]", segment) if x.strip()] |
|
|
|
|
|
clean_desc = [p.capitalize() for p in parts if len(p) > 1] |
|
|
|
|
|
if clean_desc: |
|
|
existing = parsed.get("Colony Morphology", "") |
|
|
existing_list = [x.strip() for x in existing.split(";")] if existing else [] |
|
|
|
|
|
merged = [] |
|
|
for x in existing_list: |
|
|
if x not in merged: |
|
|
merged.append(x) |
|
|
for x in clean_desc: |
|
|
if x not in merged: |
|
|
merged.append(x) |
|
|
|
|
|
parsed["Colony Morphology"] = "; ".join(merged) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if "media grown on" in text_lc or "grown on" in text_lc: |
|
|
mm = re.search(r"(?:media\s+grown\s+on|grown\s+on)[: ]+([a-z0-9 ,;/\-]+)", text_lc) |
|
|
if mm: |
|
|
segment = mm.group(1) |
|
|
raw_items = re.split(r"[;,]", segment) |
|
|
raw_items = [x.strip() for x in raw_items if x.strip()] |
|
|
|
|
|
detected_media = [] |
|
|
for item in raw_items: |
|
|
for media_name, patterns in MEDIA_KEYWORDS.items(): |
|
|
for p in patterns: |
|
|
if p in item and media_name not in detected_media: |
|
|
detected_media.append(media_name) |
|
|
|
|
|
if detected_media: |
|
|
existing = parsed.get("Media Grown On", "") |
|
|
existing_list = [x.strip() for x in existing.split(";")] if existing else [] |
|
|
|
|
|
merged = [] |
|
|
for m in existing_list: |
|
|
if m not in merged: |
|
|
merged.append(m) |
|
|
for m in detected_media: |
|
|
if m not in merged: |
|
|
merged.append(m) |
|
|
|
|
|
parsed["Media Grown On"] = "; ".join(merged) |
|
|
|
|
|
return parsed |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_text_rules(text: str) -> Dict[str, Any]: |
|
|
""" |
|
|
Main entry point for the rule-based core parser. |
|
|
""" |
|
|
original = text or "" |
|
|
text_clean = _clean_text(original) |
|
|
text_lc = text_clean.lower() |
|
|
|
|
|
parsed: Dict[str, str] = {} |
|
|
|
|
|
try: |
|
|
_parse_gram_and_shape(text_lc, parsed) |
|
|
_parse_haemolysis(text_lc, parsed) |
|
|
_parse_core_bool_tests(text_lc, parsed) |
|
|
_parse_motility_capsule_spores(text_lc, parsed) |
|
|
_parse_oxygen(text_lc, parsed) |
|
|
_parse_growth_temperature(text_lc, parsed) |
|
|
_parse_media(text_lc, parsed) |
|
|
_parse_sugars(text_lc, parsed) |
|
|
_parse_colony(text_lc, parsed) |
|
|
parsed = _apply_patches(original, text_lc, parsed) |
|
|
|
|
|
return { |
|
|
"parsed_fields": parsed, |
|
|
"source": "rule_parser", |
|
|
"raw": original, |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
return { |
|
|
"parsed_fields": parsed, |
|
|
"source": "rule_parser", |
|
|
"raw": original, |
|
|
"error": f"{type(e).__name__}: {e}", |
|
|
} |
|
|
|