Update engine/parser_rules.py
Browse files- engine/parser_rules.py +99 -26
engine/parser_rules.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
# ------------------------------------------------------------
|
| 3 |
# Rule-based core parser for microbiology descriptions.
|
| 4 |
#
|
| 5 |
-
# Stage 11F (Option A ranges + fixes) + 11H + 11I + 11J +
|
| 6 |
#
|
| 7 |
# - Always store Growth Temperature as "low//high"
|
| 8 |
# β’ single: 37 β "37//37"
|
|
@@ -26,7 +26,9 @@
|
|
| 26 |
# β’ grouped "does not ferment lactose and sucrose"
|
| 27 |
# (without nuking glucose in "but glucose positive")
|
| 28 |
# β’ global "non-fermenter" β all sugars Negative (Unknown-only)
|
| 29 |
-
# β’
|
|
|
|
|
|
|
| 30 |
#
|
| 31 |
# - Core tests:
|
| 32 |
# β’ "<kw> positive/negative"
|
|
@@ -37,14 +39,19 @@
|
|
| 37 |
# β’ "<kw> test reaction is positive/negative"
|
| 38 |
# β’ "ONPG is negative" handled via core patterns
|
| 39 |
# β’ "H2S production is positive/negative"
|
| 40 |
-
# β’
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
#
|
| 42 |
# - Decarboxylases:
|
| 43 |
# β’ "all decarboxylases negative/positive"
|
| 44 |
# β Lysine / Ornithine / Arginine dihydrolase set accordingly
|
| 45 |
# (Unknown-only; explicit values can override later)
|
| 46 |
-
# β’ "lysine, ornithine and arginine
|
| 47 |
-
#
|
|
|
|
| 48 |
#
|
| 49 |
# - Capsule / Motility:
|
| 50 |
# β’ "capsule present"/"capsule is present" β Capsule Positive
|
|
@@ -55,6 +62,10 @@
|
|
| 55 |
# - Gelatin / Esculin:
|
| 56 |
# β’ "gelatin positive/negative" β Gelatin Hydrolysis
|
| 57 |
# β’ "esculin positive/negative" β Esculin Hydrolysis
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
# ------------------------------------------------------------
|
| 59 |
|
| 60 |
from __future__ import annotations
|
|
@@ -111,11 +122,12 @@ CORE_BOOL_FIELDS: Dict[str, List[str]] = {
|
|
| 111 |
"Lipase Test": ["lipase"],
|
| 112 |
"Nitrate Reduction": ["nitrate reduction", "nitrate"],
|
| 113 |
"NaCl Tolerant (>=6%)": ["6% nacl", "7% nacl", "nacl tolerant"],
|
|
|
|
| 114 |
"Lysine Decarboxylase": ["lysine decarboxylase", "lysine decarb", "lysine"],
|
| 115 |
"Ornitihine Decarboxylase": ["ornithine decarboxylase", "ornithine decarb", "ornithine"],
|
| 116 |
"Arginine dihydrolase": ["arginine dihydrolase", "arginine decarboxylase", "arginine"],
|
|
|
|
| 117 |
"Gelatin Hydrolysis": ["gelatin hydrolysis", "gelatinase", "gelatin"],
|
| 118 |
-
# Esculin Hydrolysis: also match plain "esculin"
|
| 119 |
"Esculin Hydrolysis": ["esculin hydrolysis", "esculin"],
|
| 120 |
}
|
| 121 |
|
|
@@ -204,10 +216,14 @@ def _parse_gram_and_shape(text_lc: str, parsed: Dict[str, str]) -> None:
|
|
| 204 |
_set_if_stronger(parsed, "Gram Stain", "Variable")
|
| 205 |
|
| 206 |
# Shape
|
| 207 |
-
# Prefer "short rods" over generic rods
|
| 208 |
if "short rods" in text_lc:
|
| 209 |
_set_if_stronger(parsed, "Shape", "Short Rods")
|
| 210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
# Cocci and variants (diplococci, tetracocci, etc.)
|
| 212 |
if re.search(r"\bcocci\b", text_lc):
|
| 213 |
_set_if_stronger(parsed, "Shape", "Cocci")
|
|
@@ -282,7 +298,10 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
|
|
| 282 |
- gelatinase / gelatin β Gelatin Hydrolysis
|
| 283 |
- esculin β Esculin Hydrolysis
|
| 284 |
- grouped MR/VP: "MR and VP negative"
|
| 285 |
-
- decarboxylase global
|
|
|
|
|
|
|
|
|
|
| 286 |
"""
|
| 287 |
for field, keywords in CORE_BOOL_FIELDS.items():
|
| 288 |
for kw in keywords:
|
|
@@ -470,9 +489,7 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
|
|
| 470 |
_assign_mr_vp(name1)
|
| 471 |
_assign_mr_vp(name2)
|
| 472 |
|
| 473 |
-
# --- Decarboxylases global
|
| 474 |
-
|
| 475 |
-
# 1) "all decarboxylases negative/positive"
|
| 476 |
m_all_decarb = re.search(
|
| 477 |
r"all\s+decarboxylases?\s+(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)",
|
| 478 |
text_lc,
|
|
@@ -483,24 +500,62 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
|
|
| 483 |
for f in ("Lysine Decarboxylase", "Ornitihine Decarboxylase", "Arginine dihydrolase"):
|
| 484 |
_set_if_stronger(parsed, f, val)
|
| 485 |
|
| 486 |
-
#
|
| 487 |
-
#
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 493 |
seg = m.group(1)
|
| 494 |
val = _value_from_pnv_token(m.group(2))
|
| 495 |
if not val:
|
| 496 |
continue
|
| 497 |
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 504 |
|
| 505 |
|
| 506 |
# ------------------------------------------------------------
|
|
@@ -755,7 +810,8 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
|
|
| 755 |
- "<sugar> is positive/negative"
|
| 756 |
- "<sugar> fermentation is positive/negative"
|
| 757 |
- global non-fermenter phrases
|
| 758 |
-
-
|
|
|
|
| 759 |
"""
|
| 760 |
|
| 761 |
# 0) Simple "<sugar> positive/negative" and "<sugar> is positive"
|
|
@@ -812,6 +868,8 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
|
|
| 812 |
_set_if_stronger(parsed, field, "Negative")
|
| 813 |
|
| 814 |
# 2) Grouped "does not ferment X, Y and Z" (stop at but/punctuation)
|
|
|
|
|
|
|
| 815 |
grouped_neg_pattern = re.compile(
|
| 816 |
r"does\s+(?:not|n't)\s+ferment\s+([a-z0-9 ,;/&\-]+?)(?:\s+but\b|\.|;|,|$)"
|
| 817 |
)
|
|
@@ -874,6 +932,8 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
|
|
| 874 |
continue
|
| 875 |
|
| 876 |
# 6) Global non-fermenter phrases
|
|
|
|
|
|
|
| 877 |
if (
|
| 878 |
re.search(
|
| 879 |
r"does\s+(?:not|n't)\s+ferment\s+(carbohydrates|sugars)", text_lc
|
|
@@ -894,9 +954,22 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
|
|
| 894 |
if field not in parsed or parsed[field] == UNKNOWN:
|
| 895 |
_set_if_stronger(parsed, field, "Negative")
|
| 896 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 897 |
|
| 898 |
# ------------------------------------------------------------
|
| 899 |
-
# Colony morphology
|
| 900 |
# ------------------------------------------------------------
|
| 901 |
|
| 902 |
def _normalise_colony_desc(desc: str) -> str:
|
|
|
|
| 2 |
# ------------------------------------------------------------
|
| 3 |
# Rule-based core parser for microbiology descriptions.
|
| 4 |
#
|
| 5 |
+
# Stage 11F (Option A ranges + fixes) + 11H + 11I + 11J + 11L + 11M:
|
| 6 |
#
|
| 7 |
# - Always store Growth Temperature as "low//high"
|
| 8 |
# β’ single: 37 β "37//37"
|
|
|
|
| 26 |
# β’ grouped "does not ferment lactose and sucrose"
|
| 27 |
# (without nuking glucose in "but glucose positive")
|
| 28 |
# β’ global "non-fermenter" β all sugars Negative (Unknown-only)
|
| 29 |
+
# β’ "asaccharolytic" β all sugars Negative (Unknown-only)
|
| 30 |
+
# β’ NEW (11M): "all other sugars negative" β all remaining sugars Negative
|
| 31 |
+
# (Unknown-only; no hard rewrite)
|
| 32 |
#
|
| 33 |
# - Core tests:
|
| 34 |
# β’ "<kw> positive/negative"
|
|
|
|
| 39 |
# β’ "<kw> test reaction is positive/negative"
|
| 40 |
# β’ "ONPG is negative" handled via core patterns
|
| 41 |
# β’ "H2S production is positive/negative"
|
| 42 |
+
# β’ "MR and VP negative/positive" β both set
|
| 43 |
+
# β’ NEW (11M): grouped phrases like
|
| 44 |
+
# "gelatin and esculin hydrolysis negative"
|
| 45 |
+
# "lysine, ornithine and arginine negative"
|
| 46 |
+
# β all mentioned tests / sugars set to the given value
|
| 47 |
#
|
| 48 |
# - Decarboxylases:
|
| 49 |
# β’ "all decarboxylases negative/positive"
|
| 50 |
# β Lysine / Ornithine / Arginine dihydrolase set accordingly
|
| 51 |
# (Unknown-only; explicit values can override later)
|
| 52 |
+
# β’ list-style "lysine, ornithine and arginine negative" now handled
|
| 53 |
+
# by the universal grouped-phrase logic (old decarb-specific
|
| 54 |
+
# list logic removed).
|
| 55 |
#
|
| 56 |
# - Capsule / Motility:
|
| 57 |
# β’ "capsule present"/"capsule is present" β Capsule Positive
|
|
|
|
| 62 |
# - Gelatin / Esculin:
|
| 63 |
# β’ "gelatin positive/negative" β Gelatin Hydrolysis
|
| 64 |
# β’ "esculin positive/negative" β Esculin Hydrolysis
|
| 65 |
+
# β’ grouped "gelatin and esculin hydrolysis negative" handled
|
| 66 |
+
#
|
| 67 |
+
# - Shape:
|
| 68 |
+
# β’ "coccobacilli / coccobacillus" β Shape = Short Rods
|
| 69 |
# ------------------------------------------------------------
|
| 70 |
|
| 71 |
from __future__ import annotations
|
|
|
|
| 122 |
"Lipase Test": ["lipase"],
|
| 123 |
"Nitrate Reduction": ["nitrate reduction", "nitrate"],
|
| 124 |
"NaCl Tolerant (>=6%)": ["6% nacl", "7% nacl", "nacl tolerant"],
|
| 125 |
+
# Decarboxylases (also match plain amino acid words)
|
| 126 |
"Lysine Decarboxylase": ["lysine decarboxylase", "lysine decarb", "lysine"],
|
| 127 |
"Ornitihine Decarboxylase": ["ornithine decarboxylase", "ornithine decarb", "ornithine"],
|
| 128 |
"Arginine dihydrolase": ["arginine dihydrolase", "arginine decarboxylase", "arginine"],
|
| 129 |
+
# Gelatin / Esculin
|
| 130 |
"Gelatin Hydrolysis": ["gelatin hydrolysis", "gelatinase", "gelatin"],
|
|
|
|
| 131 |
"Esculin Hydrolysis": ["esculin hydrolysis", "esculin"],
|
| 132 |
}
|
| 133 |
|
|
|
|
| 216 |
_set_if_stronger(parsed, "Gram Stain", "Variable")
|
| 217 |
|
| 218 |
# Shape
|
| 219 |
+
# Prefer "short rods" / coccobacilli over generic rods
|
| 220 |
if "short rods" in text_lc:
|
| 221 |
_set_if_stronger(parsed, "Shape", "Short Rods")
|
| 222 |
|
| 223 |
+
# NEW: coccobacilli β Short Rods
|
| 224 |
+
if re.search(r"\bcoccobacill(?:us|i)\b", text_lc):
|
| 225 |
+
_set_if_stronger(parsed, "Shape", "Short Rods")
|
| 226 |
+
|
| 227 |
# Cocci and variants (diplococci, tetracocci, etc.)
|
| 228 |
if re.search(r"\bcocci\b", text_lc):
|
| 229 |
_set_if_stronger(parsed, "Shape", "Cocci")
|
|
|
|
| 298 |
- gelatinase / gelatin β Gelatin Hydrolysis
|
| 299 |
- esculin β Esculin Hydrolysis
|
| 300 |
- grouped MR/VP: "MR and VP negative"
|
| 301 |
+
- decarboxylase global phrases
|
| 302 |
+
- NEW: generic grouped phrases
|
| 303 |
+
"gelatin and esculin hydrolysis negative"
|
| 304 |
+
"lysine, ornithine and arginine negative"
|
| 305 |
"""
|
| 306 |
for field, keywords in CORE_BOOL_FIELDS.items():
|
| 307 |
for kw in keywords:
|
|
|
|
| 489 |
_assign_mr_vp(name1)
|
| 490 |
_assign_mr_vp(name2)
|
| 491 |
|
| 492 |
+
# --- Decarboxylases global "all decarboxylases negative/positive" ---
|
|
|
|
|
|
|
| 493 |
m_all_decarb = re.search(
|
| 494 |
r"all\s+decarboxylases?\s+(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)",
|
| 495 |
text_lc,
|
|
|
|
| 500 |
for f in ("Lysine Decarboxylase", "Ornitihine Decarboxylase", "Arginine dihydrolase"):
|
| 501 |
_set_if_stronger(parsed, f, val)
|
| 502 |
|
| 503 |
+
# --- NEW (11M): Generic grouped list logic for tests & sugars ---
|
| 504 |
+
#
|
| 505 |
+
# Handles things like:
|
| 506 |
+
# "gelatin and esculin hydrolysis negative"
|
| 507 |
+
# "lysine, ornithine and arginine negative"
|
| 508 |
+
# "indole, urease and citrate positive"
|
| 509 |
+
# "raffinose and inositol negative"
|
| 510 |
+
#
|
| 511 |
+
# Strategy:
|
| 512 |
+
# 1) Capture a list segment and a trailing P/N token.
|
| 513 |
+
# 2) Only keep it if the segment contains at least one known
|
| 514 |
+
# CORE_BOOL_FIELDS keyword or sugar name.
|
| 515 |
+
# 3) Apply the value to all matching tests + sugars in that segment.
|
| 516 |
+
#
|
| 517 |
+
grouped_tests_pattern = re.compile(
|
| 518 |
+
r"([a-z0-9 ,/&\-]+?)\s+"
|
| 519 |
+
r"(?:hydrolysis|decarboxylases?|dihydrolases?|tests?|reactions?)?"
|
| 520 |
+
r"\s*(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)"
|
| 521 |
+
)
|
| 522 |
+
|
| 523 |
+
for m in grouped_tests_pattern.finditer(text_lc):
|
| 524 |
seg = m.group(1)
|
| 525 |
val = _value_from_pnv_token(m.group(2))
|
| 526 |
if not val:
|
| 527 |
continue
|
| 528 |
|
| 529 |
+
seg_lc = seg.lower()
|
| 530 |
+
|
| 531 |
+
# Quick filter: does this segment contain any known test/sugar keyword?
|
| 532 |
+
has_any = False
|
| 533 |
+
|
| 534 |
+
for _, keywords in CORE_BOOL_FIELDS.items():
|
| 535 |
+
if any(re.search(rf"\b{re.escape(kw)}\b", seg_lc) for kw in keywords):
|
| 536 |
+
has_any = True
|
| 537 |
+
break
|
| 538 |
+
|
| 539 |
+
if not has_any:
|
| 540 |
+
for sugar_key in SUGAR_FIELDS.keys():
|
| 541 |
+
if re.search(rf"\b{sugar_key}\b", seg_lc):
|
| 542 |
+
has_any = True
|
| 543 |
+
break
|
| 544 |
+
|
| 545 |
+
if not has_any:
|
| 546 |
+
continue # ignore segments unrelated to tests/sugars
|
| 547 |
+
|
| 548 |
+
# Apply to all matching core boolean tests
|
| 549 |
+
for field, keywords in CORE_BOOL_FIELDS.items():
|
| 550 |
+
for kw in keywords:
|
| 551 |
+
if re.search(rf"\b{re.escape(kw)}\b", seg_lc):
|
| 552 |
+
_set_if_stronger(parsed, field, val)
|
| 553 |
+
break
|
| 554 |
+
|
| 555 |
+
# Apply to all matching sugars
|
| 556 |
+
for sugar_key, field in SUGAR_FIELDS.items():
|
| 557 |
+
if re.search(rf"\b{sugar_key}\b", seg_lc):
|
| 558 |
+
_set_if_stronger(parsed, field, val)
|
| 559 |
|
| 560 |
|
| 561 |
# ------------------------------------------------------------
|
|
|
|
| 810 |
- "<sugar> is positive/negative"
|
| 811 |
- "<sugar> fermentation is positive/negative"
|
| 812 |
- global non-fermenter phrases
|
| 813 |
+
- "asaccharolytic" β all sugars Negative (Unknown-only)
|
| 814 |
+
- "all other sugars negative" β remaining sugars Negative
|
| 815 |
"""
|
| 816 |
|
| 817 |
# 0) Simple "<sugar> positive/negative" and "<sugar> is positive"
|
|
|
|
| 868 |
_set_if_stronger(parsed, field, "Negative")
|
| 869 |
|
| 870 |
# 2) Grouped "does not ferment X, Y and Z" (stop at but/punctuation)
|
| 871 |
+
# Prevents glucose being accidentally marked negative in:
|
| 872 |
+
# "does not ferment lactose or sucrose, but glucose fermentation is positive"
|
| 873 |
grouped_neg_pattern = re.compile(
|
| 874 |
r"does\s+(?:not|n't)\s+ferment\s+([a-z0-9 ,;/&\-]+?)(?:\s+but\b|\.|;|,|$)"
|
| 875 |
)
|
|
|
|
| 932 |
continue
|
| 933 |
|
| 934 |
# 6) Global non-fermenter phrases
|
| 935 |
+
# e.g. "non-fermenter", "does not ferment sugars"
|
| 936 |
+
# β set all sugars Negative *unless* already set by a more specific rule.
|
| 937 |
if (
|
| 938 |
re.search(
|
| 939 |
r"does\s+(?:not|n't)\s+ferment\s+(carbohydrates|sugars)", text_lc
|
|
|
|
| 954 |
if field not in parsed or parsed[field] == UNKNOWN:
|
| 955 |
_set_if_stronger(parsed, field, "Negative")
|
| 956 |
|
| 957 |
+
# 8) NEW (11M): "all other sugars negative/positive"
|
| 958 |
+
# we treat "other sugars" as "any sugar not already explicitly set"
|
| 959 |
+
m_other = re.search(
|
| 960 |
+
r"all\s+other\s+sugars\s+(?:are\s+)?(positive|negative|variable|pos|neg|\+|\-)",
|
| 961 |
+
text_lc,
|
| 962 |
+
)
|
| 963 |
+
if m_other:
|
| 964 |
+
val = _value_from_pnv_token(m_other.group(1))
|
| 965 |
+
if val:
|
| 966 |
+
for field in SUGAR_FIELDS.values():
|
| 967 |
+
if field not in parsed or parsed[field] == UNKNOWN:
|
| 968 |
+
_set_if_stronger(parsed, field, val)
|
| 969 |
+
|
| 970 |
|
| 971 |
# ------------------------------------------------------------
|
| 972 |
+
# Colony morphology (coarse, optional)
|
| 973 |
# ------------------------------------------------------------
|
| 974 |
|
| 975 |
def _normalise_colony_desc(desc: str) -> str:
|