Update engine/parser_rules.py
Browse files- engine/parser_rules.py +22 -1
engine/parser_rules.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
# ------------------------------------------------------------
|
| 3 |
# Rule-based core parser for microbiology descriptions.
|
| 4 |
#
|
| 5 |
-
# Stage 11F (Option A ranges + fixes):
|
| 6 |
# - Always store Growth Temperature as "low//high"
|
| 7 |
# • single: 37 → "37//37"
|
| 8 |
# • two temps in text: min//max (e.g. "4 °C but not 45 °C" → "4//45")
|
|
@@ -12,6 +12,9 @@
|
|
| 12 |
# - "aerobically" / "anaerobically" → Aerobic / Anaerobic
|
| 13 |
# - NaCl tolerance phrases improved
|
| 14 |
# - Colony morphology from "colonies dry, white and irregular on nutrient agar"
|
|
|
|
|
|
|
|
|
|
| 15 |
# ------------------------------------------------------------
|
| 16 |
|
| 17 |
from __future__ import annotations
|
|
@@ -210,6 +213,7 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
|
|
| 210 |
- Nitrate reduction text
|
| 211 |
- H2S production / non-production
|
| 212 |
- DNase universal coverage
|
|
|
|
| 213 |
"""
|
| 214 |
for field, keywords in CORE_BOOL_FIELDS.items():
|
| 215 |
for kw in keywords:
|
|
@@ -316,6 +320,14 @@ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
|
|
| 316 |
if re.search(r"\bnon[- ]?dnase[- ]?producing\b", text_lc):
|
| 317 |
_set_if_stronger(parsed, "DNase", "Negative")
|
| 318 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
|
| 320 |
# ------------------------------------------------------------
|
| 321 |
# Motility / Capsule / Spores
|
|
@@ -551,6 +563,7 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
|
|
| 551 |
- "ferments glucose, mannitol and sucrose but not lactose"
|
| 552 |
- "does not ferment lactose"
|
| 553 |
- "non-lactose fermenter"
|
|
|
|
| 554 |
- global non-fermenter phrases
|
| 555 |
"""
|
| 556 |
|
|
@@ -565,6 +578,14 @@ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
|
|
| 565 |
if val:
|
| 566 |
_set_if_stronger(parsed, field, val)
|
| 567 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 568 |
# 1) "ferments X, Y and Z but not A, B"
|
| 569 |
ferments_pattern = re.compile(r"ferments\s+([a-z0-9 ,;/&\-]+)")
|
| 570 |
for m in ferments_pattern.finditer(text_lc):
|
|
|
|
| 2 |
# ------------------------------------------------------------
|
| 3 |
# Rule-based core parser for microbiology descriptions.
|
| 4 |
#
|
| 5 |
+
# Stage 11F (Option A ranges + fixes) + Stage 11H additions:
|
| 6 |
# - Always store Growth Temperature as "low//high"
|
| 7 |
# • single: 37 → "37//37"
|
| 8 |
# • two temps in text: min//max (e.g. "4 °C but not 45 °C" → "4//45")
|
|
|
|
| 12 |
# - "aerobically" / "anaerobically" → Aerobic / Anaerobic
|
| 13 |
# - NaCl tolerance phrases improved
|
| 14 |
# - Colony morphology from "colonies dry, white and irregular on nutrient agar"
|
| 15 |
+
# - NEW (11H):
|
| 16 |
+
# • "Gelatinase positive/negative" → Gelatin Hydrolysis Positive/Negative
|
| 17 |
+
# • "<sugar> fermenter" → <Sugar> Fermentation = Positive
|
| 18 |
# ------------------------------------------------------------
|
| 19 |
|
| 20 |
from __future__ import annotations
|
|
|
|
| 213 |
- Nitrate reduction text
|
| 214 |
- H2S production / non-production
|
| 215 |
- DNase universal coverage
|
| 216 |
+
- NEW (11H): explicit gelatinase → Gelatin Hydrolysis mapping
|
| 217 |
"""
|
| 218 |
for field, keywords in CORE_BOOL_FIELDS.items():
|
| 219 |
for kw in keywords:
|
|
|
|
| 320 |
if re.search(r"\bnon[- ]?dnase[- ]?producing\b", text_lc):
|
| 321 |
_set_if_stronger(parsed, "DNase", "Negative")
|
| 322 |
|
| 323 |
+
# --- NEW: Gelatinase → Gelatin Hydrolysis ---
|
| 324 |
+
# Explicit mapping just in case generic patterns miss it
|
| 325 |
+
if re.search(r"\bgelatinase\s*(positive|pos|\+)\b", text_lc):
|
| 326 |
+
_set_if_stronger(parsed, "Gelatin Hydrolysis", "Positive")
|
| 327 |
+
|
| 328 |
+
if re.search(r"\bgelatinase\s*(negative|neg|\-)\b", text_lc):
|
| 329 |
+
_set_if_stronger(parsed, "Gelatin Hydrolysis", "Negative")
|
| 330 |
+
|
| 331 |
|
| 332 |
# ------------------------------------------------------------
|
| 333 |
# Motility / Capsule / Spores
|
|
|
|
| 563 |
- "ferments glucose, mannitol and sucrose but not lactose"
|
| 564 |
- "does not ferment lactose"
|
| 565 |
- "non-lactose fermenter"
|
| 566 |
+
- "<sugar> fermenter" (positive)
|
| 567 |
- global non-fermenter phrases
|
| 568 |
"""
|
| 569 |
|
|
|
|
| 578 |
if val:
|
| 579 |
_set_if_stronger(parsed, field, val)
|
| 580 |
|
| 581 |
+
# 0b) NEW: "<sugar> fermenter" → Positive (unless "non-<sugar> fermenter")
|
| 582 |
+
for sugar_key, field in SUGAR_FIELDS.items():
|
| 583 |
+
# positive: "lactose fermenter"
|
| 584 |
+
if re.search(rf"\b{sugar_key}\s+fermenter\b", text_lc) and not re.search(
|
| 585 |
+
rf"\bnon[- ]{sugar_key}\s+fermenter\b", text_lc
|
| 586 |
+
):
|
| 587 |
+
_set_if_stronger(parsed, field, "Positive")
|
| 588 |
+
|
| 589 |
# 1) "ferments X, Y and Z but not A, B"
|
| 590 |
ferments_pattern = re.compile(r"ferments\s+([a-z0-9 ,;/&\-]+)")
|
| 591 |
for m in ferments_pattern.finditer(text_lc):
|