Spaces:

EphAsad
/

BactKing

Sleeping

App Files Files Community

EphAsad commited on Nov 17, 2025

Commit

b6dd794

verified ·

1 Parent(s): db752eb

Update engine/parser_ext.py

Browse files

Files changed (1) hide show

engine/parser_ext.py +324 -111

engine/parser_ext.py CHANGED Viewed

@@ -1,19 +1,28 @@
 # engine/parser_ext.py
 # ------------------------------------------------------------
-# Extended test parser (Stage 11C)
 #
-# - Focuses on *extended* tests (disc tests, rare biochemicals, etc.)
-# - Uses extended_schema.json dynamically
-# - Ignores core DB fields (those are handled by parser_rules)
-# - Adds robust patterns for:
-#       CAMP, PYR, Optochin, Bacitracin, Novobiocin
 #
-# Returns:
-# {
-#   "parsed_fields": { ... },
-#   "source": "extended_parser",
-#   "raw": original_text
-# }
 # ------------------------------------------------------------
 from __future__ import annotations
@@ -23,6 +32,10 @@ import os
 import re
 from typing import Dict, Any, List
 EXTENDED_SCHEMA_PATH = os.path.join("data", "extended_schema.json")
 ALIAS_MAP_PATH = os.path.join("data", "alias_maps.json")
@@ -77,9 +90,146 @@ CORE_FIELDS = {
     "Inositol Fermentation",
 }
 # ------------------------------------------------------------
-# Helpers: load extended schema & alias maps
 # ------------------------------------------------------------
 def _load_extended_schema(path: str = EXTENDED_SCHEMA_PATH) -> Dict[str, Any]:
@@ -97,6 +247,7 @@ def _load_alias_map(path: str = ALIAS_MAP_PATH) -> Dict[str, str]:
     """
     alias_maps.json is assumed to be a simple dict like:
       { "Field:raw_value": "canonical_value", ... }
     We keep this optional and conservative.
     """
     if not os.path.exists(path):
@@ -126,32 +277,60 @@ def _apply_field_value_alias(field: str, value: str, alias_map: Dict[str, str])
 # ------------------------------------------------------------
-# Value normalisation helpers
 # ------------------------------------------------------------
 def _bool_from_tokens(tokens: List[str]) -> str:
     """
-    Map "positive/sensitive/susceptible" vs "negative/resistant"
-    into Positive / Negative where appropriate.
     """
     t = " ".join(tokens).lower()
     # Strong negative signals
     neg_tokens = [
-        "negative", "no", "not", "resistant", "no zone",
-        "no growth", "fails to", "does not"
     ]
     if any(nt in t for nt in neg_tokens):
         return "Negative"
     # Strong positive signals
     pos_tokens = [
-        "positive", "pos", "sensitive", "susceptible",
-        "clear zone", "zone of inhibition"
     ]
     if any(pt in t for pt in pos_tokens):
         return "Positive"
     return UNKNOWN
@@ -160,6 +339,7 @@ def _disc_result_from_phrase(phrase: str) -> str:
     For disc tests like Novobiocin / Optochin / Bacitracin, interpret:
     - 'sensitive', 'susceptible' as Positive
     - 'resistant', 'no zone' as Negative
     - default -> Unknown
     """
     ph = phrase.lower()
@@ -170,7 +350,6 @@ def _disc_result_from_phrase(phrase: str) -> str:
     if any(w in ph for w in ["sensitive", "susceptible", "zone of inhibition", "clear zone"]):
         return "Positive"
-    # If explicit 'positive'/'negative' appears, handle that
     if "positive" in ph:
         return "Positive"
     if "negative" in ph:
@@ -180,117 +359,150 @@ def _disc_result_from_phrase(phrase: str) -> str:
 # ------------------------------------------------------------
-# Core pattern logic for extended tests
 # ------------------------------------------------------------
 def _parse_disc_tests(text: str, parsed: Dict[str, str]) -> None:
     """
-    Handle disc tests:
       - Optochin
       - Bacitracin
       - Novobiocin
-    with phrasing like 'optochin sensitive', 'bacitracin resistant', etc.
     """
     lower = text.lower()
-    disc_fields = ["Optochin", "Bacitracin", "Novobiocin"]
-    for test_name in disc_fields:
         key = test_name.lower()
-        # Find segments surrounding the keyword
-        for m in re.finditer(rf"\b{re.escape(key)}\b[^\.,;]*", lower):
             segment = lower[m.start():m.end()]
             val = _disc_result_from_phrase(segment)
             if val != UNKNOWN:
-                parsed[test_name] = val
-        # Also handle "<test> test positive/negative"
-        for m in re.finditer(rf"\b{re.escape(key)}\s+test[^\.,;]*", lower):
             segment = lower[m.start():m.end()]
             val = _disc_result_from_phrase(segment)
             if val != UNKNOWN:
-                parsed[test_name] = val
-def _parse_simple_PNV_test(
-    text: str,
-    test_name: str,
-    parsed: Dict[str, str],
-    extra_keywords: List[str] | None = None,
-) -> None:
     """
-    Generic P/N/V parser for named tests (e.g. CAMP, PYR, Hippurate).
-    Looks for patterns like:
-      'CAMP positive', 'PYR test negative'
-    and maps to Positive / Negative / Variable.
     """
-    if extra_keywords is None:
-        extra_keywords = []
-    label = test_name.lower()
-    lower = text.lower()
-    # Basic patterns: "<name> positive/negative/variable"
-    pat_direct = rf"\b{re.escape(label)}\b[^\.,;]*"
-    for m in re.finditer(pat_direct, lower):
-        segment = lower[m.start():m.end()]
-        val = _bool_from_tokens(segment.split())
-        if val != UNKNOWN:
-            parsed[test_name] = val
-    # Patterns like "<name> test positive/negative"
-    pat_test = rf"\b{re.escape(label)}\s+test[^\.,;]*"
-    for m in re.finditer(pat_test, lower):
-        segment = lower[m.start():m.end()]
-        val = _bool_from_tokens(segment.split())
-        if val != UNKNOWN:
-            parsed[test_name] = val
-    # Extra synonyms if any (e.g. "CAMP reaction", "PYR activity")
-    for kw in extra_keywords:
-        k = kw.lower()
-        pat_kw = rf"\b{re.escape(k)}\b[^\.,;]*"
-        for m in re.finditer(pat_kw, lower):
-            segment = lower[m.start():m.end()]
-            val = _bool_from_tokens(segment.split())
-            if val != UNKNOWN:
-                parsed[test_name] = val
-def _parse_extended_from_schema(
     text: str,
     ext_schema: Dict[str, Any],
     alias_map: Dict[str, str],
     parsed: Dict[str, str],
 ) -> None:
     """
-    Generic extended parser driven by extended_schema.json.
-    For each field where value_type == "enum_PNV" and not in CORE_FIELDS:
-      - looks for '<field> positive/negative/variable' style patterns.
-      - applies alias map for (field, value).
     """
     lower = text.lower()
-    for field_name, meta in ext_schema.items():
-        if not isinstance(meta, dict):
-            continue
-        if meta.get("value_type") != "enum_PNV":
-            continue
-        if field_name in CORE_FIELDS:
-            # We never treat core DB tests as "extended"
-            continue
         label = field_name.lower()
-        # Very simple pattern: "<field_name> positive/negative/variable"
-        pat = rf"\b{re.escape(label)}\b[^\.,;]*"
-        for m in re.finditer(pat, lower):
-            segment = lower[m.start():m.end()]
-            val = _bool_from_tokens(segment.split())
-            if val == UNKNOWN:
-                continue
-            val = _apply_field_value_alias(field_name, val, alias_map)
-            parsed[field_name] = val
 # ------------------------------------------------------------
@@ -301,10 +513,13 @@ def parse_text_extended(text: str) -> Dict[str, Any]:
     """
     Parse extended-only tests from the description.
-    This is intentionally conservative:
-      - Only sets a field if reasonably confident from text
-      - Never overwrites core parser behaviour directly
-      - Plays nicely with alias maps and extended_schema
     Returns:
       {
@@ -313,31 +528,29 @@ def parse_text_extended(text: str) -> Dict[str, Any]:
         "raw": original_text
       }
     """
-    if not text:
         return {
             "parsed_fields": {},
             "source": "extended_parser",
-            "raw": text or "",
         }
     ext_schema = _load_extended_schema(EXTENDED_SCHEMA_PATH)
     alias_map = _load_alias_map(ALIAS_MAP_PATH)
     parsed: Dict[str, str] = {}
     # 1) Disc tests (Novobiocin / Optochin / Bacitracin) with rich language
-    _parse_disc_tests(text, parsed)
-    # 2) CAMP & PYR & Hippurate (if present in schema/gold tests)
-    _parse_simple_PNV_test(text, "CAMP", parsed, extra_keywords=["CAMP reaction"])
-    _parse_simple_PNV_test(text, "PYR", parsed, extra_keywords=["PYR activity"])
-    _parse_simple_PNV_test(text, "Hippurate Hydrolysis", parsed, extra_keywords=["hippurate"])
-    # 3) Any other enum_PNV extended tests from extended_schema.json
-    _parse_extended_from_schema(text, ext_schema, alias_map, parsed)
     return {
         "parsed_fields": parsed,
         "source": "extended_parser",
-        "raw": text,
     }

 # engine/parser_ext.py
 # ------------------------------------------------------------
+# Extended test parser — Stage 11G (Option A: high-recall, SOTA-style)
 #
+# Goals:
+#   - Focus ONLY on *extended* tests (NOT in the core DB schema).
+#   - Drive behaviour entirely from extended_schema.json
+#       • value_type == "enum_PNV"
+#       • honour per-field aliases
+#       • additional hard-coded microbiology aliases where useful
+#   - Aggressive, high-recall extraction:
+#       • "<test> positive/negative/variable"
+#       • "<test> test positive/negative"
+#       • disc tests: "novobiocin sensitive", "bacitracin resistant", etc.
+#       • "non-<test>" / "no <test>" / "non acid-fast" → Negative
+#       • "sensitive/susceptible" vs "resistant" → mapped to P/N
+#   - Never touch core DB fields (delegate to parser_rules).
+#   - Respect alias_maps.json for field/value canonicalisation.
 #
+# Output:
+#   {
+#     "parsed_fields": { field: value, ... },
+#     "source": "extended_parser",
+#     "raw": original_text
+#   }
 # ------------------------------------------------------------
 from __future__ import annotations
 import re
 from typing import Dict, Any, List
+# ------------------------------------------------------------
+# Paths & constants
+# ------------------------------------------------------------
 EXTENDED_SCHEMA_PATH = os.path.join("data", "extended_schema.json")
 ALIAS_MAP_PATH = os.path.join("data", "alias_maps.json")
     "Inositol Fermentation",
 }
+# Extra microbiology-driven aliases that are too domain-specific to live
+# directly in extended_schema.json, but very useful for parsing.
+EXTRA_FIELD_ALIASES: Dict[str, List[str]] = {
+    "Hippurate Hydrolysis": [
+        "hippurate",
+        "hippurate test",
+    ],
+    "CAMP": [
+        "camp test",
+        "camp reaction",
+    ],
+    "PYR": [
+        "pyr test",
+        "pyr reaction",
+        "pyr activity",
+    ],
+    "Acid Fast": [
+        "acid-fast",
+        "acid fast",
+        "acid-fast stain",
+        "acid fast stain",
+    ],
+    "Gas Production": [
+        "gas production",
+        "gas-producing",
+        "gas producing",
+        "gas producer",
+    ],
+    "Bile Solubility": [
+        "bile solubility",
+        "bile soluble",
+    ],
+    "Bile Resistance": [
+        "bile resistance",
+        "bile resistant",
+    ],
+    "Glucose Oxidation": [
+        "oxidation of glucose",
+        "glucose oxidation",
+    ],
+    "Mannose Fermentation": [
+        "mannose positive",
+        "mannose negative",
+        "ferments mannose",
+        "does not ferment mannose",
+    ],
+    "Fructose Fermentation": [
+        "fructose positive",
+        "fructose negative",
+        "ferments fructose",
+        "does not ferment fructose",
+    ],
+    "Inulin Fermentation": [
+        "inulin positive",
+        "inulin negative",
+        "ferments inulin",
+        "does not ferment inulin",
+    ],
+    "Glycerol Fermentation": [
+        "glycerol positive",
+        "glycerol negative",
+        "ferments glycerol",
+        "does not ferment glycerol",
+    ],
+    "Cellobiose Fermentation": [
+        "cellobiose positive",
+        "cellobiose negative",
+        "ferments cellobiose",
+        "does not ferment cellobiose",
+    ],
+    "Casein Hydrolysis": [
+        "caseinase",
+        "casein hydrolysis",
+    ],
+    "Tyrosine Hydrolysis": [
+        "tyrosine hydrolysis",
+    ],
+    "Iron Oxidation": [
+        "iron oxidation",
+    ],
+    "Sulfur Utilization": [
+        "sulfur utilization",
+        "sulphur utilization",
+        "sulfur utilisation",
+        "sulphur utilisation",
+    ],
+    "Antibiotic Resistance": [
+        "antibiotic resistance",
+        "antibiotic-resistant",
+        "antibiotic resistant",
+    ],
+}
+# Disc tests we treat with special "sensitive / resistant" logic
+DISC_TEST_FIELDS = {"Optochin", "Bacitracin", "Novobiocin"}
+# ------------------------------------------------------------
+# Basic helpers
+# ------------------------------------------------------------
+def _clean_text(text: str) -> str:
+    """
+    Normalise a few unicode oddities and collapse whitespace.
+    We keep case sensitive content for matching, but most
+    logic will run on .lower() views.
+    Also:
+      - strip degree symbols (not vital for extended tests, but harmless)
+      - normalise subscript ₂ → 2 if ever encountered
+    """
+    if not text:
+        return ""
+    s = text.replace("°", "").replace("º", "")
+    s = s.replace("₂", "2")
+    # collapse whitespace
+    return " ".join(s.split())
+def _norm(s: str) -> str:
+    return s.strip().lower()
+def _set_if_stronger(parsed: Dict[str, str], field: str, value: str) -> None:
+    """
+    Write value to parsed[field] if:
+      - field not present, or
+      - we are replacing Unknown with a concrete value.
+    Extended parser only ever writes extended tests, so this is mostly
+    about avoiding weaker overwrites (e.g. UNKNOWN over Positive).
+    """
+    if not value:
+        return
+    if field not in parsed or parsed[field] == UNKNOWN:
+        parsed[field] = value
 # ------------------------------------------------------------
+# Loading schema & alias maps
 # ------------------------------------------------------------
 def _load_extended_schema(path: str = EXTENDED_SCHEMA_PATH) -> Dict[str, Any]:
     """
     alias_maps.json is assumed to be a simple dict like:
       { "Field:raw_value": "canonical_value", ... }
     We keep this optional and conservative.
     """
     if not os.path.exists(path):
 # ------------------------------------------------------------
+# Value → P/N/V logic
 # ------------------------------------------------------------
 def _bool_from_tokens(tokens: List[str]) -> str:
     """
+    Map text tokens into Positive / Negative / Variable.
+    This is intentionally high-recall and slightly aggressive.
+    We consider a phrase "negative" if any strong negative token appears,
+    and "positive" if any strong positive token appears, with a bias:
+      - negative has priority when both appear.
     """
     t = " ".join(tokens).lower()
     # Strong negative signals
     neg_tokens = [
+        "negative",
+        "neg",
+        "-",
+        "no",
+        "not",
+        "non",
+        "resistant",
+        "no zone",
+        "no growth",
+        "fails to",
+        "does not",
+        "doesn't",
+        "without",
+        "lacks",
+        "absent",
     ]
     if any(nt in t for nt in neg_tokens):
         return "Negative"
     # Strong positive signals
     pos_tokens = [
+        "positive",
+        "pos",
+        "+",
+        "sensitive",
+        "susceptible",
+        "clear zone",
+        "zone of inhibition",
+        "with growth",
     ]
     if any(pt in t for pt in pos_tokens):
         return "Positive"
+    # Variable
+    var_tokens = ["variable", "var", "v"]
+    if any(vt in t for vt in var_tokens):
+        return "Variable"
     return UNKNOWN
     For disc tests like Novobiocin / Optochin / Bacitracin, interpret:
     - 'sensitive', 'susceptible' as Positive
     - 'resistant', 'no zone' as Negative
+    - explicit 'positive/negative' if present
     - default -> Unknown
     """
     ph = phrase.lower()
     if any(w in ph for w in ["sensitive", "susceptible", "zone of inhibition", "clear zone"]):
         return "Positive"
     if "positive" in ph:
         return "Positive"
     if "negative" in ph:
 # ------------------------------------------------------------
+# Disc test parsing
 # ------------------------------------------------------------
 def _parse_disc_tests(text: str, parsed: Dict[str, str]) -> None:
     """
+    Handle disc tests explicitly:
       - Optochin
       - Bacitracin
       - Novobiocin
+    We look for segments like:
+      'optochin sensitive', 'bacitracin resistant', 'novobiocin sensitive',
+      '<test> test positive', etc.
     """
     lower = text.lower()
+    for test_name in DISC_TEST_FIELDS:
         key = test_name.lower()
+        # Segments like "optochin sensitive..."
+        for m in re.finditer(rf"\b{re.escape(key)}\b[^\.;,\n]*", lower):
             segment = lower[m.start():m.end()]
             val = _disc_result_from_phrase(segment)
             if val != UNKNOWN:
+                _set_if_stronger(parsed, test_name, val)
+        # Segments like "optochin test positive..."
+        for m in re.finditer(rf"\b{re.escape(key)}\s+test[^\.;,\n]*", lower):
             segment = lower[m.start():m.end()]
             val = _disc_result_from_phrase(segment)
             if val != UNKNOWN:
+                _set_if_stronger(parsed, test_name, val)
+# ------------------------------------------------------------
+# Schema-driven extended parsing
+# ------------------------------------------------------------
+def _build_field_keywords(ext_schema: Dict[str, Any]) -> Dict[str, List[str]]:
     """
+    Build a mapping:
+      field_name -> list of keywords/synonyms (lowercased)
+    Sources:
+      - field name itself
+      - 'aliases' array in extended_schema.json
+      - EXTRA_FIELD_ALIASES hard-coded here
     """
+    field_kw: Dict[str, List[str]] = {}
+    for field_name, meta in ext_schema.items():
+        if not isinstance(meta, dict):
+            continue
+        if meta.get("value_type") != "enum_PNV":
+            continue
+        if field_name in CORE_FIELDS:
+            # Extended parser never touches core DB fields.
+            continue
+        kws: List[str] = []
+        # Canonical field label
+        kws.append(field_name)
+        # schema-defined aliases
+        aliases = meta.get("aliases", [])
+        if isinstance(aliases, list):
+            for a in aliases:
+                if isinstance(a, str) and a.strip():
+                    kws.append(a)
+        # extra hard-coded aliases
+        extra = EXTRA_FIELD_ALIASES.get(field_name, [])
+        for a in extra:
+            if isinstance(a, str) and a.strip():
+                kws.append(a)
+        # de-duplicate & normalise spacing
+        normed: List[str] = []
+        seen = set()
+        for k in kws:
+            kk = " ".join(k.strip().split())
+            if kk and kk.lower() not in seen:
+                seen.add(kk.lower())
+                normed.append(kk)
+        field_kw[field_name] = normed
+    return field_kw
+def _parse_schema_enum_pnv(
     text: str,
     ext_schema: Dict[str, Any],
     alias_map: Dict[str, str],
     parsed: Dict[str, str],
 ) -> None:
     """
+    High-recall parsing for enum_PNV extended tests driven by schema.
+    For each field (not in CORE_FIELDS):
+      - For each keyword/synonym:
+          • "<kw> positive/negative/variable"
+          • "<kw> test positive/negative"
+          • "non-<kw>" / "no <kw>" style negatives (via _bool_from_tokens)
     """
     lower = text.lower()
+    field_keywords = _build_field_keywords(ext_schema)
+    for field_name, keywords in field_keywords.items():
+        for kw in keywords:
+            key = kw.lower()
+            # Pattern 1: "<kw> ... (up to sentence/phrase boundary)"
+            pat_direct = rf"\b{re.escape(key)}\b[^\.;,\n]*"
+            for m in re.finditer(pat_direct, lower):
+                segment = lower[m.start():m.end()]
+                tokens = segment.split()
+                val = _bool_from_tokens(tokens)
+                if val != UNKNOWN:
+                    val = _apply_field_value_alias(field_name, val, alias_map)
+                    _set_if_stronger(parsed, field_name, val)
+            # Pattern 2: "<kw> test ..."
+            pat_test = rf"\b{re.escape(key)}\s+test[^\.;,\n]*"
+            for m in re.finditer(pat_test, lower):
+                segment = lower[m.start():m.end()]
+                tokens = segment.split()
+                val = _bool_from_tokens(tokens)
+                if val != UNKNOWN:
+                    val = _apply_field_value_alias(field_name, val, alias_map)
+                    _set_if_stronger(parsed, field_name, val)
+        # Extra aggressive negative patterns of the form:
+        #   - "non-<label>", "non <label>"
+        #   - "no <label>"
+        # This is especially useful for things like "non acid-fast"
+        # or "non CAMP reacting".
         label = field_name.lower()
+        if re.search(rf"\bnon[- ]{re.escape(label)}\b", lower) or re.search(
+            rf"\bno\s+{re.escape(label)}\b", lower
+        ):
+            val = _apply_field_value_alias(field_name, "Negative", alias_map)
+            _set_if_stronger(parsed, field_name, val)
 # ------------------------------------------------------------
     """
     Parse extended-only tests from the description.
+    Behaviour:
+      - If text empty → return empty parsed_fields.
+      - Loads extended_schema.json + alias_maps.json
+      - Runs:
+          1) disc test parsing (Optochin / Bacitracin / Novobiocin)
+          2) generic schema-driven enum_PNV parsing for all ext fields
+      - Never touches core DB fields.
     Returns:
       {
         "raw": original_text
       }
     """
+    original = text or ""
+    if not original.strip():
         return {
             "parsed_fields": {},
             "source": "extended_parser",
+            "raw": original,
         }
+    text_clean = _clean_text(original)
     ext_schema = _load_extended_schema(EXTENDED_SCHEMA_PATH)
     alias_map = _load_alias_map(ALIAS_MAP_PATH)
     parsed: Dict[str, str] = {}
     # 1) Disc tests (Novobiocin / Optochin / Bacitracin) with rich language
+    _parse_disc_tests(text_clean, parsed)
+    # 2) All other enum_PNV extended tests from extended_schema.json
+    _parse_schema_enum_pnv(text_clean, ext_schema, alias_map, parsed)
     return {
         "parsed_fields": parsed,
         "source": "extended_parser",
+        "raw": original,
     }