Spaces:

EphAsad
/

BactKing

Sleeping

App Files Files Community

EphAsad commited on Nov 16, 2025

Commit

b1deeea

verified ·

1 Parent(s): 8784ac2

Update engine/parser_ext.py

Browse files

Files changed (1) hide show

engine/parser_ext.py +263 -218

engine/parser_ext.py CHANGED Viewed

@@ -1,27 +1,19 @@
 # engine/parser_ext.py
 # ------------------------------------------------------------
-# Extended parser for non-core / specialty tests.
 #
-# Focus: fields that are NOT part of the strict core schema
-# and usually live in data/extended_schema.json, e.g.:
 #
-#   - CAMP, PYR, Hippurate Hydrolysis
-#   - Bile Solubility, Bile Resistance
-#   - Optochin, Bacitracin, Novobiocin (disc tests)
-#   - Odour
-#   - NaCl Tolerant (>=10%), NaCl Tolerant (>=15%)
-#   - Lipase, Lecithinase, etc. IF present in extended_schema
-#
-# It returns:
-#   {
-#     "parsed_fields": { field: value, ... },
-#     "source": "extended_parser",
-#     "raw": original_text,
-#     "error": optional_error_message
-#   }
-#
-# Stage 11B: safer alias usage, better coverage for disc tests
-# and extended biochemical tests, without touching core schema.
 # ------------------------------------------------------------
 from __future__ import annotations
@@ -31,268 +23,321 @@ import os
 import re
 from typing import Dict, Any, List
 UNKNOWN = "Unknown"
-EXTENDED_SCHEMA_PATH = os.path.join("data", "extended_schema.json")
-ALIAS_MAP_PATH = os.path.join("data", "alias_maps.json")
 # ------------------------------------------------------------
-# Loading helpers
 # ------------------------------------------------------------
-def _load_extended_schema() -> Dict[str, Any]:
-    if not os.path.exists(EXTENDED_SCHEMA_PATH):
         return {}
     try:
-        with open(EXTENDED_SCHEMA_PATH, "r", encoding="utf-8") as f:
             obj = json.load(f)
-            return obj if isinstance(obj, dict) else {}
     except Exception:
         return {}
-def _load_alias_maps() -> Dict[str, str]:
-    if not os.path.exists(ALIAS_MAP_PATH):
         return {}
     try:
-        with open(ALIAS_MAP_PATH, "r", encoding="utf-8") as f:
             obj = json.load(f)
-            return obj if isinstance(obj, dict) else {}
     except Exception:
         return {}
-def _apply_alias(field: str, value: str, alias_maps: Dict[str, str]) -> str:
     """
-    Apply alias maps in a SAFE way:
-    - key format expected: "Field:Value" -> "NormalizedValue"
-    - if no match, return original value
-    - we NEVER map values to some other field name here.
     """
     key = f"{field}:{value}"
-    mapped = alias_maps.get(key)
-    if mapped:
-        return mapped
     return value
 # ------------------------------------------------------------
-# Generic helpers
 # ------------------------------------------------------------
-def _clean_text(text: str) -> str:
-    return " ".join(text.split())
-def _norm(s: str) -> str:
-    return s.strip().lower()
-def _set_if_stronger(parsed: Dict[str, str], field: str, value: str) -> None:
-    if not value:
-        return
-    if field not in parsed or parsed[field] == UNKNOWN:
-        parsed[field] = value
-def _value_from_pnv_context(segment: str) -> str | None:
-    seg = _norm(segment)
-    if seg in ["positive", "pos", "+"]:
-        return "Positive"
-    if seg in ["negative", "neg", "-"]:
         return "Negative"
-    if seg in ["variable", "var", "v"]:
-        return "Variable"
-    return None
-# ------------------------------------------------------------
-# Extended test patterns
-# ------------------------------------------------------------
-EXT_BOOL_FIELDS: Dict[str, List[str]] = {
-    # extended enum_PNV style tests that often appear in gold tests
-    "CAMP": ["camp", "camp test"],
-    "PYR": ["pyr", "pyr test"],
-    "Hippurate Hydrolysis": ["hippurate", "hippurate hydrolysis"],
-    "Bile Solubility": ["bile soluble", "bile solubility"],
-    "Bile Resistance": ["bile resistant", "bile resistance"],
-    "Lipase": ["lipase"],
-    "Lecithinase": ["lecithinase"],
-    "Casein Hydrolysis": ["casein hydrolysis"],
-    "Tyrosine Hydrolysis": ["tyrosine hydrolysis"],
-}
-# Disc tests with sensitive/resistant semantics
-DISC_TESTS = {
-    "Optochin": ["optochin"],
-    "Bacitracin": ["bacitracin"],
-    "Novobiocin": ["novobiocin"],
-}
-# NaCl tolerance (high salt)
-EXT_SALT_FIELDS = {
-    "NaCl Tolerant (>=10%)": ["10% nacl", "10 % nacl"],
-    "NaCl Tolerant (>=15%)": ["15% nacl", "15 % nacl"],
-}
-# Odour
-ODOUR_VALUES = {
-    "Fruity": ["fruity odour", "fruity odor"],
-    "Horse": ["horse odour", "horse odor", "horse stable smell"],
-    "Foul": ["foul odour", "foul odor"],
-    "Butyric": ["butyric odour", "butyric odor"],
-    "Earthy": ["earthy odour", "earthy odor"],
-}
 # ------------------------------------------------------------
-# Parsing functions
 # ------------------------------------------------------------
-def _parse_ext_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
-    for field, keywords in EXT_BOOL_FIELDS.items():
-        for kw in keywords:
-            # "CAMP test positive"
-            m1 = re.search(rf"{re.escape(kw)}[ \-]?(positive|negative|variable|pos|neg|\+|\-)", text_lc)
-            if m1:
-                val = _value_from_pnv_context(m1.group(1))
-                if val:
-                    _set_if_stronger(parsed, field, val)
-                    break
-            # "positive for CAMP test"
-            m2 = re.search(
-                rf"(positive|negative|variable|pos|neg|\+|\-)\s+(for\s+)?{re.escape(kw)}",
-                text_lc,
-            )
-            if m2:
-                val = _value_from_pnv_context(m2.group(1))
-                if val:
-                    _set_if_stronger(parsed, field, val)
-                    break
-def _parse_disc_tests(text_lc: str, parsed: Dict[str, str]) -> None:
     """
-    Disc tests like Optochin, Bacitracin, Novobiocin.
-    Convention:
-      - "sensitive"  -> Positive
-      - "susceptible" -> Positive
-      - "resistant"  -> Negative
     """
-    for field, keywords in DISC_TESTS.items():
-        for kw in keywords:
-            # "optochin sensitive" / "optochin resistant"
-            m = re.search(
-                rf"{re.escape(kw)}[ \-]?(sensitive|susceptible|resistant)",
-                text_lc,
-            )
-            if not m:
-                continue
-            word = m.group(1).lower()
-            if word in ["sensitive", "susceptible"]:
-                _set_if_stronger(parsed, field, "Positive")
-            elif word == "resistant":
-                _set_if_stronger(parsed, field, "Negative")
-            break
-def _parse_salt_tolerance(text_lc: str, parsed: Dict[str, str]) -> None:
-    for field, patterns in EXT_SALT_FIELDS.items():
-        for p in patterns:
-            if p in text_lc:
-                # If explicitly says "tolerant" or "growth at"
-                if re.search(rf"(tolerant|grows at|growth at)\s*{re.escape(p)}", text_lc):
-                    _set_if_stronger(parsed, field, "Positive")
-                # If "no growth at 15% NaCl" etc.
-                if re.search(rf"no growth at\s*{re.escape(p)}", text_lc):
-                    _set_if_stronger(parsed, field, "Negative")
-def _parse_odour(text_lc: str, parsed: Dict[str, str]) -> None:
-    for value, patterns in ODOUR_VALUES.items():
-        for p in patterns:
-            if p in text_lc:
-                _set_if_stronger(parsed, "Odour", value)
-                break
-def _parse_misc_extended(text_lc: str, parsed: Dict[str, str]) -> None:
     """
-    Place-holder for any additional extended patterns you want.
-    For now, we keep it minimal to avoid accidental conflicts.
     """
-    # Example: "acid fast" partial positivity is handled as extended field
-    if "partial acid fast" in text_lc or "partially acid fast" in text_lc:
-        _set_if_stronger(parsed, "Acid Fast", "Partial")
 # ------------------------------------------------------------
-# PUBLIC API
 # ------------------------------------------------------------
 def parse_text_extended(text: str) -> Dict[str, Any]:
     """
-    Parse extended (non-core) tests from description, guided by extended_schema.json.
     Returns:
       {
-        "parsed_fields": {...},
         "source": "extended_parser",
-        "raw": original_text,
-        "error": optional_error
       }
     """
-    original = text or ""
-    text_clean = _clean_text(original)
-    text_lc = text_clean.lower()
     parsed: Dict[str, str] = {}
-    try:
-        ext_schema = _load_extended_schema()
-        alias_maps = _load_alias_maps()
-        # Run pattern-based extraction
-        _parse_ext_bool_tests(text_lc, parsed)
-        _parse_disc_tests(text_lc, parsed)
-        _parse_salt_tolerance(text_lc, parsed)
-        _parse_odour(text_lc, parsed)
-        _parse_misc_extended(text_lc, parsed)
-        # Apply alias maps (non-destructive, only value changes)
-        if alias_maps:
-            for field in list(parsed.keys()):
-                val = parsed[field]
-                mapped_val = _apply_alias(field, val, alias_maps)
-                parsed[field] = mapped_val
-        # Filter out any fields that are clearly core and should
-        # NOT live in extended land (safety).
-        # We let the main engine ignore unknown fields anyway.
-        if ext_schema:
-            valid_ext_fields = set(ext_schema.keys())
-            parsed = {
-                f: v for (f, v) in parsed.items()
-                if f in valid_ext_fields
-            }
-        return {
-            "parsed_fields": parsed,
-            "source": "extended_parser",
-            "raw": original,
-        }
-    except Exception as e:
-        return {
-            "parsed_fields": parsed,
-            "source": "extended_parser",
-            "raw": original,
-            "error": f"{type(e).__name__}: {e}",
-        }

 # engine/parser_ext.py
 # ------------------------------------------------------------
+# Extended test parser (Stage 11C)
 #
+# - Focuses on *extended* tests (disc tests, rare biochemicals, etc.)
+# - Uses extended_schema.json dynamically
+# - Ignores core DB fields (those are handled by parser_rules)
+# - Adds robust patterns for:
+#       CAMP, PYR, Optochin, Bacitracin, Novobiocin
 #
+# Returns:
+# {
+#   "parsed_fields": { ... },
+#   "source": "extended_parser",
+#   "raw": original_text
+# }
 # ------------------------------------------------------------
 from __future__ import annotations
 import re
 from typing import Dict, Any, List
+EXTENDED_SCHEMA_PATH = os.path.join("data", "extended_schema.json")
+ALIAS_MAP_PATH = os.path.join("data", "alias_maps.json")
 UNKNOWN = "Unknown"
+# These are the CORE / DB fields that should *not* be treated as extended.
+# (Must match your bacteria_db.xlsx columns.)
+CORE_FIELDS = {
+    "Genus",
+    "Species",
+    "Gram Stain",
+    "Shape",
+    "Colony Morphology",
+    "Haemolysis",
+    "Haemolysis Type",
+    "Motility",
+    "Capsule",
+    "Spore Formation",
+    "Growth Temperature",
+    "Oxygen Requirement",
+    "Media Grown On",
+    "Catalase",
+    "Oxidase",
+    "Indole",
+    "Urease",
+    "Citrate",
+    "Methyl Red",
+    "VP",
+    "H2S",
+    "DNase",
+    "ONPG",
+    "Coagulase",
+    "Lipase Test",
+    "Nitrate Reduction",
+    "NaCl Tolerant (>=6%)",
+    "Lysine Decarboxylase",
+    "Ornitihine Decarboxylase",
+    "Arginine dihydrolase",
+    "Gelatin Hydrolysis",
+    "Esculin Hydrolysis",
+    "Glucose Fermentation",
+    "Lactose Fermentation",
+    "Sucrose Fermentation",
+    "Mannitol Fermentation",
+    "Sorbitol Fermentation",
+    "Maltose Fermentation",
+    "Xylose Fermentation",
+    "Rhamnose Fermentation",
+    "Arabinose Fermentation",
+    "Raffinose Fermentation",
+    "Trehalose Fermentation",
+    "Inositol Fermentation",
+}
 # ------------------------------------------------------------
+# Helpers: load extended schema & alias maps
 # ------------------------------------------------------------
+def _load_extended_schema(path: str = EXTENDED_SCHEMA_PATH) -> Dict[str, Any]:
+    if not os.path.exists(path):
         return {}
     try:
+        with open(path, "r", encoding="utf-8") as f:
             obj = json.load(f)
+        return obj if isinstance(obj, dict) else {}
     except Exception:
         return {}
+def _load_alias_map(path: str = ALIAS_MAP_PATH) -> Dict[str, str]:
+    """
+    alias_maps.json is assumed to be a simple dict like:
+      { "Field:raw_value": "canonical_value", ... }
+    We keep this optional and conservative.
+    """
+    if not os.path.exists(path):
         return {}
     try:
+        with open(path, "r", encoding="utf-8") as f:
             obj = json.load(f)
+        if isinstance(obj, dict):
+            return obj
+        return {}
     except Exception:
         return {}
+def _apply_field_value_alias(field: str, value: str, alias_map: Dict[str, str]) -> str:
     """
+    Apply alias mapping of the form:
+      "Field:raw_value" -> "canonical"
+    Case-insensitive on the key; value returned as-is if no mapping.
     """
     key = f"{field}:{value}"
+    key_lower = key.lower()
+    for k, v in alias_map.items():
+        if k.lower() == key_lower:
+            return v
     return value
 # ------------------------------------------------------------
+# Value normalisation helpers
 # ------------------------------------------------------------
+def _bool_from_tokens(tokens: List[str]) -> str:
+    """
+    Map "positive/sensitive/susceptible" vs "negative/resistant"
+    into Positive / Negative where appropriate.
+    """
+    t = " ".join(tokens).lower()
+    # Strong negative signals
+    neg_tokens = [
+        "negative", "no", "not", "resistant", "no zone",
+        "no growth", "fails to", "does not"
+    ]
+    if any(nt in t for nt in neg_tokens):
+        return "Negative"
+    # Strong positive signals
+    pos_tokens = [
+        "positive", "pos", "sensitive", "susceptible",
+        "clear zone", "zone of inhibition"
+    ]
+    if any(pt in t for pt in pos_tokens):
+        return "Positive"
+    return UNKNOWN
+def _disc_result_from_phrase(phrase: str) -> str:
+    """
+    For disc tests like Novobiocin / Optochin / Bacitracin, interpret:
+    - 'sensitive', 'susceptible' as Positive
+    - 'resistant', 'no zone' as Negative
+    - default -> Unknown
+    """
+    ph = phrase.lower()
+    if any(w in ph for w in ["resistant", "no zone", "no inhibition", "no clear zone"]):
         return "Negative"
+    if any(w in ph for w in ["sensitive", "susceptible", "zone of inhibition", "clear zone"]):
+        return "Positive"
+    # If explicit 'positive'/'negative' appears, handle that
+    if "positive" in ph:
+        return "Positive"
+    if "negative" in ph:
+        return "Negative"
+    return UNKNOWN
 # ------------------------------------------------------------
+# Core pattern logic for extended tests
 # ------------------------------------------------------------
+def _parse_disc_tests(text: str, parsed: Dict[str, str]) -> None:
     """
+    Handle disc tests:
+      - Optochin
+      - Bacitracin
+      - Novobiocin
+    with phrasing like 'optochin sensitive', 'bacitracin resistant', etc.
     """
+    lower = text.lower()
+    disc_fields = ["Optochin", "Bacitracin", "Novobiocin"]
+    for test_name in disc_fields:
+        key = test_name.lower()
+        # Find segments surrounding the keyword
+        for m in re.finditer(rf"\b{re.escape(key)}\b[^\.,;]*", lower):
+            segment = lower[m.start():m.end()]
+            val = _disc_result_from_phrase(segment)
+            if val != UNKNOWN:
+                parsed[test_name] = val
+        # Also handle "<test> test positive/negative"
+        for m in re.finditer(rf"\b{re.escape(key)}\s+test[^\.,;]*", lower):
+            segment = lower[m.start():m.end()]
+            val = _disc_result_from_phrase(segment)
+            if val != UNKNOWN:
+                parsed[test_name] = val
+def _parse_simple_PNV_test(
+    text: str,
+    test_name: str,
+    parsed: Dict[str, str],
+    extra_keywords: List[str] | None = None,
+) -> None:
+    """
+    Generic P/N/V parser for named tests (e.g. CAMP, PYR, Hippurate).
+    Looks for patterns like:
+      'CAMP positive', 'PYR test negative'
+    and maps to Positive / Negative / Variable.
     """
+    if extra_keywords is None:
+        extra_keywords = []
+    label = test_name.lower()
+    lower = text.lower()
+    # Basic patterns: "<name> positive/negative/variable"
+    pat_direct = rf"\b{re.escape(label)}\b[^\.,;]*"
+    for m in re.finditer(pat_direct, lower):
+        segment = lower[m.start():m.end()]
+        val = _bool_from_tokens(segment.split())
+        if val != UNKNOWN:
+            parsed[test_name] = val
+    # Patterns like "<name> test positive/negative"
+    pat_test = rf"\b{re.escape(label)}\s+test[^\.,;]*"
+    for m in re.finditer(pat_test, lower):
+        segment = lower[m.start():m.end()]
+        val = _bool_from_tokens(segment.split())
+        if val != UNKNOWN:
+            parsed[test_name] = val
+    # Extra synonyms if any (e.g. "CAMP reaction", "PYR activity")
+    for kw in extra_keywords:
+        k = kw.lower()
+        pat_kw = rf"\b{re.escape(k)}\b[^\.,;]*"
+        for m in re.finditer(pat_kw, lower):
+            segment = lower[m.start():m.end()]
+            val = _bool_from_tokens(segment.split())
+            if val != UNKNOWN:
+                parsed[test_name] = val
+def _parse_extended_from_schema(
+    text: str,
+    ext_schema: Dict[str, Any],
+    alias_map: Dict[str, str],
+    parsed: Dict[str, str],
+) -> None:
     """
+    Generic extended parser driven by extended_schema.json.
+    For each field where value_type == "enum_PNV" and not in CORE_FIELDS:
+      - looks for '<field> positive/negative/variable' style patterns.
+      - applies alias map for (field, value).
+    """
+    lower = text.lower()
+    for field_name, meta in ext_schema.items():
+        if not isinstance(meta, dict):
+            continue
+        if meta.get("value_type") != "enum_PNV":
+            continue
+        if field_name in CORE_FIELDS:
+            # We never treat core DB tests as "extended"
+            continue
+        label = field_name.lower()
+        # Very simple pattern: "<field_name> positive/negative/variable"
+        pat = rf"\b{re.escape(label)}\b[^\.,;]*"
+        for m in re.finditer(pat, lower):
+            segment = lower[m.start():m.end()]
+            val = _bool_from_tokens(segment.split())
+            if val == UNKNOWN:
+                continue
+            val = _apply_field_value_alias(field_name, val, alias_map)
+            parsed[field_name] = val
 # ------------------------------------------------------------
+# MAIN ENTRY
 # ------------------------------------------------------------
 def parse_text_extended(text: str) -> Dict[str, Any]:
     """
+    Parse extended-only tests from the description.
+    This is intentionally conservative:
+      - Only sets a field if reasonably confident from text
+      - Never overwrites core parser behaviour directly
+      - Plays nicely with alias maps and extended_schema
     Returns:
       {
+        "parsed_fields": { ... },
         "source": "extended_parser",
+        "raw": original_text
       }
     """
+    if not text:
+        return {
+            "parsed_fields": {},
+            "source": "extended_parser",
+            "raw": text or "",
+        }
+    ext_schema = _load_extended_schema(EXTENDED_SCHEMA_PATH)
+    alias_map = _load_alias_map(ALIAS_MAP_PATH)
     parsed: Dict[str, str] = {}
+    # 1) Disc tests (Novobiocin / Optochin / Bacitracin) with rich language
+    _parse_disc_tests(text, parsed)
+    # 2) CAMP & PYR & Hippurate (if present in schema/gold tests)
+    _parse_simple_PNV_test(text, "CAMP", parsed, extra_keywords=["CAMP reaction"])
+    _parse_simple_PNV_test(text, "PYR", parsed, extra_keywords=["PYR activity"])
+    _parse_simple_PNV_test(text, "Hippurate Hydrolysis", parsed, extra_keywords=["hippurate"])
+    # 3) Any other enum_PNV extended tests from extended_schema.json
+    _parse_extended_from_schema(text, ext_schema, alias_map, parsed)
+    return {
+        "parsed_fields": parsed,
+        "source": "extended_parser",
+        "raw": text,
+    }