Spaces:

EphAsad
/

BactKing

Sleeping

App Files Files Community

EphAsad commited on Dec 16, 2025

Commit

8f3c0ec

verified ·

1 Parent(s): e11bc4c

Update engine/parser_ext.py

Browse files

Files changed (1) hide show

engine/parser_ext.py +142 -56

engine/parser_ext.py CHANGED Viewed

@@ -1,14 +1,45 @@
 # engine/parser_ext.py
 # ======================================================================
 # Extended test parser — Stage 12C-fix2
 # ======================================================================
 from __future__ import annotations
-import os, re
-from typing import Dict, Any
 UNKNOWN = "Unknown"
 # ======================================================================
 # Helpers
 # ======================================================================
@@ -25,113 +56,155 @@ def _set_if_stronger(parsed: Dict[str,str], field: str, value: str):
     if field not in parsed or parsed[field] == UNKNOWN:
         parsed[field] = value
-def _parse_pnv_after_anchor(text: str, parsed: Dict[str,str], field: str):
     m = re.search(
-        rf"\b{re.escape(field.lower())}\b\s*(positive|negative|variable|unknown)",
-        text.lower()
     )
     if m:
         _set_if_stronger(parsed, field, m.group(1).capitalize())
 # ======================================================================
 # 1. Gas Production
 # ======================================================================
 def _parse_gas_production(text: str, parsed: Dict[str,str]):
     t = text.lower()
-    if any(x in t for x in [
         "produces gas","gas produced","with gas",
         "gas production positive","gas producer",
-        "production of gas","ferments glucose with gas"
-    ]):
-        _set_if_stronger(parsed,"Gas Production","Positive")
-    elif any(x in t for x in [
         "does not produce gas","no gas",
-        "absence of gas","gas production negative"
-    ]):
         _set_if_stronger(parsed,"Gas Production","Negative")
 # ======================================================================
-# 2. Motility Type (fixed)
 # ======================================================================
-MOTILITY_TYPES = {
     "Peritrichous","Monotrichous","Polytrichous","Polar",
-    "Swarming","Tumbling","Gliding","Corkscrew","Axial"
-}
 def _parse_motility_type(text: str, parsed: Dict[str,str]):
     t = text.lower()
-    # Explicit negative
-    if re.search(r"\bmotility type\b\s*(negative|none)", t):
-        _set_if_stronger(parsed,"Motility Type","Negative")
         return
-    # Anchor-based
-    m = re.search(r"\bmotility type\b\s*[:\-]?\s*([a-z]+)", t)
     if m:
         val = m.group(1).capitalize()
         if val in MOTILITY_TYPES:
             _set_if_stronger(parsed,"Motility Type",val)
             return
-    # Free word
     for mt in MOTILITY_TYPES:
         if re.search(rf"\b{mt.lower()}\b", t):
             _set_if_stronger(parsed,"Motility Type",mt)
             return
 # ======================================================================
-# 3. Pigment (unchanged)
 # ======================================================================
 def _parse_pigment(text: str, parsed: Dict[str,str]):
     t = text.lower()
-    if not re.search(r"\b(pigment|pigmentation)\b", t):
-        return
-    if "no pigmentation" in t or "pigment none" in t:
-        _set_if_stronger(parsed,"Pigment","None")
         return
-    pigments = []
-    for p in [
-        "pyocyanin","pyoverdine","pyovacin",
-        "green","yellow","pink","red","orange",
-        "brown","black","violet","cream"
-    ]:
         if re.search(rf"\b{p}\b", t):
-            pigments.append(p.capitalize())
-    if pigments:
-        _set_if_stronger(parsed,"Pigment","; ".join(sorted(set(pigments))))
 # ======================================================================
 # 4. Colony Pattern (explicit only)
 # ======================================================================
 def _parse_colony_pattern(text: str, parsed: Dict[str,str]):
     t = text.lower()
     if not re.search(r"\bcolony pattern\b", t):
         return
-    m = re.search(r"\bcolony pattern\b\s*[:\-]?\s*([a-z\-]+)", t)
     if m:
-        _set_if_stronger(parsed,"Colony Pattern",m.group(1).capitalize())
 # ======================================================================
 # 5. Odor (anchor-based)
 # ======================================================================
 def _parse_odor(text: str, parsed: Dict[str,str]):
     m = re.search(
         r"\b(odor|odour|smell)\b\s*[:\-]?\s*([a-z; ]+)",
-        text.lower()
     )
     if not m:
         return
-    vals = [v.strip().capitalize() for v in m.group(2).split(";") if v.strip()]
     if vals:
         _set_if_stronger(parsed,"Odor","; ".join(vals))
@@ -141,6 +214,7 @@ def _parse_odor(text: str, parsed: Dict[str,str]):
 def _parse_tsi(text: str, parsed: Dict[str,str]):
     t = text.upper()
     if "TSI" in t and "UNKNOWN" in t:
         _set_if_stronger(parsed,"TSI Pattern","Unknown")
         return
@@ -148,38 +222,48 @@ def _parse_tsi(text: str, parsed: Dict[str,str]):
     m = re.search(r"\b([KA]/[KA])(\s*\+\s*H2S)?\b", t)
     if m:
         base = m.group(1)
-        _set_if_stronger(parsed,"TSI Pattern", base + ("+H2S" if m.group(2) else ""))
 # ======================================================================
 # 7. NaCl Tolerant (>=6%)
 # ======================================================================
 def _parse_nacl(text: str, parsed: Dict[str,str]):
-    _parse_pnv_after_anchor(text, parsed, "NaCl Tolerant (>=6%)")
 # ======================================================================
-# 8. Haemolysis Type (fixed)
 # ======================================================================
 def _parse_haemolysis_type(text: str, parsed: Dict[str,str]):
     m = re.search(
         r"\bhaemolysis type\b\s*[:\-]?\s*(alpha|beta|gamma|none)",
-        text.lower()
     )
     if m:
         _set_if_stronger(parsed,"Haemolysis Type",m.group(1).capitalize())
 # ======================================================================
-# 9. Ornithine / Ornitihine Decarboxylase alias sync
 # ======================================================================
-def _sync_ornithine(parsed: Dict[str,str]):
-    if "Ornitihine Decarboxylase" in parsed:
-        _set_if_stronger(
-            parsed,
-            "Ornithine Decarboxylase",
-            parsed["Ornitihine Decarboxylase"]
-        )
 # ======================================================================
 # MAIN
@@ -201,11 +285,13 @@ def parse_text_extended(text: str) -> Dict[str,Any]:
     _parse_tsi(cleaned, parsed)
     _parse_nacl(cleaned, parsed)
     _parse_haemolysis_type(cleaned, parsed)
-    _sync_ornithine(parsed)
-    # If Motility is explicitly negative anywhere → Motility Type negative
-    if parsed.get("Motility") == "Negative":
-        _set_if_stronger(parsed,"Motility Type","Negative")
     return {
         "parsed_fields": parsed,

 # engine/parser_ext.py
 # ======================================================================
 # Extended test parser — Stage 12C-fix2
+#
+# Fixes added after eval_parsers (~0.9045 accuracy):
+#   ✔ Haemolysis Type: supports "None"
+#   ✔ Ornithine Decarboxylase: supports correct spelling + typo alias sync
+#   ✔ Motility Type: supports "Negative"/"None" when explicitly stated
+#
+# GOAL:
+#   • Explicit-only parsing
+#   • ML-safe
+#   • Deterministic
+#   • No inference
 # ======================================================================
 from __future__ import annotations
+import os, re, json
+from typing import Dict, Any, List
+EXTENDED_SCHEMA_PATH = os.path.join("data", "extended_schema.json")
 UNKNOWN = "Unknown"
+# ======================================================================
+# Fields NOT parsed here
+# ======================================================================
+CORE_FIELDS = {
+    "Genus","Species",
+    "Gram Stain","Shape","Colony Morphology",
+    "Haemolysis","Motility","Capsule","Spore Formation",
+    "Growth Temperature","Oxygen Requirement","Media Grown On",
+    "Catalase","Oxidase","Indole","Urease","Citrate","Methyl Red","VP",
+    "H2S","DNase","ONPG","Coagulase","Lipase Test","Nitrate Reduction",
+    "Lysine Decarboxylase","Arginine dihydrolase",
+    "Gelatin Hydrolysis","Esculin Hydrolysis",
+    "Glucose Fermentation","Lactose Fermentation","Sucrose Fermentation",
+    "Mannitol Fermentation","Sorbitol Fermentation","Maltose Fermentation",
+    "Xylose Fermentation","Rhamnose Fermentation","Arabinose Fermentation",
+    "Raffinose Fermentation","Trehalose Fermentation","Inositol Fermentation",
+}
 # ======================================================================
 # Helpers
 # ======================================================================
     if field not in parsed or parsed[field] == UNKNOWN:
         parsed[field] = value
+def _parse_pnv_after_anchor(
+    text: str,
+    parsed: Dict[str,str],
+    field: str,
+    anchor: str
+):
     m = re.search(
+        rf"\b{re.escape(anchor)}\b\s*(positive|negative|variable|unknown)",
+        text,
+        re.IGNORECASE,
     )
     if m:
         _set_if_stronger(parsed, field, m.group(1).capitalize())
+def _load_extended_schema(path: str) -> Dict[str, Any]:
+    if not os.path.exists(path):
+        return {}
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            obj = json.load(f)
+        return obj if isinstance(obj, dict) else {}
+    except Exception:
+        return {}
 # ======================================================================
 # 1. Gas Production
 # ======================================================================
 def _parse_gas_production(text: str, parsed: Dict[str,str]):
     t = text.lower()
+    POS = [
         "produces gas","gas produced","with gas",
         "gas production positive","gas producer",
+        "production of gas","ferments glucose with gas",
+    ]
+    NEG = [
         "does not produce gas","no gas",
+        "absence of gas","gas production negative",
+    ]
+    if any(p in t for p in POS):
+        _set_if_stronger(parsed,"Gas Production","Positive")
+    elif any(n in t for n in NEG):
         _set_if_stronger(parsed,"Gas Production","Negative")
 # ======================================================================
+# 2. Motility Type (explicit)
 # ======================================================================
+MOTILITY_TYPES = [
     "Peritrichous","Monotrichous","Polytrichous","Polar",
+    "Swarming","Tumbling","Gliding","Corkscrew","Axial",
+]
 def _parse_motility_type(text: str, parsed: Dict[str,str]):
     t = text.lower()
+    # Explicit negative / none:
+    # "Motility Type Negative" / "Motility Type None"
+    mneg = re.search(r"\bmotility type\b\s*[:\-]?\s*(negative|none)\b", t)
+    if mneg:
+        _set_if_stronger(parsed, "Motility Type", mneg.group(1).capitalize())
         return
+    # Anchor-based: "Motility Type Swarming"
+    m = re.search(
+        r"\bmotility type\b\s*[:\-]?\s*([a-z]+)",
+        t
+    )
     if m:
         val = m.group(1).capitalize()
         if val in MOTILITY_TYPES:
             _set_if_stronger(parsed,"Motility Type",val)
             return
+    # Free explicit words (only these)
     for mt in MOTILITY_TYPES:
         if re.search(rf"\b{mt.lower()}\b", t):
             _set_if_stronger(parsed,"Motility Type",mt)
             return
 # ======================================================================
+# 3. Pigment (explicit only)
 # ======================================================================
+PIGMENT_TERMS = [
+    "pyocyanin","pyoverdine","pyovacin",
+    "green","yellow","pink","red","orange",
+    "brown","black","violet","cream",
+]
 def _parse_pigment(text: str, parsed: Dict[str,str]):
     t = text.lower()
+    if not re.search(r"\b(pigment|pigmentation)\b", t):
         return
+    found = []
+    for p in PIGMENT_TERMS:
         if re.search(rf"\b{p}\b", t):
+            found.append(p.capitalize())
+    if "no pigmentation" in t or "pigment none" in t:
+        _set_if_stronger(parsed,"Pigment","None")
+    elif found:
+        _set_if_stronger(parsed,"Pigment","; ".join(sorted(set(found))))
 # ======================================================================
 # 4. Colony Pattern (explicit only)
 # ======================================================================
+COLONY_PATTERNS = [
+    "Mucoid","Smooth","Rough","Filamentous",
+    "Spreading","Swarming","Sticky",
+    "Ground-glass","Molar-tooth","Chalky","Corroding",
+]
 def _parse_colony_pattern(text: str, parsed: Dict[str,str]):
     t = text.lower()
     if not re.search(r"\bcolony pattern\b", t):
         return
+    m = re.search(
+        r"\bcolony pattern\b\s*[:\-]?\s*([a-z\-]+)",
+        t
+    )
     if m:
+        val = m.group(1).capitalize()
+        if val in COLONY_PATTERNS:
+            _set_if_stronger(parsed,"Colony Pattern",val)
 # ======================================================================
 # 5. Odor (anchor-based)
 # ======================================================================
 def _parse_odor(text: str, parsed: Dict[str,str]):
+    t = text.lower()
     m = re.search(
         r"\b(odor|odour|smell)\b\s*[:\-]?\s*([a-z; ]+)",
+        t
     )
     if not m:
         return
+    raw = m.group(2)
+    vals = [v.strip().capitalize() for v in raw.split(";") if v.strip()]
     if vals:
         _set_if_stronger(parsed,"Odor","; ".join(vals))
 def _parse_tsi(text: str, parsed: Dict[str,str]):
     t = text.upper()
     if "TSI" in t and "UNKNOWN" in t:
         _set_if_stronger(parsed,"TSI Pattern","Unknown")
         return
     m = re.search(r"\b([KA]/[KA])(\s*\+\s*H2S)?\b", t)
     if m:
         base = m.group(1)
+        if m.group(2):
+            _set_if_stronger(parsed,"TSI Pattern",f"{base}+H2S")
+        else:
+            _set_if_stronger(parsed,"TSI Pattern",base)
 # ======================================================================
 # 7. NaCl Tolerant (>=6%)
 # ======================================================================
 def _parse_nacl(text: str, parsed: Dict[str,str]):
+    _parse_pnv_after_anchor(
+        text, parsed,
+        "NaCl Tolerant (>=6%)",
+        "NaCl Tolerant (>=6%)"
+    )
 # ======================================================================
+# 8. Haemolysis Type override (supports None)
 # ======================================================================
 def _parse_haemolysis_type(text: str, parsed: Dict[str,str]):
     m = re.search(
         r"\bhaemolysis type\b\s*[:\-]?\s*(alpha|beta|gamma|none)",
+        text,
+        re.IGNORECASE,
     )
     if m:
         _set_if_stronger(parsed,"Haemolysis Type",m.group(1).capitalize())
 # ======================================================================
+# 9. Ornithine Decarboxylase: accept both spellings + alias sync
 # ======================================================================
+def _parse_ornithine_dec(text: str, parsed: Dict[str,str]):
+    # Correct spelling
+    _parse_pnv_after_anchor(text, parsed, "Ornithine Decarboxylase", "Ornithine Decarboxylase")
+    # Common typo spelling (legacy)
+    _parse_pnv_after_anchor(text, parsed, "Ornitihine Decarboxylase", "Ornitihine Decarboxylase")
+    # Sync: if typo parsed, also fill correct field
+    if "Ornitihine Decarboxylase" in parsed and "Ornithine Decarboxylase" not in parsed:
+        _set_if_stronger(parsed, "Ornithine Decarboxylase", parsed["Ornitihine Decarboxylase"])
 # ======================================================================
 # MAIN
     _parse_tsi(cleaned, parsed)
     _parse_nacl(cleaned, parsed)
     _parse_haemolysis_type(cleaned, parsed)
+    _parse_ornithine_dec(cleaned, parsed)
+    # NOTE:
+    # You asked: "If Motility Negative parsed (by any parser) should make Motility Type negative automatically".
+    # parser_ext only sees raw text, and you’ve chosen to leave parser_rules alone.
+    # So we *only* enforce this if the raw text explicitly says "Motility Type Negative/None".
+    # (No cross-parser inference here to stay ML-safe.)
     return {
         "parsed_fields": parsed,