Spaces:

EphAsad
/

BactKing

Sleeping

App Files Files Community

EphAsad commited on Dec 16, 2025

Commit

e11bc4c

verified ·

1 Parent(s): 28b5c1c

Update engine/parser_ext.py

Browse files

Files changed (1) hide show

engine/parser_ext.py +65 -115

engine/parser_ext.py CHANGED Viewed

@@ -1,47 +1,14 @@
 # engine/parser_ext.py
 # ======================================================================
-# Extended test parser — Stage 12C-fix1
-#
-# Fixes added after eval_parsers (0.882 accuracy):
-#   ✔ Odor: parse value following odor/smell anchor
-#   ✔ Ornithine Decarboxylase spelling corrected
-#   ✔ Motility Type: parse value following anchor
-#   ✔ NaCl Tolerant (>=6%) explicit rule
-#   ✔ Haemolysis Type explicit override rule
-#
-# GOAL:
-#   • Explicit-only parsing
-#   • ML-safe
-#   • Deterministic
-#   • No inference
 # ======================================================================
 from __future__ import annotations
-import os, re, json
-from typing import Dict, Any, List
-EXTENDED_SCHEMA_PATH = os.path.join("data", "extended_schema.json")
 UNKNOWN = "Unknown"
-# ======================================================================
-# Fields NOT parsed here
-# ======================================================================
-CORE_FIELDS = {
-    "Genus","Species",
-    "Gram Stain","Shape","Colony Morphology",
-    "Haemolysis","Motility","Capsule","Spore Formation",
-    "Growth Temperature","Oxygen Requirement","Media Grown On",
-    "Catalase","Oxidase","Indole","Urease","Citrate","Methyl Red","VP",
-    "H2S","DNase","ONPG","Coagulase","Lipase Test","Nitrate Reduction",
-    "Lysine Decarboxylase","Arginine dihydrolase",
-    "Gelatin Hydrolysis","Esculin Hydrolysis",
-    "Glucose Fermentation","Lactose Fermentation","Sucrose Fermentation",
-    "Mannitol Fermentation","Sorbitol Fermentation","Maltose Fermentation",
-    "Xylose Fermentation","Rhamnose Fermentation","Arabinose Fermentation",
-    "Raffinose Fermentation","Trehalose Fermentation","Inositol Fermentation",
-}
 # ======================================================================
 # Helpers
 # ======================================================================
@@ -58,16 +25,10 @@ def _set_if_stronger(parsed: Dict[str,str], field: str, value: str):
     if field not in parsed or parsed[field] == UNKNOWN:
         parsed[field] = value
-def _parse_pnv_after_anchor(
-    text: str,
-    parsed: Dict[str,str],
-    field: str,
-    anchor: str
-):
     m = re.search(
-        rf"\b{re.escape(anchor)}\b\s*(positive|negative|variable|unknown)",
-        text,
-        re.IGNORECASE,
     )
     if m:
         _set_if_stronger(parsed, field, m.group(1).capitalize())
@@ -78,118 +39,99 @@ def _parse_pnv_after_anchor(
 def _parse_gas_production(text: str, parsed: Dict[str,str]):
     t = text.lower()
-    POS = [
         "produces gas","gas produced","with gas",
         "gas production positive","gas producer",
-        "production of gas","ferments glucose with gas",
-    ]
-    NEG = [
-        "does not produce gas","no gas",
-        "absence of gas","gas production negative",
-    ]
-    if any(p in t for p in POS):
         _set_if_stronger(parsed,"Gas Production","Positive")
-    elif any(n in t for n in NEG):
         _set_if_stronger(parsed,"Gas Production","Negative")
 # ======================================================================
-# 2. Motility Type (explicit)
 # ======================================================================
-MOTILITY_TYPES = [
     "Peritrichous","Monotrichous","Polytrichous","Polar",
-    "Swarming","Tumbling","Gliding","Corkscrew","Axial",
-]
 def _parse_motility_type(text: str, parsed: Dict[str,str]):
     t = text.lower()
-    # Anchor-based: "Motility Type Swarming"
-    m = re.search(
-        r"\bmotility type\b\s*[:\-]?\s*([a-z]+)",
-        t
-    )
     if m:
         val = m.group(1).capitalize()
         if val in MOTILITY_TYPES:
             _set_if_stronger(parsed,"Motility Type",val)
             return
-    # Free explicit words (only these)
     for mt in MOTILITY_TYPES:
         if re.search(rf"\b{mt.lower()}\b", t):
             _set_if_stronger(parsed,"Motility Type",mt)
             return
 # ======================================================================
-# 3. Pigment (explicit only)
 # ======================================================================
-PIGMENT_TERMS = [
-    "pyocyanin","pyoverdine","pyovacin",
-    "green","yellow","pink","red","orange",
-    "brown","black","violet","cream",
-]
 def _parse_pigment(text: str, parsed: Dict[str,str]):
     t = text.lower()
     if not re.search(r"\b(pigment|pigmentation)\b", t):
         return
-    found = []
-    for p in PIGMENT_TERMS:
-        if re.search(rf"\b{p}\b", t):
-            found.append(p.capitalize())
     if "no pigmentation" in t or "pigment none" in t:
         _set_if_stronger(parsed,"Pigment","None")
-    elif found:
-        _set_if_stronger(parsed,"Pigment","; ".join(sorted(set(found))))
 # ======================================================================
 # 4. Colony Pattern (explicit only)
 # ======================================================================
-COLONY_PATTERNS = [
-    "Mucoid","Smooth","Rough","Filamentous",
-    "Spreading","Swarming","Sticky",
-    "Ground-glass","Molar-tooth","Chalky","Corroding",
-]
 def _parse_colony_pattern(text: str, parsed: Dict[str,str]):
     t = text.lower()
     if not re.search(r"\bcolony pattern\b", t):
         return
-    m = re.search(
-        r"\bcolony pattern\b\s*[:\-]?\s*([a-z\-]+)",
-        t
-    )
     if m:
-        val = m.group(1).capitalize()
-        if val in COLONY_PATTERNS:
-            _set_if_stronger(parsed,"Colony Pattern",val)
 # ======================================================================
 # 5. Odor (anchor-based)
 # ======================================================================
 def _parse_odor(text: str, parsed: Dict[str,str]):
-    t = text.lower()
     m = re.search(
         r"\b(odor|odour|smell)\b\s*[:\-]?\s*([a-z; ]+)",
-        t
     )
     if not m:
         return
-    raw = m.group(2)
-    vals = [v.strip().capitalize() for v in raw.split(";") if v.strip()]
     if vals:
         _set_if_stronger(parsed,"Odor","; ".join(vals))
@@ -199,7 +141,6 @@ def _parse_odor(text: str, parsed: Dict[str,str]):
 def _parse_tsi(text: str, parsed: Dict[str,str]):
     t = text.upper()
     if "TSI" in t and "UNKNOWN" in t:
         _set_if_stronger(parsed,"TSI Pattern","Unknown")
         return
@@ -207,35 +148,39 @@ def _parse_tsi(text: str, parsed: Dict[str,str]):
     m = re.search(r"\b([KA]/[KA])(\s*\+\s*H2S)?\b", t)
     if m:
         base = m.group(1)
-        if m.group(2):
-            _set_if_stronger(parsed,"TSI Pattern",f"{base}+H2S")
-        else:
-            _set_if_stronger(parsed,"TSI Pattern",base)
 # ======================================================================
 # 7. NaCl Tolerant (>=6%)
 # ======================================================================
 def _parse_nacl(text: str, parsed: Dict[str,str]):
-    _parse_pnv_after_anchor(
-        text, parsed,
-        "NaCl Tolerant (>=6%)",
-        "NaCl Tolerant (>=6%)"
-    )
 # ======================================================================
-# 8. Haemolysis Type override
 # ======================================================================
 def _parse_haemolysis_type(text: str, parsed: Dict[str,str]):
     m = re.search(
-        r"\bhaemolysis type\b\s*[:\-]?\s*(alpha|beta|gamma)",
-        text,
-        re.IGNORECASE,
     )
     if m:
         _set_if_stronger(parsed,"Haemolysis Type",m.group(1).capitalize())
 # ======================================================================
 # MAIN
 # ======================================================================
@@ -256,6 +201,11 @@ def parse_text_extended(text: str) -> Dict[str,Any]:
     _parse_tsi(cleaned, parsed)
     _parse_nacl(cleaned, parsed)
     _parse_haemolysis_type(cleaned, parsed)
     return {
         "parsed_fields": parsed,

 # engine/parser_ext.py
 # ======================================================================
+# Extended test parser — Stage 12C-fix2
 # ======================================================================
 from __future__ import annotations
+import os, re
+from typing import Dict, Any
 UNKNOWN = "Unknown"
 # ======================================================================
 # Helpers
 # ======================================================================
     if field not in parsed or parsed[field] == UNKNOWN:
         parsed[field] = value
+def _parse_pnv_after_anchor(text: str, parsed: Dict[str,str], field: str):
     m = re.search(
+        rf"\b{re.escape(field.lower())}\b\s*(positive|negative|variable|unknown)",
+        text.lower()
     )
     if m:
         _set_if_stronger(parsed, field, m.group(1).capitalize())
 def _parse_gas_production(text: str, parsed: Dict[str,str]):
     t = text.lower()
+    if any(x in t for x in [
         "produces gas","gas produced","with gas",
         "gas production positive","gas producer",
+        "production of gas","ferments glucose with gas"
+    ]):
         _set_if_stronger(parsed,"Gas Production","Positive")
+    elif any(x in t for x in [
+        "does not produce gas","no gas",
+        "absence of gas","gas production negative"
+    ]):
         _set_if_stronger(parsed,"Gas Production","Negative")
 # ======================================================================
+# 2. Motility Type (fixed)
 # ======================================================================
+MOTILITY_TYPES = {
     "Peritrichous","Monotrichous","Polytrichous","Polar",
+    "Swarming","Tumbling","Gliding","Corkscrew","Axial"
+}
 def _parse_motility_type(text: str, parsed: Dict[str,str]):
     t = text.lower()
+    # Explicit negative
+    if re.search(r"\bmotility type\b\s*(negative|none)", t):
+        _set_if_stronger(parsed,"Motility Type","Negative")
+        return
+    # Anchor-based
+    m = re.search(r"\bmotility type\b\s*[:\-]?\s*([a-z]+)", t)
     if m:
         val = m.group(1).capitalize()
         if val in MOTILITY_TYPES:
             _set_if_stronger(parsed,"Motility Type",val)
             return
+    # Free word
     for mt in MOTILITY_TYPES:
         if re.search(rf"\b{mt.lower()}\b", t):
             _set_if_stronger(parsed,"Motility Type",mt)
             return
 # ======================================================================
+# 3. Pigment (unchanged)
 # ======================================================================
 def _parse_pigment(text: str, parsed: Dict[str,str]):
     t = text.lower()
     if not re.search(r"\b(pigment|pigmentation)\b", t):
         return
     if "no pigmentation" in t or "pigment none" in t:
         _set_if_stronger(parsed,"Pigment","None")
+        return
+    pigments = []
+    for p in [
+        "pyocyanin","pyoverdine","pyovacin",
+        "green","yellow","pink","red","orange",
+        "brown","black","violet","cream"
+    ]:
+        if re.search(rf"\b{p}\b", t):
+            pigments.append(p.capitalize())
+    if pigments:
+        _set_if_stronger(parsed,"Pigment","; ".join(sorted(set(pigments))))
 # ======================================================================
 # 4. Colony Pattern (explicit only)
 # ======================================================================
 def _parse_colony_pattern(text: str, parsed: Dict[str,str]):
     t = text.lower()
     if not re.search(r"\bcolony pattern\b", t):
         return
+    m = re.search(r"\bcolony pattern\b\s*[:\-]?\s*([a-z\-]+)", t)
     if m:
+        _set_if_stronger(parsed,"Colony Pattern",m.group(1).capitalize())
 # ======================================================================
 # 5. Odor (anchor-based)
 # ======================================================================
 def _parse_odor(text: str, parsed: Dict[str,str]):
     m = re.search(
         r"\b(odor|odour|smell)\b\s*[:\-]?\s*([a-z; ]+)",
+        text.lower()
     )
     if not m:
         return
+    vals = [v.strip().capitalize() for v in m.group(2).split(";") if v.strip()]
     if vals:
         _set_if_stronger(parsed,"Odor","; ".join(vals))
 def _parse_tsi(text: str, parsed: Dict[str,str]):
     t = text.upper()
     if "TSI" in t and "UNKNOWN" in t:
         _set_if_stronger(parsed,"TSI Pattern","Unknown")
         return
     m = re.search(r"\b([KA]/[KA])(\s*\+\s*H2S)?\b", t)
     if m:
         base = m.group(1)
+        _set_if_stronger(parsed,"TSI Pattern", base + ("+H2S" if m.group(2) else ""))
 # ======================================================================
 # 7. NaCl Tolerant (>=6%)
 # ======================================================================
 def _parse_nacl(text: str, parsed: Dict[str,str]):
+    _parse_pnv_after_anchor(text, parsed, "NaCl Tolerant (>=6%)")
 # ======================================================================
+# 8. Haemolysis Type (fixed)
 # ======================================================================
 def _parse_haemolysis_type(text: str, parsed: Dict[str,str]):
     m = re.search(
+        r"\bhaemolysis type\b\s*[:\-]?\s*(alpha|beta|gamma|none)",
+        text.lower()
     )
     if m:
         _set_if_stronger(parsed,"Haemolysis Type",m.group(1).capitalize())
+# ======================================================================
+# 9. Ornithine / Ornitihine Decarboxylase alias sync
+# ======================================================================
+def _sync_ornithine(parsed: Dict[str,str]):
+    if "Ornitihine Decarboxylase" in parsed:
+        _set_if_stronger(
+            parsed,
+            "Ornithine Decarboxylase",
+            parsed["Ornitihine Decarboxylase"]
+        )
 # ======================================================================
 # MAIN
 # ======================================================================
     _parse_tsi(cleaned, parsed)
     _parse_nacl(cleaned, parsed)
     _parse_haemolysis_type(cleaned, parsed)
+    _sync_ornithine(parsed)
+    # If Motility is explicitly negative anywhere → Motility Type negative
+    if parsed.get("Motility") == "Negative":
+        _set_if_stronger(parsed,"Motility Type","Negative")
     return {
         "parsed_fields": parsed,