Spaces:

EphAsad
/

BactKing

Sleeping

App Files Files Community

EphAsad commited on Dec 23, 2025

Commit

83b046c

verified ·

1 Parent(s): 7132230

Update engine/parser_fusion.py

Browse files

Files changed (1) hide show

engine/parser_fusion.py +263 -367

engine/parser_fusion.py CHANGED Viewed

@@ -1,436 +1,332 @@
-# engine/parser_llm.py
 # ------------------------------------------------------------
-# Local LLM parser for BactAI-D (Flan-T5, CPU-friendly)
-# Third parser head: repair & recovery
 #
-# Drop-in patched version:
-# - Few-shot examples increased to 15 (configurable via env)
-# - Field alias mapping (prevents silent field drops)
-# - Non-greedy JSON extraction (prevents regex over-capture)
-# - Improved P/N/V normalization (Flan phrasing coverage)
-# - Prompt refined for "extract/clarify" (reduces Unknown collapse)
-# - Debug prints (toggle via env var)
-# - Sugar logic scaffold preserved
 # ------------------------------------------------------------
 from __future__ import annotations
 import json
 import os
-import random
-import re
-from typing import Dict, Any, List, Optional
-import torch
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-# ------------------------------------------------------------
-# Model configuration
-# ------------------------------------------------------------
-DEFAULT_MODEL = os.getenv(
-    "BACTAI_LLM_PARSER_MODEL",
-    "google/flan-t5-base",
-)
-# You asked to raise snapshots to 15
-MAX_FEWSHOT_EXAMPLES = int(os.getenv("BACTAI_LLM_FEWSHOT", "25"))
-MAX_NEW_TOKENS = int(os.getenv("BACTAI_LLM_MAX_NEW_TOKENS", "128"))
-# Debug visibility (prints raw model output + parsed dict)
-DEBUG_LLM = os.getenv("BACTAI_LLM_DEBUG", "1").strip().lower() in {"1", "true", "yes", "y", "on"}
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-_tokenizer: Optional[AutoTokenizer] = None
-_model: Optional[AutoModelForSeq2SeqLM] = None
-_GOLD_EXAMPLES: Optional[List[Dict[str, Any]]] = None
-# ------------------------------------------------------------
-# Allowed fields
-# ------------------------------------------------------------
-ALL_FIELDS: List[str] = [
-    "Gram Stain",
-    "Shape",
-    "Motility",
-    "Capsule",
-    "Spore Formation",
-    "Haemolysis",
-    "Haemolysis Type",
-    "Media Grown On",
-    "Colony Morphology",
-    "Oxygen Requirement",
-    "Growth Temperature",
-    "Catalase",
-    "Oxidase",
-    "Indole",
-    "Urease",
-    "Citrate",
-    "Methyl Red",
-    "VP",
-    "H2S",
-    "DNase",
-    "ONPG",
-    "Coagulase",
-    "Gelatin Hydrolysis",
-    "Esculin Hydrolysis",
-    "Nitrate Reduction",
-    "NaCl Tolerant (>=6%)",
-    "Lipase Test",
-    "Lysine Decarboxylase",
-    "Ornithine Decarboxylase",
-    "Ornitihine Decarboxylase",
-    "Arginine dihydrolase",
-    "Glucose Fermentation",
-    "Lactose Fermentation",
-    "Sucrose Fermentation",
-    "Maltose Fermentation",
-    "Mannitol Fermentation",
-    "Sorbitol Fermentation",
-    "Xylose Fermentation",
-    "Rhamnose Fermentation",
-    "Arabinose Fermentation",
-    "Raffinose Fermentation",
-    "Trehalose Fermentation",
-    "Inositol Fermentation",
-    "Gas Production",
-    "TSI Pattern",
-    "Colony Pattern",
-    "Pigment",
-    "Motility Type",
-    "Odor",
-]
-SUGAR_FIELDS = [
-    "Glucose Fermentation",
-    "Lactose Fermentation",
-    "Sucrose Fermentation",
-    "Maltose Fermentation",
-    "Mannitol Fermentation",
-    "Sorbitol Fermentation",
-    "Xylose Fermentation",
-    "Rhamnose Fermentation",
-    "Arabinose Fermentation",
-    "Raffinose Fermentation",
-    "Trehalose Fermentation",
-    "Inositol Fermentation",
-]
-PNV_FIELDS = set(
-    f for f in ALL_FIELDS
-    if f not in {
-        "Media Grown On",
-        "Colony Morphology",
-        "Growth Temperature",
-        "Gram Stain",
-        "Shape",
-        "Oxygen Requirement",
-        "Haemolysis Type",
-    }
-)
-# ------------------------------------------------------------
-# Field alias mapping (CRITICAL)
-# ------------------------------------------------------------
-FIELD_ALIASES: Dict[str, str] = {
-    # Gram
-    "Gram": "Gram Stain",
-    "Gram stain": "Gram Stain",
-    "Gram Stain Result": "Gram Stain",
-    # Salt tolerance
-    "NaCl tolerance": "NaCl Tolerant (>=6%)",
-    "NaCl Tolerant": "NaCl Tolerant (>=6%)",
-    "Salt tolerance": "NaCl Tolerant (>=6%)",
-    "Salt tolerant": "NaCl Tolerant (>=6%)",
-    "6.5% NaCl": "NaCl Tolerant (>=6%)",
-    "6% NaCl": "NaCl Tolerant (>=6%)",
-    # Temperature
-    "Growth temp": "Growth Temperature",
-    "Growth temperature": "Growth Temperature",
-    "Temperature growth": "Growth Temperature",
-    # Tests
-    "Catalase test": "Catalase",
-    "Oxidase test": "Oxidase",
-    "Indole test": "Indole",
-    "Urease test": "Urease",
-    "Citrate test": "Citrate",
-    # Sugars (common lowercase variants)
-    "Glucose fermentation": "Glucose Fermentation",
-    "Lactose fermentation": "Lactose Fermentation",
-    "Sucrose fermentation": "Sucrose Fermentation",
-    "Maltose fermentation": "Maltose Fermentation",
-    "Mannitol fermentation": "Mannitol Fermentation",
-    "Sorbitol fermentation": "Sorbitol Fermentation",
-    "Xylose fermentation": "Xylose Fermentation",
-    "Rhamnose fermentation": "Rhamnose Fermentation",
-    "Arabinose fermentation": "Arabinose Fermentation",
-    "Raffinose fermentation": "Raffinose Fermentation",
-    "Trehalose fermentation": "Trehalose Fermentation",
-    "Inositol fermentation": "Inositol Fermentation",
-}
 # ------------------------------------------------------------
-# Normalisation helpers
 # ------------------------------------------------------------
-def _norm_str(s: Any) -> str:
-    return str(s).strip() if s is not None else ""
-def _normalise_pnv_value(raw: Any) -> str:
     """
-    Expanded Flan-friendly normalization.
     """
-    s = _norm_str(raw).lower()
-    if not s:
-        return "Unknown"
-    if any(x in s for x in {"positive", "pos", "+", "yes", "present", "detected", "reactive"}):
-        return "Positive"
-    if any(x in s for x in {"negative", "neg", "-", "no", "none", "absent", "not detected", "no growth"}):
-        return "Negative"
-    if "variable" in s or "mixed" in s or "inconsistent" in s:
-        return "Variable"
-    return "Unknown"
-def _normalise_gram(raw: Any) -> str:
-    s = _norm_str(raw).lower()
-    if "positive" in s:
-        return "Positive"
-    if "negative" in s:
-        return "Negative"
-    if "variable" in s:
-        return "Variable"
-    return "Unknown"
-def _merge_ornithine_variants(fields: Dict[str, str]) -> Dict[str, str]:
-    v = fields.get("Ornithine Decarboxylase") or fields.get("Ornitihine Decarboxylase")
-    if v and v != "Unknown":
-        fields["Ornithine Decarboxylase"] = v
-        fields["Ornitihine Decarboxylase"] = v
-    return fields
-# ------------------------------------------------------------
-# Sugar logic (RESTORED)
-# ------------------------------------------------------------
-_NON_FERMENTER_PATTERNS = re.compile(
-    r"\b(non[-\s]?fermenter|non[-\s]?fermentative|asaccharolytic|"
-    r"does not ferment (sugars|carbohydrates)|no carbohydrate fermentation)\b",
-    re.IGNORECASE,
-)
-def _apply_global_sugar_logic(fields: Dict[str, str], original_text: str) -> Dict[str, str]:
     """
-    If phenotype text indicates global non-fermenter behaviour,
-    mark all sugar fields Negative unless explicitly overridden.
     """
-    if not _NON_FERMENTER_PATTERNS.search(original_text):
-        return fields
-    for sugar in SUGAR_FIELDS:
-        current = fields.get(sugar)
-        if current in {"Positive", "Variable"}:
-            continue
-        fields[sugar] = "Negative"
-    return fields
-# ------------------------------------------------------------
-# Gold examples
-# ------------------------------------------------------------
-def _get_project_root() -> str:
-    return os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-def _load_gold_examples() -> List[Dict[str, Any]]:
-    global _GOLD_EXAMPLES
-    if _GOLD_EXAMPLES is not None:
-        return _GOLD_EXAMPLES
-    path = os.path.join(_get_project_root(), "data", "llm_gold_examples.json")
-    try:
-        with open(path, "r", encoding="utf-8") as f:
-            data = json.load(f)
-            _GOLD_EXAMPLES = data if isinstance(data, list) else []
-    except Exception:
-        _GOLD_EXAMPLES = []
-    return _GOLD_EXAMPLES
-# ------------------------------------------------------------
-# Prompt
-# ------------------------------------------------------------
-PROMPT_HEADER = """
-You are a microbiology expert assisting an automated phenotype parser.
-Your task is to EXTRACT OR CLARIFY phenotypic and biochemical test results
-from the input text.
-Rules:
-- Return ONLY valid JSON
-- Do NOT invent results
-- If a result is unclear or not stated, use "Unknown"
-- Prefer explicit statements over assumptions
-Output format:
-{
-  "parsed_fields": {
-    "Field Name": "Value",
-    ...
-  }
-}
-"""
-PROMPT_FOOTER = """
-Now process the following phenotype description.
-Input:
-\"\"\"<<PHENOTYPE>>\"\"\"
-Return ONLY the JSON object.
-"""
-def _build_prompt(text: str) -> str:
-    examples = _load_gold_examples()
-    n = min(MAX_FEWSHOT_EXAMPLES, len(examples))
-    sampled = random.sample(examples, n) if n > 0 else []
-    blocks: List[str] = [PROMPT_HEADER]
-    for ex in sampled:
-        inp = _norm_str(ex.get("input", ""))
-        exp = ex.get("expected", {})
-        if not isinstance(exp, dict):
-            exp = {}
-        blocks.append(
-            f'Input:\n"""{inp}"""\nOutput:\n'
-            f'{json.dumps({"parsed_fields": exp}, ensure_ascii=False)}\n'
-        )
-    blocks.append(PROMPT_FOOTER.replace("<<PHENOTYPE>>", text))
-    return "\n".join(blocks)
-# ------------------------------------------------------------
-# Model loader
-# ------------------------------------------------------------
-def _load_model() -> None:
-    global _model, _tokenizer
-    if _model is not None and _tokenizer is not None:
-        return
-    _tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL)
-    _model = AutoModelForSeq2SeqLM.from_pretrained(DEFAULT_MODEL).to(DEVICE)
-    _model.eval()
 # ------------------------------------------------------------
-# JSON extraction (non-greedy)
 # ------------------------------------------------------------
-_JSON_OBJECT_RE = re.compile(r"\{[\s\S]*?\}")
-def _extract_first_json_object(text: str) -> Dict[str, Any]:
     """
-    Extract the first JSON object from model output (non-greedy).
     """
-    m = _JSON_OBJECT_RE.search(text)
-    if not m:
-        return {}
-    try:
-        return json.loads(m.group(0))
-    except Exception:
-        return {}
-def _apply_field_aliases(fields_raw: Dict[str, Any]) -> Dict[str, Any]:
     """
-    Normalize keys via FIELD_ALIASES, preserving original values.
     """
-    out: Dict[str, Any] = {}
-    for k, v in fields_raw.items():
-        key = _norm_str(k)
-        if not key:
-            continue
-        mapped = FIELD_ALIASES.get(key, key)
-        out[mapped] = v
-    return out
-# ------------------------------------------------------------
-# PUBLIC API
-# ------------------------------------------------------------
-def parse_llm(text: str) -> Dict[str, Any]:
-    original = text or ""
-    if not original.strip():
-        return {"parsed_fields": {}, "source": "llm_parser", "raw": original}
-    _load_model()
-    assert _tokenizer is not None and _model is not None
-    prompt = _build_prompt(original)
-    # NOTE: Flan-T5 has a relatively small input length; truncation may occur.
-    inputs = _tokenizer(prompt, return_tensors="pt", truncation=True).to(DEVICE)
-    with torch.no_grad():
-        output = _model.generate(
-            **inputs,
-            max_new_tokens=MAX_NEW_TOKENS,
-            do_sample=False,
-            temperature=0.0,
-        )
-    decoded = _tokenizer.decode(output[0], skip_special_tokens=True)
-    if DEBUG_LLM:
-        print("=== LLM RAW OUTPUT ===")
-        print(decoded)
-        print("======================")
-    parsed_obj = _extract_first_json_object(decoded)
-    fields_raw = parsed_obj.get("parsed_fields", {}) if isinstance(parsed_obj, dict) else {}
-    if not isinstance(fields_raw, dict):
-        fields_raw = {}
-    # Apply alias mapping so we don't silently drop values
-    fields_raw = _apply_field_aliases(fields_raw)
-    if DEBUG_LLM:
-        print("=== LLM PARSED_FIELDS (RAW) ===")
-        try:
-            print(json.dumps(fields_raw, indent=2, ensure_ascii=False))
-        except Exception:
-            print(fields_raw)
-        print("===============================")
-    cleaned: Dict[str, str] = {}
-    for field in ALL_FIELDS:
-        if field not in fields_raw:
-            continue
-        raw_val = fields_raw[field]
-        if field == "Gram Stain":
-            cleaned[field] = _normalise_gram(raw_val)
-        elif field in PNV_FIELDS:
-            cleaned[field] = _normalise_pnv_value(raw_val)
-        else:
-            cleaned[field] = _norm_str(raw_val) or "Unknown"
-    cleaned = _merge_ornithine_variants(cleaned)
-    cleaned = _apply_global_sugar_logic(cleaned, original)
     return {
-        "parsed_fields": cleaned,
-        "source": "llm_parser",
-        "raw": original,
-    }

+# engine/parser_fusion.py
 # ------------------------------------------------------------
+# Tri-Parser Fusion — Stage 12B (Weighted, SOTA-style)
 #
+# This module combines:
+#   - Rule parser (parser_rules.parse_text_rules)
+#   - Extended parser (parser_ext.parse_text_extended)
+#   - LLM parser (parser_llm.parse_llm)    [optional]
+#
+# using per-field reliability weights learned in Stage 12A
+# and stored in:
+#   data/field_weights.json
+#
+# Behaviour:
+#   - For each field, gather predictions from available parsers.
+#   - For that field, load weights:
+#          field_weights[field]  (if present)
+#          else global weights
+#          else equal weights across available parsers
+#   - Discard parsers that:
+#          * did not predict the field
+#          * or only predicted "Unknown"
+#   - Group by predicted value and sum the weights of parsers
+#     that voted for each value.
+#   - Choose the value with highest total weight.
+#     Tie-break: prefer rules > extended > llm if needed.
+#
+# Output format:
+#   {
+#     "fused_fields": { field: value, ... },   # used by DB identifier AND genus ML
+#     "by_parser": {
+#       "rules": { ... },
+#       "extended": { ... },
+#       "llm": { ... }   # may be empty
+#     },
+#     "votes": {
+#       field_name: {
+#         "per_parser": {
+#           "rules": {"value": "Positive", "weight": 0.95},
+#           "extended": {"value": "Unknown", "weight": 0.03},
+#           ...
+#         },
+#         "summed": {
+#           "Positive": 0.97,
+#           "Negative": 0.02
+#         },
+#         "chosen": "Positive"
+#       },
+#       ...
+#     },
+#     "weights_meta": {
+#       "has_weights_file": True/False,
+#       "weights_path": "data/field_weights.json",
+#       "meta": { ... }  # from file if present
+#     }
+#   }
 # ------------------------------------------------------------
 from __future__ import annotations
 import json
 import os
+from typing import Any, Dict, Optional
+from engine.parser_rules import parse_text_rules
+from engine.parser_ext import parse_text_extended
+# Optional LLM parser
+try:
+    from engine.parser_llm import parse_llm as parse_text_llm  # type: ignore
+    HAS_LLM = True
+except Exception:
+    parse_text_llm = None  # type: ignore
+    HAS_LLM = False
+# Path to learned weights
+FIELD_WEIGHTS_PATH = os.path.join("data", "field_weights.json")
+UNKNOWN = "Unknown"
+PARSER_ORDER = ["rules", "extended", "llm"]  # used for tie-breaking
 # ------------------------------------------------------------
+# Weights loading and helpers
 # ------------------------------------------------------------
+def _load_field_weights(path: str = FIELD_WEIGHTS_PATH) -> Dict[str, Any]:
     """
+    Load the JSON weights file produced by Stage 12A.
+    Expected structure:
+      {
+        "global": { "rules": 0.7, "extended": 0.2, "llm": 0.1 },
+        "fields": {
+          "DNase": {
+            "rules": 0.95,
+            "extended": 0.03,
+            "llm": 0.02,
+            "support": 123
+          },
+          ...
+        },
+        "meta": { ... }
+      }
+    If the file is missing or broken, we fall back to an empty dict,
+    which triggers equal-weight behaviour later.
     """
+    if not os.path.exists(path):
+        return {}
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            obj = json.load(f)
+        if isinstance(obj, dict):
+            return obj
+        return {}
+    except Exception:
+        return {}
+FIELD_WEIGHTS_RAW: Dict[str, Any] = _load_field_weights()
+HAS_WEIGHTS_FILE: bool = bool(FIELD_WEIGHTS_RAW)
+def _normalise_scores(scores: Dict[str, float]) -> Dict[str, float]:
     """
+    Normalise a dict of parser -> score into weights summing to 1.
+    If all scores are zero or dict is empty, return equal weights.
     """
+    cleaned = {k: max(0.0, float(v)) for k, v in scores.items()}
+    total = sum(cleaned.values())
+    if total <= 0:
+        n = len(cleaned) or 1
+        return {k: 1.0 / n for k in cleaned}
+    return {k: v / total for k, v in cleaned.items()}
+def _get_base_weights_for_parsers(include_llm: bool) -> Dict[str, float]:
+    """
+    Get a naive equal-weight distribution across available parsers.
+    Used when no learned weights are available.
+    """
+    parsers = ["rules", "extended"]
+    if include_llm:
+        parsers.append("llm")
+    n = len(parsers) or 1
+    return {p: 1.0 / n for p in parsers}
+def _get_weights_for_field(field_name: str, include_llm: bool) -> Dict[str, float]:
+    """
+    Get weights for a specific field.
+    Priority:
+      1) If FIELD_WEIGHTS_RAW has a 'fields[field_name]' entry,
+         use that.
+      2) Else if FIELD_WEIGHTS_RAW has 'global', use that.
+      3) Else equal weights.
+    In all cases:
+      - Drop 'llm' if include_llm == False
+      - Normalise
+    """
+    if not FIELD_WEIGHTS_RAW:
+        base = _get_base_weights_for_parsers(include_llm)
+        return _normalise_scores(base)
+    fields_block = FIELD_WEIGHTS_RAW.get("fields", {}) or {}
+    global_block = FIELD_WEIGHTS_RAW.get("global", {}) or {}
+    raw: Dict[str, float] = {}
+    field_entry = fields_block.get(field_name)
+    if isinstance(field_entry, dict):
+        for k, v in field_entry.items():
+            if k in ("rules", "extended", "llm"):
+                raw[k] = float(v)
+    if not raw and isinstance(global_block, dict):
+        for k, v in global_block.items():
+            if k in ("rules", "extended", "llm"):
+                raw[k] = float(v)
+    if not raw:
+        raw = _get_base_weights_for_parsers(include_llm)
+    if not include_llm and "llm" in raw:
+        raw.pop("llm", None)
+    if not raw:
+        raw = _get_base_weights_for_parsers(include_llm=False)
+    return _normalise_scores(raw)
 # ------------------------------------------------------------
+# Fusion logic
 # ------------------------------------------------------------
+def _clean_pred_value(val: Optional[str]) -> Optional[str]:
     """
+    Treat None, "", or explicit "Unknown" as missing for fusion.
     """
+    if val is None:
+        return None
+    s = str(val).strip()
+    if not s:
+        return None
+    if s.lower() == UNKNOWN.lower():
+        return None
+    return s
+def parse_text_fused(text: str, use_llm: Optional[bool] = None) -> Dict[str, Any]:
     """
+    Main tri-fusion entrypoint.
+    Parameters
+    ----------
+    text : str
+    use_llm : bool or None
+        If True → include LLM.
+        If False → skip LLM.
+        If None → include if HAS_LLM.
+    Returns:
+      full fusion output including votes + per-parser summaries.
     """
+    original = text or ""
+    include_llm = HAS_LLM if use_llm is None else bool(use_llm)
+    rules_out = parse_text_rules(original) or {}
+    ext_out = parse_text_extended(original) or {}
+    rules_fields = dict(rules_out.get("parsed_fields", {}))
+    ext_fields = dict(ext_out.get("parsed_fields", {}))
+    llm_fields: Dict[str, Any] = {}
+    if include_llm and parse_text_llm is not None:
+        try:
+            llm_out = parse_text_llm(original)
+            if isinstance(llm_out, dict):
+                if "parsed_fields" in llm_out:
+                    llm_fields = dict(llm_out.get("parsed_fields", {}))
+                else:
+                    llm_fields = {str(k): v for k, v in llm_out.items()}
+        except Exception:
+            llm_fields = {}
+    else:
+        include_llm = False
+    by_parser: Dict[str, Dict[str, Any]] = {
+        "rules": rules_fields,
+        "extended": ext_fields,
+        "llm": llm_fields if include_llm else {},
+    }
+    candidate_fields = set(rules_fields.keys()) | set(ext_fields.keys()) | set(llm_fields.keys())
+    fused_fields: Dict[str, Any] = {}
+    votes_debug: Dict[str, Any] = {}
+    for field in sorted(candidate_fields):
+        weights = _get_weights_for_field(field, include_llm=include_llm)
+        parser_preds: Dict[str, Optional[str]] = {
+            "rules": _clean_pred_value(rules_fields.get(field)),
+            "extended": _clean_pred_value(ext_fields.get(field)),
+            "llm": _clean_pred_value(llm_fields.get(field)) if include_llm else None,
+        }
+        per_parser_info: Dict[str, Any] = {}
+        value_scores: Dict[str, float] = {}
+        for parser_name in PARSER_ORDER:
+            if parser_name == "llm" and not include_llm:
+                continue
+            pred = parser_preds.get(parser_name)
+            w = float(weights.get(parser_name, 0.0))
+            per_parser_info[parser_name] = {
+                "value": pred if pred is not None else UNKNOWN,
+                "weight": w,
+            }
+            if pred is None:
+                continue
+            value_scores[pred] = value_scores.get(pred, 0.0) + w
+        if not value_scores:
+            fused_value = UNKNOWN
+        else:
+            max_score = max(value_scores.values())
+            best_values = [v for v, s in value_scores.items() if s == max_score]
+            if len(best_values) == 1:
+                fused_value = best_values[0]
+            else:
+                fused_value = best_values[0]
+                for parser_name in PARSER_ORDER:
+                    if parser_name == "llm" and not include_llm:
+                        continue
+                    pred = parser_preds.get(parser_name)
+                    if pred in best_values:
+                        fused_value = pred
+                        break
+        fused_fields[field] = fused_value
+        votes_debug[field] = {
+            "per_parser": per_parser_info,
+            "summed": value_scores,
+            "chosen": fused_value,
+        }
+    weights_meta = {
+        "has_weights_file": HAS_WEIGHTS_FILE,
+        "weights_path": FIELD_WEIGHTS_PATH,
+        "meta": FIELD_WEIGHTS_RAW.get("meta", {}) if HAS_WEIGHTS_FILE else {},
+    }
     return {
+        "fused_fields": fused_fields,
+        "by_parser": by_parser,
+        "votes": votes_debug,
+        "weights_meta": weights_meta,
+    }