Spaces:

EphAsad
/

BactKing

Sleeping

App Files Files Community

EphAsad commited on Dec 28, 2025

Commit

1c100b5

verified ·

1 Parent(s): d358339

Update engine/parser_fusion.py

Browse files

Files changed (1) hide show

engine/parser_fusion.py +480 -258

engine/parser_fusion.py CHANGED Viewed

@@ -1,334 +1,556 @@
-# engine/parser_fusion.py
 # ------------------------------------------------------------
-# Tri-Parser Fusion — Stage 12B (Weighted, SOTA-style)
 #
-# This module combines:
-#   - Rule parser      (parser_rules.parse_text_rules)
-#   - Extended parser  (parser_ext.parse_text_extended)
-#   - LLM parser       (parser_llm.parse_llm)    [optional]
-#
-# using per-field reliability weights learned in Stage 12A
-# and stored in:
-#   data/field_weights.json
-#
-# Behaviour:
-#   - For each field, gather predictions from available parsers.
-#   - For that field, load weights:
-#         field_weights[field]  (if present)
-#         else global weights
-#         else equal weights across available parsers
-#   - Discard parsers that:
-#         * did not predict the field
-#         * or only predicted "Unknown"
-#   - Group by predicted value and sum the weights of parsers
-#     that voted for each value.
-#   - Choose the value with highest total weight.
-#     Tie-break: prefer rules > extended > llm if needed.
-#
-# Output format:
-#   {
-#     "fused_fields": { field: value, ... },   # used by DB identifier AND genus ML
-#     "by_parser": {
-#       "rules": { ... },
-#       "extended": { ... },
-#       "llm": { ... }   # may be empty
-#     },
-#     "votes": {
-#       field_name: {
-#         "per_parser": {
-#           "rules": {"value": "Positive", "weight": 0.95},
-#           "extended": {"value": "Unknown", "weight": 0.03},
-#           ...
-#         },
-#         "summed": {
-#           "Positive": 0.97,
-#           "Negative": 0.02
-#         },
-#         "chosen": "Positive"
-#       },
-#       ...
-#     },
-#     "weights_meta": {
-#       "has_weights_file": True/False,
-#       "weights_path": "data/field_weights.json",
-#       "meta": { ... }  # from file if present
-#     }
-#   }
 # ------------------------------------------------------------
 from __future__ import annotations
 import json
 import os
-from typing import Any, Dict, Optional
-from engine.parser_rules import parse_text_rules
-from engine.parser_ext import parse_text_extended
-# Optional LLM parser
-try:
-    from engine.parser_llm import parse_llm as parse_text_llm  # type: ignore
-    HAS_LLM = True
-except Exception:
-    parse_text_llm = None  # type: ignore
-    HAS_LLM = False
-# Path to learned weights
-FIELD_WEIGHTS_PATH = os.path.join("data", "field_weights.json")
-UNKNOWN = "Unknown"
-PARSER_ORDER = ["rules", "extended", "llm"]  # tie-breaking priority
 # ------------------------------------------------------------
-# Weights loading and helpers
 # ------------------------------------------------------------
-def _load_field_weights(path: str = FIELD_WEIGHTS_PATH) -> Dict[str, Any]:
-    """
-    Load the JSON weights file produced by Stage 12A.
-    Expected structure:
-      {
-        "global": { "rules": 0.7, "extended": 0.2, "llm": 0.1 },
-        "fields": {
-          "DNase": {
-            "rules": 0.95,
-            "extended": 0.03,
-            "llm": 0.02,
-            "support": 123
-          },
-          ...
-        },
-        "meta": { ... }
-      }
-    If the file is missing or broken, fall back to empty dict,
-    triggering equal-weight behaviour later.
-    """
-    if not os.path.exists(path):
-        return {}
     try:
         with open(path, "r", encoding="utf-8") as f:
-            obj = json.load(f)
-        return obj if isinstance(obj, dict) else {}
     except Exception:
-        return {}
-FIELD_WEIGHTS_RAW: Dict[str, Any] = _load_field_weights()
-HAS_WEIGHTS_FILE: bool = bool(FIELD_WEIGHTS_RAW)
-def _normalise_scores(scores: Dict[str, float]) -> Dict[str, float]:
-    """
-    Normalise parser -> score into weights summing to 1.
-    If all scores are zero or dict is empty, return equal weights.
-    """
-    cleaned = {k: max(0.0, float(v)) for k, v in scores.items()}
-    total = sum(cleaned.values())
-    if total <= 0:
-        n = len(cleaned) or 1
-        return {k: 1.0 / n for k in cleaned}
-    return {k: v / total for k, v in cleaned.items()}
-def _get_base_weights_for_parsers(include_llm: bool) -> Dict[str, float]:
-    """
-    Equal-weight distribution across available parsers.
-    Used when no learned weights are available.
-    """
-    parsers = ["rules", "extended"]
-    if include_llm:
-        parsers.append("llm")
-    n = len(parsers) or 1
-    return {p: 1.0 / n for p in parsers}
-def _get_weights_for_field(field_name: str, include_llm: bool) -> Dict[str, float]:
-    """
-    Get weights for a specific field.
-    Priority:
-      1) FIELD_WEIGHTS_RAW["fields"][field_name]
-      2) FIELD_WEIGHTS_RAW["global"]
-      3) Equal weights
-    Always:
-      - Drop 'llm' if include_llm == False
-      - Normalise
-    """
-    if not FIELD_WEIGHTS_RAW:
-        return _normalise_scores(_get_base_weights_for_parsers(include_llm))
-    fields_block = FIELD_WEIGHTS_RAW.get("fields", {}) or {}
-    global_block = FIELD_WEIGHTS_RAW.get("global", {}) or {}
-    raw: Dict[str, float] = {}
-    field_entry = fields_block.get(field_name)
-    if isinstance(field_entry, dict):
-        for k, v in field_entry.items():
-            if k in ("rules", "extended", "llm"):
-                raw[k] = float(v)
-    if not raw and isinstance(global_block, dict):
-        for k, v in global_block.items():
-            if k in ("rules", "extended", "llm"):
-                raw[k] = float(v)
-    if not raw:
-        raw = _get_base_weights_for_parsers(include_llm)
-    if not include_llm:
-        raw.pop("llm", None)
-    if not raw:
-        raw = _get_base_weights_for_parsers(include_llm=False)
-    return _normalise_scores(raw)
-# ------------------------------------------------------------
-# Fusion logic
-# ------------------------------------------------------------
-def _clean_pred_value(val: Optional[str]) -> Optional[str]:
     """
-    Treat None, empty string, or explicit "Unknown" as missing.
     """
-    if val is None:
-        return None
-    s = str(val).strip()
-    if not s:
-        return None
-    if s.lower() == UNKNOWN.lower():
-        return None
-    return s
-def parse_text_fused(text: str, use_llm: Optional[bool] = None) -> Dict[str, Any]:
     """
-    Main tri-parser fusion entrypoint.
     Parameters
     ----------
     text : str
-    use_llm : bool or None
-        True  -> include LLM
-        False -> exclude LLM
-        None  -> include if available
     Returns
     -------
-    Dict[str, Any]
-        Full fusion output including votes and per-parser breakdowns.
     """
     original = text or ""
-    include_llm = HAS_LLM if use_llm is None else bool(use_llm)
-    rules_out = parse_text_rules(original) or {}
-    ext_out = parse_text_extended(original) or {}
-    rules_fields = dict(rules_out.get("parsed_fields", {}))
-    ext_fields = dict(ext_out.get("parsed_fields", {}))
-    llm_fields: Dict[str, Any] = {}
-    if include_llm and parse_text_llm is not None:
-        try:
-            llm_out = parse_text_llm(original)
-            if isinstance(llm_out, dict):
-                if "parsed_fields" in llm_out:
-                    llm_fields = dict(llm_out.get("parsed_fields", {}))
-                else:
-                    llm_fields = {str(k): v for k, v in llm_out.items()}
-        except Exception:
-            llm_fields = {}
-    else:
-        include_llm = False
-    by_parser: Dict[str, Dict[str, Any]] = {
-        "rules": rules_fields,
-        "extended": ext_fields,
-        "llm": llm_fields if include_llm else {},
-    }
-    candidate_fields = (
-        set(rules_fields.keys())
-        | set(ext_fields.keys())
-        | set(llm_fields.keys())
-    )
-    fused_fields: Dict[str, Any] = {}
-    votes_debug: Dict[str, Any] = {}
-    for field in sorted(candidate_fields):
-        weights = _get_weights_for_field(field, include_llm)
-        parser_preds: Dict[str, Optional[str]] = {
-            "rules": _clean_pred_value(rules_fields.get(field)),
-            "extended": _clean_pred_value(ext_fields.get(field)),
-            "llm": _clean_pred_value(llm_fields.get(field)) if include_llm else None,
-        }
-        per_parser_info: Dict[str, Any] = {}
-        value_scores: Dict[str, float] = {}
-        for parser_name in PARSER_ORDER:
-            if parser_name == "llm" and not include_llm:
-                continue
-            pred = parser_preds.get(parser_name)
-            w = float(weights.get(parser_name, 0.0))
-            per_parser_info[parser_name] = {
-                "value": pred if pred is not None else UNKNOWN,
-                "weight": w,
-            }
-            if pred is not None:
-                value_scores[pred] = value_scores.get(pred, 0.0) + w
-        if not value_scores:
-            fused_value = UNKNOWN
-        else:
-            max_score = max(value_scores.values())
-            best_values = [v for v, s in value_scores.items() if s == max_score]
-            if len(best_values) == 1:
-                fused_value = best_values[0]
-            else:
-                fused_value = best_values[0]
-                for parser_name in PARSER_ORDER:
-                    if parser_name == "llm" and not include_llm:
-                        continue
-                    if parser_preds.get(parser_name) in best_values:
-                        fused_value = parser_preds[parser_name]  # type: ignore
-                        break
-        fused_fields[field] = fused_value
-        votes_debug[field] = {
-            "per_parser": per_parser_info,
-            "summed": value_scores,
-            "chosen": fused_value,
-        }
-    weights_meta = {
-        "has_weights_file": HAS_WEIGHTS_FILE,
-        "weights_path": FIELD_WEIGHTS_PATH,
-        "meta": FIELD_WEIGHTS_RAW.get("meta", {}) if HAS_WEIGHTS_FILE else {},
     }
-    return {
-        "fused_fields": fused_fields,
-        "by_parser": by_parser,
-        "votes": votes_debug,
-        "weights_meta": weights_meta,
-    }

+# engine/parser_llm.py
 # ------------------------------------------------------------
+# Local LLM parser for BactAI-D (T5 fine-tune, CPU-friendly)
 #
+# UPDATED (EphBactAID integration):
+# - Default model now points to your HF fine-tune: EphAsad/EphBactAID
+# - Few-shot disabled by default (your fine-tune no longer needs it)
+# - Robust output parsing:
+#     * Supports JSON output (legacy)
+#     * Supports "Key: Value" pairs output (your fine-tune style)
+# - Merge guard (optional): LLM fills ONLY missing/Unknown fields
+# - Validation/normalisation kept (PNV/Gram, sugar logic, aliases, ornithine sync)
 # ------------------------------------------------------------
 from __future__ import annotations
 import json
 import os
+import random
+import re
+from typing import Dict, Any, List, Optional
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# ------------------------------------------------------------
+# Model configuration
+# ------------------------------------------------------------
+# ✅ Your fine-tuned model (can be overridden via env var)
+DEFAULT_MODEL = os.getenv(
+    "BACTAI_LLM_PARSER_MODEL",
+    "EphAsad/EphBactAID",
+)
+# ✅ Few-shot OFF by default now (fine-tune doesn't need it)
+MAX_FEWSHOT_EXAMPLES = int(os.getenv("BACTAI_LLM_FEWSHOT", "0"))
+MAX_NEW_TOKENS = int(os.getenv("BACTAI_LLM_MAX_NEW_TOKENS", "256"))
+DEBUG_LLM = os.getenv("BACTAI_LLM_DEBUG", "0").strip().lower() in {
+    "1", "true", "yes", "y", "on"
+}
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+_tokenizer: Optional[AutoTokenizer] = None
+_model: Optional[AutoModelForSeq2SeqLM] = None
+_GOLD_EXAMPLES: Optional[List[Dict[str, Any]]] = None
 # ------------------------------------------------------------
+# Allowed fields
 # ------------------------------------------------------------
+ALL_FIELDS: List[str] = [
+    "Gram Stain",
+    "Shape",
+    "Motility",
+    "Capsule",
+    "Spore Formation",
+    "Haemolysis",
+    "Haemolysis Type",
+    "Media Grown On",
+    "Colony Morphology",
+    "Oxygen Requirement",
+    "Growth Temperature",
+    "Catalase",
+    "Oxidase",
+    "Indole",
+    "Urease",
+    "Citrate",
+    "Methyl Red",
+    "VP",
+    "H2S",
+    "DNase",
+    "ONPG",
+    "Coagulase",
+    "Gelatin Hydrolysis",
+    "Esculin Hydrolysis",
+    "Nitrate Reduction",
+    "NaCl Tolerant (>=6%)",
+    "Lipase Test",
+    "Lysine Decarboxylase",
+    "Ornithine Decarboxylase",
+    "Ornitihine Decarboxylase",
+    "Arginine dihydrolase",
+    "Glucose Fermentation",
+    "Lactose Fermentation",
+    "Sucrose Fermentation",
+    "Maltose Fermentation",
+    "Mannitol Fermentation",
+    "Sorbitol Fermentation",
+    "Xylose Fermentation",
+    "Rhamnose Fermentation",
+    "Arabinose Fermentation",
+    "Raffinose Fermentation",
+    "Trehalose Fermentation",
+    "Inositol Fermentation",
+    "Gas Production",
+    "TSI Pattern",
+    "Colony Pattern",
+    "Pigment",
+    "Motility Type",
+    "Odor",
+]
+SUGAR_FIELDS = [
+    "Glucose Fermentation",
+    "Lactose Fermentation",
+    "Sucrose Fermentation",
+    "Maltose Fermentation",
+    "Mannitol Fermentation",
+    "Sorbitol Fermentation",
+    "Xylose Fermentation",
+    "Rhamnose Fermentation",
+    "Arabinose Fermentation",
+    "Raffinose Fermentation",
+    "Trehalose Fermentation",
+    "Inositol Fermentation",
+]
+PNV_FIELDS = {
+    f for f in ALL_FIELDS
+    if f not in {
+        "Media Grown On",
+        "Colony Morphology",
+        "Growth Temperature",
+        "Gram Stain",
+        "Shape",
+        "Oxygen Requirement",
+        "Haemolysis Type",
+        "TSI Pattern",
+        "Colony Pattern",
+        "Motility Type",
+        "Odor",
+        "Pigment",
+        "Gas Production",
+    }
+}
+# ------------------------------------------------------------
+# Field alias mapping (CRITICAL)
+# ------------------------------------------------------------
+FIELD_ALIASES: Dict[str, str] = {
+    "Gram": "Gram Stain",
+    "Gram stain": "Gram Stain",
+    "Gram Stain Result": "Gram Stain",
+    "NaCl tolerance": "NaCl Tolerant (>=6%)",
+    "NaCl Tolerant": "NaCl Tolerant (>=6%)",
+    "Salt tolerance": "NaCl Tolerant (>=6%)",
+    "Salt tolerant": "NaCl Tolerant (>=6%)",
+    "6.5% NaCl": "NaCl Tolerant (>=6%)",
+    "6% NaCl": "NaCl Tolerant (>=6%)",
+    "Growth temp": "Growth Temperature",
+    "Growth temperature": "Growth Temperature",
+    "Temperature growth": "Growth Temperature",
+    "Catalase test": "Catalase",
+    "Oxidase test": "Oxidase",
+    "Indole test": "Indole",
+    "Urease test": "Urease",
+    "Citrate test": "Citrate",
+    "Glucose fermentation": "Glucose Fermentation",
+    "Lactose fermentation": "Lactose Fermentation",
+    "Sucrose fermentation": "Sucrose Fermentation",
+    "Maltose fermentation": "Maltose Fermentation",
+    "Mannitol fermentation": "Mannitol Fermentation",
+    "Sorbitol fermentation": "Sorbitol Fermentation",
+    "Xylose fermentation": "Xylose Fermentation",
+    "Rhamnose fermentation": "Rhamnose Fermentation",
+    "Arabinose fermentation": "Arabinose Fermentation",
+    "Raffinose fermentation": "Raffinose Fermentation",
+    "Trehalose fermentation": "Trehalose Fermentation",
+    "Inositol fermentation": "Inositol Fermentation",
+    # common variants from outputs
+    "Voges–Proskauer Test": "VP",
+    "Voges-Proskauer Test": "VP",
+    "Voges–Proskauer": "VP",
+    "Voges-Proskauer": "VP",
+}
+# ------------------------------------------------------------
+# Normalisation helpers
+# ------------------------------------------------------------
+def _norm_str(s: Any) -> str:
+    return str(s).strip() if s is not None else ""
+def _normalise_pnv_value(raw: Any) -> str:
+    s = _norm_str(raw).lower()
+    if not s:
+        return "Unknown"
+    # positive
+    if any(x in s for x in {"positive", "pos", "+", "yes", "present", "detected", "reactive"}):
+        return "Positive"
+    # negative
+    if any(x in s for x in {"negative", "neg", "-", "no", "none", "absent", "not detected", "no growth"}):
+        return "Negative"
+    # variable
+    if any(x in s for x in {"variable", "mixed", "inconsistent"}):
+        return "Variable"
+    return "Unknown"
+def _normalise_gram(raw: Any) -> str:
+    s = _norm_str(raw).lower()
+    if "positive" in s:
+        return "Positive"
+    if "negative" in s:
+        return "Negative"
+    if "variable" in s:
+        return "Variable"
+    return "Unknown"
+def _merge_ornithine_variants(fields: Dict[str, str]) -> Dict[str, str]:
+    v = fields.get("Ornithine Decarboxylase") or fields.get("Ornitihine Decarboxylase")
+    if v and v != "Unknown":
+        fields["Ornithine Decarboxylase"] = v
+        fields["Ornitihine Decarboxylase"] = v
+    return fields
+# ------------------------------------------------------------
+# Sugar logic
+# ------------------------------------------------------------
+_NON_FERMENTER_PATTERNS = re.compile(
+    r"\b("
+    r"non[-\s]?fermenter|"
+    r"non[-\s]?fermentative|"
+    r"asaccharolytic|"
+    r"does not ferment (sugars|carbohydrates)|"
+    r"no carbohydrate fermentation"
+    r")\b",
+    re.IGNORECASE,
+)
+def _apply_global_sugar_logic(fields: Dict[str, str], original_text: str) -> Dict[str, str]:
+    if not _NON_FERMENTER_PATTERNS.search(original_text):
+        return fields
+    for sugar in SUGAR_FIELDS:
+        if fields.get(sugar) in {"Positive", "Variable"}:
+            continue
+        fields[sugar] = "Negative"
+    return fields
+# ------------------------------------------------------------
+# Gold examples (kept for backwards compat; now optional)
+# ------------------------------------------------------------
+def _get_project_root() -> str:
+    return os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+def _load_gold_examples() -> List[Dict[str, Any]]:
+    global _GOLD_EXAMPLES
+    if _GOLD_EXAMPLES is not None:
+        return _GOLD_EXAMPLES
+    path = os.path.join(_get_project_root(), "data", "llm_gold_examples.json")
     try:
         with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            _GOLD_EXAMPLES = data if isinstance(data, list) else []
     except Exception:
+        _GOLD_EXAMPLES = []
+    return _GOLD_EXAMPLES
+# ------------------------------------------------------------
+# Prompt (supports both JSON + KV outputs; fine-tune usually KV)
+# ------------------------------------------------------------
+PROMPT_HEADER = """
+You are a microbiology phenotype parser.
+Task:
+- Extract ONLY explicitly stated results from the input text.
+- Do NOT invent results.
+- If not stated, omit the field or use "Unknown".
+Output format:
+- Prefer "Field: Value" lines, one per line.
+- You may also output JSON if instructed.
+Use the exact schema keys where possible.
+"""
+PROMPT_FOOTER = """
+Input:
+\"\"\"<<PHENOTYPE>>\"\"\"
+Output:
+"""
+def _build_prompt(text: str) -> str:
+    # Few-shot disabled by default; but we keep the capability for testing.
+    blocks: List[str] = [PROMPT_HEADER]
+    if MAX_FEWSHOT_EXAMPLES > 0:
+        examples = _load_gold_examples()
+        n = min(MAX_FEWSHOT_EXAMPLES, len(examples))
+        sampled = random.sample(examples, n) if n > 0 else []
+        for ex in sampled:
+            inp = _norm_str(ex.get("input", ""))
+            exp = ex.get("expected", {})
+            if not isinstance(exp, dict):
+                exp = {}
+            # Show KV style to match your fine-tune
+            kv_lines = "\n".join([f"{k}: {v}" for k, v in exp.items()])
+            blocks.append(f'Example Input:\n"""{inp}"""\nExample Output:\n{kv_lines}\n')
+    blocks.append(PROMPT_FOOTER.replace("<<PHENOTYPE>>", text))
+    return "\n".join(blocks)
+# ------------------------------------------------------------
+# Model loader
+# ------------------------------------------------------------
+def _load_model() -> None:
+    global _model, _tokenizer
+    if _model is not None and _tokenizer is not None:
+        return
+    _tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL)
+    _model = AutoModelForSeq2SeqLM.from_pretrained(DEFAULT_MODEL).to(DEVICE)
+    _model.eval()
+# ------------------------------------------------------------
+# Output parsing helpers (JSON + KV)
+# ------------------------------------------------------------
+_JSON_OBJECT_RE = re.compile(r"\{[\s\S]*?\}")
+def _extract_first_json_object(text: str) -> Dict[str, Any]:
+    m = _JSON_OBJECT_RE.search(text)
+    if not m:
+        return {}
+    try:
+        return json.loads(m.group(0))
+    except Exception:
+        return {}
+# Match "Key: Value" (including keys with symbols like >=6%)
+_KV_LINE_RE = re.compile(r"^\s*([^:\n]{2,120})\s*:\s*(.*?)\s*$")
+def _extract_kv_pairs(text: str) -> Dict[str, Any]:
+    """
+    Parse outputs like:
+      Gram Stain: Positive
+      Shape: Cocci
+      ...
+    """
+    out: Dict[str, Any] = {}
+    for line in (text or "").splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        m = _KV_LINE_RE.match(line)
+        if not m:
+            continue
+        k = _norm_str(m.group(1))
+        v = _norm_str(m.group(2))
+        if not k:
+            continue
+        out[k] = v
+    return out
+def _apply_field_aliases(fields_raw: Dict[str, Any]) -> Dict[str, Any]:
+    out: Dict[str, Any] = {}
+    for k, v in fields_raw.items():
+        key = _norm_str(k)
+        if not key:
+            continue
+        mapped = FIELD_ALIASES.get(key, key)
+        out[mapped] = v
+    return out
+def _clean_and_normalise(fields_raw: Dict[str, Any], original_text: str) -> Dict[str, str]:
     """
+    Keep only allowed fields and normalise values into your contract.
     """
+    cleaned: Dict[str, str] = {}
+    # Only accept keys that match schema (or aliases already applied)
+    for field in ALL_FIELDS:
+        if field not in fields_raw:
+            continue
+        raw_val = fields_raw[field]
+        if field == "Gram Stain":
+            cleaned[field] = _normalise_gram(raw_val)
+        elif field in PNV_FIELDS:
+            cleaned[field] = _normalise_pnv_value(raw_val)
+        else:
+            cleaned[field] = _norm_str(raw_val) or "Unknown"
+    cleaned = _merge_ornithine_variants(cleaned)
+    cleaned = _apply_global_sugar_logic(cleaned, original_text)
+    return cleaned
+def _merge_guard_fill_only_missing(
+    llm_fields: Dict[str, str],
+    existing_fields: Optional[Dict[str, Any]],
+) -> Dict[str, str]:
+    """
+    Merge guard:
+      - If an existing field is present and not Unknown -> do NOT overwrite.
+      - If existing is missing/Unknown -> allow llm value (if not Unknown).
+    """
+    if not existing_fields or not isinstance(existing_fields, dict):
+        return llm_fields
+    out = dict(existing_fields)  # start with existing
+    for k, v in llm_fields.items():
+        if k not in ALL_FIELDS:
+            continue
+        existing_val = _norm_str(out.get(k, ""))
+        existing_norm = _normalise_pnv_value(existing_val) if k in PNV_FIELDS else existing_val
+        # Treat empty/Unknown as fillable
+        fillable = (not existing_val) or (existing_val == "Unknown") or (existing_norm == "Unknown")
+        if not fillable:
+            continue
+        # Only fill if LLM has something meaningful
+        if _norm_str(v) and v != "Unknown":
+            out[k] = v
+    # Ensure we return only schema keys and strings
+    final: Dict[str, str] = {}
+    for k, v in out.items():
+        if k in ALL_FIELDS:
+            final[k] = _norm_str(v) or "Unknown"
+    return final
+# ------------------------------------------------------------
+# PUBLIC API
+# ------------------------------------------------------------
+def parse_llm(text: str, existing_fields: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
     """
+    Parse phenotype text using local seq2seq model.
     Parameters
     ----------
     text : str
+        phenotype description
+    existing_fields : dict | None
+        Optional pre-parsed fields (e.g., from rules/ext).
+        If provided, LLM will ONLY fill missing/Unknown fields.
     Returns
     -------
+    dict:
+      {
+        "parsed_fields": { ... },
+        "source": "llm_parser",
+        "raw": <original text>,
+        "decoded": <model output> (only when DEBUG on)
+      }
     """
     original = text or ""
+    if not original.strip():
+        return {
+            "parsed_fields": (existing_fields or {}) if isinstance(existing_fields, dict) else {},
+            "source": "llm_parser",
+            "raw": original,
+        }
+    _load_model()
+    assert _tokenizer is not None and _model is not None
+    prompt = _build_prompt(original)
+    inputs = _tokenizer(prompt, return_tensors="pt", truncation=True).to(DEVICE)
+    with torch.no_grad():
+        output = _model.generate(
+            **inputs,
+            max_new_tokens=MAX_NEW_TOKENS,
+            do_sample=False,
+            temperature=0.0,
+        )
+    decoded = _tokenizer.decode(output[0], skip_special_tokens=True)
+    if DEBUG_LLM:
+        print("=== LLM PROMPT (truncated) ===")
+        print(prompt[:1500] + ("..." if len(prompt) > 1500 else ""))
+        print("=== LLM RAW OUTPUT ===")
+        print(decoded)
+        print("======================")
+    # 1) Try JSON extraction (legacy)
+    parsed_obj = _extract_first_json_object(decoded)
+    fields_raw = {}
+    if isinstance(parsed_obj, dict) and parsed_obj:
+        if "parsed_fields" in parsed_obj and isinstance(parsed_obj.get("parsed_fields"), dict):
+            fields_raw = dict(parsed_obj["parsed_fields"])
+        else:
+            # in case model returned a flat JSON dict
+            fields_raw = dict(parsed_obj)
+    # 2) Fallback to KV parsing (your fine-tune style)
+    if not fields_raw:
+        fields_raw = _extract_kv_pairs(decoded)
+    # 3) Alias map + normalise
+    fields_raw = _apply_field_aliases(fields_raw)
+    cleaned = _clean_and_normalise(fields_raw, original)
+    # 4) Merge guard (optional) - fill only missing/Unknown
+    if existing_fields is not None:
+        cleaned = _merge_guard_fill_only_missing(cleaned, existing_fields)
+    out = {
+        "parsed_fields": cleaned,
+        "source": "llm_parser",
+        "raw": original,
     }
+    if DEBUG_LLM:
+        out["decoded"] = decoded
+    return out