Spaces:

EphAsad
/

BactAID-Demo

Sleeping

App Files Files Community

EphAsad commited on Dec 28, 2025

Commit

584a0a9

verified ·

1 Parent(s): e6d653e

Update engine/parser_llm.py

Browse files

Files changed (1) hide show

engine/parser_llm.py +556 -435

engine/parser_llm.py CHANGED Viewed

@@ -1,435 +1,556 @@
-# engine/parser_llm.py
-# ------------------------------------------------------------
-# Local LLM parser for BactAI-D (Flan-T5, CPU-friendly)
-# Third parser head: repair & recovery
-#
-# Drop-in patched version:
-# - Few-shot examples increased (configurable via env)
-# - Field alias mapping (prevents silent field drops)
-# - Non-greedy JSON extraction (prevents regex over-capture)
-# - Improved P/N/V normalization (Flan phrasing coverage)
-# - Prompt refined for "extract/clarify" (reduces Unknown collapse)
-# - Debug prints (toggle via env var)
-# - Sugar logic scaffold preserved
-# ------------------------------------------------------------
-from __future__ import annotations
-import json
-import os
-import random
-import re
-from typing import Dict, Any, List, Optional
-import torch
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-# ------------------------------------------------------------
-# Model configuration
-# ------------------------------------------------------------
-DEFAULT_MODEL = os.getenv(
-    "BACTAI_LLM_PARSER_MODEL",
-    "google/flan-t5-base",
-)
-MAX_FEWSHOT_EXAMPLES = int(os.getenv("BACTAI_LLM_FEWSHOT", "25"))
-MAX_NEW_TOKENS = int(os.getenv("BACTAI_LLM_MAX_NEW_TOKENS", "128"))
-DEBUG_LLM = os.getenv("BACTAI_LLM_DEBUG", "1").strip().lower() in {
-    "1", "true", "yes", "y", "on"
-}
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-_tokenizer: Optional[AutoTokenizer] = None
-_model: Optional[AutoModelForSeq2SeqLM] = None
-_GOLD_EXAMPLES: Optional[List[Dict[str, Any]]] = None
-# ------------------------------------------------------------
-# Allowed fields
-# ------------------------------------------------------------
-ALL_FIELDS: List[str] = [
-    "Gram Stain",
-    "Shape",
-    "Motility",
-    "Capsule",
-    "Spore Formation",
-    "Haemolysis",
-    "Haemolysis Type",
-    "Media Grown On",
-    "Colony Morphology",
-    "Oxygen Requirement",
-    "Growth Temperature",
-    "Catalase",
-    "Oxidase",
-    "Indole",
-    "Urease",
-    "Citrate",
-    "Methyl Red",
-    "VP",
-    "H2S",
-    "DNase",
-    "ONPG",
-    "Coagulase",
-    "Gelatin Hydrolysis",
-    "Esculin Hydrolysis",
-    "Nitrate Reduction",
-    "NaCl Tolerant (>=6%)",
-    "Lipase Test",
-    "Lysine Decarboxylase",
-    "Ornithine Decarboxylase",
-    "Ornitihine Decarboxylase",
-    "Arginine dihydrolase",
-    "Glucose Fermentation",
-    "Lactose Fermentation",
-    "Sucrose Fermentation",
-    "Maltose Fermentation",
-    "Mannitol Fermentation",
-    "Sorbitol Fermentation",
-    "Xylose Fermentation",
-    "Rhamnose Fermentation",
-    "Arabinose Fermentation",
-    "Raffinose Fermentation",
-    "Trehalose Fermentation",
-    "Inositol Fermentation",
-    "Gas Production",
-    "TSI Pattern",
-    "Colony Pattern",
-    "Pigment",
-    "Motility Type",
-    "Odor",
-]
-SUGAR_FIELDS = [
-    "Glucose Fermentation",
-    "Lactose Fermentation",
-    "Sucrose Fermentation",
-    "Maltose Fermentation",
-    "Mannitol Fermentation",
-    "Sorbitol Fermentation",
-    "Xylose Fermentation",
-    "Rhamnose Fermentation",
-    "Arabinose Fermentation",
-    "Raffinose Fermentation",
-    "Trehalose Fermentation",
-    "Inositol Fermentation",
-]
-PNV_FIELDS = {
-    f for f in ALL_FIELDS
-    if f not in {
-        "Media Grown On",
-        "Colony Morphology",
-        "Growth Temperature",
-        "Gram Stain",
-        "Shape",
-        "Oxygen Requirement",
-        "Haemolysis Type",
-    }
-}
-# ------------------------------------------------------------
-# Field alias mapping (CRITICAL)
-# ------------------------------------------------------------
-FIELD_ALIASES: Dict[str, str] = {
-    "Gram": "Gram Stain",
-    "Gram stain": "Gram Stain",
-    "Gram Stain Result": "Gram Stain",
-    "NaCl tolerance": "NaCl Tolerant (>=6%)",
-    "NaCl Tolerant": "NaCl Tolerant (>=6%)",
-    "Salt tolerance": "NaCl Tolerant (>=6%)",
-    "Salt tolerant": "NaCl Tolerant (>=6%)",
-    "6.5% NaCl": "NaCl Tolerant (>=6%)",
-    "6% NaCl": "NaCl Tolerant (>=6%)",
-    "Growth temp": "Growth Temperature",
-    "Growth temperature": "Growth Temperature",
-    "Temperature growth": "Growth Temperature",
-    "Catalase test": "Catalase",
-    "Oxidase test": "Oxidase",
-    "Indole test": "Indole",
-    "Urease test": "Urease",
-    "Citrate test": "Citrate",
-    "Glucose fermentation": "Glucose Fermentation",
-    "Lactose fermentation": "Lactose Fermentation",
-    "Sucrose fermentation": "Sucrose Fermentation",
-    "Maltose fermentation": "Maltose Fermentation",
-    "Mannitol fermentation": "Mannitol Fermentation",
-    "Sorbitol fermentation": "Sorbitol Fermentation",
-    "Xylose fermentation": "Xylose Fermentation",
-    "Rhamnose fermentation": "Rhamnose Fermentation",
-    "Arabinose fermentation": "Arabinose Fermentation",
-    "Raffinose fermentation": "Raffinose Fermentation",
-    "Trehalose fermentation": "Trehalose Fermentation",
-    "Inositol fermentation": "Inositol Fermentation",
-}
-# ------------------------------------------------------------
-# Normalisation helpers
-# ------------------------------------------------------------
-def _norm_str(s: Any) -> str:
-    return str(s).strip() if s is not None else ""
-def _normalise_pnv_value(raw: Any) -> str:
-    s = _norm_str(raw).lower()
-    if not s:
-        return "Unknown"
-    if any(x in s for x in {"positive", "pos", "+", "yes", "present", "detected", "reactive"}):
-        return "Positive"
-    if any(x in s for x in {"negative", "neg", "-", "no", "none", "absent", "not detected", "no growth"}):
-        return "Negative"
-    if any(x in s for x in {"variable", "mixed", "inconsistent"}):
-        return "Variable"
-    return "Unknown"
-def _normalise_gram(raw: Any) -> str:
-    s = _norm_str(raw).lower()
-    if "positive" in s:
-        return "Positive"
-    if "negative" in s:
-        return "Negative"
-    if "variable" in s:
-        return "Variable"
-    return "Unknown"
-def _merge_ornithine_variants(fields: Dict[str, str]) -> Dict[str, str]:
-    v = fields.get("Ornithine Decarboxylase") or fields.get("Ornitihine Decarboxylase")
-    if v and v != "Unknown":
-        fields["Ornithine Decarboxylase"] = v
-        fields["Ornitihine Decarboxylase"] = v
-    return fields
-# ------------------------------------------------------------
-# Sugar logic
-# ------------------------------------------------------------
-_NON_FERMENTER_PATTERNS = re.compile(
-    r"\b("
-    r"non[-\s]?fermenter|"
-    r"non[-\s]?fermentative|"
-    r"asaccharolytic|"
-    r"does not ferment (sugars|carbohydrates)|"
-    r"no carbohydrate fermentation"
-    r")\b",
-    re.IGNORECASE,
-)
-def _apply_global_sugar_logic(fields: Dict[str, str], original_text: str) -> Dict[str, str]:
-    if not _NON_FERMENTER_PATTERNS.search(original_text):
-        return fields
-    for sugar in SUGAR_FIELDS:
-        if fields.get(sugar) in {"Positive", "Variable"}:
-            continue
-        fields[sugar] = "Negative"
-    return fields
-# ------------------------------------------------------------
-# Gold examples
-# ------------------------------------------------------------
-def _get_project_root() -> str:
-    return os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-def _load_gold_examples() -> List[Dict[str, Any]]:
-    global _GOLD_EXAMPLES
-    if _GOLD_EXAMPLES is not None:
-        return _GOLD_EXAMPLES
-    path = os.path.join(_get_project_root(), "data", "llm_gold_examples.json")
-    try:
-        with open(path, "r", encoding="utf-8") as f:
-            data = json.load(f)
-            _GOLD_EXAMPLES = data if isinstance(data, list) else []
-    except Exception:
-        _GOLD_EXAMPLES = []
-    return _GOLD_EXAMPLES
-# ------------------------------------------------------------
-# Prompt
-# ------------------------------------------------------------
-PROMPT_HEADER = """
-You are a microbiology expert assisting an automated phenotype parser.
-Your task is to EXTRACT OR CLARIFY phenotypic and biochemical test results
-from the input text.
-Rules:
-- Return ONLY valid JSON
-- Do NOT invent results
-- If a result is unclear or not stated, use "Unknown"
-- Prefer explicit statements over assumptions
-Output format:
-{
-  "parsed_fields": {
-    "Field Name": "Value",
-    ...
-  }
-}
-"""
-PROMPT_FOOTER = """
-Now process the following phenotype description.
-Input:
-\"\"\"<<PHENOTYPE>>\"\"\"
-Return ONLY the JSON object.
-"""
-def _build_prompt(text: str) -> str:
-    examples = _load_gold_examples()
-    n = min(MAX_FEWSHOT_EXAMPLES, len(examples))
-    sampled = random.sample(examples, n) if n > 0 else []
-    blocks: List[str] = [PROMPT_HEADER]
-    for ex in sampled:
-        inp = _norm_str(ex.get("input", ""))
-        exp = ex.get("expected", {})
-        if not isinstance(exp, dict):
-            exp = {}
-        blocks.append(
-            f'Input:\n"""{inp}"""\nOutput:\n'
-            f'{json.dumps({"parsed_fields": exp}, ensure_ascii=False)}\n'
-        )
-    blocks.append(PROMPT_FOOTER.replace("<<PHENOTYPE>>", text))
-    return "\n".join(blocks)
-# ------------------------------------------------------------
-# Model loader
-# ------------------------------------------------------------
-def _load_model() -> None:
-    global _model, _tokenizer
-    if _model is not None and _tokenizer is not None:
-        return
-    _tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL)
-    _model = AutoModelForSeq2SeqLM.from_pretrained(DEFAULT_MODEL).to(DEVICE)
-    _model.eval()
-# ------------------------------------------------------------
-# JSON extraction (non-greedy)
-# ------------------------------------------------------------
-_JSON_OBJECT_RE = re.compile(r"\{[\s\S]*?\}")
-def _extract_first_json_object(text: str) -> Dict[str, Any]:
-    m = _JSON_OBJECT_RE.search(text)
-    if not m:
-        return {}
-    try:
-        return json.loads(m.group(0))
-    except Exception:
-        return {}
-def _apply_field_aliases(fields_raw: Dict[str, Any]) -> Dict[str, Any]:
-    out: Dict[str, Any] = {}
-    for k, v in fields_raw.items():
-        key = _norm_str(k)
-        if not key:
-            continue
-        mapped = FIELD_ALIASES.get(key, key)
-        out[mapped] = v
-    return out
-# ------------------------------------------------------------
-# PUBLIC API
-# ------------------------------------------------------------
-def parse_llm(text: str) -> Dict[str, Any]:
-    original = text or ""
-    if not original.strip():
-        return {
-            "parsed_fields": {},
-            "source": "llm_parser",
-            "raw": original,
-        }
-    _load_model()
-    assert _tokenizer is not None and _model is not None
-    prompt = _build_prompt(original)
-    inputs = _tokenizer(prompt, return_tensors="pt", truncation=True).to(DEVICE)
-    with torch.no_grad():
-        output = _model.generate(
-            **inputs,
-            max_new_tokens=MAX_NEW_TOKENS,
-            do_sample=False,
-            temperature=0.0,
-        )
-    decoded = _tokenizer.decode(output[0], skip_special_tokens=True)
-    if DEBUG_LLM:
-        print("=== LLM RAW OUTPUT ===")
-        print(decoded)
-        print("======================")
-    parsed_obj = _extract_first_json_object(decoded)
-    fields_raw = parsed_obj.get("parsed_fields", {}) if isinstance(parsed_obj, dict) else {}
-    if not isinstance(fields_raw, dict):
-        fields_raw = {}
-    fields_raw = _apply_field_aliases(fields_raw)
-    cleaned: Dict[str, str] = {}
-    for field in ALL_FIELDS:
-        if field not in fields_raw:
-            continue
-        raw_val = fields_raw[field]
-        if field == "Gram Stain":
-            cleaned[field] = _normalise_gram(raw_val)
-        elif field in PNV_FIELDS:
-            cleaned[field] = _normalise_pnv_value(raw_val)
-        else:
-            cleaned[field] = _norm_str(raw_val) or "Unknown"
-    cleaned = _merge_ornithine_variants(cleaned)
-    cleaned = _apply_global_sugar_logic(cleaned, original)
-    return {
-        "parsed_fields": cleaned,
-        "source": "llm_parser",
-        "raw": original,
-    }

+# engine/parser_llm.py
+# ------------------------------------------------------------
+# Local LLM parser for BactAI-D (T5 fine-tune, CPU-friendly)
+#
+# UPDATED (EphBactAID integration):
+# - Default model now points to your HF fine-tune: EphAsad/EphBactAID
+# - Few-shot disabled by default (your fine-tune no longer needs it)
+# - Robust output parsing:
+#     * Supports JSON output (legacy)
+#     * Supports "Key: Value" pairs output (your fine-tune style)
+# - Merge guard (optional): LLM fills ONLY missing/Unknown fields
+# - Validation/normalisation kept (PNV/Gram, sugar logic, aliases, ornithine sync)
+# ------------------------------------------------------------
+from __future__ import annotations
+import json
+import os
+import random
+import re
+from typing import Dict, Any, List, Optional
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# ------------------------------------------------------------
+# Model configuration
+# ------------------------------------------------------------
+# ✅ Your fine-tuned model (can be overridden via env var)
+DEFAULT_MODEL = os.getenv(
+    "BACTAI_LLM_PARSER_MODEL",
+    "EphAsad/EphBactAID",
+)
+# ✅ Few-shot OFF by default now (fine-tune doesn't need it)
+MAX_FEWSHOT_EXAMPLES = int(os.getenv("BACTAI_LLM_FEWSHOT", "0"))
+MAX_NEW_TOKENS = int(os.getenv("BACTAI_LLM_MAX_NEW_TOKENS", "256"))
+DEBUG_LLM = os.getenv("BACTAI_LLM_DEBUG", "0").strip().lower() in {
+    "1", "true", "yes", "y", "on"
+}
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+_tokenizer: Optional[AutoTokenizer] = None
+_model: Optional[AutoModelForSeq2SeqLM] = None
+_GOLD_EXAMPLES: Optional[List[Dict[str, Any]]] = None
+# ------------------------------------------------------------
+# Allowed fields
+# ------------------------------------------------------------
+ALL_FIELDS: List[str] = [
+    "Gram Stain",
+    "Shape",
+    "Motility",
+    "Capsule",
+    "Spore Formation",
+    "Haemolysis",
+    "Haemolysis Type",
+    "Media Grown On",
+    "Colony Morphology",
+    "Oxygen Requirement",
+    "Growth Temperature",
+    "Catalase",
+    "Oxidase",
+    "Indole",
+    "Urease",
+    "Citrate",
+    "Methyl Red",
+    "VP",
+    "H2S",
+    "DNase",
+    "ONPG",
+    "Coagulase",
+    "Gelatin Hydrolysis",
+    "Esculin Hydrolysis",
+    "Nitrate Reduction",
+    "NaCl Tolerant (>=6%)",
+    "Lipase Test",
+    "Lysine Decarboxylase",
+    "Ornithine Decarboxylase",
+    "Ornitihine Decarboxylase",
+    "Arginine dihydrolase",
+    "Glucose Fermentation",
+    "Lactose Fermentation",
+    "Sucrose Fermentation",
+    "Maltose Fermentation",
+    "Mannitol Fermentation",
+    "Sorbitol Fermentation",
+    "Xylose Fermentation",
+    "Rhamnose Fermentation",
+    "Arabinose Fermentation",
+    "Raffinose Fermentation",
+    "Trehalose Fermentation",
+    "Inositol Fermentation",
+    "Gas Production",
+    "TSI Pattern",
+    "Colony Pattern",
+    "Pigment",
+    "Motility Type",
+    "Odor",
+]
+SUGAR_FIELDS = [
+    "Glucose Fermentation",
+    "Lactose Fermentation",
+    "Sucrose Fermentation",
+    "Maltose Fermentation",
+    "Mannitol Fermentation",
+    "Sorbitol Fermentation",
+    "Xylose Fermentation",
+    "Rhamnose Fermentation",
+    "Arabinose Fermentation",
+    "Raffinose Fermentation",
+    "Trehalose Fermentation",
+    "Inositol Fermentation",
+]
+PNV_FIELDS = {
+    f for f in ALL_FIELDS
+    if f not in {
+        "Media Grown On",
+        "Colony Morphology",
+        "Growth Temperature",
+        "Gram Stain",
+        "Shape",
+        "Oxygen Requirement",
+        "Haemolysis Type",
+        "TSI Pattern",
+        "Colony Pattern",
+        "Motility Type",
+        "Odor",
+        "Pigment",
+        "Gas Production",
+    }
+}
+# ------------------------------------------------------------
+# Field alias mapping (CRITICAL)
+# ------------------------------------------------------------
+FIELD_ALIASES: Dict[str, str] = {
+    "Gram": "Gram Stain",
+    "Gram stain": "Gram Stain",
+    "Gram Stain Result": "Gram Stain",
+    "NaCl tolerance": "NaCl Tolerant (>=6%)",
+    "NaCl Tolerant": "NaCl Tolerant (>=6%)",
+    "Salt tolerance": "NaCl Tolerant (>=6%)",
+    "Salt tolerant": "NaCl Tolerant (>=6%)",
+    "6.5% NaCl": "NaCl Tolerant (>=6%)",
+    "6% NaCl": "NaCl Tolerant (>=6%)",
+    "Growth temp": "Growth Temperature",
+    "Growth temperature": "Growth Temperature",
+    "Temperature growth": "Growth Temperature",
+    "Catalase test": "Catalase",
+    "Oxidase test": "Oxidase",
+    "Indole test": "Indole",
+    "Urease test": "Urease",
+    "Citrate test": "Citrate",
+    "Glucose fermentation": "Glucose Fermentation",
+    "Lactose fermentation": "Lactose Fermentation",
+    "Sucrose fermentation": "Sucrose Fermentation",
+    "Maltose fermentation": "Maltose Fermentation",
+    "Mannitol fermentation": "Mannitol Fermentation",
+    "Sorbitol fermentation": "Sorbitol Fermentation",
+    "Xylose fermentation": "Xylose Fermentation",
+    "Rhamnose fermentation": "Rhamnose Fermentation",
+    "Arabinose fermentation": "Arabinose Fermentation",
+    "Raffinose fermentation": "Raffinose Fermentation",
+    "Trehalose fermentation": "Trehalose Fermentation",
+    "Inositol fermentation": "Inositol Fermentation",
+    # common variants from outputs
+    "Voges–Proskauer Test": "VP",
+    "Voges-Proskauer Test": "VP",
+    "Voges–Proskauer": "VP",
+    "Voges-Proskauer": "VP",
+}
+# ------------------------------------------------------------
+# Normalisation helpers
+# ------------------------------------------------------------
+def _norm_str(s: Any) -> str:
+    return str(s).strip() if s is not None else ""
+def _normalise_pnv_value(raw: Any) -> str:
+    s = _norm_str(raw).lower()
+    if not s:
+        return "Unknown"
+    # positive
+    if any(x in s for x in {"positive", "pos", "+", "yes", "present", "detected", "reactive"}):
+        return "Positive"
+    # negative
+    if any(x in s for x in {"negative", "neg", "-", "no", "none", "absent", "not detected", "no growth"}):
+        return "Negative"
+    # variable
+    if any(x in s for x in {"variable", "mixed", "inconsistent"}):
+        return "Variable"
+    return "Unknown"
+def _normalise_gram(raw: Any) -> str:
+    s = _norm_str(raw).lower()
+    if "positive" in s:
+        return "Positive"
+    if "negative" in s:
+        return "Negative"
+    if "variable" in s:
+        return "Variable"
+    return "Unknown"
+def _merge_ornithine_variants(fields: Dict[str, str]) -> Dict[str, str]:
+    v = fields.get("Ornithine Decarboxylase") or fields.get("Ornitihine Decarboxylase")
+    if v and v != "Unknown":
+        fields["Ornithine Decarboxylase"] = v
+        fields["Ornitihine Decarboxylase"] = v
+    return fields
+# ------------------------------------------------------------
+# Sugar logic
+# ------------------------------------------------------------
+_NON_FERMENTER_PATTERNS = re.compile(
+    r"\b("
+    r"non[-\s]?fermenter|"
+    r"non[-\s]?fermentative|"
+    r"asaccharolytic|"
+    r"does not ferment (sugars|carbohydrates)|"
+    r"no carbohydrate fermentation"
+    r")\b",
+    re.IGNORECASE,
+)
+def _apply_global_sugar_logic(fields: Dict[str, str], original_text: str) -> Dict[str, str]:
+    if not _NON_FERMENTER_PATTERNS.search(original_text):
+        return fields
+    for sugar in SUGAR_FIELDS:
+        if fields.get(sugar) in {"Positive", "Variable"}:
+            continue
+        fields[sugar] = "Negative"
+    return fields
+# ------------------------------------------------------------
+# Gold examples (kept for backwards compat; now optional)
+# ------------------------------------------------------------
+def _get_project_root() -> str:
+    return os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+def _load_gold_examples() -> List[Dict[str, Any]]:
+    global _GOLD_EXAMPLES
+    if _GOLD_EXAMPLES is not None:
+        return _GOLD_EXAMPLES
+    path = os.path.join(_get_project_root(), "data", "llm_gold_examples.json")
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            _GOLD_EXAMPLES = data if isinstance(data, list) else []
+    except Exception:
+        _GOLD_EXAMPLES = []
+    return _GOLD_EXAMPLES
+# ------------------------------------------------------------
+# Prompt (supports both JSON + KV outputs; fine-tune usually KV)
+# ------------------------------------------------------------
+PROMPT_HEADER = """
+You are a microbiology phenotype parser.
+Task:
+- Extract ONLY explicitly stated results from the input text.
+- Do NOT invent results.
+- If not stated, omit the field or use "Unknown".
+Output format:
+- Prefer "Field: Value" lines, one per line.
+- You may also output JSON if instructed.
+Use the exact schema keys where possible.
+"""
+PROMPT_FOOTER = """
+Input:
+\"\"\"<<PHENOTYPE>>\"\"\"
+Output:
+"""
+def _build_prompt(text: str) -> str:
+    # Few-shot disabled by default; but we keep the capability for testing.
+    blocks: List[str] = [PROMPT_HEADER]
+    if MAX_FEWSHOT_EXAMPLES > 0:
+        examples = _load_gold_examples()
+        n = min(MAX_FEWSHOT_EXAMPLES, len(examples))
+        sampled = random.sample(examples, n) if n > 0 else []
+        for ex in sampled:
+            inp = _norm_str(ex.get("input", ""))
+            exp = ex.get("expected", {})
+            if not isinstance(exp, dict):
+                exp = {}
+            # Show KV style to match your fine-tune
+            kv_lines = "\n".join([f"{k}: {v}" for k, v in exp.items()])
+            blocks.append(f'Example Input:\n"""{inp}"""\nExample Output:\n{kv_lines}\n')
+    blocks.append(PROMPT_FOOTER.replace("<<PHENOTYPE>>", text))
+    return "\n".join(blocks)
+# ------------------------------------------------------------
+# Model loader
+# ------------------------------------------------------------
+def _load_model() -> None:
+    global _model, _tokenizer
+    if _model is not None and _tokenizer is not None:
+        return
+    _tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL)
+    _model = AutoModelForSeq2SeqLM.from_pretrained(DEFAULT_MODEL).to(DEVICE)
+    _model.eval()
+# ------------------------------------------------------------
+# Output parsing helpers (JSON + KV)
+# ------------------------------------------------------------
+_JSON_OBJECT_RE = re.compile(r"\{[\s\S]*?\}")
+def _extract_first_json_object(text: str) -> Dict[str, Any]:
+    m = _JSON_OBJECT_RE.search(text)
+    if not m:
+        return {}
+    try:
+        return json.loads(m.group(0))
+    except Exception:
+        return {}
+# Match "Key: Value" (including keys with symbols like >=6%)
+_KV_LINE_RE = re.compile(r"^\s*([^:\n]{2,120})\s*:\s*(.*?)\s*$")
+def _extract_kv_pairs(text: str) -> Dict[str, Any]:
+    """
+    Parse outputs like:
+      Gram Stain: Positive
+      Shape: Cocci
+      ...
+    """
+    out: Dict[str, Any] = {}
+    for line in (text or "").splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        m = _KV_LINE_RE.match(line)
+        if not m:
+            continue
+        k = _norm_str(m.group(1))
+        v = _norm_str(m.group(2))
+        if not k:
+            continue
+        out[k] = v
+    return out
+def _apply_field_aliases(fields_raw: Dict[str, Any]) -> Dict[str, Any]:
+    out: Dict[str, Any] = {}
+    for k, v in fields_raw.items():
+        key = _norm_str(k)
+        if not key:
+            continue
+        mapped = FIELD_ALIASES.get(key, key)
+        out[mapped] = v
+    return out
+def _clean_and_normalise(fields_raw: Dict[str, Any], original_text: str) -> Dict[str, str]:
+    """
+    Keep only allowed fields and normalise values into your contract.
+    """
+    cleaned: Dict[str, str] = {}
+    # Only accept keys that match schema (or aliases already applied)
+    for field in ALL_FIELDS:
+        if field not in fields_raw:
+            continue
+        raw_val = fields_raw[field]
+        if field == "Gram Stain":
+            cleaned[field] = _normalise_gram(raw_val)
+        elif field in PNV_FIELDS:
+            cleaned[field] = _normalise_pnv_value(raw_val)
+        else:
+            cleaned[field] = _norm_str(raw_val) or "Unknown"
+    cleaned = _merge_ornithine_variants(cleaned)
+    cleaned = _apply_global_sugar_logic(cleaned, original_text)
+    return cleaned
+def _merge_guard_fill_only_missing(
+    llm_fields: Dict[str, str],
+    existing_fields: Optional[Dict[str, Any]],
+) -> Dict[str, str]:
+    """
+    Merge guard:
+      - If an existing field is present and not Unknown -> do NOT overwrite.
+      - If existing is missing/Unknown -> allow llm value (if not Unknown).
+    """
+    if not existing_fields or not isinstance(existing_fields, dict):
+        return llm_fields
+    out = dict(existing_fields)  # start with existing
+    for k, v in llm_fields.items():
+        if k not in ALL_FIELDS:
+            continue
+        existing_val = _norm_str(out.get(k, ""))
+        existing_norm = _normalise_pnv_value(existing_val) if k in PNV_FIELDS else existing_val
+        # Treat empty/Unknown as fillable
+        fillable = (not existing_val) or (existing_val == "Unknown") or (existing_norm == "Unknown")
+        if not fillable:
+            continue
+        # Only fill if LLM has something meaningful
+        if _norm_str(v) and v != "Unknown":
+            out[k] = v
+    # Ensure we return only schema keys and strings
+    final: Dict[str, str] = {}
+    for k, v in out.items():
+        if k in ALL_FIELDS:
+            final[k] = _norm_str(v) or "Unknown"
+    return final
+# ------------------------------------------------------------
+# PUBLIC API
+# ------------------------------------------------------------
+def parse_llm(text: str, existing_fields: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+    """
+    Parse phenotype text using local seq2seq model.
+    Parameters
+    ----------
+    text : str
+        phenotype description
+    existing_fields : dict | None
+        Optional pre-parsed fields (e.g., from rules/ext).
+        If provided, LLM will ONLY fill missing/Unknown fields.
+    Returns
+    -------
+    dict:
+      {
+        "parsed_fields": { ... },
+        "source": "llm_parser",
+        "raw": <original text>,
+        "decoded": <model output> (only when DEBUG on)
+      }
+    """
+    original = text or ""
+    if not original.strip():
+        return {
+            "parsed_fields": (existing_fields or {}) if isinstance(existing_fields, dict) else {},
+            "source": "llm_parser",
+            "raw": original,
+        }
+    _load_model()
+    assert _tokenizer is not None and _model is not None
+    prompt = _build_prompt(original)
+    inputs = _tokenizer(prompt, return_tensors="pt", truncation=True).to(DEVICE)
+    with torch.no_grad():
+        output = _model.generate(
+            **inputs,
+            max_new_tokens=MAX_NEW_TOKENS,
+            do_sample=False,
+            temperature=0.0,
+        )
+    decoded = _tokenizer.decode(output[0], skip_special_tokens=True)
+    if DEBUG_LLM:
+        print("=== LLM PROMPT (truncated) ===")
+        print(prompt[:1500] + ("..." if len(prompt) > 1500 else ""))
+        print("=== LLM RAW OUTPUT ===")
+        print(decoded)
+        print("======================")
+    # 1) Try JSON extraction (legacy)
+    parsed_obj = _extract_first_json_object(decoded)
+    fields_raw = {}
+    if isinstance(parsed_obj, dict) and parsed_obj:
+        if "parsed_fields" in parsed_obj and isinstance(parsed_obj.get("parsed_fields"), dict):
+            fields_raw = dict(parsed_obj["parsed_fields"])
+        else:
+            # in case model returned a flat JSON dict
+            fields_raw = dict(parsed_obj)
+    # 2) Fallback to KV parsing (your fine-tune style)
+    if not fields_raw:
+        fields_raw = _extract_kv_pairs(decoded)
+    # 3) Alias map + normalise
+    fields_raw = _apply_field_aliases(fields_raw)
+    cleaned = _clean_and_normalise(fields_raw, original)
+    # 4) Merge guard (optional) - fill only missing/Unknown
+    if existing_fields is not None:
+        cleaned = _merge_guard_fill_only_missing(cleaned, existing_fields)
+    out = {
+        "parsed_fields": cleaned,
+        "source": "llm_parser",
+        "raw": original,
+    }
+    if DEBUG_LLM:
+        out["decoded"] = decoded
+    return out