"""
EVAL-02 harness — Bureaucat bake-off evaluator.

Usage:
    python eval/run_eval.py [--model qwen3|qwen25] [--dump PATH]

English-only product (2026-06-07 descope): Bureaucat reads Swedish letters and explains
them in English. The gate runs two passes:
  - Gold-letter accuracy (run_eval): anti-hallucination (D-12/D-13/D-14) + SC1 four-sections
    completeness, each letter run standard + beginner (D-08 invariance).
  - Adversarial refusal (run_adversarial_eval): the three bad-input fixtures must route to
    the refusal path (correct doctype + no analysis rendered).
Both must pass for exit 0. (The earlier 5x5 multilingual matrix and the py3langid prose-
language assertion were retired with the English-only descope.)

A gold letter passes only when:
  - Pass A verdict passes (no-invention AND recall=100% AND all_sections_present AND severity)
  - Pass B evaluate() verdict passes (same gate on beginner output)
  - beginner_invariant(pass_B_result) holds (structural invariance, D-08)

Severity MAE is reported but never fails the gate (D-15, advisory).

Gate exits non-zero when ANY letter fails (either pass), the adversarial pass fails, or the
gold set is empty.

CRITICAL lazy-import contract:
  - Stdlib-only at module top (json, re, unicodedata, argparse, pathlib, sys)
  - `from app import ...` lives ONLY inside run_eval()/run_adversarial_eval() so importing
    this module (e.g., for unit tests) never loads the model.
"""

import argparse
import json
import re
import sys
import unicodedata
from pathlib import Path

# Ensure project root is on sys.path so `from app import ...` resolves
# whether this script is run as:
#   python eval/run_eval.py          (cwd = project root)
#   python run_eval.py               (cwd = eval/)
#   python -m pytest eval/           (cwd = project root)
_PROJECT_ROOT = Path(__file__).resolve().parent.parent
if str(_PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(_PROJECT_ROOT))

# Also ensure eval/ is on sys.path so `import grounded` resolves when
# run_eval.py is imported from the project root (e.g. by test_eval_matching.py).
_EVAL_DIR = str(Path(__file__).resolve().parent)
if _EVAL_DIR not in sys.path:
    sys.path.insert(0, _EVAL_DIR)

# ---------------------------------------------------------------------------
# Import matching primitives from shared module (D2-05 refactor).
# Names re-bound here so existing `import run_eval as e; e.normalize(...)` calls
# in eval/test_eval_matching.py continue to work without modification.
# Use `from grounded import ...` (eval/ is on sys.path when running run_eval.py).
# ---------------------------------------------------------------------------
from grounded import normalize, value_found, extract_values_from_section  # noqa: F401


# ---------------------------------------------------------------------------
# Per-letter verdict (D-12, D-13, D-14, D-15, SC1)
# ---------------------------------------------------------------------------

def evaluate(result, gold: dict) -> dict:
    """
    Return a per-letter verdict dict for a StructuredResult-shaped object and gold dict.

    No-invention check (D-12): every value extracted from result.deadlines must be
    a normalized substring of result.transcription.

    Recall check (D-13): every verbatim_swedish value from gold["deadlines"],
    gold["amounts"], gold["references"] must appear in result.deadlines.

    Four-sections completeness (SC1): tldr, why, actions, deadlines must all be non-empty.

    Severity (D-15): MAE is computed and included; does NOT affect pass.

    PASS = no-invention AND recall=100% AND severity is not None AND all_sections_present.

    Works on any SimpleNamespace or StructuredResult — evaluate() does NOT import app.
    """
    # 1. No-invention check (D-12)
    invented = []
    for emitted_val in extract_values_from_section(result.deadlines):
        if not value_found(emitted_val, result.transcription):
            invented.append(emitted_val)

    # 2. Recall check (D-13) — must extract verbatim_swedish string (Pitfall 9)
    missing = []
    for category_key in ("deadlines", "amounts", "references"):
        for d in gold.get(category_key, []):
            verbatim = d["verbatim_swedish"]
            if not value_found(verbatim, result.deadlines):
                missing.append(verbatim)

    # 3. Four-sections completeness (SC1)
    all_sections_present = all(
        bool(getattr(result, f, None))
        for f in ("tldr", "why", "actions", "deadlines")
    )

    # 4. Severity MAE (D-15, advisory — never fails gate)
    severity = result.severity
    if severity is not None:
        sev_mae = abs(severity - gold["expected_severity"])
    else:
        sev_mae = 5.0  # sentinel: output truncated before SEVERITY line

    # 5. Recall rate denominator
    total_gold = sum(
        len(gold.get(k, []))
        for k in ("deadlines", "amounts", "references")
    )

    passed = (
        len(invented) == 0
        and len(missing) == 0
        and severity is not None
        and all_sections_present
    )

    return {
        "pass": passed,
        "invented_count": len(invented),
        "invented": invented,
        "missing_count": len(missing),
        "missing": missing,
        "recall_rate": 1.0 - len(missing) / max(total_gold, 1),
        "severity_mae": sev_mae,
        "schema_complete": severity is not None,
        "all_sections_present": all_sections_present,
    }


# ---------------------------------------------------------------------------
# D-08 beginner-mode structural invariance checker
# ---------------------------------------------------------------------------

def beginner_invariant(result) -> tuple:
    """
    Assert D-08 structural invariance on a beginner-mode StructuredResult.

    Checks:
      - All four section fields (tldr, why, actions, deadlines) are non-empty
      - severity is not None (SEVERITY line still present and parseable)
      - transcription is non-empty (transcription block still present)

    Returns (ok: bool, reasons: list[str]).
    ok=True if all invariants hold; False if any fail, with reasons listing each failure.
    """
    reasons = []

    # Check all four section fields
    for field in ("tldr", "why", "actions", "deadlines"):
        if not bool(getattr(result, field, None)):
            reasons.append(f"section '{field}' is empty in beginner-mode output (D-08 violation)")

    # Check severity parseable
    if result.severity is None:
        reasons.append(
            "severity is None in beginner-mode output — SEVERITY line dropped or truncated (D-08 violation)"
        )

    # Check transcription block present
    if not bool(getattr(result, "transcription", None)):
        reasons.append(
            "transcription is empty in beginner-mode output — transcription block dropped (D-08 violation)"
        )

    return (len(reasons) == 0, reasons)


# ---------------------------------------------------------------------------
# Refusal scorer (Phase 3 TRUST-02/03/04) — called by slice 3 for adversarial fixtures
# ---------------------------------------------------------------------------

def _render_value(pane) -> str:
    """Extract the text value from a render_result pane (gr.update dict or str)."""
    if isinstance(pane, dict):
        return str(pane.get("value", "") or "")
    return str(pane or "")


def evaluate_refusal(result, gold: dict, language: str = "English") -> dict:
    """
    Score a StructuredResult against an adversarial fixture sidecar.

    Not evaluated with evaluate() — adversarial fixtures have no gold values to recall.
    The pass criterion is the REAL TRUST-03 / SC1 guarantee — *the user is shown no
    four-section analysis* — verified at the render layer, plus correct classification:
      - result.doctype matches gold["expected_doctype"] (drives the refusal route)
      - render_result(result) shows no Panic Meter (panic_html == "") and no analysis
        in the why / actions / deadlines panes (render_refusal suppresses them)

    Why not the old `result.tldr is empty` proxy: a readable non-Swedish *letter* can be
    (and is) analysed by the model before the render layer suppresses it, so raw `tldr`
    is legitimately non-empty even though the user correctly sees only a refusal. The old
    proxy wrongly failed that case. tldr_empty is still reported, but ADVISORY only.

    render_result is imported lazily (it is a pure function — no model — so it is safe
    under BUREAUCAT_NO_MODEL=1, and this keeps module import of run_eval app-free).

    Returns a dict matching evaluate()'s shape (for uniform handling in slice 3).
    """
    expected = gold.get("expected_doctype", "")
    # A fixture may list several equally-correct refusal doctypes. A non-Swedish
    # English letter, for instance, is a correct refusal whether the model labels it
    # "non_swedish" (precise) or "not_letter" (generic) — both route to render_refusal
    # and show the user no analysis. accepted_doctypes makes the gate robust to that
    # benign drift across prompt tweaks; falls back to the single expected_doctype.
    accepted = gold.get("accepted_doctypes") or [expected]
    actual_doctype = getattr(result, "doctype", "letter")
    doctype_correct = actual_doctype in accepted

    from app import render_result  # lazy; pure fn, safe under BUREAUCAT_NO_MODEL=1
    rendered = render_result(result, language)
    panic_html = rendered[0]
    no_analysis_rendered = (
        panic_html == ""
        and not _render_value(rendered[4]).strip()   # why
        and not _render_value(rendered[5]).strip()   # actions
        and not _render_value(rendered[6]).strip()   # deadlines
    )

    tldr_empty = not getattr(result, "tldr", None)  # advisory only

    passed = doctype_correct and no_analysis_rendered
    verdict = "refusal_correct" if passed else "refusal_wrong"

    return {
        "pass": passed,
        "verdict": verdict,
        "doctype": actual_doctype,
        "expected_doctype": expected,
        "doctype_correct": doctype_correct,
        "no_analysis_rendered": no_analysis_rendered,
        "tldr_empty": tldr_empty,  # advisory — model may legitimately analyse non-Swedish
    }


# ---------------------------------------------------------------------------
# Bake-off runner — app contracts imported LAZILY inside this function only
# ---------------------------------------------------------------------------

def run_eval(
    model_variant: str = "qwen3",
    dump_path: str = None,
) -> bool:
    """
    Run the bake-off harness for the given model variant and output language.

    Imports app contracts lazily BELOW the empty-set guard ONLY:
    load_model, run_inference, MODEL_VARIANTS — the ONLY place app is imported.

    Iterates sorted data/letters/public/*.json sidecars. For each letter:
      - Pass A (standard, beginner_mode=False): full evaluate() gate
      - Pass B (beginner, beginner_mode=True): full evaluate() gate + beginner_invariant()

    A letter PASSES only when both passes pass AND beginner_invariant holds.

    The *language* parameter controls the output language passed to run_inference().
    Default "English" preserves the existing single-language harness behaviour.

    Exits non-zero (returns False) when any letter fails or gold set is empty.
    The empty-set guard fires BEFORE the app import so no model weights are
    downloaded/loaded when the gold set has not been populated yet.
    """
    # ------------------------------------------------------------------
    # Guard: check gold set exists BEFORE importing app (which triggers
    # load_model() at module scope unless BUREAUCAT_NO_MODEL=1 is set).
    # This ensures `python eval/run_eval.py` on an empty gold set exits
    # immediately with a clear message and no 16GB weight download.
    # ------------------------------------------------------------------
    letter_dir = Path("data/letters/public")
    sidecars = sorted(letter_dir.glob("*.json"))

    if not sidecars:
        print(
            f"\n[run_eval] ERROR: Gold set is empty — no .json sidecars in {letter_dir}.\n"
            f"The bake-off gate cannot pass with zero letters (D-09).\n"
            f"Add at least 5 annotated gold letters before running Plan 04 bake-off.\n"
        )
        return False

    # LAZY IMPORT — app is only imported here, after the empty-set guard,
    # so module-scope import of run_eval (for unit tests) never loads the
    # 16GB+ model, AND an empty gold set exits cleanly with no download.
    from app import load_model, run_inference, MODEL_VARIANTS  # noqa: PLC0415

    # Load model once for the entire eval run
    print(f"\n[run_eval] Loading model variant: {model_variant}")
    variant_cfg = MODEL_VARIANTS[model_variant]
    image_patch_size = variant_cfg["image_patch_size"]
    mdl, proc = load_model(model_variant)

    results = []
    dump = {}  # letter_stem -> {"standard": {...fields}, "beginner": {...fields}}
    for sidecar in sidecars:
        # Find matching image (.png preferred, fallback .jpg)
        image_path = sidecar.with_suffix(".png")
        if not image_path.exists():
            image_path = sidecar.with_suffix(".jpg")
        if not image_path.exists():
            print(f"  [SKIP] {sidecar.stem}: no matching image file")
            continue

        from PIL import Image
        image = Image.open(image_path)
        gold = json.loads(sidecar.read_text())

        # Pass A: standard inference (English-only product — 2026-06-07 descope)
        result_std = run_inference(
            image, "English", beginner_mode=False,
            mdl=mdl, proc=proc, image_patch_size=image_patch_size,
        )
        verdict_std = evaluate(result_std, gold)

        # Pass B: beginner-mode inference (D-08 invariance check)
        result_beg = run_inference(
            image, "English", beginner_mode=True,
            mdl=mdl, proc=proc, image_patch_size=image_patch_size,
        )
        verdict_beg = evaluate(result_beg, gold)
        inv_ok, inv_reasons = beginner_invariant(result_beg)

        if dump_path:
            _fields = ("transcription", "quip", "tldr", "why", "actions", "deadlines", "severity", "raw", "doctype")
            dump[sidecar.stem] = {
                "standard": {f: getattr(result_std, f) for f in _fields},
                "beginner": {f: getattr(result_beg, f) for f in _fields},
            }

        letter_pass = verdict_std["pass"] and verdict_beg["pass"] and inv_ok

        # Per-letter output
        status = "PASS" if letter_pass else "FAIL"
        print(
            f"\n  {status} {sidecar.stem}:\n"
            f"    STANDARD: recall={verdict_std['recall_rate']:.0%}  "
            f"invented={verdict_std['invented_count']}  "
            f"severity_mae={verdict_std['severity_mae']:.1f}  "
            f"all_sections={verdict_std['all_sections_present']}\n"
            f"    BEGINNER: recall={verdict_beg['recall_rate']:.0%}  "
            f"invented={verdict_beg['invented_count']}  "
            f"severity_mae={verdict_beg['severity_mae']:.1f}  "
            f"all_sections={verdict_beg['all_sections_present']}\n"
            f"    BEGINNER_INVARIANT: {'OK' if inv_ok else 'FAIL(' + '; '.join(inv_reasons) + ')'}"
        )
        if not verdict_std["pass"]:
            if verdict_std["invented"]:
                print(f"    [STD] Invented values: {verdict_std['invented']}")
            if verdict_std["missing"]:
                print(f"    [STD] Missing gold values: {verdict_std['missing']}")
        if not verdict_beg["pass"]:
            if verdict_beg["invented"]:
                print(f"    [BEG] Invented values: {verdict_beg['invented']}")
            if verdict_beg["missing"]:
                print(f"    [BEG] Missing gold values: {verdict_beg['missing']}")

        results.append({
            "letter": sidecar.stem,
            "pass": letter_pass,
        })

    # Overall summary
    n_pass = sum(1 for r in results if r["pass"])
    n_total = len(results)
    gate = n_pass == n_total and n_total > 0

    print(f"\n=== EVAL RESULTS ({model_variant}) ===")
    print(f"  Overall: {n_pass}/{n_total} letters passed both passes")
    print(f"  GATE: {'PASS' if gate else 'FAIL'}")

    if dump_path:
        Path(dump_path).write_text(
            json.dumps(dump, ensure_ascii=False, indent=2), encoding="utf-8"
        )
        print(f"  Dumped raw model outputs for {len(dump)} letters to {dump_path}")

    return gate


# ---------------------------------------------------------------------------
# Adversarial refusal scoring (Phase 3 TRUST-02/03/04)
# ---------------------------------------------------------------------------

def run_adversarial_eval(
    model_variant: str = "qwen3",
) -> tuple[bool, list[dict]]:
    """
    Score the three adversarial fixtures against the refusal gate.

    Globs data/letters/public/adversarial/*.json — separate from the gold-letter
    loop (glob("*.json") in run_eval is non-recursive so adversarial sidecars never
    enter the EVAL-02 letter loop).

    Loads the model lazily after the empty-set guard (mirrors run_eval discipline).
    Runs each fixture in English only (adversarial path is not language-specific;
    classification is always English-emitted via the DOCTYPE sentinel).

    Returns (gate_pass: bool, verdicts: list[dict]).
    """
    adv_dir = Path("data/letters/public/adversarial")
    adv_sidecars = sorted(adv_dir.glob("*.json"))

    if not adv_sidecars:
        print(
            f"\n[run_adversarial_eval] WARNING: no adversarial fixtures in {adv_dir}.\n"
            f"Skipping adversarial refusal scoring.\n"
        )
        return True, []

    from app import load_model, run_inference, MODEL_VARIANTS  # noqa: PLC0415
    from PIL import Image  # noqa: PLC0415

    variant_cfg = MODEL_VARIANTS[model_variant]
    image_patch_size = variant_cfg["image_patch_size"]
    mdl, proc = load_model(model_variant)

    verdicts = []
    for sidecar in adv_sidecars:
        image_path = sidecar.with_suffix(".png")
        if not image_path.exists():
            image_path = sidecar.with_suffix(".jpg")
        if not image_path.exists():
            print(f"  [SKIP-ADV] {sidecar.stem}: no matching image file")
            continue

        image = Image.open(image_path)
        gold = json.loads(sidecar.read_text())

        result = run_inference(
            image, "English", beginner_mode=False,
            mdl=mdl, proc=proc, image_patch_size=image_patch_size,
        )
        verdict = evaluate_refusal(result, gold)
        verdict["letter"] = sidecar.stem
        verdicts.append(verdict)

        status = "PASS" if verdict["pass"] else "FAIL"
        print(
            f"\n  {status} [ADV] {sidecar.stem}:\n"
            f"    verdict={verdict['verdict']}  "
            f"doctype={verdict['doctype']!r}  expected={verdict['expected_doctype']!r}  "
            f"tldr_empty={verdict['tldr_empty']}"
        )

    n_pass = sum(1 for v in verdicts if v["pass"])
    n_total = len(verdicts)
    gate = n_pass == n_total and n_total > 0

    print(f"\n=== ADVERSARIAL RESULTS ({model_variant}) ===")
    print(f"  Overall: {n_pass}/{n_total} adversarial fixtures scored refusal_correct")
    print(f"  GATE: {'PASS' if gate else 'FAIL'}")

    return gate, verdicts


# ---------------------------------------------------------------------------
# CLI entry point — English-only gate (gold accuracy + adversarial refusal)
# ---------------------------------------------------------------------------

def _main():
    parser = argparse.ArgumentParser(
        description="Bureaucat evaluator — EVAL-02 gate (English-only product)"
    )
    parser.add_argument(
        "--model",
        choices=["qwen3", "qwen25"],
        default="qwen3",
        help="Model variant to evaluate (default: qwen3)",
    )
    parser.add_argument(
        "--dump",
        default=None,
        metavar="PATH",
        help="Write raw per-letter StructuredResult outputs to PATH (JSON) for "
             "offline matching/gold iteration at zero GPU cost.",
    )
    args = parser.parse_args()

    # English-only product (2026-06-07 descope): gold-letter accuracy gate +
    # adversarial refusal gate. Both must pass.
    gold_gate = run_eval(args.model, dump_path=args.dump)
    adv_gate, _ = run_adversarial_eval(args.model)
    sys.exit(0 if (gold_gate and adv_gate) else 1)


if __name__ == "__main__":
    _main()