""" EVAL-02 harness — Bureaucat bake-off evaluator. Usage: python eval/run_eval.py [--model qwen3|qwen25] [--dump PATH] English-only product (2026-06-07 descope): Bureaucat reads Swedish letters and explains them in English. The gate runs two passes: - Gold-letter accuracy (run_eval): anti-hallucination (D-12/D-13/D-14) + SC1 four-sections completeness, each letter run standard + beginner (D-08 invariance). - Adversarial refusal (run_adversarial_eval): the three bad-input fixtures must route to the refusal path (correct doctype + no analysis rendered). Both must pass for exit 0. (The earlier 5x5 multilingual matrix and the py3langid prose- language assertion were retired with the English-only descope.) A gold letter passes only when: - Pass A verdict passes (no-invention AND recall=100% AND all_sections_present AND severity) - Pass B evaluate() verdict passes (same gate on beginner output) - beginner_invariant(pass_B_result) holds (structural invariance, D-08) Severity MAE is reported but never fails the gate (D-15, advisory). Gate exits non-zero when ANY letter fails (either pass), the adversarial pass fails, or the gold set is empty. CRITICAL lazy-import contract: - Stdlib-only at module top (json, re, unicodedata, argparse, pathlib, sys) - `from app import ...` lives ONLY inside run_eval()/run_adversarial_eval() so importing this module (e.g., for unit tests) never loads the model. """ import argparse import json import re import sys import unicodedata from pathlib import Path # Ensure project root is on sys.path so `from app import ...` resolves # whether this script is run as: # python eval/run_eval.py (cwd = project root) # python run_eval.py (cwd = eval/) # python -m pytest eval/ (cwd = project root) _PROJECT_ROOT = Path(__file__).resolve().parent.parent if str(_PROJECT_ROOT) not in sys.path: sys.path.insert(0, str(_PROJECT_ROOT)) # Also ensure eval/ is on sys.path so `import grounded` resolves when # run_eval.py is imported from the project root (e.g. by test_eval_matching.py). _EVAL_DIR = str(Path(__file__).resolve().parent) if _EVAL_DIR not in sys.path: sys.path.insert(0, _EVAL_DIR) # --------------------------------------------------------------------------- # Import matching primitives from shared module (D2-05 refactor). # Names re-bound here so existing `import run_eval as e; e.normalize(...)` calls # in eval/test_eval_matching.py continue to work without modification. # Use `from grounded import ...` (eval/ is on sys.path when running run_eval.py). # --------------------------------------------------------------------------- from grounded import normalize, value_found, extract_values_from_section # noqa: F401 # --------------------------------------------------------------------------- # Per-letter verdict (D-12, D-13, D-14, D-15, SC1) # --------------------------------------------------------------------------- def evaluate(result, gold: dict) -> dict: """ Return a per-letter verdict dict for a StructuredResult-shaped object and gold dict. No-invention check (D-12): every value extracted from result.deadlines must be a normalized substring of result.transcription. Recall check (D-13): every verbatim_swedish value from gold["deadlines"], gold["amounts"], gold["references"] must appear in result.deadlines. Four-sections completeness (SC1): tldr, why, actions, deadlines must all be non-empty. Severity (D-15): MAE is computed and included; does NOT affect pass. PASS = no-invention AND recall=100% AND severity is not None AND all_sections_present. Works on any SimpleNamespace or StructuredResult — evaluate() does NOT import app. """ # 1. No-invention check (D-12) invented = [] for emitted_val in extract_values_from_section(result.deadlines): if not value_found(emitted_val, result.transcription): invented.append(emitted_val) # 2. Recall check (D-13) — must extract verbatim_swedish string (Pitfall 9) missing = [] for category_key in ("deadlines", "amounts", "references"): for d in gold.get(category_key, []): verbatim = d["verbatim_swedish"] if not value_found(verbatim, result.deadlines): missing.append(verbatim) # 3. Four-sections completeness (SC1) all_sections_present = all( bool(getattr(result, f, None)) for f in ("tldr", "why", "actions", "deadlines") ) # 4. Severity MAE (D-15, advisory — never fails gate) severity = result.severity if severity is not None: sev_mae = abs(severity - gold["expected_severity"]) else: sev_mae = 5.0 # sentinel: output truncated before SEVERITY line # 5. Recall rate denominator total_gold = sum( len(gold.get(k, [])) for k in ("deadlines", "amounts", "references") ) passed = ( len(invented) == 0 and len(missing) == 0 and severity is not None and all_sections_present ) return { "pass": passed, "invented_count": len(invented), "invented": invented, "missing_count": len(missing), "missing": missing, "recall_rate": 1.0 - len(missing) / max(total_gold, 1), "severity_mae": sev_mae, "schema_complete": severity is not None, "all_sections_present": all_sections_present, } # --------------------------------------------------------------------------- # D-08 beginner-mode structural invariance checker # --------------------------------------------------------------------------- def beginner_invariant(result) -> tuple: """ Assert D-08 structural invariance on a beginner-mode StructuredResult. Checks: - All four section fields (tldr, why, actions, deadlines) are non-empty - severity is not None (SEVERITY line still present and parseable) - transcription is non-empty (transcription block still present) Returns (ok: bool, reasons: list[str]). ok=True if all invariants hold; False if any fail, with reasons listing each failure. """ reasons = [] # Check all four section fields for field in ("tldr", "why", "actions", "deadlines"): if not bool(getattr(result, field, None)): reasons.append(f"section '{field}' is empty in beginner-mode output (D-08 violation)") # Check severity parseable if result.severity is None: reasons.append( "severity is None in beginner-mode output — SEVERITY line dropped or truncated (D-08 violation)" ) # Check transcription block present if not bool(getattr(result, "transcription", None)): reasons.append( "transcription is empty in beginner-mode output — transcription block dropped (D-08 violation)" ) return (len(reasons) == 0, reasons) # --------------------------------------------------------------------------- # Refusal scorer (Phase 3 TRUST-02/03/04) — called by slice 3 for adversarial fixtures # --------------------------------------------------------------------------- def _render_value(pane) -> str: """Extract the text value from a render_result pane (gr.update dict or str).""" if isinstance(pane, dict): return str(pane.get("value", "") or "") return str(pane or "") def evaluate_refusal(result, gold: dict, language: str = "English") -> dict: """ Score a StructuredResult against an adversarial fixture sidecar. Not evaluated with evaluate() — adversarial fixtures have no gold values to recall. The pass criterion is the REAL TRUST-03 / SC1 guarantee — *the user is shown no four-section analysis* — verified at the render layer, plus correct classification: - result.doctype matches gold["expected_doctype"] (drives the refusal route) - render_result(result) shows no Panic Meter (panic_html == "") and no analysis in the why / actions / deadlines panes (render_refusal suppresses them) Why not the old `result.tldr is empty` proxy: a readable non-Swedish *letter* can be (and is) analysed by the model before the render layer suppresses it, so raw `tldr` is legitimately non-empty even though the user correctly sees only a refusal. The old proxy wrongly failed that case. tldr_empty is still reported, but ADVISORY only. render_result is imported lazily (it is a pure function — no model — so it is safe under BUREAUCAT_NO_MODEL=1, and this keeps module import of run_eval app-free). Returns a dict matching evaluate()'s shape (for uniform handling in slice 3). """ expected = gold.get("expected_doctype", "") # A fixture may list several equally-correct refusal doctypes. A non-Swedish # English letter, for instance, is a correct refusal whether the model labels it # "non_swedish" (precise) or "not_letter" (generic) — both route to render_refusal # and show the user no analysis. accepted_doctypes makes the gate robust to that # benign drift across prompt tweaks; falls back to the single expected_doctype. accepted = gold.get("accepted_doctypes") or [expected] actual_doctype = getattr(result, "doctype", "letter") doctype_correct = actual_doctype in accepted from app import render_result # lazy; pure fn, safe under BUREAUCAT_NO_MODEL=1 rendered = render_result(result, language) panic_html = rendered[0] no_analysis_rendered = ( panic_html == "" and not _render_value(rendered[4]).strip() # why and not _render_value(rendered[5]).strip() # actions and not _render_value(rendered[6]).strip() # deadlines ) tldr_empty = not getattr(result, "tldr", None) # advisory only passed = doctype_correct and no_analysis_rendered verdict = "refusal_correct" if passed else "refusal_wrong" return { "pass": passed, "verdict": verdict, "doctype": actual_doctype, "expected_doctype": expected, "doctype_correct": doctype_correct, "no_analysis_rendered": no_analysis_rendered, "tldr_empty": tldr_empty, # advisory — model may legitimately analyse non-Swedish } # --------------------------------------------------------------------------- # Bake-off runner — app contracts imported LAZILY inside this function only # --------------------------------------------------------------------------- def run_eval( model_variant: str = "qwen3", dump_path: str = None, ) -> bool: """ Run the bake-off harness for the given model variant and output language. Imports app contracts lazily BELOW the empty-set guard ONLY: load_model, run_inference, MODEL_VARIANTS — the ONLY place app is imported. Iterates sorted data/letters/public/*.json sidecars. For each letter: - Pass A (standard, beginner_mode=False): full evaluate() gate - Pass B (beginner, beginner_mode=True): full evaluate() gate + beginner_invariant() A letter PASSES only when both passes pass AND beginner_invariant holds. The *language* parameter controls the output language passed to run_inference(). Default "English" preserves the existing single-language harness behaviour. Exits non-zero (returns False) when any letter fails or gold set is empty. The empty-set guard fires BEFORE the app import so no model weights are downloaded/loaded when the gold set has not been populated yet. """ # ------------------------------------------------------------------ # Guard: check gold set exists BEFORE importing app (which triggers # load_model() at module scope unless BUREAUCAT_NO_MODEL=1 is set). # This ensures `python eval/run_eval.py` on an empty gold set exits # immediately with a clear message and no 16GB weight download. # ------------------------------------------------------------------ letter_dir = Path("data/letters/public") sidecars = sorted(letter_dir.glob("*.json")) if not sidecars: print( f"\n[run_eval] ERROR: Gold set is empty — no .json sidecars in {letter_dir}.\n" f"The bake-off gate cannot pass with zero letters (D-09).\n" f"Add at least 5 annotated gold letters before running Plan 04 bake-off.\n" ) return False # LAZY IMPORT — app is only imported here, after the empty-set guard, # so module-scope import of run_eval (for unit tests) never loads the # 16GB+ model, AND an empty gold set exits cleanly with no download. from app import load_model, run_inference, MODEL_VARIANTS # noqa: PLC0415 # Load model once for the entire eval run print(f"\n[run_eval] Loading model variant: {model_variant}") variant_cfg = MODEL_VARIANTS[model_variant] image_patch_size = variant_cfg["image_patch_size"] mdl, proc = load_model(model_variant) results = [] dump = {} # letter_stem -> {"standard": {...fields}, "beginner": {...fields}} for sidecar in sidecars: # Find matching image (.png preferred, fallback .jpg) image_path = sidecar.with_suffix(".png") if not image_path.exists(): image_path = sidecar.with_suffix(".jpg") if not image_path.exists(): print(f" [SKIP] {sidecar.stem}: no matching image file") continue from PIL import Image image = Image.open(image_path) gold = json.loads(sidecar.read_text()) # Pass A: standard inference (English-only product — 2026-06-07 descope) result_std = run_inference( image, "English", beginner_mode=False, mdl=mdl, proc=proc, image_patch_size=image_patch_size, ) verdict_std = evaluate(result_std, gold) # Pass B: beginner-mode inference (D-08 invariance check) result_beg = run_inference( image, "English", beginner_mode=True, mdl=mdl, proc=proc, image_patch_size=image_patch_size, ) verdict_beg = evaluate(result_beg, gold) inv_ok, inv_reasons = beginner_invariant(result_beg) if dump_path: _fields = ("transcription", "quip", "tldr", "why", "actions", "deadlines", "severity", "raw", "doctype") dump[sidecar.stem] = { "standard": {f: getattr(result_std, f) for f in _fields}, "beginner": {f: getattr(result_beg, f) for f in _fields}, } letter_pass = verdict_std["pass"] and verdict_beg["pass"] and inv_ok # Per-letter output status = "PASS" if letter_pass else "FAIL" print( f"\n {status} {sidecar.stem}:\n" f" STANDARD: recall={verdict_std['recall_rate']:.0%} " f"invented={verdict_std['invented_count']} " f"severity_mae={verdict_std['severity_mae']:.1f} " f"all_sections={verdict_std['all_sections_present']}\n" f" BEGINNER: recall={verdict_beg['recall_rate']:.0%} " f"invented={verdict_beg['invented_count']} " f"severity_mae={verdict_beg['severity_mae']:.1f} " f"all_sections={verdict_beg['all_sections_present']}\n" f" BEGINNER_INVARIANT: {'OK' if inv_ok else 'FAIL(' + '; '.join(inv_reasons) + ')'}" ) if not verdict_std["pass"]: if verdict_std["invented"]: print(f" [STD] Invented values: {verdict_std['invented']}") if verdict_std["missing"]: print(f" [STD] Missing gold values: {verdict_std['missing']}") if not verdict_beg["pass"]: if verdict_beg["invented"]: print(f" [BEG] Invented values: {verdict_beg['invented']}") if verdict_beg["missing"]: print(f" [BEG] Missing gold values: {verdict_beg['missing']}") results.append({ "letter": sidecar.stem, "pass": letter_pass, }) # Overall summary n_pass = sum(1 for r in results if r["pass"]) n_total = len(results) gate = n_pass == n_total and n_total > 0 print(f"\n=== EVAL RESULTS ({model_variant}) ===") print(f" Overall: {n_pass}/{n_total} letters passed both passes") print(f" GATE: {'PASS' if gate else 'FAIL'}") if dump_path: Path(dump_path).write_text( json.dumps(dump, ensure_ascii=False, indent=2), encoding="utf-8" ) print(f" Dumped raw model outputs for {len(dump)} letters to {dump_path}") return gate # --------------------------------------------------------------------------- # Adversarial refusal scoring (Phase 3 TRUST-02/03/04) # --------------------------------------------------------------------------- def run_adversarial_eval( model_variant: str = "qwen3", ) -> tuple[bool, list[dict]]: """ Score the three adversarial fixtures against the refusal gate. Globs data/letters/public/adversarial/*.json — separate from the gold-letter loop (glob("*.json") in run_eval is non-recursive so adversarial sidecars never enter the EVAL-02 letter loop). Loads the model lazily after the empty-set guard (mirrors run_eval discipline). Runs each fixture in English only (adversarial path is not language-specific; classification is always English-emitted via the DOCTYPE sentinel). Returns (gate_pass: bool, verdicts: list[dict]). """ adv_dir = Path("data/letters/public/adversarial") adv_sidecars = sorted(adv_dir.glob("*.json")) if not adv_sidecars: print( f"\n[run_adversarial_eval] WARNING: no adversarial fixtures in {adv_dir}.\n" f"Skipping adversarial refusal scoring.\n" ) return True, [] from app import load_model, run_inference, MODEL_VARIANTS # noqa: PLC0415 from PIL import Image # noqa: PLC0415 variant_cfg = MODEL_VARIANTS[model_variant] image_patch_size = variant_cfg["image_patch_size"] mdl, proc = load_model(model_variant) verdicts = [] for sidecar in adv_sidecars: image_path = sidecar.with_suffix(".png") if not image_path.exists(): image_path = sidecar.with_suffix(".jpg") if not image_path.exists(): print(f" [SKIP-ADV] {sidecar.stem}: no matching image file") continue image = Image.open(image_path) gold = json.loads(sidecar.read_text()) result = run_inference( image, "English", beginner_mode=False, mdl=mdl, proc=proc, image_patch_size=image_patch_size, ) verdict = evaluate_refusal(result, gold) verdict["letter"] = sidecar.stem verdicts.append(verdict) status = "PASS" if verdict["pass"] else "FAIL" print( f"\n {status} [ADV] {sidecar.stem}:\n" f" verdict={verdict['verdict']} " f"doctype={verdict['doctype']!r} expected={verdict['expected_doctype']!r} " f"tldr_empty={verdict['tldr_empty']}" ) n_pass = sum(1 for v in verdicts if v["pass"]) n_total = len(verdicts) gate = n_pass == n_total and n_total > 0 print(f"\n=== ADVERSARIAL RESULTS ({model_variant}) ===") print(f" Overall: {n_pass}/{n_total} adversarial fixtures scored refusal_correct") print(f" GATE: {'PASS' if gate else 'FAIL'}") return gate, verdicts # --------------------------------------------------------------------------- # CLI entry point — English-only gate (gold accuracy + adversarial refusal) # --------------------------------------------------------------------------- def _main(): parser = argparse.ArgumentParser( description="Bureaucat evaluator — EVAL-02 gate (English-only product)" ) parser.add_argument( "--model", choices=["qwen3", "qwen25"], default="qwen3", help="Model variant to evaluate (default: qwen3)", ) parser.add_argument( "--dump", default=None, metavar="PATH", help="Write raw per-letter StructuredResult outputs to PATH (JSON) for " "offline matching/gold iteration at zero GPU cost.", ) args = parser.parse_args() # English-only product (2026-06-07 descope): gold-letter accuracy gate + # adversarial refusal gate. Both must pass. gold_gate = run_eval(args.model, dump_path=args.dump) adv_gate, _ = run_adversarial_eval(args.model) sys.exit(0 if (gold_gate and adv_gate) else 1) if __name__ == "__main__": _main()