Spaces:
Running on Zero
Running on Zero
| """ | |
| EVAL-02 harness — Bureaucat bake-off evaluator. | |
| Usage: | |
| python eval/run_eval.py [--model qwen3|qwen25] [--dump PATH] | |
| English-only product (2026-06-07 descope): Bureaucat reads Swedish letters and explains | |
| them in English. The gate runs two passes: | |
| - Gold-letter accuracy (run_eval): anti-hallucination (D-12/D-13/D-14) + SC1 four-sections | |
| completeness, each letter run standard + beginner (D-08 invariance). | |
| - Adversarial refusal (run_adversarial_eval): the three bad-input fixtures must route to | |
| the refusal path (correct doctype + no analysis rendered). | |
| Both must pass for exit 0. (The earlier 5x5 multilingual matrix and the py3langid prose- | |
| language assertion were retired with the English-only descope.) | |
| A gold letter passes only when: | |
| - Pass A verdict passes (no-invention AND recall=100% AND all_sections_present AND severity) | |
| - Pass B evaluate() verdict passes (same gate on beginner output) | |
| - beginner_invariant(pass_B_result) holds (structural invariance, D-08) | |
| Severity MAE is reported but never fails the gate (D-15, advisory). | |
| Gate exits non-zero when ANY letter fails (either pass), the adversarial pass fails, or the | |
| gold set is empty. | |
| CRITICAL lazy-import contract: | |
| - Stdlib-only at module top (json, re, unicodedata, argparse, pathlib, sys) | |
| - `from app import ...` lives ONLY inside run_eval()/run_adversarial_eval() so importing | |
| this module (e.g., for unit tests) never loads the model. | |
| """ | |
| import argparse | |
| import json | |
| import re | |
| import sys | |
| import unicodedata | |
| from pathlib import Path | |
| # Ensure project root is on sys.path so `from app import ...` resolves | |
| # whether this script is run as: | |
| # python eval/run_eval.py (cwd = project root) | |
| # python run_eval.py (cwd = eval/) | |
| # python -m pytest eval/ (cwd = project root) | |
| _PROJECT_ROOT = Path(__file__).resolve().parent.parent | |
| if str(_PROJECT_ROOT) not in sys.path: | |
| sys.path.insert(0, str(_PROJECT_ROOT)) | |
| # Also ensure eval/ is on sys.path so `import grounded` resolves when | |
| # run_eval.py is imported from the project root (e.g. by test_eval_matching.py). | |
| _EVAL_DIR = str(Path(__file__).resolve().parent) | |
| if _EVAL_DIR not in sys.path: | |
| sys.path.insert(0, _EVAL_DIR) | |
| # --------------------------------------------------------------------------- | |
| # Import matching primitives from shared module (D2-05 refactor). | |
| # Names re-bound here so existing `import run_eval as e; e.normalize(...)` calls | |
| # in eval/test_eval_matching.py continue to work without modification. | |
| # Use `from grounded import ...` (eval/ is on sys.path when running run_eval.py). | |
| # --------------------------------------------------------------------------- | |
| from grounded import normalize, value_found, extract_values_from_section # noqa: F401 | |
| # --------------------------------------------------------------------------- | |
| # Per-letter verdict (D-12, D-13, D-14, D-15, SC1) | |
| # --------------------------------------------------------------------------- | |
| def evaluate(result, gold: dict) -> dict: | |
| """ | |
| Return a per-letter verdict dict for a StructuredResult-shaped object and gold dict. | |
| No-invention check (D-12): every value extracted from result.deadlines must be | |
| a normalized substring of result.transcription. | |
| Recall check (D-13): every verbatim_swedish value from gold["deadlines"], | |
| gold["amounts"], gold["references"] must appear in result.deadlines. | |
| Four-sections completeness (SC1): tldr, why, actions, deadlines must all be non-empty. | |
| Severity (D-15): MAE is computed and included; does NOT affect pass. | |
| PASS = no-invention AND recall=100% AND severity is not None AND all_sections_present. | |
| Works on any SimpleNamespace or StructuredResult — evaluate() does NOT import app. | |
| """ | |
| # 1. No-invention check (D-12) | |
| invented = [] | |
| for emitted_val in extract_values_from_section(result.deadlines): | |
| if not value_found(emitted_val, result.transcription): | |
| invented.append(emitted_val) | |
| # 2. Recall check (D-13) — must extract verbatim_swedish string (Pitfall 9) | |
| missing = [] | |
| for category_key in ("deadlines", "amounts", "references"): | |
| for d in gold.get(category_key, []): | |
| verbatim = d["verbatim_swedish"] | |
| if not value_found(verbatim, result.deadlines): | |
| missing.append(verbatim) | |
| # 3. Four-sections completeness (SC1) | |
| all_sections_present = all( | |
| bool(getattr(result, f, None)) | |
| for f in ("tldr", "why", "actions", "deadlines") | |
| ) | |
| # 4. Severity MAE (D-15, advisory — never fails gate) | |
| severity = result.severity | |
| if severity is not None: | |
| sev_mae = abs(severity - gold["expected_severity"]) | |
| else: | |
| sev_mae = 5.0 # sentinel: output truncated before SEVERITY line | |
| # 5. Recall rate denominator | |
| total_gold = sum( | |
| len(gold.get(k, [])) | |
| for k in ("deadlines", "amounts", "references") | |
| ) | |
| passed = ( | |
| len(invented) == 0 | |
| and len(missing) == 0 | |
| and severity is not None | |
| and all_sections_present | |
| ) | |
| return { | |
| "pass": passed, | |
| "invented_count": len(invented), | |
| "invented": invented, | |
| "missing_count": len(missing), | |
| "missing": missing, | |
| "recall_rate": 1.0 - len(missing) / max(total_gold, 1), | |
| "severity_mae": sev_mae, | |
| "schema_complete": severity is not None, | |
| "all_sections_present": all_sections_present, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # D-08 beginner-mode structural invariance checker | |
| # --------------------------------------------------------------------------- | |
| def beginner_invariant(result) -> tuple: | |
| """ | |
| Assert D-08 structural invariance on a beginner-mode StructuredResult. | |
| Checks: | |
| - All four section fields (tldr, why, actions, deadlines) are non-empty | |
| - severity is not None (SEVERITY line still present and parseable) | |
| - transcription is non-empty (transcription block still present) | |
| Returns (ok: bool, reasons: list[str]). | |
| ok=True if all invariants hold; False if any fail, with reasons listing each failure. | |
| """ | |
| reasons = [] | |
| # Check all four section fields | |
| for field in ("tldr", "why", "actions", "deadlines"): | |
| if not bool(getattr(result, field, None)): | |
| reasons.append(f"section '{field}' is empty in beginner-mode output (D-08 violation)") | |
| # Check severity parseable | |
| if result.severity is None: | |
| reasons.append( | |
| "severity is None in beginner-mode output — SEVERITY line dropped or truncated (D-08 violation)" | |
| ) | |
| # Check transcription block present | |
| if not bool(getattr(result, "transcription", None)): | |
| reasons.append( | |
| "transcription is empty in beginner-mode output — transcription block dropped (D-08 violation)" | |
| ) | |
| return (len(reasons) == 0, reasons) | |
| # --------------------------------------------------------------------------- | |
| # Refusal scorer (Phase 3 TRUST-02/03/04) — called by slice 3 for adversarial fixtures | |
| # --------------------------------------------------------------------------- | |
| def _render_value(pane) -> str: | |
| """Extract the text value from a render_result pane (gr.update dict or str).""" | |
| if isinstance(pane, dict): | |
| return str(pane.get("value", "") or "") | |
| return str(pane or "") | |
| def evaluate_refusal(result, gold: dict, language: str = "English") -> dict: | |
| """ | |
| Score a StructuredResult against an adversarial fixture sidecar. | |
| Not evaluated with evaluate() — adversarial fixtures have no gold values to recall. | |
| The pass criterion is the REAL TRUST-03 / SC1 guarantee — *the user is shown no | |
| four-section analysis* — verified at the render layer, plus correct classification: | |
| - result.doctype matches gold["expected_doctype"] (drives the refusal route) | |
| - render_result(result) shows no Panic Meter (panic_html == "") and no analysis | |
| in the why / actions / deadlines panes (render_refusal suppresses them) | |
| Why not the old `result.tldr is empty` proxy: a readable non-Swedish *letter* can be | |
| (and is) analysed by the model before the render layer suppresses it, so raw `tldr` | |
| is legitimately non-empty even though the user correctly sees only a refusal. The old | |
| proxy wrongly failed that case. tldr_empty is still reported, but ADVISORY only. | |
| render_result is imported lazily (it is a pure function — no model — so it is safe | |
| under BUREAUCAT_NO_MODEL=1, and this keeps module import of run_eval app-free). | |
| Returns a dict matching evaluate()'s shape (for uniform handling in slice 3). | |
| """ | |
| expected = gold.get("expected_doctype", "") | |
| # A fixture may list several equally-correct refusal doctypes. A non-Swedish | |
| # English letter, for instance, is a correct refusal whether the model labels it | |
| # "non_swedish" (precise) or "not_letter" (generic) — both route to render_refusal | |
| # and show the user no analysis. accepted_doctypes makes the gate robust to that | |
| # benign drift across prompt tweaks; falls back to the single expected_doctype. | |
| accepted = gold.get("accepted_doctypes") or [expected] | |
| actual_doctype = getattr(result, "doctype", "letter") | |
| doctype_correct = actual_doctype in accepted | |
| from app import render_result # lazy; pure fn, safe under BUREAUCAT_NO_MODEL=1 | |
| rendered = render_result(result, language) | |
| panic_html = rendered[0] | |
| no_analysis_rendered = ( | |
| panic_html == "" | |
| and not _render_value(rendered[4]).strip() # why | |
| and not _render_value(rendered[5]).strip() # actions | |
| and not _render_value(rendered[6]).strip() # deadlines | |
| ) | |
| tldr_empty = not getattr(result, "tldr", None) # advisory only | |
| passed = doctype_correct and no_analysis_rendered | |
| verdict = "refusal_correct" if passed else "refusal_wrong" | |
| return { | |
| "pass": passed, | |
| "verdict": verdict, | |
| "doctype": actual_doctype, | |
| "expected_doctype": expected, | |
| "doctype_correct": doctype_correct, | |
| "no_analysis_rendered": no_analysis_rendered, | |
| "tldr_empty": tldr_empty, # advisory — model may legitimately analyse non-Swedish | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Bake-off runner — app contracts imported LAZILY inside this function only | |
| # --------------------------------------------------------------------------- | |
| def run_eval( | |
| model_variant: str = "qwen3", | |
| dump_path: str = None, | |
| ) -> bool: | |
| """ | |
| Run the bake-off harness for the given model variant and output language. | |
| Imports app contracts lazily BELOW the empty-set guard ONLY: | |
| load_model, run_inference, MODEL_VARIANTS — the ONLY place app is imported. | |
| Iterates sorted data/letters/public/*.json sidecars. For each letter: | |
| - Pass A (standard, beginner_mode=False): full evaluate() gate | |
| - Pass B (beginner, beginner_mode=True): full evaluate() gate + beginner_invariant() | |
| A letter PASSES only when both passes pass AND beginner_invariant holds. | |
| The *language* parameter controls the output language passed to run_inference(). | |
| Default "English" preserves the existing single-language harness behaviour. | |
| Exits non-zero (returns False) when any letter fails or gold set is empty. | |
| The empty-set guard fires BEFORE the app import so no model weights are | |
| downloaded/loaded when the gold set has not been populated yet. | |
| """ | |
| # ------------------------------------------------------------------ | |
| # Guard: check gold set exists BEFORE importing app (which triggers | |
| # load_model() at module scope unless BUREAUCAT_NO_MODEL=1 is set). | |
| # This ensures `python eval/run_eval.py` on an empty gold set exits | |
| # immediately with a clear message and no 16GB weight download. | |
| # ------------------------------------------------------------------ | |
| letter_dir = Path("data/letters/public") | |
| sidecars = sorted(letter_dir.glob("*.json")) | |
| if not sidecars: | |
| print( | |
| f"\n[run_eval] ERROR: Gold set is empty — no .json sidecars in {letter_dir}.\n" | |
| f"The bake-off gate cannot pass with zero letters (D-09).\n" | |
| f"Add at least 5 annotated gold letters before running Plan 04 bake-off.\n" | |
| ) | |
| return False | |
| # LAZY IMPORT — app is only imported here, after the empty-set guard, | |
| # so module-scope import of run_eval (for unit tests) never loads the | |
| # 16GB+ model, AND an empty gold set exits cleanly with no download. | |
| from app import load_model, run_inference, MODEL_VARIANTS # noqa: PLC0415 | |
| # Load model once for the entire eval run | |
| print(f"\n[run_eval] Loading model variant: {model_variant}") | |
| variant_cfg = MODEL_VARIANTS[model_variant] | |
| image_patch_size = variant_cfg["image_patch_size"] | |
| mdl, proc = load_model(model_variant) | |
| results = [] | |
| dump = {} # letter_stem -> {"standard": {...fields}, "beginner": {...fields}} | |
| for sidecar in sidecars: | |
| # Find matching image (.png preferred, fallback .jpg) | |
| image_path = sidecar.with_suffix(".png") | |
| if not image_path.exists(): | |
| image_path = sidecar.with_suffix(".jpg") | |
| if not image_path.exists(): | |
| print(f" [SKIP] {sidecar.stem}: no matching image file") | |
| continue | |
| from PIL import Image | |
| image = Image.open(image_path) | |
| gold = json.loads(sidecar.read_text()) | |
| # Pass A: standard inference (English-only product — 2026-06-07 descope) | |
| result_std = run_inference( | |
| image, "English", beginner_mode=False, | |
| mdl=mdl, proc=proc, image_patch_size=image_patch_size, | |
| ) | |
| verdict_std = evaluate(result_std, gold) | |
| # Pass B: beginner-mode inference (D-08 invariance check) | |
| result_beg = run_inference( | |
| image, "English", beginner_mode=True, | |
| mdl=mdl, proc=proc, image_patch_size=image_patch_size, | |
| ) | |
| verdict_beg = evaluate(result_beg, gold) | |
| inv_ok, inv_reasons = beginner_invariant(result_beg) | |
| if dump_path: | |
| _fields = ("transcription", "quip", "tldr", "why", "actions", "deadlines", "severity", "raw", "doctype") | |
| dump[sidecar.stem] = { | |
| "standard": {f: getattr(result_std, f) for f in _fields}, | |
| "beginner": {f: getattr(result_beg, f) for f in _fields}, | |
| } | |
| letter_pass = verdict_std["pass"] and verdict_beg["pass"] and inv_ok | |
| # Per-letter output | |
| status = "PASS" if letter_pass else "FAIL" | |
| print( | |
| f"\n {status} {sidecar.stem}:\n" | |
| f" STANDARD: recall={verdict_std['recall_rate']:.0%} " | |
| f"invented={verdict_std['invented_count']} " | |
| f"severity_mae={verdict_std['severity_mae']:.1f} " | |
| f"all_sections={verdict_std['all_sections_present']}\n" | |
| f" BEGINNER: recall={verdict_beg['recall_rate']:.0%} " | |
| f"invented={verdict_beg['invented_count']} " | |
| f"severity_mae={verdict_beg['severity_mae']:.1f} " | |
| f"all_sections={verdict_beg['all_sections_present']}\n" | |
| f" BEGINNER_INVARIANT: {'OK' if inv_ok else 'FAIL(' + '; '.join(inv_reasons) + ')'}" | |
| ) | |
| if not verdict_std["pass"]: | |
| if verdict_std["invented"]: | |
| print(f" [STD] Invented values: {verdict_std['invented']}") | |
| if verdict_std["missing"]: | |
| print(f" [STD] Missing gold values: {verdict_std['missing']}") | |
| if not verdict_beg["pass"]: | |
| if verdict_beg["invented"]: | |
| print(f" [BEG] Invented values: {verdict_beg['invented']}") | |
| if verdict_beg["missing"]: | |
| print(f" [BEG] Missing gold values: {verdict_beg['missing']}") | |
| results.append({ | |
| "letter": sidecar.stem, | |
| "pass": letter_pass, | |
| }) | |
| # Overall summary | |
| n_pass = sum(1 for r in results if r["pass"]) | |
| n_total = len(results) | |
| gate = n_pass == n_total and n_total > 0 | |
| print(f"\n=== EVAL RESULTS ({model_variant}) ===") | |
| print(f" Overall: {n_pass}/{n_total} letters passed both passes") | |
| print(f" GATE: {'PASS' if gate else 'FAIL'}") | |
| if dump_path: | |
| Path(dump_path).write_text( | |
| json.dumps(dump, ensure_ascii=False, indent=2), encoding="utf-8" | |
| ) | |
| print(f" Dumped raw model outputs for {len(dump)} letters to {dump_path}") | |
| return gate | |
| # --------------------------------------------------------------------------- | |
| # Adversarial refusal scoring (Phase 3 TRUST-02/03/04) | |
| # --------------------------------------------------------------------------- | |
| def run_adversarial_eval( | |
| model_variant: str = "qwen3", | |
| ) -> tuple[bool, list[dict]]: | |
| """ | |
| Score the three adversarial fixtures against the refusal gate. | |
| Globs data/letters/public/adversarial/*.json — separate from the gold-letter | |
| loop (glob("*.json") in run_eval is non-recursive so adversarial sidecars never | |
| enter the EVAL-02 letter loop). | |
| Loads the model lazily after the empty-set guard (mirrors run_eval discipline). | |
| Runs each fixture in English only (adversarial path is not language-specific; | |
| classification is always English-emitted via the DOCTYPE sentinel). | |
| Returns (gate_pass: bool, verdicts: list[dict]). | |
| """ | |
| adv_dir = Path("data/letters/public/adversarial") | |
| adv_sidecars = sorted(adv_dir.glob("*.json")) | |
| if not adv_sidecars: | |
| print( | |
| f"\n[run_adversarial_eval] WARNING: no adversarial fixtures in {adv_dir}.\n" | |
| f"Skipping adversarial refusal scoring.\n" | |
| ) | |
| return True, [] | |
| from app import load_model, run_inference, MODEL_VARIANTS # noqa: PLC0415 | |
| from PIL import Image # noqa: PLC0415 | |
| variant_cfg = MODEL_VARIANTS[model_variant] | |
| image_patch_size = variant_cfg["image_patch_size"] | |
| mdl, proc = load_model(model_variant) | |
| verdicts = [] | |
| for sidecar in adv_sidecars: | |
| image_path = sidecar.with_suffix(".png") | |
| if not image_path.exists(): | |
| image_path = sidecar.with_suffix(".jpg") | |
| if not image_path.exists(): | |
| print(f" [SKIP-ADV] {sidecar.stem}: no matching image file") | |
| continue | |
| image = Image.open(image_path) | |
| gold = json.loads(sidecar.read_text()) | |
| result = run_inference( | |
| image, "English", beginner_mode=False, | |
| mdl=mdl, proc=proc, image_patch_size=image_patch_size, | |
| ) | |
| verdict = evaluate_refusal(result, gold) | |
| verdict["letter"] = sidecar.stem | |
| verdicts.append(verdict) | |
| status = "PASS" if verdict["pass"] else "FAIL" | |
| print( | |
| f"\n {status} [ADV] {sidecar.stem}:\n" | |
| f" verdict={verdict['verdict']} " | |
| f"doctype={verdict['doctype']!r} expected={verdict['expected_doctype']!r} " | |
| f"tldr_empty={verdict['tldr_empty']}" | |
| ) | |
| n_pass = sum(1 for v in verdicts if v["pass"]) | |
| n_total = len(verdicts) | |
| gate = n_pass == n_total and n_total > 0 | |
| print(f"\n=== ADVERSARIAL RESULTS ({model_variant}) ===") | |
| print(f" Overall: {n_pass}/{n_total} adversarial fixtures scored refusal_correct") | |
| print(f" GATE: {'PASS' if gate else 'FAIL'}") | |
| return gate, verdicts | |
| # --------------------------------------------------------------------------- | |
| # CLI entry point — English-only gate (gold accuracy + adversarial refusal) | |
| # --------------------------------------------------------------------------- | |
| def _main(): | |
| parser = argparse.ArgumentParser( | |
| description="Bureaucat evaluator — EVAL-02 gate (English-only product)" | |
| ) | |
| parser.add_argument( | |
| "--model", | |
| choices=["qwen3", "qwen25"], | |
| default="qwen3", | |
| help="Model variant to evaluate (default: qwen3)", | |
| ) | |
| parser.add_argument( | |
| "--dump", | |
| default=None, | |
| metavar="PATH", | |
| help="Write raw per-letter StructuredResult outputs to PATH (JSON) for " | |
| "offline matching/gold iteration at zero GPU cost.", | |
| ) | |
| args = parser.parse_args() | |
| # English-only product (2026-06-07 descope): gold-letter accuracy gate + | |
| # adversarial refusal gate. Both must pass. | |
| gold_gate = run_eval(args.model, dump_path=args.dump) | |
| adv_gate, _ = run_adversarial_eval(args.model) | |
| sys.exit(0 if (gold_gate and adv_gate) else 1) | |
| if __name__ == "__main__": | |
| _main() | |