bureaucat / eval /run_eval.py
ravinsingh15's picture
Bureaucat — Build Small Hackathon submission (Qwen3-VL-8B, ZeroGPU, gr.Server)
6b5e47d
Raw
History Blame Contribute Delete
20.5 kB
"""
EVAL-02 harness — Bureaucat bake-off evaluator.
Usage:
python eval/run_eval.py [--model qwen3|qwen25] [--dump PATH]
English-only product (2026-06-07 descope): Bureaucat reads Swedish letters and explains
them in English. The gate runs two passes:
- Gold-letter accuracy (run_eval): anti-hallucination (D-12/D-13/D-14) + SC1 four-sections
completeness, each letter run standard + beginner (D-08 invariance).
- Adversarial refusal (run_adversarial_eval): the three bad-input fixtures must route to
the refusal path (correct doctype + no analysis rendered).
Both must pass for exit 0. (The earlier 5x5 multilingual matrix and the py3langid prose-
language assertion were retired with the English-only descope.)
A gold letter passes only when:
- Pass A verdict passes (no-invention AND recall=100% AND all_sections_present AND severity)
- Pass B evaluate() verdict passes (same gate on beginner output)
- beginner_invariant(pass_B_result) holds (structural invariance, D-08)
Severity MAE is reported but never fails the gate (D-15, advisory).
Gate exits non-zero when ANY letter fails (either pass), the adversarial pass fails, or the
gold set is empty.
CRITICAL lazy-import contract:
- Stdlib-only at module top (json, re, unicodedata, argparse, pathlib, sys)
- `from app import ...` lives ONLY inside run_eval()/run_adversarial_eval() so importing
this module (e.g., for unit tests) never loads the model.
"""
import argparse
import json
import re
import sys
import unicodedata
from pathlib import Path
# Ensure project root is on sys.path so `from app import ...` resolves
# whether this script is run as:
# python eval/run_eval.py (cwd = project root)
# python run_eval.py (cwd = eval/)
# python -m pytest eval/ (cwd = project root)
_PROJECT_ROOT = Path(__file__).resolve().parent.parent
if str(_PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(_PROJECT_ROOT))
# Also ensure eval/ is on sys.path so `import grounded` resolves when
# run_eval.py is imported from the project root (e.g. by test_eval_matching.py).
_EVAL_DIR = str(Path(__file__).resolve().parent)
if _EVAL_DIR not in sys.path:
sys.path.insert(0, _EVAL_DIR)
# ---------------------------------------------------------------------------
# Import matching primitives from shared module (D2-05 refactor).
# Names re-bound here so existing `import run_eval as e; e.normalize(...)` calls
# in eval/test_eval_matching.py continue to work without modification.
# Use `from grounded import ...` (eval/ is on sys.path when running run_eval.py).
# ---------------------------------------------------------------------------
from grounded import normalize, value_found, extract_values_from_section # noqa: F401
# ---------------------------------------------------------------------------
# Per-letter verdict (D-12, D-13, D-14, D-15, SC1)
# ---------------------------------------------------------------------------
def evaluate(result, gold: dict) -> dict:
"""
Return a per-letter verdict dict for a StructuredResult-shaped object and gold dict.
No-invention check (D-12): every value extracted from result.deadlines must be
a normalized substring of result.transcription.
Recall check (D-13): every verbatim_swedish value from gold["deadlines"],
gold["amounts"], gold["references"] must appear in result.deadlines.
Four-sections completeness (SC1): tldr, why, actions, deadlines must all be non-empty.
Severity (D-15): MAE is computed and included; does NOT affect pass.
PASS = no-invention AND recall=100% AND severity is not None AND all_sections_present.
Works on any SimpleNamespace or StructuredResult — evaluate() does NOT import app.
"""
# 1. No-invention check (D-12)
invented = []
for emitted_val in extract_values_from_section(result.deadlines):
if not value_found(emitted_val, result.transcription):
invented.append(emitted_val)
# 2. Recall check (D-13) — must extract verbatim_swedish string (Pitfall 9)
missing = []
for category_key in ("deadlines", "amounts", "references"):
for d in gold.get(category_key, []):
verbatim = d["verbatim_swedish"]
if not value_found(verbatim, result.deadlines):
missing.append(verbatim)
# 3. Four-sections completeness (SC1)
all_sections_present = all(
bool(getattr(result, f, None))
for f in ("tldr", "why", "actions", "deadlines")
)
# 4. Severity MAE (D-15, advisory — never fails gate)
severity = result.severity
if severity is not None:
sev_mae = abs(severity - gold["expected_severity"])
else:
sev_mae = 5.0 # sentinel: output truncated before SEVERITY line
# 5. Recall rate denominator
total_gold = sum(
len(gold.get(k, []))
for k in ("deadlines", "amounts", "references")
)
passed = (
len(invented) == 0
and len(missing) == 0
and severity is not None
and all_sections_present
)
return {
"pass": passed,
"invented_count": len(invented),
"invented": invented,
"missing_count": len(missing),
"missing": missing,
"recall_rate": 1.0 - len(missing) / max(total_gold, 1),
"severity_mae": sev_mae,
"schema_complete": severity is not None,
"all_sections_present": all_sections_present,
}
# ---------------------------------------------------------------------------
# D-08 beginner-mode structural invariance checker
# ---------------------------------------------------------------------------
def beginner_invariant(result) -> tuple:
"""
Assert D-08 structural invariance on a beginner-mode StructuredResult.
Checks:
- All four section fields (tldr, why, actions, deadlines) are non-empty
- severity is not None (SEVERITY line still present and parseable)
- transcription is non-empty (transcription block still present)
Returns (ok: bool, reasons: list[str]).
ok=True if all invariants hold; False if any fail, with reasons listing each failure.
"""
reasons = []
# Check all four section fields
for field in ("tldr", "why", "actions", "deadlines"):
if not bool(getattr(result, field, None)):
reasons.append(f"section '{field}' is empty in beginner-mode output (D-08 violation)")
# Check severity parseable
if result.severity is None:
reasons.append(
"severity is None in beginner-mode output — SEVERITY line dropped or truncated (D-08 violation)"
)
# Check transcription block present
if not bool(getattr(result, "transcription", None)):
reasons.append(
"transcription is empty in beginner-mode output — transcription block dropped (D-08 violation)"
)
return (len(reasons) == 0, reasons)
# ---------------------------------------------------------------------------
# Refusal scorer (Phase 3 TRUST-02/03/04) — called by slice 3 for adversarial fixtures
# ---------------------------------------------------------------------------
def _render_value(pane) -> str:
"""Extract the text value from a render_result pane (gr.update dict or str)."""
if isinstance(pane, dict):
return str(pane.get("value", "") or "")
return str(pane or "")
def evaluate_refusal(result, gold: dict, language: str = "English") -> dict:
"""
Score a StructuredResult against an adversarial fixture sidecar.
Not evaluated with evaluate() — adversarial fixtures have no gold values to recall.
The pass criterion is the REAL TRUST-03 / SC1 guarantee — *the user is shown no
four-section analysis* — verified at the render layer, plus correct classification:
- result.doctype matches gold["expected_doctype"] (drives the refusal route)
- render_result(result) shows no Panic Meter (panic_html == "") and no analysis
in the why / actions / deadlines panes (render_refusal suppresses them)
Why not the old `result.tldr is empty` proxy: a readable non-Swedish *letter* can be
(and is) analysed by the model before the render layer suppresses it, so raw `tldr`
is legitimately non-empty even though the user correctly sees only a refusal. The old
proxy wrongly failed that case. tldr_empty is still reported, but ADVISORY only.
render_result is imported lazily (it is a pure function — no model — so it is safe
under BUREAUCAT_NO_MODEL=1, and this keeps module import of run_eval app-free).
Returns a dict matching evaluate()'s shape (for uniform handling in slice 3).
"""
expected = gold.get("expected_doctype", "")
# A fixture may list several equally-correct refusal doctypes. A non-Swedish
# English letter, for instance, is a correct refusal whether the model labels it
# "non_swedish" (precise) or "not_letter" (generic) — both route to render_refusal
# and show the user no analysis. accepted_doctypes makes the gate robust to that
# benign drift across prompt tweaks; falls back to the single expected_doctype.
accepted = gold.get("accepted_doctypes") or [expected]
actual_doctype = getattr(result, "doctype", "letter")
doctype_correct = actual_doctype in accepted
from app import render_result # lazy; pure fn, safe under BUREAUCAT_NO_MODEL=1
rendered = render_result(result, language)
panic_html = rendered[0]
no_analysis_rendered = (
panic_html == ""
and not _render_value(rendered[4]).strip() # why
and not _render_value(rendered[5]).strip() # actions
and not _render_value(rendered[6]).strip() # deadlines
)
tldr_empty = not getattr(result, "tldr", None) # advisory only
passed = doctype_correct and no_analysis_rendered
verdict = "refusal_correct" if passed else "refusal_wrong"
return {
"pass": passed,
"verdict": verdict,
"doctype": actual_doctype,
"expected_doctype": expected,
"doctype_correct": doctype_correct,
"no_analysis_rendered": no_analysis_rendered,
"tldr_empty": tldr_empty, # advisory — model may legitimately analyse non-Swedish
}
# ---------------------------------------------------------------------------
# Bake-off runner — app contracts imported LAZILY inside this function only
# ---------------------------------------------------------------------------
def run_eval(
model_variant: str = "qwen3",
dump_path: str = None,
) -> bool:
"""
Run the bake-off harness for the given model variant and output language.
Imports app contracts lazily BELOW the empty-set guard ONLY:
load_model, run_inference, MODEL_VARIANTS — the ONLY place app is imported.
Iterates sorted data/letters/public/*.json sidecars. For each letter:
- Pass A (standard, beginner_mode=False): full evaluate() gate
- Pass B (beginner, beginner_mode=True): full evaluate() gate + beginner_invariant()
A letter PASSES only when both passes pass AND beginner_invariant holds.
The *language* parameter controls the output language passed to run_inference().
Default "English" preserves the existing single-language harness behaviour.
Exits non-zero (returns False) when any letter fails or gold set is empty.
The empty-set guard fires BEFORE the app import so no model weights are
downloaded/loaded when the gold set has not been populated yet.
"""
# ------------------------------------------------------------------
# Guard: check gold set exists BEFORE importing app (which triggers
# load_model() at module scope unless BUREAUCAT_NO_MODEL=1 is set).
# This ensures `python eval/run_eval.py` on an empty gold set exits
# immediately with a clear message and no 16GB weight download.
# ------------------------------------------------------------------
letter_dir = Path("data/letters/public")
sidecars = sorted(letter_dir.glob("*.json"))
if not sidecars:
print(
f"\n[run_eval] ERROR: Gold set is empty — no .json sidecars in {letter_dir}.\n"
f"The bake-off gate cannot pass with zero letters (D-09).\n"
f"Add at least 5 annotated gold letters before running Plan 04 bake-off.\n"
)
return False
# LAZY IMPORT — app is only imported here, after the empty-set guard,
# so module-scope import of run_eval (for unit tests) never loads the
# 16GB+ model, AND an empty gold set exits cleanly with no download.
from app import load_model, run_inference, MODEL_VARIANTS # noqa: PLC0415
# Load model once for the entire eval run
print(f"\n[run_eval] Loading model variant: {model_variant}")
variant_cfg = MODEL_VARIANTS[model_variant]
image_patch_size = variant_cfg["image_patch_size"]
mdl, proc = load_model(model_variant)
results = []
dump = {} # letter_stem -> {"standard": {...fields}, "beginner": {...fields}}
for sidecar in sidecars:
# Find matching image (.png preferred, fallback .jpg)
image_path = sidecar.with_suffix(".png")
if not image_path.exists():
image_path = sidecar.with_suffix(".jpg")
if not image_path.exists():
print(f" [SKIP] {sidecar.stem}: no matching image file")
continue
from PIL import Image
image = Image.open(image_path)
gold = json.loads(sidecar.read_text())
# Pass A: standard inference (English-only product — 2026-06-07 descope)
result_std = run_inference(
image, "English", beginner_mode=False,
mdl=mdl, proc=proc, image_patch_size=image_patch_size,
)
verdict_std = evaluate(result_std, gold)
# Pass B: beginner-mode inference (D-08 invariance check)
result_beg = run_inference(
image, "English", beginner_mode=True,
mdl=mdl, proc=proc, image_patch_size=image_patch_size,
)
verdict_beg = evaluate(result_beg, gold)
inv_ok, inv_reasons = beginner_invariant(result_beg)
if dump_path:
_fields = ("transcription", "quip", "tldr", "why", "actions", "deadlines", "severity", "raw", "doctype")
dump[sidecar.stem] = {
"standard": {f: getattr(result_std, f) for f in _fields},
"beginner": {f: getattr(result_beg, f) for f in _fields},
}
letter_pass = verdict_std["pass"] and verdict_beg["pass"] and inv_ok
# Per-letter output
status = "PASS" if letter_pass else "FAIL"
print(
f"\n {status} {sidecar.stem}:\n"
f" STANDARD: recall={verdict_std['recall_rate']:.0%} "
f"invented={verdict_std['invented_count']} "
f"severity_mae={verdict_std['severity_mae']:.1f} "
f"all_sections={verdict_std['all_sections_present']}\n"
f" BEGINNER: recall={verdict_beg['recall_rate']:.0%} "
f"invented={verdict_beg['invented_count']} "
f"severity_mae={verdict_beg['severity_mae']:.1f} "
f"all_sections={verdict_beg['all_sections_present']}\n"
f" BEGINNER_INVARIANT: {'OK' if inv_ok else 'FAIL(' + '; '.join(inv_reasons) + ')'}"
)
if not verdict_std["pass"]:
if verdict_std["invented"]:
print(f" [STD] Invented values: {verdict_std['invented']}")
if verdict_std["missing"]:
print(f" [STD] Missing gold values: {verdict_std['missing']}")
if not verdict_beg["pass"]:
if verdict_beg["invented"]:
print(f" [BEG] Invented values: {verdict_beg['invented']}")
if verdict_beg["missing"]:
print(f" [BEG] Missing gold values: {verdict_beg['missing']}")
results.append({
"letter": sidecar.stem,
"pass": letter_pass,
})
# Overall summary
n_pass = sum(1 for r in results if r["pass"])
n_total = len(results)
gate = n_pass == n_total and n_total > 0
print(f"\n=== EVAL RESULTS ({model_variant}) ===")
print(f" Overall: {n_pass}/{n_total} letters passed both passes")
print(f" GATE: {'PASS' if gate else 'FAIL'}")
if dump_path:
Path(dump_path).write_text(
json.dumps(dump, ensure_ascii=False, indent=2), encoding="utf-8"
)
print(f" Dumped raw model outputs for {len(dump)} letters to {dump_path}")
return gate
# ---------------------------------------------------------------------------
# Adversarial refusal scoring (Phase 3 TRUST-02/03/04)
# ---------------------------------------------------------------------------
def run_adversarial_eval(
model_variant: str = "qwen3",
) -> tuple[bool, list[dict]]:
"""
Score the three adversarial fixtures against the refusal gate.
Globs data/letters/public/adversarial/*.json — separate from the gold-letter
loop (glob("*.json") in run_eval is non-recursive so adversarial sidecars never
enter the EVAL-02 letter loop).
Loads the model lazily after the empty-set guard (mirrors run_eval discipline).
Runs each fixture in English only (adversarial path is not language-specific;
classification is always English-emitted via the DOCTYPE sentinel).
Returns (gate_pass: bool, verdicts: list[dict]).
"""
adv_dir = Path("data/letters/public/adversarial")
adv_sidecars = sorted(adv_dir.glob("*.json"))
if not adv_sidecars:
print(
f"\n[run_adversarial_eval] WARNING: no adversarial fixtures in {adv_dir}.\n"
f"Skipping adversarial refusal scoring.\n"
)
return True, []
from app import load_model, run_inference, MODEL_VARIANTS # noqa: PLC0415
from PIL import Image # noqa: PLC0415
variant_cfg = MODEL_VARIANTS[model_variant]
image_patch_size = variant_cfg["image_patch_size"]
mdl, proc = load_model(model_variant)
verdicts = []
for sidecar in adv_sidecars:
image_path = sidecar.with_suffix(".png")
if not image_path.exists():
image_path = sidecar.with_suffix(".jpg")
if not image_path.exists():
print(f" [SKIP-ADV] {sidecar.stem}: no matching image file")
continue
image = Image.open(image_path)
gold = json.loads(sidecar.read_text())
result = run_inference(
image, "English", beginner_mode=False,
mdl=mdl, proc=proc, image_patch_size=image_patch_size,
)
verdict = evaluate_refusal(result, gold)
verdict["letter"] = sidecar.stem
verdicts.append(verdict)
status = "PASS" if verdict["pass"] else "FAIL"
print(
f"\n {status} [ADV] {sidecar.stem}:\n"
f" verdict={verdict['verdict']} "
f"doctype={verdict['doctype']!r} expected={verdict['expected_doctype']!r} "
f"tldr_empty={verdict['tldr_empty']}"
)
n_pass = sum(1 for v in verdicts if v["pass"])
n_total = len(verdicts)
gate = n_pass == n_total and n_total > 0
print(f"\n=== ADVERSARIAL RESULTS ({model_variant}) ===")
print(f" Overall: {n_pass}/{n_total} adversarial fixtures scored refusal_correct")
print(f" GATE: {'PASS' if gate else 'FAIL'}")
return gate, verdicts
# ---------------------------------------------------------------------------
# CLI entry point — English-only gate (gold accuracy + adversarial refusal)
# ---------------------------------------------------------------------------
def _main():
parser = argparse.ArgumentParser(
description="Bureaucat evaluator — EVAL-02 gate (English-only product)"
)
parser.add_argument(
"--model",
choices=["qwen3", "qwen25"],
default="qwen3",
help="Model variant to evaluate (default: qwen3)",
)
parser.add_argument(
"--dump",
default=None,
metavar="PATH",
help="Write raw per-letter StructuredResult outputs to PATH (JSON) for "
"offline matching/gold iteration at zero GPU cost.",
)
args = parser.parse_args()
# English-only product (2026-06-07 descope): gold-letter accuracy gate +
# adversarial refusal gate. Both must pass.
gold_gate = run_eval(args.model, dump_path=args.dump)
adv_gate, _ = run_adversarial_eval(args.model)
sys.exit(0 if (gold_gate and adv_gate) else 1)
if __name__ == "__main__":
_main()