Spaces:

build-small-hackathon
/

bureaucat

Running on Zero

App Files Files Community

bureaucat / eval /run_eval.py

ravinsingh15

Bureaucat — Build Small Hackathon submission (Qwen3-VL-8B, ZeroGPU, gr.Server)

6b5e47d 21 days ago

Raw

History Blame Contribute Delete

20.5 kB

	"""
	EVAL-02 harness — Bureaucat bake-off evaluator.

	Usage:
	python eval/run_eval.py [--model qwen3\|qwen25] [--dump PATH]

	English-only product (2026-06-07 descope): Bureaucat reads Swedish letters and explains
	them in English. The gate runs two passes:
	- Gold-letter accuracy (run_eval): anti-hallucination (D-12/D-13/D-14) + SC1 four-sections
	completeness, each letter run standard + beginner (D-08 invariance).
	- Adversarial refusal (run_adversarial_eval): the three bad-input fixtures must route to
	the refusal path (correct doctype + no analysis rendered).
	Both must pass for exit 0. (The earlier 5x5 multilingual matrix and the py3langid prose-
	language assertion were retired with the English-only descope.)

	A gold letter passes only when:
	- Pass A verdict passes (no-invention AND recall=100% AND all_sections_present AND severity)
	- Pass B evaluate() verdict passes (same gate on beginner output)
	- beginner_invariant(pass_B_result) holds (structural invariance, D-08)

	Severity MAE is reported but never fails the gate (D-15, advisory).

	Gate exits non-zero when ANY letter fails (either pass), the adversarial pass fails, or the
	gold set is empty.

	CRITICAL lazy-import contract:
	- Stdlib-only at module top (json, re, unicodedata, argparse, pathlib, sys)
	- `from app import ...` lives ONLY inside run_eval()/run_adversarial_eval() so importing
	this module (e.g., for unit tests) never loads the model.
	"""

	import argparse
	import json
	import re
	import sys
	import unicodedata
	from pathlib import Path

	# Ensure project root is on sys.path so `from app import ...` resolves
	# whether this script is run as:
	# python eval/run_eval.py (cwd = project root)
	# python run_eval.py (cwd = eval/)
	# python -m pytest eval/ (cwd = project root)
	_PROJECT_ROOT = Path(__file__).resolve().parent.parent
	if str(_PROJECT_ROOT) not in sys.path:
	sys.path.insert(0, str(_PROJECT_ROOT))

	# Also ensure eval/ is on sys.path so `import grounded` resolves when
	# run_eval.py is imported from the project root (e.g. by test_eval_matching.py).
	_EVAL_DIR = str(Path(__file__).resolve().parent)
	if _EVAL_DIR not in sys.path:
	sys.path.insert(0, _EVAL_DIR)

	# ---------------------------------------------------------------------------
	# Import matching primitives from shared module (D2-05 refactor).
	# Names re-bound here so existing `import run_eval as e; e.normalize(...)` calls
	# in eval/test_eval_matching.py continue to work without modification.
	# Use `from grounded import ...` (eval/ is on sys.path when running run_eval.py).
	# ---------------------------------------------------------------------------
	from grounded import normalize, value_found, extract_values_from_section # noqa: F401


	# ---------------------------------------------------------------------------
	# Per-letter verdict (D-12, D-13, D-14, D-15, SC1)
	# ---------------------------------------------------------------------------

	def evaluate(result, gold: dict) -> dict:
	"""
	Return a per-letter verdict dict for a StructuredResult-shaped object and gold dict.

	No-invention check (D-12): every value extracted from result.deadlines must be
	a normalized substring of result.transcription.

	Recall check (D-13): every verbatim_swedish value from gold["deadlines"],
	gold["amounts"], gold["references"] must appear in result.deadlines.

	Four-sections completeness (SC1): tldr, why, actions, deadlines must all be non-empty.

	Severity (D-15): MAE is computed and included; does NOT affect pass.

	PASS = no-invention AND recall=100% AND severity is not None AND all_sections_present.

	Works on any SimpleNamespace or StructuredResult — evaluate() does NOT import app.
	"""
	# 1. No-invention check (D-12)
	invented = []
	for emitted_val in extract_values_from_section(result.deadlines):
	if not value_found(emitted_val, result.transcription):
	invented.append(emitted_val)

	# 2. Recall check (D-13) — must extract verbatim_swedish string (Pitfall 9)
	missing = []
	for category_key in ("deadlines", "amounts", "references"):
	for d in gold.get(category_key, []):
	verbatim = d["verbatim_swedish"]
	if not value_found(verbatim, result.deadlines):
	missing.append(verbatim)

	# 3. Four-sections completeness (SC1)
	all_sections_present = all(
	bool(getattr(result, f, None))
	for f in ("tldr", "why", "actions", "deadlines")
	)

	# 4. Severity MAE (D-15, advisory — never fails gate)
	severity = result.severity
	if severity is not None:
	sev_mae = abs(severity - gold["expected_severity"])
	else:
	sev_mae = 5.0 # sentinel: output truncated before SEVERITY line

	# 5. Recall rate denominator
	total_gold = sum(
	len(gold.get(k, []))
	for k in ("deadlines", "amounts", "references")
	)

	passed = (
	len(invented) == 0
	and len(missing) == 0
	and severity is not None
	and all_sections_present
	)

	return {
	"pass": passed,
	"invented_count": len(invented),
	"invented": invented,
	"missing_count": len(missing),
	"missing": missing,
	"recall_rate": 1.0 - len(missing) / max(total_gold, 1),
	"severity_mae": sev_mae,
	"schema_complete": severity is not None,
	"all_sections_present": all_sections_present,
	}


	# ---------------------------------------------------------------------------
	# D-08 beginner-mode structural invariance checker
	# ---------------------------------------------------------------------------

	def beginner_invariant(result) -> tuple:
	"""
	Assert D-08 structural invariance on a beginner-mode StructuredResult.

	Checks:
	- All four section fields (tldr, why, actions, deadlines) are non-empty
	- severity is not None (SEVERITY line still present and parseable)
	- transcription is non-empty (transcription block still present)

	Returns (ok: bool, reasons: list[str]).
	ok=True if all invariants hold; False if any fail, with reasons listing each failure.
	"""
	reasons = []

	# Check all four section fields
	for field in ("tldr", "why", "actions", "deadlines"):
	if not bool(getattr(result, field, None)):
	reasons.append(f"section '{field}' is empty in beginner-mode output (D-08 violation)")

	# Check severity parseable
	if result.severity is None:
	reasons.append(
	"severity is None in beginner-mode output — SEVERITY line dropped or truncated (D-08 violation)"
	)

	# Check transcription block present
	if not bool(getattr(result, "transcription", None)):
	reasons.append(
	"transcription is empty in beginner-mode output — transcription block dropped (D-08 violation)"
	)

	return (len(reasons) == 0, reasons)


	# ---------------------------------------------------------------------------
	# Refusal scorer (Phase 3 TRUST-02/03/04) — called by slice 3 for adversarial fixtures
	# ---------------------------------------------------------------------------

	def _render_value(pane) -> str:
	"""Extract the text value from a render_result pane (gr.update dict or str)."""
	if isinstance(pane, dict):
	return str(pane.get("value", "") or "")
	return str(pane or "")


	def evaluate_refusal(result, gold: dict, language: str = "English") -> dict:
	"""
	Score a StructuredResult against an adversarial fixture sidecar.

	Not evaluated with evaluate() — adversarial fixtures have no gold values to recall.
	The pass criterion is the REAL TRUST-03 / SC1 guarantee — *the user is shown no
	four-section analysis* — verified at the render layer, plus correct classification:
	- result.doctype matches gold["expected_doctype"] (drives the refusal route)
	- render_result(result) shows no Panic Meter (panic_html == "") and no analysis
	in the why / actions / deadlines panes (render_refusal suppresses them)

	Why not the old `result.tldr is empty` proxy: a readable non-Swedish letter can be
	(and is) analysed by the model before the render layer suppresses it, so raw `tldr`
	is legitimately non-empty even though the user correctly sees only a refusal. The old
	proxy wrongly failed that case. tldr_empty is still reported, but ADVISORY only.

	render_result is imported lazily (it is a pure function — no model — so it is safe
	under BUREAUCAT_NO_MODEL=1, and this keeps module import of run_eval app-free).

	Returns a dict matching evaluate()'s shape (for uniform handling in slice 3).
	"""
	expected = gold.get("expected_doctype", "")
	# A fixture may list several equally-correct refusal doctypes. A non-Swedish
	# English letter, for instance, is a correct refusal whether the model labels it
	# "non_swedish" (precise) or "not_letter" (generic) — both route to render_refusal
	# and show the user no analysis. accepted_doctypes makes the gate robust to that
	# benign drift across prompt tweaks; falls back to the single expected_doctype.
	accepted = gold.get("accepted_doctypes") or [expected]
	actual_doctype = getattr(result, "doctype", "letter")
	doctype_correct = actual_doctype in accepted

	from app import render_result # lazy; pure fn, safe under BUREAUCAT_NO_MODEL=1
	rendered = render_result(result, language)
	panic_html = rendered[0]
	no_analysis_rendered = (
	panic_html == ""
	and not _render_value(rendered[4]).strip() # why
	and not _render_value(rendered[5]).strip() # actions
	and not _render_value(rendered[6]).strip() # deadlines
	)

	tldr_empty = not getattr(result, "tldr", None) # advisory only

	passed = doctype_correct and no_analysis_rendered
	verdict = "refusal_correct" if passed else "refusal_wrong"

	return {
	"pass": passed,
	"verdict": verdict,
	"doctype": actual_doctype,
	"expected_doctype": expected,
	"doctype_correct": doctype_correct,
	"no_analysis_rendered": no_analysis_rendered,
	"tldr_empty": tldr_empty, # advisory — model may legitimately analyse non-Swedish
	}


	# ---------------------------------------------------------------------------
	# Bake-off runner — app contracts imported LAZILY inside this function only
	# ---------------------------------------------------------------------------

	def run_eval(
	model_variant: str = "qwen3",
	dump_path: str = None,
	) -> bool:
	"""
	Run the bake-off harness for the given model variant and output language.

	Imports app contracts lazily BELOW the empty-set guard ONLY:
	load_model, run_inference, MODEL_VARIANTS — the ONLY place app is imported.

	Iterates sorted data/letters/public/*.json sidecars. For each letter:
	- Pass A (standard, beginner_mode=False): full evaluate() gate
	- Pass B (beginner, beginner_mode=True): full evaluate() gate + beginner_invariant()

	A letter PASSES only when both passes pass AND beginner_invariant holds.

	The language parameter controls the output language passed to run_inference().
	Default "English" preserves the existing single-language harness behaviour.

	Exits non-zero (returns False) when any letter fails or gold set is empty.
	The empty-set guard fires BEFORE the app import so no model weights are
	downloaded/loaded when the gold set has not been populated yet.
	"""
	# ------------------------------------------------------------------
	# Guard: check gold set exists BEFORE importing app (which triggers
	# load_model() at module scope unless BUREAUCAT_NO_MODEL=1 is set).
	# This ensures `python eval/run_eval.py` on an empty gold set exits
	# immediately with a clear message and no 16GB weight download.
	# ------------------------------------------------------------------
	letter_dir = Path("data/letters/public")
	sidecars = sorted(letter_dir.glob("*.json"))

	if not sidecars:
	print(
	f"\n[run_eval] ERROR: Gold set is empty — no .json sidecars in {letter_dir}.\n"
	f"The bake-off gate cannot pass with zero letters (D-09).\n"
	f"Add at least 5 annotated gold letters before running Plan 04 bake-off.\n"
	)
	return False

	# LAZY IMPORT — app is only imported here, after the empty-set guard,
	# so module-scope import of run_eval (for unit tests) never loads the
	# 16GB+ model, AND an empty gold set exits cleanly with no download.
	from app import load_model, run_inference, MODEL_VARIANTS # noqa: PLC0415

	# Load model once for the entire eval run
	print(f"\n[run_eval] Loading model variant: {model_variant}")
	variant_cfg = MODEL_VARIANTS[model_variant]
	image_patch_size = variant_cfg["image_patch_size"]
	mdl, proc = load_model(model_variant)

	results = []
	dump = {} # letter_stem -> {"standard": {...fields}, "beginner": {...fields}}
	for sidecar in sidecars:
	# Find matching image (.png preferred, fallback .jpg)
	image_path = sidecar.with_suffix(".png")
	if not image_path.exists():
	image_path = sidecar.with_suffix(".jpg")
	if not image_path.exists():
	print(f" [SKIP] {sidecar.stem}: no matching image file")
	continue

	from PIL import Image
	image = Image.open(image_path)
	gold = json.loads(sidecar.read_text())

	# Pass A: standard inference (English-only product — 2026-06-07 descope)
	result_std = run_inference(
	image, "English", beginner_mode=False,
	mdl=mdl, proc=proc, image_patch_size=image_patch_size,
	)
	verdict_std = evaluate(result_std, gold)

	# Pass B: beginner-mode inference (D-08 invariance check)
	result_beg = run_inference(
	image, "English", beginner_mode=True,
	mdl=mdl, proc=proc, image_patch_size=image_patch_size,
	)
	verdict_beg = evaluate(result_beg, gold)
	inv_ok, inv_reasons = beginner_invariant(result_beg)

	if dump_path:
	_fields = ("transcription", "quip", "tldr", "why", "actions", "deadlines", "severity", "raw", "doctype")
	dump[sidecar.stem] = {
	"standard": {f: getattr(result_std, f) for f in _fields},
	"beginner": {f: getattr(result_beg, f) for f in _fields},
	}

	letter_pass = verdict_std["pass"] and verdict_beg["pass"] and inv_ok

	# Per-letter output
	status = "PASS" if letter_pass else "FAIL"
	print(
	f"\n {status} {sidecar.stem}:\n"
	f" STANDARD: recall={verdict_std['recall_rate']:.0%} "
	f"invented={verdict_std['invented_count']} "
	f"severity_mae={verdict_std['severity_mae']:.1f} "
	f"all_sections={verdict_std['all_sections_present']}\n"
	f" BEGINNER: recall={verdict_beg['recall_rate']:.0%} "
	f"invented={verdict_beg['invented_count']} "
	f"severity_mae={verdict_beg['severity_mae']:.1f} "
	f"all_sections={verdict_beg['all_sections_present']}\n"
	f" BEGINNER_INVARIANT: {'OK' if inv_ok else 'FAIL(' + '; '.join(inv_reasons) + ')'}"
	)
	if not verdict_std["pass"]:
	if verdict_std["invented"]:
	print(f" [STD] Invented values: {verdict_std['invented']}")
	if verdict_std["missing"]:
	print(f" [STD] Missing gold values: {verdict_std['missing']}")
	if not verdict_beg["pass"]:
	if verdict_beg["invented"]:
	print(f" [BEG] Invented values: {verdict_beg['invented']}")
	if verdict_beg["missing"]:
	print(f" [BEG] Missing gold values: {verdict_beg['missing']}")

	results.append({
	"letter": sidecar.stem,
	"pass": letter_pass,
	})

	# Overall summary
	n_pass = sum(1 for r in results if r["pass"])
	n_total = len(results)
	gate = n_pass == n_total and n_total > 0

	print(f"\n=== EVAL RESULTS ({model_variant}) ===")
	print(f" Overall: {n_pass}/{n_total} letters passed both passes")
	print(f" GATE: {'PASS' if gate else 'FAIL'}")

	if dump_path:
	Path(dump_path).write_text(
	json.dumps(dump, ensure_ascii=False, indent=2), encoding="utf-8"
	)
	print(f" Dumped raw model outputs for {len(dump)} letters to {dump_path}")

	return gate


	# ---------------------------------------------------------------------------
	# Adversarial refusal scoring (Phase 3 TRUST-02/03/04)
	# ---------------------------------------------------------------------------

	def run_adversarial_eval(
	model_variant: str = "qwen3",
	) -> tuple[bool, list[dict]]:
	"""
	Score the three adversarial fixtures against the refusal gate.

	Globs data/letters/public/adversarial/*.json — separate from the gold-letter
	loop (glob("*.json") in run_eval is non-recursive so adversarial sidecars never
	enter the EVAL-02 letter loop).

	Loads the model lazily after the empty-set guard (mirrors run_eval discipline).
	Runs each fixture in English only (adversarial path is not language-specific;
	classification is always English-emitted via the DOCTYPE sentinel).

	Returns (gate_pass: bool, verdicts: list[dict]).
	"""
	adv_dir = Path("data/letters/public/adversarial")
	adv_sidecars = sorted(adv_dir.glob("*.json"))

	if not adv_sidecars:
	print(
	f"\n[run_adversarial_eval] WARNING: no adversarial fixtures in {adv_dir}.\n"
	f"Skipping adversarial refusal scoring.\n"
	)
	return True, []

	from app import load_model, run_inference, MODEL_VARIANTS # noqa: PLC0415
	from PIL import Image # noqa: PLC0415

	variant_cfg = MODEL_VARIANTS[model_variant]
	image_patch_size = variant_cfg["image_patch_size"]
	mdl, proc = load_model(model_variant)

	verdicts = []
	for sidecar in adv_sidecars:
	image_path = sidecar.with_suffix(".png")
	if not image_path.exists():
	image_path = sidecar.with_suffix(".jpg")
	if not image_path.exists():
	print(f" [SKIP-ADV] {sidecar.stem}: no matching image file")
	continue

	image = Image.open(image_path)
	gold = json.loads(sidecar.read_text())

	result = run_inference(
	image, "English", beginner_mode=False,
	mdl=mdl, proc=proc, image_patch_size=image_patch_size,
	)
	verdict = evaluate_refusal(result, gold)
	verdict["letter"] = sidecar.stem
	verdicts.append(verdict)

	status = "PASS" if verdict["pass"] else "FAIL"
	print(
	f"\n {status} [ADV] {sidecar.stem}:\n"
	f" verdict={verdict['verdict']} "
	f"doctype={verdict['doctype']!r} expected={verdict['expected_doctype']!r} "
	f"tldr_empty={verdict['tldr_empty']}"
	)

	n_pass = sum(1 for v in verdicts if v["pass"])
	n_total = len(verdicts)
	gate = n_pass == n_total and n_total > 0

	print(f"\n=== ADVERSARIAL RESULTS ({model_variant}) ===")
	print(f" Overall: {n_pass}/{n_total} adversarial fixtures scored refusal_correct")
	print(f" GATE: {'PASS' if gate else 'FAIL'}")

	return gate, verdicts


	# ---------------------------------------------------------------------------
	# CLI entry point — English-only gate (gold accuracy + adversarial refusal)
	# ---------------------------------------------------------------------------

	def _main():
	parser = argparse.ArgumentParser(
	description="Bureaucat evaluator — EVAL-02 gate (English-only product)"
	)
	parser.add_argument(
	"--model",
	choices=["qwen3", "qwen25"],
	default="qwen3",
	help="Model variant to evaluate (default: qwen3)",
	)
	parser.add_argument(
	"--dump",
	default=None,
	metavar="PATH",
	help="Write raw per-letter StructuredResult outputs to PATH (JSON) for "
	"offline matching/gold iteration at zero GPU cost.",
	)
	args = parser.parse_args()

	# English-only product (2026-06-07 descope): gold-letter accuracy gate +
	# adversarial refusal gate. Both must pass.
	gold_gate = run_eval(args.model, dump_path=args.dump)
	adv_gate, _ = run_adversarial_eval(args.model)
	sys.exit(0 if (gold_gate and adv_gate) else 1)


	if __name__ == "__main__":
	_main()