Spaces:

davanstrien
/

ocr-bench-viewer

Sleeping

App Files Files Community

davanstrien HF Staff commited on Feb 25

Commit

1118181

verified ·

1 Parent(s): 810cfa0

Upload folder using huggingface_hub

Browse files

Files changed (18) hide show

src/ocr_bench/__init__.py +3 -0
src/ocr_bench/backends.py +238 -0
src/ocr_bench/cli.py +589 -0
src/ocr_bench/dataset.py +297 -0
src/ocr_bench/elo.py +309 -0
src/ocr_bench/judge.py +287 -0
src/ocr_bench/publish.py +262 -0
src/ocr_bench/run.py +187 -0
src/ocr_bench/space.py +18 -0
src/ocr_bench/static/style.css +379 -0
src/ocr_bench/templates/base.html +48 -0
src/ocr_bench/templates/comparison_card.html +88 -0
src/ocr_bench/templates/comparisons.html +40 -0
src/ocr_bench/templates/leaderboard.html +43 -0
src/ocr_bench/templates/stats_panel.html +10 -0
src/ocr_bench/validate.py +311 -0
src/ocr_bench/viewer.py +202 -0
src/ocr_bench/web.py +487 -0

src/ocr_bench/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ """OCR model evaluation toolkit — VLM-as-judge with per-dataset leaderboards."""
2	+
3	+ __version__ = "0.1.0"

src/ocr_bench/backends.py ADDED Viewed

	@@ -0,0 +1,238 @@

+"""Judge backends — API-based (HF Inference Providers, OpenAI-compatible)."""
+from __future__ import annotations
+import abc
+from collections import Counter
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any
+import stamina
+import structlog
+from huggingface_hub import InferenceClient
+from openai import OpenAI
+from ocr_bench.judge import JUDGE_SCHEMA, Comparison, parse_judge_output
+logger = structlog.get_logger()
+# Retry on these exception types with exponential backoff + jitter.
+_RETRYABLE = (Exception,)
+class JudgeBackend(abc.ABC):
+    """Base class for judge backends."""
+    name: str
+    concurrency: int = 1
+    @abc.abstractmethod
+    def _call_single(self, comp: Comparison) -> dict[str, str]:
+        """Run the judge on a single comparison."""
+    def judge(self, comparisons: list[Comparison]) -> list[dict[str, str]]:
+        """Run the judge on a list of comparisons (concurrently if supported).
+        Returns a list of parsed results (one per comparison).
+        Each result is a dict with ``winner`` and ``reason`` keys,
+        or an empty dict on failure.
+        """
+        if self.concurrency <= 1 or len(comparisons) <= 1:
+            return [self._call_single(comp) for comp in comparisons]
+        # Concurrent execution preserving order
+        results: list[dict[str, str]] = [{}] * len(comparisons)
+        with ThreadPoolExecutor(max_workers=self.concurrency) as pool:
+            future_to_idx = {
+                pool.submit(self._call_single, comp): i
+                for i, comp in enumerate(comparisons)
+            }
+            for future in as_completed(future_to_idx):
+                idx = future_to_idx[future]
+                try:
+                    results[idx] = future.result()
+                except Exception as exc:
+                    logger.warning("judge_call_failed", idx=idx, error=str(exc))
+                    results[idx] = {}
+        return results
+DEFAULT_MAX_TOKENS = 1024
+class InferenceProviderJudge(JudgeBackend):
+    """HF Inference Providers backend (Novita, Together, etc.)."""
+    def __init__(
+        self, model: str, provider: str | None = None, max_tokens: int = DEFAULT_MAX_TOKENS,
+    ):
+        self.name = f"{provider + ':' if provider else ''}{model}"
+        self.model = model
+        self.max_tokens = max_tokens
+        self.client = InferenceClient(model=model, provider=provider)  # type: ignore[invalid-argument-type]
+    @stamina.retry(on=_RETRYABLE, attempts=6)
+    def _call_single(self, comp: Comparison) -> dict[str, str]:
+        response = self.client.chat_completion(  # type: ignore[no-matching-overload]
+            messages=comp.messages,
+            max_tokens=self.max_tokens,
+            temperature=0.0,
+            response_format={"type": "json_object"},
+            extra_body={"chat_template_kwargs": {"enable_thinking": False}},
+        )
+        raw = response.choices[0].message.content.strip()
+        result = parse_judge_output(raw)
+        if not result:
+            logger.warning("empty_parse", backend=self.name, sample=comp.sample_idx)
+        return result
+class OpenAICompatibleJudge(JudgeBackend):
+    """OpenAI-compatible endpoint (local vLLM server, Ollama, HF IE, etc.)."""
+    def __init__(
+        self,
+        base_url: str,
+        model: str = "default",
+        max_tokens: int = DEFAULT_MAX_TOKENS,
+        api_key: str = "not-needed",
+        extra_body: dict | None = None,
+        temperature: float = 0.0,
+        concurrency: int = 1,
+    ):
+        self.name = model if model != "default" else f"openai@{base_url}"
+        self.model = model
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        self.extra_body = extra_body if extra_body is not None else {"guided_json": JUDGE_SCHEMA}
+        self.concurrency = concurrency
+        self.client = OpenAI(base_url=base_url, api_key=api_key)
+    @stamina.retry(on=_RETRYABLE, attempts=3)
+    def _call_single(self, comp: Comparison) -> dict[str, str]:
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=comp.messages,  # type: ignore[invalid-argument-type]
+            max_tokens=self.max_tokens,
+            temperature=self.temperature,
+            extra_body=self.extra_body,
+        )
+        raw = response.choices[0].message.content.strip()
+        result = parse_judge_output(raw)
+        if not result:
+            logger.warning("empty_parse", backend=self.name, sample=comp.sample_idx)
+        return result
+# ---------------------------------------------------------------------------
+# Spec parsing
+# ---------------------------------------------------------------------------
+DEFAULT_JUDGE = "novita:moonshotai/Kimi-K2.5"
+def parse_judge_spec(
+    spec: str, max_tokens: int = DEFAULT_MAX_TOKENS, concurrency: int = 1,
+) -> JudgeBackend:
+    """Parse a judge specification string into a backend.
+    Formats:
+      - ``"https://xxx.endpoints.huggingface.cloud"`` → :class:`OpenAICompatibleJudge`
+        (HF Inference Endpoints, OpenAI-compatible with HF token auth)
+      - ``"http://..."`` or ``"https://..."`` (other) → :class:`OpenAICompatibleJudge`
+      - ``"provider:org/model"`` (colon before first ``/``) → :class:`InferenceProviderJudge`
+      - anything else → :class:`InferenceProviderJudge` (no provider)
+    """
+    if spec.startswith("http://") or spec.startswith("https://"):
+        # Check for url:model format (e.g. https://...cloud/v1/:org/model)
+        url_part = spec
+        model_name = "default"
+        # Split on /v1/: to separate URL from model name
+        if "/v1/:" in spec:
+            url_part, model_name = spec.split("/v1/:", 1)
+            url_part += "/v1"
+        # HF Inference Endpoints — OpenAI-compatible, auth via HF token
+        if ".endpoints.huggingface." in url_part:
+            from huggingface_hub import get_token
+            base_url = url_part.rstrip("/")
+            if not base_url.endswith("/v1"):
+                base_url += "/v1"
+            token = get_token() or "not-needed"
+            return OpenAICompatibleJudge(
+                base_url=base_url,
+                model=model_name,
+                api_key=token,
+                max_tokens=max_tokens,
+                temperature=0.7,
+                extra_body={"chat_template_kwargs": {"enable_thinking": False}},
+                concurrency=concurrency,
+            )
+        return OpenAICompatibleJudge(
+            base_url=url_part, model=model_name, max_tokens=max_tokens,
+            concurrency=concurrency,
+        )
+    if ":" in spec:
+        # provider:model format — colon must come before first slash
+        colon_idx = spec.index(":")
+        slash_idx = spec.find("/")
+        if slash_idx == -1 or colon_idx < slash_idx:
+            provider, model = spec.split(":", 1)
+            return InferenceProviderJudge(model=model, provider=provider, max_tokens=max_tokens)
+    return InferenceProviderJudge(model=spec, max_tokens=max_tokens)
+# ---------------------------------------------------------------------------
+# Jury aggregation
+# ---------------------------------------------------------------------------
+def aggregate_jury_votes(
+    all_results: list[list[dict[str, str]]],
+    judge_names: list[str],
+) -> list[dict[str, Any]]:
+    """Aggregate votes from multiple judges using majority voting.
+    Args:
+        all_results: List of result lists, one per judge. Each inner list
+            has one dict per comparison.
+        judge_names: Names of the judges (same order as *all_results*).
+    Returns:
+        Aggregated results with ``winner``, ``reason``, and ``agreement`` fields.
+    """
+    if not all_results:
+        return []
+    n_comparisons = len(all_results[0])
+    n_judges = len(all_results)
+    aggregated: list[dict[str, Any]] = []
+    for i in range(n_comparisons):
+        votes: list[str] = []
+        reasons: list[str] = []
+        for j in range(n_judges):
+            result = all_results[j][i] if i < len(all_results[j]) else {}
+            winner = result.get("winner", "")
+            if winner:
+                votes.append(winner)
+                reasons.append(f"{judge_names[j]}: {result.get('reason', '')}")
+        if not votes:
+            aggregated.append({"winner": "tie", "reason": "no valid votes", "agreement": "0/0"})
+            continue
+        counter = Counter(votes)
+        majority_winner, majority_count = counter.most_common(1)[0]
+        agreement = f"{majority_count}/{len(votes)}"
+        aggregated.append({
+            "winner": majority_winner,
+            "reason": "; ".join(reasons),
+            "agreement": agreement,
+        })
+    return aggregated

src/ocr_bench/cli.py ADDED Viewed

	@@ -0,0 +1,589 @@

+"""CLI entrypoint for ocr-bench."""
+from __future__ import annotations
+import argparse
+import sys
+import structlog
+from rich.console import Console
+from rich.table import Table
+from ocr_bench.backends import (
+    DEFAULT_JUDGE,
+    DEFAULT_MAX_TOKENS,
+    aggregate_jury_votes,
+    parse_judge_spec,
+)
+from ocr_bench.dataset import (
+    DatasetError,
+    discover_configs,
+    discover_pr_configs,
+    load_config_dataset,
+    load_flat_dataset,
+)
+from ocr_bench.elo import ComparisonResult, Leaderboard, compute_elo, rankings_resolved
+from ocr_bench.judge import Comparison, _normalize_pair, build_comparisons, sample_indices
+from ocr_bench.publish import (
+    EvalMetadata,
+    load_existing_comparisons,
+    load_existing_metadata,
+    publish_results,
+)
+logger = structlog.get_logger()
+console = Console()
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="ocr-bench",
+        description="OCR model evaluation toolkit — VLM-as-judge with per-dataset leaderboards",
+    )
+    sub = parser.add_subparsers(dest="command")
+    judge = sub.add_parser("judge", help="Run pairwise VLM judge on OCR outputs")
+    # Dataset
+    judge.add_argument("dataset", help="HF dataset repo id")
+    judge.add_argument("--split", default="train", help="Dataset split (default: train)")
+    judge.add_argument("--columns", nargs="+", default=None, help="Explicit OCR column names")
+    judge.add_argument(
+        "--configs", nargs="+", default=None, help="Config-per-model: list of config names"
+    )
+    judge.add_argument("--from-prs", action="store_true", help="Force PR-based config discovery")
+    judge.add_argument(
+        "--merge",
+        action="store_true",
+        help="Merge PRs to main after discovery (default: load via revision)",
+    )
+    # Judge
+    judge.add_argument(
+        "--model",
+        action="append",
+        dest="models",
+        help=f"Judge model spec (repeatable for jury). Default: {DEFAULT_JUDGE}",
+    )
+    # Eval
+    judge.add_argument("--max-samples", type=int, default=None, help="Max samples to evaluate")
+    judge.add_argument("--seed", type=int, default=42, help="Random seed (default: 42)")
+    judge.add_argument(
+        "--max-tokens",
+        type=int,
+        default=DEFAULT_MAX_TOKENS,
+        help=f"Max tokens for judge response (default: {DEFAULT_MAX_TOKENS})",
+    )
+    # Output
+    judge.add_argument(
+        "--save-results",
+        default=None,
+        help="HF repo id to publish results to (default: {dataset}-results)",
+    )
+    judge.add_argument(
+        "--no-publish",
+        action="store_true",
+        help="Don't publish results (default: publish to {dataset}-results)",
+    )
+    judge.add_argument(
+        "--full-rejudge",
+        action="store_true",
+        help="Re-judge all pairs, ignoring existing comparisons in --save-results repo",
+    )
+    judge.add_argument(
+        "--no-adaptive",
+        action="store_true",
+        help="Disable adaptive stopping (default: adaptive is on)",
+    )
+    judge.add_argument(
+        "--concurrency",
+        type=int,
+        default=1,
+        help="Number of concurrent judge API calls (default: 1)",
+    )
+    # --- run subcommand ---
+    run = sub.add_parser("run", help="Launch OCR models on a dataset via HF Jobs")
+    run.add_argument("input_dataset", help="HF dataset repo id with images")
+    run.add_argument("output_repo", help="Output dataset repo (all models push here)")
+    run.add_argument(
+        "--models", nargs="+", default=None, help="Model slugs to run (default: all 4 core)"
+    )
+    run.add_argument("--max-samples", type=int, default=None, help="Per-model sample limit")
+    run.add_argument("--split", default="train", help="Dataset split (default: train)")
+    run.add_argument("--flavor", default=None, help="Override GPU flavor for all models")
+    run.add_argument("--timeout", default="4h", help="Per-job timeout (default: 4h)")
+    run.add_argument("--seed", type=int, default=42, help="Random seed (default: 42)")
+    run.add_argument("--shuffle", action="store_true", help="Shuffle source dataset")
+    run.add_argument("--list-models", action="store_true", help="Print available models and exit")
+    run.add_argument(
+        "--dry-run", action="store_true", help="Show what would launch without launching"
+    )
+    run.add_argument(
+        "--no-wait", action="store_true", help="Launch and exit without polling (default: wait)"
+    )
+    # --- view subcommand ---
+    view = sub.add_parser("view", help="Browse and validate results in a web UI")
+    view.add_argument("results", help="HF dataset repo id with published results")
+    view.add_argument("--port", type=int, default=7860, help="Port (default: 7860)")
+    view.add_argument("--host", default="127.0.0.1", help="Host (default: 127.0.0.1)")
+    view.add_argument("--output", default=None, help="Path to save annotations JSON")
+    return parser
+def print_leaderboard(board: Leaderboard) -> None:
+    """Print leaderboard as a Rich table."""
+    table = Table(title="OCR Model Leaderboard")
+    table.add_column("Rank", style="bold")
+    table.add_column("Model")
+    has_ci = bool(board.elo_ci)
+    if has_ci:
+        table.add_column("ELO (95% CI)", justify="right")
+    else:
+        table.add_column("ELO", justify="right")
+    table.add_column("Wins", justify="right")
+    table.add_column("Losses", justify="right")
+    table.add_column("Ties", justify="right")
+    table.add_column("Win%", justify="right")
+    for rank, (model, elo) in enumerate(board.ranked, 1):
+        pct = board.win_pct(model)
+        pct_str = f"{pct:.0f}%" if pct is not None else "-"
+        if has_ci and model in board.elo_ci:
+            lo, hi = board.elo_ci[model]
+            elo_str = f"{round(elo)} ({round(lo)}\u2013{round(hi)})"
+        else:
+            elo_str = str(round(elo))
+        table.add_row(
+            str(rank),
+            model,
+            elo_str,
+            str(board.wins[model]),
+            str(board.losses[model]),
+            str(board.ties[model]),
+            pct_str,
+        )
+    console.print(table)
+def _convert_results(
+    comparisons: list[Comparison], aggregated: list[dict]
+) -> list[ComparisonResult]:
+    """Convert judged comparisons + aggregated outputs into ComparisonResult list."""
+    results: list[ComparisonResult] = []
+    for comp, result in zip(comparisons, aggregated):
+        if not result:
+            continue
+        results.append(
+            ComparisonResult(
+                sample_idx=comp.sample_idx,
+                model_a=comp.model_a,
+                model_b=comp.model_b,
+                winner=result.get("winner", "tie"),
+                reason=result.get("reason", ""),
+                agreement=result.get("agreement", "1/1"),
+                swapped=comp.swapped,
+                text_a=comp.text_a,
+                text_b=comp.text_b,
+                col_a=comp.col_a,
+                col_b=comp.col_b,
+            )
+        )
+    return results
+def _resolve_results_repo(dataset: str, save_results: str | None, no_publish: bool) -> str | None:
+    """Derive the results repo id. Returns None if publishing is disabled."""
+    if no_publish:
+        return None
+    if save_results:
+        return save_results
+    return f"{dataset}-results"
+def cmd_judge(args: argparse.Namespace) -> None:
+    """Orchestrate: load → compare → judge → elo → print → publish."""
+    # --- Resolve flags ---
+    adaptive = not args.no_adaptive
+    merge = args.merge
+    results_repo = _resolve_results_repo(args.dataset, args.save_results, args.no_publish)
+    from_prs = False  # track for metadata
+    if results_repo:
+        console.print(f"Results will be published to [bold]{results_repo}[/bold]")
+    # --- Load dataset (cascading auto-detection) ---
+    if args.configs:
+        # Explicit configs — use them directly
+        config_names = args.configs
+        ds, ocr_columns = load_config_dataset(args.dataset, config_names, split=args.split)
+    elif args.columns:
+        # Explicit columns — flat loading
+        ds, ocr_columns = load_flat_dataset(args.dataset, split=args.split, columns=args.columns)
+    elif args.from_prs:
+        # Forced PR discovery
+        config_names, pr_revisions = discover_pr_configs(args.dataset, merge=merge)
+        if not config_names:
+            raise DatasetError("No configs found in open PRs")
+        from_prs = True
+        console.print(f"Discovered {len(config_names)} configs from PRs: {config_names}")
+        ds, ocr_columns = load_config_dataset(
+            args.dataset,
+            config_names,
+            split=args.split,
+            pr_revisions=pr_revisions if not merge else None,
+        )
+    else:
+        # Auto-detect: PRs + main branch configs combined, fall back to flat
+        pr_configs, pr_revisions = discover_pr_configs(args.dataset, merge=merge)
+        main_configs = discover_configs(args.dataset)
+        # Combine: PR configs + main configs not already in PRs
+        config_names = list(pr_configs)
+        for mc in main_configs:
+            if mc not in pr_configs:
+                config_names.append(mc)
+        if config_names:
+            if pr_configs:
+                from_prs = True
+                console.print(f"Auto-detected {len(pr_configs)} configs from PRs: {pr_configs}")
+            if main_configs:
+                main_only = [c for c in main_configs if c not in pr_configs]
+                if main_only:
+                    console.print(f"Auto-detected {len(main_only)} configs on main: {main_only}")
+            ds, ocr_columns = load_config_dataset(
+                args.dataset,
+                config_names,
+                split=args.split,
+                pr_revisions=pr_revisions if pr_configs else None,
+            )
+        else:
+            # No configs anywhere — fall back to flat loading
+            ds, ocr_columns = load_flat_dataset(args.dataset, split=args.split)
+    console.print(f"Loaded {len(ds)} samples with {len(ocr_columns)} models:")
+    for col, model in ocr_columns.items():
+        console.print(f"  {col} → {model}")
+    # --- Incremental: load existing comparisons ---
+    existing_results: list[ComparisonResult] = []
+    existing_meta_rows: list[dict] = []
+    skip_pairs: set[tuple[str, str]] | None = None
+    if results_repo and not args.full_rejudge:
+        existing_results = load_existing_comparisons(results_repo)
+        if existing_results:
+            judged_pairs = {_normalize_pair(r.model_a, r.model_b) for r in existing_results}
+            skip_pairs = judged_pairs
+            console.print(
+                f"\nIncremental mode: {len(existing_results)} existing comparisons "
+                f"across {len(judged_pairs)} model pairs — skipping those."
+            )
+            existing_meta_rows = load_existing_metadata(results_repo)
+        else:
+            console.print("\nNo existing comparisons found — full judge run.")
+    model_names = list(set(ocr_columns.values()))
+    # --- Judge setup (shared by both paths) ---
+    model_specs = args.models or [DEFAULT_JUDGE]
+    judges = [
+        parse_judge_spec(spec, max_tokens=args.max_tokens, concurrency=args.concurrency)
+        for spec in model_specs
+    ]
+    is_jury = len(judges) > 1
+    def _judge_batch(batch_comps: list[Comparison]) -> list[ComparisonResult]:
+        """Run judge(s) on a batch of comparisons and return ComparisonResults."""
+        all_judge_outputs: list[list[dict]] = []
+        for judge in judges:
+            results = judge.judge(batch_comps)
+            all_judge_outputs.append(results)
+        if is_jury:
+            judge_names = [j.name for j in judges]
+            aggregated = aggregate_jury_votes(all_judge_outputs, judge_names)
+        else:
+            aggregated = all_judge_outputs[0]
+        return _convert_results(batch_comps, aggregated)
+    if adaptive:
+        # --- Adaptive stopping: batch-by-batch with convergence check ---
+        from itertools import combinations as _combs
+        all_indices = sample_indices(len(ds), args.max_samples, args.seed)
+        n_pairs = len(list(_combs(model_names, 2)))
+        batch_samples = 5
+        min_before_check = max(3 * n_pairs, 20)
+        if is_jury:
+            console.print(f"\nJury mode: {len(judges)} judges")
+        console.print(
+            f"\n[bold]Adaptive mode[/bold]: {len(all_indices)} samples, "
+            f"{n_pairs} pairs, batch size {batch_samples}, "
+            f"checking after {min_before_check} comparisons"
+        )
+        new_results: list[ComparisonResult] = []
+        total_comparisons = 0
+        for batch_num, batch_start in enumerate(range(0, len(all_indices), batch_samples)):
+            batch_indices = all_indices[batch_start : batch_start + batch_samples]
+            batch_comps = build_comparisons(
+                ds,
+                ocr_columns,
+                skip_pairs=skip_pairs,
+                indices=batch_indices,
+                seed=args.seed,
+            )
+            if not batch_comps:
+                continue
+            batch_results = _judge_batch(batch_comps)
+            new_results.extend(batch_results)
+            total_comparisons += len(batch_comps)
+            # batch_comps goes out of scope → GC can free images
+            total = len(existing_results) + len(new_results)
+            console.print(f"  Batch {batch_num + 1}: {len(batch_results)} new, {total} total")
+            if total >= min_before_check:
+                board = compute_elo(existing_results + new_results, model_names)
+                # Show CI gaps for each adjacent pair
+                ranked = board.ranked
+                if board.elo_ci:
+                    gaps: list[str] = []
+                    for i in range(len(ranked) - 1):
+                        hi_model, _ = ranked[i]
+                        lo_model, _ = ranked[i + 1]
+                        hi_ci = board.elo_ci.get(hi_model)
+                        lo_ci = board.elo_ci.get(lo_model)
+                        if hi_ci and lo_ci:
+                            gap = hi_ci[0] - lo_ci[1]  # positive = resolved
+                            if gap > 0:
+                                status = "[green]ok[/green]"
+                            else:
+                                status = f"[yellow]overlap {-gap:.0f}[/yellow]"
+                            gaps.append(f"    {hi_model} vs {lo_model}: gap={gap:+.0f} {status}")
+                    if gaps:
+                        console.print("  CI gaps:")
+                        for g in gaps:
+                            console.print(g)
+                if rankings_resolved(board):
+                    remaining = len(all_indices) - batch_start - len(batch_indices)
+                    console.print(
+                        f"[green]Rankings converged after {total} comparisons! "
+                        f"Skipped ~{remaining * n_pairs} remaining.[/green]"
+                    )
+                    break
+        console.print(f"\n{len(new_results)}/{total_comparisons} valid comparisons")
+    else:
+        # --- Standard single-pass flow ---
+        comparisons = build_comparisons(
+            ds,
+            ocr_columns,
+            max_samples=args.max_samples,
+            seed=args.seed,
+            skip_pairs=skip_pairs,
+        )
+        console.print(f"\nBuilt {len(comparisons)} new pairwise comparisons")
+        if not comparisons and not existing_results:
+            console.print(
+                "[yellow]No valid comparisons — check that OCR columns have text.[/yellow]"
+            )
+            return
+        if not comparisons:
+            console.print("[green]All pairs already judged — refitting leaderboard.[/green]")
+            board = compute_elo(existing_results, model_names)
+            console.print()
+            print_leaderboard(board)
+            if results_repo:
+                metadata = EvalMetadata(
+                    source_dataset=args.dataset,
+                    judge_models=[],
+                    seed=args.seed,
+                    max_samples=args.max_samples or len(ds),
+                    total_comparisons=0,
+                    valid_comparisons=0,
+                    from_prs=from_prs,
+                )
+                publish_results(
+                    results_repo,
+                    board,
+                    metadata,
+                    existing_metadata=existing_meta_rows,
+                )
+                console.print(f"\nResults published to [bold]{results_repo}[/bold]")
+            return
+        if is_jury:
+            console.print(f"\nJury mode: {len(judges)} judges")
+        for judge in judges:
+            console.print(f"\nRunning judge: {judge.name}")
+        new_results = _judge_batch(comparisons)
+        total_comparisons = len(comparisons)
+        console.print(f"\n{len(new_results)}/{total_comparisons} valid comparisons")
+    # --- Merge existing + new, compute ELO ---
+    all_results = existing_results + new_results
+    board = compute_elo(all_results, model_names)
+    console.print()
+    print_leaderboard(board)
+    # --- Publish ---
+    if results_repo:
+        metadata = EvalMetadata(
+            source_dataset=args.dataset,
+            judge_models=[j.name for j in judges],
+            seed=args.seed,
+            max_samples=args.max_samples or len(ds),
+            total_comparisons=total_comparisons,
+            valid_comparisons=len(new_results),
+            from_prs=from_prs,
+        )
+        publish_results(results_repo, board, metadata, existing_metadata=existing_meta_rows)
+        console.print(f"\nResults published to [bold]{results_repo}[/bold]")
+def cmd_run(args: argparse.Namespace) -> None:
+    """Launch OCR models on a dataset via HF Jobs."""
+    from ocr_bench.run import (
+        DEFAULT_MODELS,
+        MODEL_REGISTRY,
+        build_script_args,
+        launch_ocr_jobs,
+        poll_jobs,
+    )
+    # --list-models
+    if args.list_models:
+        table = Table(title="Available OCR Models", show_lines=True)
+        table.add_column("Slug", style="cyan bold")
+        table.add_column("Model ID")
+        table.add_column("Size", justify="right")
+        table.add_column("Default GPU", justify="center")
+        for slug in sorted(MODEL_REGISTRY):
+            cfg = MODEL_REGISTRY[slug]
+            default = " (default)" if slug in DEFAULT_MODELS else ""
+            table.add_row(slug + default, cfg.model_id, cfg.size, cfg.default_flavor)
+        console.print(table)
+        console.print(f"\nDefault set: {', '.join(DEFAULT_MODELS)}")
+        return
+    selected = args.models or DEFAULT_MODELS
+    for slug in selected:
+        if slug not in MODEL_REGISTRY:
+            console.print(f"[red]Unknown model: {slug}[/red]")
+            console.print(f"Available: {', '.join(MODEL_REGISTRY.keys())}")
+            sys.exit(1)
+    console.print("\n[bold]OCR Benchmark Run[/bold]")
+    console.print(f"  Source:  {args.input_dataset}")
+    console.print(f"  Output:  {args.output_repo}")
+    console.print(f"  Models:  {', '.join(selected)}")
+    if args.max_samples:
+        console.print(f"  Samples: {args.max_samples} per model")
+    console.print()
+    # Dry run
+    if args.dry_run:
+        console.print("[bold yellow]DRY RUN[/bold yellow] — no jobs will be launched\n")
+        for slug in selected:
+            cfg = MODEL_REGISTRY[slug]
+            flavor = args.flavor or cfg.default_flavor
+            script_args = build_script_args(
+                args.input_dataset,
+                args.output_repo,
+                slug,
+                max_samples=args.max_samples,
+                shuffle=args.shuffle,
+                seed=args.seed,
+                extra_args=cfg.default_args or None,
+            )
+            console.print(f"[cyan]{slug}[/cyan] ({cfg.model_id})")
+            console.print(f"  Flavor:  {flavor}")
+            console.print(f"  Timeout: {args.timeout}")
+            console.print(f"  Script:  {cfg.script}")
+            console.print(f"  Args:    {' '.join(script_args)}")
+            console.print()
+        console.print("Remove --dry-run to launch these jobs.")
+        return
+    # Launch
+    jobs = launch_ocr_jobs(
+        args.input_dataset,
+        args.output_repo,
+        models=selected,
+        max_samples=args.max_samples,
+        split=args.split,
+        shuffle=args.shuffle,
+        seed=args.seed,
+        flavor_override=args.flavor,
+        timeout=args.timeout,
+    )
+    console.print(f"\n[green]{len(jobs)} jobs launched.[/green]")
+    for job in jobs:
+        console.print(f"  [cyan]{job.model_slug}[/cyan]: {job.job_url}")
+    if not args.no_wait:
+        console.print("\n[bold]Waiting for jobs to complete...[/bold]")
+        poll_jobs(jobs)
+        console.print("\n[bold green]All jobs finished![/bold green]")
+        console.print("\nEvaluate:")
+        console.print(f"  ocr-bench judge {args.output_repo}")
+    else:
+        console.print("\nJobs running in background.")
+        console.print("Check status at: https://huggingface.co/settings/jobs")
+        console.print(f"When complete: ocr-bench judge {args.output_repo}")
+def cmd_view(args: argparse.Namespace) -> None:
+    """Launch the FastAPI + HTMX results viewer."""
+    try:
+        import uvicorn
+        from ocr_bench.web import create_app
+    except ImportError:
+        console.print(
+            "[red]Error:[/red] FastAPI/uvicorn not installed. "
+            "Install the viewer extra: [bold]pip install ocr-bench\\[viewer][/bold]"
+        )
+        sys.exit(1)
+    console.print(f"Loading results from [bold]{args.results}[/bold]...")
+    app = create_app(args.results, output_path=args.output)
+    console.print(f"Starting viewer at [bold]http://{args.host}:{args.port}[/bold]")
+    uvicorn.run(app, host=args.host, port=args.port)
+def main() -> None:
+    parser = build_parser()
+    args = parser.parse_args()
+    if args.command is None:
+        parser.print_help()
+        sys.exit(0)
+    try:
+        if args.command == "judge":
+            cmd_judge(args)
+        elif args.command == "run":
+            cmd_run(args)
+        elif args.command == "view":
+            cmd_view(args)
+    except DatasetError as exc:
+        console.print(f"[red]Error:[/red] {exc}")
+        sys.exit(1)

src/ocr_bench/dataset.py ADDED Viewed

	@@ -0,0 +1,297 @@

+"""Dataset loading — flat, config-per-model, PR-based. OCR column discovery."""
+from __future__ import annotations
+import json
+import structlog
+from datasets import Dataset, get_dataset_config_names, load_dataset
+from huggingface_hub import HfApi
+logger = structlog.get_logger()
+class DatasetError(Exception):
+    """Raised when dataset loading or column discovery fails."""
+# ---------------------------------------------------------------------------
+# OCR column discovery
+# ---------------------------------------------------------------------------
+def discover_ocr_columns(dataset: Dataset) -> dict[str, str]:
+    """Discover OCR output columns and their model names from a dataset.
+    Strategy:
+      1. Parse ``inference_info`` JSON from the first row (list or single entry).
+      2. Fallback: heuristic column-name matching (``markdown``, ``ocr``, ``text``).
+      3. Disambiguate duplicate model names by appending the column name.
+    Returns:
+        Mapping of ``column_name → model_name``.
+    Raises:
+        DatasetError: If no OCR columns can be found.
+    """
+    columns: dict[str, str] = {}
+    try:
+        if "inference_info" not in dataset.column_names:
+            raise KeyError("no inference_info column")
+        info_raw = dataset["inference_info"][0]  # column access avoids image decode
+        if info_raw:
+            info = json.loads(info_raw)
+            if not isinstance(info, list):
+                info = [info]
+            for entry in info:
+                col = entry.get("column_name", "")
+                model = entry.get("model_id", entry.get("model_name", "unknown"))
+                if col and col in dataset.column_names:
+                    columns[col] = model
+    except (json.JSONDecodeError, TypeError, KeyError) as exc:
+        logger.warning("could_not_parse_inference_info", error=str(exc))
+    # Fallback: heuristic
+    if not columns:
+        for col in dataset.column_names:
+            lower = col.lower()
+            if "markdown" in lower or "ocr" in lower or col == "text":
+                columns[col] = col
+    if not columns:
+        raise DatasetError(f"No OCR columns found. Available columns: {dataset.column_names}")
+    # Disambiguate duplicates
+    model_counts: dict[str, int] = {}
+    for model in columns.values():
+        model_counts[model] = model_counts.get(model, 0) + 1
+    disambiguated: dict[str, str] = {}
+    for col, model in columns.items():
+        if model_counts[model] > 1:
+            short = model.split("/")[-1] if "/" in model else model
+            disambiguated[col] = f"{short} ({col})"
+        else:
+            disambiguated[col] = model
+    return disambiguated
+# ---------------------------------------------------------------------------
+# PR-based config discovery
+# ---------------------------------------------------------------------------
+def discover_pr_configs(
+    repo_id: str,
+    merge: bool = False,
+    api: HfApi | None = None,
+) -> tuple[list[str], dict[str, str]]:
+    """Discover dataset configs from open PRs on a Hub dataset repo.
+    PR titles must end with ``[config_name]`` to be detected.
+    Args:
+        repo_id: HF dataset repo id.
+        merge: If True, merge each discovered PR before loading.
+        api: Optional pre-configured HfApi instance.
+    Returns:
+        Tuple of (config_names, {config_name: pr_revision}).
+    """
+    if api is None:
+        api = HfApi()
+    config_names: list[str] = []
+    revisions: dict[str, str] = {}
+    discussions = api.get_repo_discussions(repo_id, repo_type="dataset")
+    for disc in discussions:
+        if not disc.is_pull_request or disc.status != "open":
+            continue
+        title = disc.title
+        if "[" in title and title.endswith("]"):
+            config = title[title.rindex("[") + 1 : -1].strip()
+            if config:
+                if merge:
+                    api.merge_pull_request(repo_id, disc.num, repo_type="dataset")
+                    logger.info("merged_pr", pr=disc.num, config=config)
+                else:
+                    revisions[config] = f"refs/pr/{disc.num}"
+                config_names.append(config)
+    return config_names, revisions
+def discover_configs(repo_id: str) -> list[str]:
+    """List non-default configs from the main branch of a Hub dataset.
+    Returns:
+        Config names excluding "default", or empty list if none found.
+    """
+    try:
+        configs = get_dataset_config_names(repo_id)
+    except Exception as exc:
+        logger.info("no_configs_on_main", repo=repo_id, reason=str(exc))
+        return []
+    return [c for c in configs if c != "default"]
+# ---------------------------------------------------------------------------
+# Config-per-model loading
+# ---------------------------------------------------------------------------
+def load_config_dataset(
+    repo_id: str,
+    config_names: list[str],
+    split: str = "train",
+    pr_revisions: dict[str, str] | None = None,
+) -> tuple[Dataset, dict[str, str]]:
+    """Load multiple configs from a Hub dataset and merge into one.
+    Each config becomes a column whose name is the config name and whose value
+    is the OCR text (from the first column matching heuristics, or ``markdown``).
+    Args:
+        repo_id: HF dataset repo id.
+        config_names: List of config names to load.
+        split: Dataset split to load.
+        pr_revisions: Optional mapping of config_name → revision for PR-based loading.
+    Returns:
+        Tuple of (unified Dataset, {column_name: model_id}).
+    """
+    if not config_names:
+        raise DatasetError("No config names provided")
+    pr_revisions = pr_revisions or {}
+    unified: Dataset | None = None
+    ocr_columns: dict[str, str] = {}
+    for config in config_names:
+        revision = pr_revisions.get(config)
+        kwargs: dict = {"path": repo_id, "name": config, "split": split}
+        if revision:
+            kwargs["revision"] = revision
+        ds = load_dataset(**kwargs)
+        # Find the OCR text column in this config
+        text_col = _find_text_column(ds)
+        if text_col is None:
+            logger.warning("no_text_column_in_config", config=config)
+            continue
+        # Extract model_id from inference_info if available
+        model_id = _extract_model_id(ds, config)
+        ocr_columns[config] = model_id
+        # Build unified dataset using Arrow-level ops (no per-row image decode)
+        text_values = ds[text_col]  # column access — no image decoding
+        if unified is None:
+            # First config: keep all columns except text_col, add text as config name
+            drop = [text_col] if text_col != config else []
+            unified = ds.remove_columns(drop) if drop else ds
+            if config != text_col:
+                unified = unified.add_column(config, text_values)
+            # Also rename text_col to config if they differ and text_col was kept
+        else:
+            if len(ds) != len(unified):
+                logger.warning(
+                    "config_length_mismatch",
+                    config=config,
+                    expected=len(unified),
+                    got=len(ds),
+                )
+                text_values = text_values[: len(unified)]
+            unified = unified.add_column(config, text_values)
+    if unified is None:
+        raise DatasetError("No configs loaded successfully")
+    return unified, ocr_columns
+def _extract_model_id(ds: Dataset, config: str) -> str:
+    """Extract model_id from inference_info in first row, falling back to config name."""
+    if "inference_info" not in ds.column_names:
+        return config
+    try:
+        info_raw = ds["inference_info"][0]  # column access avoids image decode
+        if info_raw:
+            info = json.loads(info_raw)
+            if isinstance(info, list):
+                info = info[0]
+            return info.get("model_id", info.get("model_name", config))
+    except (json.JSONDecodeError, TypeError, KeyError, IndexError):
+        pass
+    return config
+def _find_text_column(ds: Dataset) -> str | None:
+    """Find the likely OCR text column in a dataset.
+    Priority:
+      1. ``inference_info[0]["column_name"]`` if present and exists in dataset.
+      2. First column matching ``markdown`` (case-insensitive).
+      3. First column matching ``ocr`` (case-insensitive).
+      4. Column named exactly ``text``.
+    """
+    # Try inference_info first (column access avoids image decoding)
+    if "inference_info" in ds.column_names:
+        try:
+            info_raw = ds["inference_info"][0]
+            if info_raw:
+                info = json.loads(info_raw)
+                if isinstance(info, list):
+                    info = info[0]
+                col_name = info.get("column_name", "")
+                if col_name and col_name in ds.column_names:
+                    return col_name
+        except (json.JSONDecodeError, TypeError, KeyError, IndexError):
+            pass
+    # Prioritized heuristic: markdown > ocr > text
+    for pattern in ["markdown", "ocr"]:
+        for col in ds.column_names:
+            if pattern in col.lower():
+                return col
+    if "text" in ds.column_names:
+        return "text"
+    return None
+# ---------------------------------------------------------------------------
+# Flat dataset loading
+# ---------------------------------------------------------------------------
+def load_flat_dataset(
+    repo_id: str,
+    split: str = "train",
+    columns: list[str] | None = None,
+) -> tuple[Dataset, dict[str, str]]:
+    """Load a flat dataset from Hub and discover OCR columns.
+    Args:
+        repo_id: HF dataset repo id.
+        split: Dataset split.
+        columns: If given, use these as OCR columns (maps col→col).
+    Returns:
+        Tuple of (Dataset, {column_name: model_name}).
+    """
+    ds = load_dataset(repo_id, split=split)
+    if columns:
+        # Validate columns exist
+        for col in columns:
+            if col not in ds.column_names:
+                raise DatasetError(f"Column '{col}' not found. Available: {ds.column_names}")
+        ocr_columns = {col: col for col in columns}
+    else:
+        ocr_columns = discover_ocr_columns(ds)
+    return ds, ocr_columns

src/ocr_bench/elo.py ADDED Viewed

	@@ -0,0 +1,309 @@

+"""Bradley-Terry MLE rating computation for pairwise comparisons."""
+from __future__ import annotations
+import math
+import random
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Literal
+import numpy as np
+from scipy.optimize import minimize
+INITIAL_ELO: float = 1500.0
+Winner = Literal["A", "B", "tie"]
+@dataclass
+class ComparisonResult:
+    """Result of a single pairwise comparison, ready for ELO computation."""
+    sample_idx: int
+    model_a: str
+    model_b: str
+    winner: Winner
+    reason: str = ""
+    agreement: str = "1/1"
+    swapped: bool = False
+    text_a: str = ""
+    text_b: str = ""
+    col_a: str = ""
+    col_b: str = ""
+@dataclass
+class Leaderboard:
+    """ELO leaderboard computed from pairwise comparison results."""
+    elo: dict[str, float] = field(default_factory=dict)
+    wins: dict[str, int] = field(default_factory=dict)
+    losses: dict[str, int] = field(default_factory=dict)
+    ties: dict[str, int] = field(default_factory=dict)
+    comparison_log: list[dict[str, object]] = field(default_factory=list)
+    elo_ci: dict[str, tuple[float, float]] = field(default_factory=dict)
+    @property
+    def ranked(self) -> list[tuple[str, float]]:
+        """Models sorted by ELO rating, descending."""
+        return sorted(self.elo.items(), key=lambda x: x[1], reverse=True)
+    def win_pct(self, model: str) -> float | None:
+        """Win percentage for a model, or None if no comparisons."""
+        total = self.wins[model] + self.losses[model] + self.ties[model]
+        if total == 0:
+            return None
+        return self.wins[model] / total * 100
+def _unswap_winner(winner: Winner, swapped: bool) -> Winner:
+    """Unswap winner if positions were randomized."""
+    if swapped:
+        if winner == "A":
+            return "B"
+        elif winner == "B":
+            return "A"
+    return winner
+def _build_win_matrix(
+    results: list[ComparisonResult],
+) -> tuple[dict[tuple[str, str], float], set[str]]:
+    """Count wins per ordered pair. Ties count as 0.5 for each side.
+    Returns (win_counts, models_seen) where win_counts[(i, j)] = fractional
+    wins of i over j.
+    """
+    win_counts: dict[tuple[str, str], float] = defaultdict(float)
+    models_seen: set[str] = set()
+    for r in results:
+        winner = _unswap_winner(r.winner, r.swapped)
+        models_seen.add(r.model_a)
+        models_seen.add(r.model_b)
+        if winner == "A":
+            win_counts[(r.model_a, r.model_b)] += 1.0
+        elif winner == "B":
+            win_counts[(r.model_b, r.model_a)] += 1.0
+        else:
+            win_counts[(r.model_a, r.model_b)] += 0.5
+            win_counts[(r.model_b, r.model_a)] += 0.5
+    return win_counts, models_seen
+def _bt_mle(
+    win_counts: dict[tuple[str, str], float],
+    model_names: list[str],
+) -> dict[str, float]:
+    """Fit Bradley-Terry model via maximum likelihood estimation.
+    Returns theta (strength) per model. Uses scipy L-BFGS-B on the
+    negative log-likelihood with log-parameterization for positivity.
+    """
+    n = len(model_names)
+    if n == 0:
+        return {}
+    if n == 1:
+        return {model_names[0]: 1.0}
+    idx = {name: i for i, name in enumerate(model_names)}
+    # Collect all pairs with nonzero games
+    pairs: list[tuple[int, int, float, float]] = []
+    for i_name in model_names:
+        for j_name in model_names:
+            if i_name >= j_name:
+                continue
+            w_ij = win_counts.get((i_name, j_name), 0.0)
+            w_ji = win_counts.get((j_name, i_name), 0.0)
+            if w_ij + w_ji > 0:
+                pairs.append((idx[i_name], idx[j_name], w_ij, w_ji))
+    if not pairs:
+        return {name: 1.0 for name in model_names}
+    def neg_log_likelihood(log_theta: np.ndarray) -> float:
+        nll = 0.0
+        for i, j, w_ij, w_ji in pairs:
+            diff = log_theta[i] - log_theta[j]
+            # log(theta_i / (theta_i + theta_j)) = diff - log(1 + exp(diff))
+            # log(theta_j / (theta_i + theta_j)) = -diff - log(1 + exp(-diff))
+            # Use log-sum-exp for numerical stability
+            log_p_ij = diff - np.logaddexp(0.0, diff)
+            log_p_ji = -diff - np.logaddexp(0.0, -diff)
+            nll -= w_ij * log_p_ij + w_ji * log_p_ji
+        return nll
+    def gradient(log_theta: np.ndarray) -> np.ndarray:
+        grad = np.zeros(n)
+        for i, j, w_ij, w_ji in pairs:
+            diff = log_theta[i] - log_theta[j]
+            p_ij = 1.0 / (1.0 + np.exp(-diff))  # sigmoid(diff)
+            total = w_ij + w_ji
+            # d(NLL)/d(log_theta_i)
+            grad[i] -= w_ij - total * p_ij
+            grad[j] -= w_ji - total * (1.0 - p_ij)
+        return grad
+    # Pin first model at 0 to fix the scale
+    x0 = np.zeros(n)
+    result = minimize(
+        neg_log_likelihood,
+        x0,
+        jac=gradient,
+        method="L-BFGS-B",
+    )
+    log_theta = result.x
+    # Center: subtract geometric mean (= mean of log_theta)
+    log_theta -= log_theta.mean()
+    theta = np.exp(log_theta)
+    return {name: float(theta[idx[name]]) for name in model_names}
+def _theta_to_elo(theta: dict[str, float], center: float = 1500.0) -> dict[str, float]:
+    """Convert BT theta values to ELO scale.
+    ELO_i = 400 * log10(theta_i / theta_ref) + center
+    where theta_ref is the geometric mean of all theta values.
+    """
+    if not theta:
+        return {}
+    values = list(theta.values())
+    log_geo_mean = sum(math.log(v) for v in values) / len(values)
+    geo_mean = math.exp(log_geo_mean)
+    return {
+        name: 400.0 * math.log10(t / geo_mean) + center
+        for name, t in theta.items()
+    }
+def _bootstrap_ci(
+    results: list[ComparisonResult],
+    model_names: list[str],
+    n_bootstrap: int = 1000,
+    ci: float = 0.95,
+    seed: int = 42,
+) -> dict[str, tuple[float, float]]:
+    """Compute bootstrap confidence intervals for ELO ratings.
+    Resamples comparisons with replacement, fits BT-MLE each time,
+    returns percentile-based CIs.
+    """
+    if not results or not model_names:
+        return {}
+    rng = random.Random(seed)
+    n = len(results)
+    elo_samples: dict[str, list[float]] = {name: [] for name in model_names}
+    for _ in range(n_bootstrap):
+        boot = rng.choices(results, k=n)
+        win_counts, _ = _build_win_matrix(boot)
+        theta = _bt_mle(win_counts, model_names)
+        elos = _theta_to_elo(theta)
+        for name in model_names:
+            elo_samples[name].append(elos.get(name, 1500.0))
+    alpha = (1.0 - ci) / 2.0
+    lo_pct = alpha * 100
+    hi_pct = (1.0 - alpha) * 100
+    cis: dict[str, tuple[float, float]] = {}
+    for name in model_names:
+        samples = sorted(elo_samples[name])
+        lo_idx = int(len(samples) * lo_pct / 100)
+        hi_idx = min(int(len(samples) * hi_pct / 100), len(samples) - 1)
+        cis[name] = (samples[lo_idx], samples[hi_idx])
+    return cis
+def rankings_resolved(board: Leaderboard) -> bool:
+    """Check if all adjacent ranks have non-overlapping 95% CIs.
+    Returns True when the ranking order is statistically resolved — i.e. for
+    every pair of adjacent models in the ranking, the higher-ranked model's
+    CI lower bound exceeds the lower-ranked model's CI upper bound.
+    """
+    if not board.elo_ci:
+        return False
+    ranked = board.ranked
+    if len(ranked) < 2:
+        return False
+    for i in range(len(ranked) - 1):
+        model_hi, _ = ranked[i]
+        model_lo, _ = ranked[i + 1]
+        if model_hi not in board.elo_ci or model_lo not in board.elo_ci:
+            return False
+        lo_of_higher, _ = board.elo_ci[model_hi]
+        _, hi_of_lower = board.elo_ci[model_lo]
+        if hi_of_lower >= lo_of_higher:
+            return False  # CIs overlap
+    return True
+def compute_elo(
+    results: list[ComparisonResult],
+    model_names: list[str],
+    n_bootstrap: int = 1000,
+) -> Leaderboard:
+    """Compute ELO ratings from pairwise comparison results using Bradley-Terry MLE.
+    Handles position-bias unswapping: if a result has swapped=True,
+    the winner is flipped before updating ratings.
+    Bootstrap confidence intervals are computed when n_bootstrap > 0.
+    """
+    board = Leaderboard(
+        elo={m: INITIAL_ELO for m in model_names},
+        wins={m: 0 for m in model_names},
+        losses={m: 0 for m in model_names},
+        ties={m: 0 for m in model_names},
+    )
+    # Tally wins/losses/ties and build comparison log
+    for r in results:
+        winner = _unswap_winner(r.winner, r.swapped)
+        if winner == "A":
+            board.wins[r.model_a] += 1
+            board.losses[r.model_b] += 1
+        elif winner == "B":
+            board.losses[r.model_a] += 1
+            board.wins[r.model_b] += 1
+        else:
+            board.ties[r.model_a] += 1
+            board.ties[r.model_b] += 1
+        board.comparison_log.append(
+            {
+                "sample_idx": r.sample_idx,
+                "model_a": r.model_a,
+                "model_b": r.model_b,
+                "winner": winner,
+                "reason": r.reason,
+                "agreement": r.agreement,
+                "text_a": r.text_a,
+                "text_b": r.text_b,
+                "col_a": r.col_a,
+                "col_b": r.col_b,
+            }
+        )
+    # Fit BT-MLE
+    win_counts, _ = _build_win_matrix(results)
+    theta = _bt_mle(win_counts, model_names)
+    board.elo = _theta_to_elo(theta)
+    # Bootstrap CIs
+    if n_bootstrap > 0 and results:
+        board.elo_ci = _bootstrap_ci(results, model_names, n_bootstrap=n_bootstrap)
+    return board

src/ocr_bench/judge.py ADDED Viewed

	@@ -0,0 +1,287 @@

+"""Pairwise VLM judge — prompt templates, structured output schema, comparison building."""
+from __future__ import annotations
+import base64
+import io
+import json
+import logging
+import random
+from dataclasses import dataclass
+from itertools import combinations
+from typing import Any
+from PIL import Image
+logger = logging.getLogger(__name__)
+# --- Judge prompt ---
+PAIRWISE_PROMPT = """\
+You are an expert OCR quality evaluator. You are given a document image and \
+TWO OCR outputs (A and B) extracted from that same image.
+Compare them and decide which extraction is better overall.
+Evaluation criteria (in priority order):
+1. Faithfulness: The output must ONLY contain text actually visible in the document. \
+Hallucinating text that is not in the image (garbled strings, repeated tokens, \
+nonsensical output) is the most serious error. Added commentary or notes \
+(e.g. "it appears the text says...") is also an error, but less severe than \
+hallucination. If a page is blank or has minimal text, saying so is acceptable — \
+fabricating content is always worse.
+2. Completeness: ALL visible text must be captured — headers, footers, marginalia, \
+stamps, handwritten notes. Missing any section of text is a significant penalty.
+3. Accuracy: Correct characters, no garbled or fabricated words.
+4. Reading order: Text flows naturally as a human would read the document.
+5. Formatting: Clean structure. Ignore bounding box tags like <|ref|> <|det|> \
+if present. Do NOT prefer fancier markdown formatting — plain accurate text is \
+better than nicely formatted but incomplete text.
+If both outputs capture the same text with similar accuracy, respond with "tie". \
+Only pick a winner when there is a clear quality difference.
+Output A:
+---
+{ocr_text_a}
+---
+Output B:
+---
+{ocr_text_b}
+---
+Respond with JSON only (no markdown fences, no extra text):
+{{"winner": "A", "reason": "brief explanation"}}
+Use "A", "B", or "tie" for the winner field."""
+JUDGE_SCHEMA: dict[str, Any] = {
+    "type": "object",
+    "properties": {
+        "winner": {"type": "string", "enum": ["A", "B", "tie"]},
+        "reason": {"type": "string"},
+    },
+    "required": ["winner", "reason"],
+}
+# Max characters of OCR text to include per output in the prompt.
+MAX_OCR_TEXT_LENGTH = 2500
+# Max image dimension (longer side) before resizing.
+MAX_IMAGE_DIM = 1024
+# --- Image helpers ---
+def image_to_base64(image: Image.Image, max_dim: int = MAX_IMAGE_DIM) -> str:
+    """Convert a PIL image to a base64-encoded JPEG string, resizing if needed."""
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    if max(image.size) > max_dim:
+        ratio = max_dim / max(image.size)
+        new_size = (int(image.width * ratio), int(image.height * ratio))
+        image = image.resize(new_size, Image.Resampling.LANCZOS)
+    buf = io.BytesIO()
+    image.save(buf, format="JPEG", quality=85)
+    return base64.b64encode(buf.getvalue()).decode()
+# --- Comparison ---
+@dataclass
+class Comparison:
+    """A single pairwise comparison to evaluate."""
+    sample_idx: int
+    model_a: str
+    model_b: str
+    col_a: str
+    col_b: str
+    swapped: bool
+    messages: list[dict[str, Any]]
+    text_a: str = ""
+    text_b: str = ""
+def build_prompt(text_a: str, text_b: str, swapped: bool) -> tuple[str, bool]:
+    """Build the pairwise comparison prompt, applying position-bias swap.
+    Returns (prompt_text, swapped).
+    """
+    a = text_a[:MAX_OCR_TEXT_LENGTH]
+    b = text_b[:MAX_OCR_TEXT_LENGTH]
+    if swapped:
+        a, b = b, a
+    return PAIRWISE_PROMPT.format(ocr_text_a=a, ocr_text_b=b), swapped
+def build_messages(image_b64: str, prompt: str) -> list[dict[str, Any]]:
+    """Build chat messages for the judge (image + prompt)."""
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"},
+                },
+                {"type": "text", "text": prompt},
+            ],
+        }
+    ]
+def _normalize_pair(a: str, b: str) -> tuple[str, str]:
+    """Return a canonical (sorted) pair for symmetric lookup."""
+    return (a, b) if a <= b else (b, a)
+def sample_indices(
+    dataset_len: int, max_samples: int | None = None, seed: int = 42
+) -> list[int]:
+    """Compute shuffled sample indices (cheap — no image loading).
+    Args:
+        dataset_len: Total number of rows in the dataset.
+        max_samples: If set, randomly sample this many indices.
+        seed: Random seed for reproducible sampling.
+    Returns:
+        List of integer indices into the dataset.
+    """
+    indices = list(range(dataset_len))
+    if max_samples and max_samples < len(indices):
+        random.seed(seed)
+        indices = random.sample(indices, max_samples)
+    return indices
+def build_comparisons(
+    dataset: Any,
+    ocr_columns: dict[str, str],
+    max_samples: int | None = None,
+    seed: int = 42,
+    skip_pairs: set[tuple[str, str]] | None = None,
+    indices: list[int] | None = None,
+) -> list[Comparison]:
+    """Build pairwise comparison prompts from a dataset.
+    Args:
+        dataset: HF dataset with an "image" column and OCR output columns.
+        ocr_columns: Mapping of column_name -> model_name.
+        max_samples: If set, randomly sample this many rows. Ignored when
+            ``indices`` is provided.
+        seed: Random seed for sampling and position-bias randomization.
+        skip_pairs: Set of (model_a, model_b) pairs to exclude. Pairs are
+            normalized so (a, b) and (b, a) are treated identically.
+            If None, all pairs are included.
+        indices: Explicit row indices to use. When provided, ``max_samples``
+            and ``seed`` are not used for index selection (seed is still used
+            for position-bias randomization).
+    Returns:
+        List of Comparison objects with pre-built chat messages.
+    """
+    col_names = list(ocr_columns.keys())
+    model_names = list(ocr_columns.values())
+    pairs = list(combinations(range(len(col_names)), 2))
+    # Normalize skip set for symmetric lookup
+    normalized_skip: set[tuple[str, str]] = set()
+    if skip_pairs:
+        normalized_skip = {_normalize_pair(a, b) for a, b in skip_pairs}
+    if indices is None:
+        indices = sample_indices(len(dataset), max_samples, seed)
+    rng = random.Random(seed)
+    comparisons: list[Comparison] = []
+    # Pre-fetch text columns to avoid triggering image decode per row.
+    # HF Dataset supports column access (dataset["col"]), plain lists don't.
+    text_cols_data: dict[str, list] | None = None
+    if hasattr(dataset, "column_names"):
+        text_cols_data = {col: dataset[col] for col in col_names}
+    for idx in indices:
+        # Determine which pairs need judging for this row
+        needed_pairs = [
+            (i, j)
+            for i, j in pairs
+            if _normalize_pair(model_names[i], model_names[j]) not in normalized_skip
+        ]
+        if not needed_pairs:
+            continue  # Skip image encoding entirely
+        # Check text availability before decoding the image
+        valid_pairs = []
+        if text_cols_data is not None:
+            for i, j in needed_pairs:
+                text_a = text_cols_data[col_names[i]][idx] or ""
+                text_b = text_cols_data[col_names[j]][idx] or ""
+                if text_a.strip() and text_b.strip():
+                    valid_pairs.append((i, j, text_a, text_b))
+        else:
+            row = dataset[idx]
+            for i, j in needed_pairs:
+                text_a = row[col_names[i]] or ""
+                text_b = row[col_names[j]] or ""
+                if text_a.strip() and text_b.strip():
+                    valid_pairs.append((i, j, text_a, text_b))
+        if not valid_pairs:
+            continue
+        image_b64 = image_to_base64(dataset[idx]["image"])
+        for i, j, text_a, text_b in valid_pairs:
+            swapped = rng.random() < 0.5
+            prompt, swapped = build_prompt(text_a, text_b, swapped)
+            messages = build_messages(image_b64, prompt)
+            comparisons.append(
+                Comparison(
+                    sample_idx=idx,
+                    model_a=model_names[i],
+                    model_b=model_names[j],
+                    col_a=col_names[i],
+                    col_b=col_names[j],
+                    swapped=swapped,
+                    messages=messages,
+                    text_a=text_a,
+                    text_b=text_b,
+                )
+            )
+    return comparisons
+# --- Output parsing ---
+def parse_judge_output(text: str) -> dict[str, str]:
+    """Parse judge JSON output, handling markdown fences and invalid values.
+    Returns dict with "winner" and "reason" keys, or empty dict on failure.
+    """
+    text = text.strip()
+    if text.startswith("```"):
+        text = text.split("\n", 1)[1].rsplit("```", 1)[0].strip()
+    try:
+        result = json.loads(text)
+        winner = result.get("winner", "tie").upper().strip()
+        if winner == "TIE":
+            winner = "tie"
+        if winner not in ("A", "B", "tie"):
+            winner = "tie"
+        return {"winner": winner, "reason": result.get("reason", "")}
+    except json.JSONDecodeError:
+        logger.warning("Failed to parse judge output: %s", text[:200])
+        return {}

src/ocr_bench/publish.py ADDED Viewed

	@@ -0,0 +1,262 @@

+"""Hub publishing — push comparisons, leaderboard, and metadata configs to HF Hub."""
+from __future__ import annotations
+import datetime
+import json
+from dataclasses import dataclass
+import structlog
+from datasets import Dataset, load_dataset
+from huggingface_hub import HfApi
+from ocr_bench.elo import ComparisonResult, Leaderboard
+logger = structlog.get_logger()
+@dataclass
+class EvalMetadata:
+    """Metadata for an evaluation run, stored alongside results on Hub."""
+    source_dataset: str
+    judge_models: list[str]
+    seed: int
+    max_samples: int
+    total_comparisons: int
+    valid_comparisons: int
+    from_prs: bool = False
+    timestamp: str = ""
+    def __post_init__(self):
+        if not self.timestamp:
+            self.timestamp = datetime.datetime.now(datetime.UTC).isoformat()
+def load_existing_comparisons(repo_id: str) -> list[ComparisonResult]:
+    """Load existing comparisons from a Hub results repo.
+    The stored winner is already unswapped (canonical), so ``swapped=False``.
+    Returns an empty list if the repo or config doesn't exist.
+    """
+    try:
+        ds = load_dataset(repo_id, name="comparisons", split="train")
+    except Exception as exc:
+        logger.info("no_existing_comparisons", repo=repo_id, reason=str(exc))
+        return []
+    results = []
+    for row in ds:
+        results.append(
+            ComparisonResult(
+                sample_idx=row["sample_idx"],
+                model_a=row["model_a"],
+                model_b=row["model_b"],
+                winner=row["winner"],
+                reason=row.get("reason", ""),
+                agreement=row.get("agreement", "1/1"),
+                swapped=False,
+                text_a=row.get("text_a", ""),
+                text_b=row.get("text_b", ""),
+                col_a=row.get("col_a", ""),
+                col_b=row.get("col_b", ""),
+            )
+        )
+    logger.info("loaded_existing_comparisons", repo=repo_id, n=len(results))
+    return results
+def load_existing_metadata(repo_id: str) -> list[dict]:
+    """Load existing metadata rows from a Hub results repo.
+    Returns an empty list if the repo or config doesn't exist.
+    """
+    try:
+        ds = load_dataset(repo_id, name="metadata", split="train")
+        return [dict(row) for row in ds]
+    except Exception as exc:
+        logger.info("no_existing_metadata", repo=repo_id, reason=str(exc))
+        return []
+def build_leaderboard_rows(board: Leaderboard) -> list[dict]:
+    """Convert a Leaderboard into rows suitable for a Hub dataset."""
+    rows = []
+    for model, elo in board.ranked:
+        total = board.wins[model] + board.losses[model] + board.ties[model]
+        row = {
+            "model": model,
+            "elo": round(elo),
+            "wins": board.wins[model],
+            "losses": board.losses[model],
+            "ties": board.ties[model],
+            "win_pct": round(board.wins[model] / total * 100) if total > 0 else 0,
+        }
+        if board.elo_ci and model in board.elo_ci:
+            lo, hi = board.elo_ci[model]
+            row["elo_low"] = round(lo)
+            row["elo_high"] = round(hi)
+        rows.append(row)
+    return rows
+def build_metadata_row(metadata: EvalMetadata) -> dict:
+    """Convert EvalMetadata into a single row for a Hub dataset."""
+    return {
+        "source_dataset": metadata.source_dataset,
+        "judge_models": json.dumps(metadata.judge_models),
+        "seed": metadata.seed,
+        "max_samples": metadata.max_samples,
+        "total_comparisons": metadata.total_comparisons,
+        "valid_comparisons": metadata.valid_comparisons,
+        "from_prs": metadata.from_prs,
+        "timestamp": metadata.timestamp,
+    }
+def publish_results(
+    repo_id: str,
+    board: Leaderboard,
+    metadata: EvalMetadata,
+    existing_metadata: list[dict] | None = None,
+) -> None:
+    """Push evaluation results to Hub as a dataset with multiple configs.
+    Configs:
+      - (default): Leaderboard table — ``load_dataset("repo")`` returns this.
+      - ``leaderboard``: Same table, named config (backward compat for viewer).
+      - ``comparisons``: Full comparison log from the board (caller merges
+        existing + new before ``compute_elo``, so ``board.comparison_log``
+        is already the complete set).
+      - ``metadata``: Append-only run log. New row is appended to
+        ``existing_metadata``.
+    """
+    # Comparisons
+    if board.comparison_log:
+        comp_ds = Dataset.from_list(board.comparison_log)
+        comp_ds.push_to_hub(repo_id, config_name="comparisons")
+        logger.info("published_comparisons", repo=repo_id, n=len(board.comparison_log))
+    # Leaderboard — dual push: default config + named config
+    rows = build_leaderboard_rows(board)
+    lb_ds = Dataset.from_list(rows)
+    lb_ds.push_to_hub(repo_id)
+    lb_ds.push_to_hub(repo_id, config_name="leaderboard")
+    logger.info("published_leaderboard", repo=repo_id, n=len(rows))
+    # Metadata — append-only
+    meta_row = build_metadata_row(metadata)
+    all_meta = (existing_metadata or []) + [meta_row]
+    Dataset.from_list(all_meta).push_to_hub(repo_id, config_name="metadata")
+    logger.info("published_metadata", repo=repo_id, n=len(all_meta))
+    # README — auto-generated dataset card with leaderboard
+    readme = _build_readme(repo_id, rows, board, metadata)
+    api = HfApi()
+    api.upload_file(
+        path_or_fileobj=readme.encode(),
+        path_in_repo="README.md",
+        repo_id=repo_id,
+        repo_type="dataset",
+    )
+    logger.info("published_readme", repo=repo_id)
+def _build_readme(
+    repo_id: str,
+    rows: list[dict],
+    board: Leaderboard,
+    metadata: EvalMetadata,
+) -> str:
+    """Build a dataset card README with the leaderboard table."""
+    has_ci = bool(board.elo_ci)
+    source_short = metadata.source_dataset.split("/")[-1]
+    judges = json.loads(
+        metadata.judge_models
+        if isinstance(metadata.judge_models, str)
+        else json.dumps(metadata.judge_models)
+    )
+    judge_str = ", ".join(j.split("/")[-1] for j in judges) if judges else "N/A"
+    n_comparisons = len(board.comparison_log)
+    lines = [
+        "---",
+        "license: mit",
+        "tags:",
+        "  - ocr-bench",
+        "  - leaderboard",
+        "configs:",
+        "  - config_name: default",
+        "    data_files:",
+        "      - split: train",
+        "        path: data/train-*.parquet",
+        "  - config_name: comparisons",
+        "    data_files:",
+        "      - split: train",
+        "        path: comparisons/train-*.parquet",
+        "  - config_name: leaderboard",
+        "    data_files:",
+        "      - split: train",
+        "        path: leaderboard/train-*.parquet",
+        "  - config_name: metadata",
+        "    data_files:",
+        "      - split: train",
+        "        path: metadata/train-*.parquet",
+        "---",
+        "",
+        f"# OCR Bench Results: {source_short}",
+        "",
+        "VLM-as-judge pairwise evaluation of OCR models. "
+        "Rankings depend on document type — there is no single best OCR model.",
+        "",
+        "## Leaderboard",
+        "",
+    ]
+    # Table header
+    if has_ci:
+        lines.append("| Rank | Model | ELO | 95% CI | Wins | Losses | Ties | Win% |")
+        lines.append("|------|-------|-----|--------|------|--------|------|------|")
+    else:
+        lines.append("| Rank | Model | ELO | Wins | Losses | Ties | Win% |")
+        lines.append("|------|-------|-----|------|--------|------|------|")
+    for rank, row in enumerate(rows, 1):
+        model = row["model"]
+        elo = row["elo"]
+        if has_ci and "elo_low" in row:
+            ci = f"{row['elo_low']}\u2013{row['elo_high']}"
+            lines.append(
+                f"| {rank} | {model} | {elo} | {ci} "
+                f"| {row['wins']} | {row['losses']} | {row['ties']} "
+                f"| {row['win_pct']}% |"
+            )
+        else:
+            lines.append(
+                f"| {rank} | {model} | {elo} "
+                f"| {row['wins']} | {row['losses']} | {row['ties']} "
+                f"| {row['win_pct']}% |"
+            )
+    lines += [
+        "",
+        "## Details",
+        "",
+        f"- **Source dataset**: [`{metadata.source_dataset}`]"
+        f"(https://huggingface.co/datasets/{metadata.source_dataset})",
+        f"- **Judge**: {judge_str}",
+        f"- **Comparisons**: {n_comparisons}",
+        "- **Method**: Bradley-Terry MLE with bootstrap 95% CIs",
+        "",
+        "## Configs",
+        "",
+        f"- `load_dataset(\"{repo_id}\")` — leaderboard table",
+        f"- `load_dataset(\"{repo_id}\", name=\"comparisons\")` "
+        "— full pairwise comparison log",
+        f"- `load_dataset(\"{repo_id}\", name=\"metadata\")` "
+        "— evaluation run history",
+        "",
+        "*Generated by [ocr-bench](https://github.com/davanstrien/ocr-bench)*",
+    ]
+    return "\n".join(lines) + "\n"

src/ocr_bench/run.py ADDED Viewed

	@@ -0,0 +1,187 @@

+"""OCR model orchestration — launch HF Jobs for multiple OCR models."""
+from __future__ import annotations
+import time
+from dataclasses import dataclass, field
+import structlog
+from huggingface_hub import HfApi, get_token
+logger = structlog.get_logger()
+@dataclass
+class ModelConfig:
+    """Configuration for a single OCR model."""
+    script: str
+    model_id: str
+    size: str
+    default_flavor: str = "l4x1"
+    default_args: list[str] = field(default_factory=list)
+MODEL_REGISTRY: dict[str, ModelConfig] = {
+    "glm-ocr": ModelConfig(
+        script="https://huggingface.co/datasets/uv-scripts/ocr/raw/main/glm-ocr.py",
+        model_id="zai-org/GLM-OCR",
+        size="0.9B",
+        default_flavor="l4x1",
+    ),
+    "deepseek-ocr": ModelConfig(
+        script="https://huggingface.co/datasets/uv-scripts/ocr/raw/main/deepseek-ocr-vllm.py",
+        model_id="deepseek-ai/DeepSeek-OCR",
+        size="4B",
+        default_flavor="l4x1",
+        default_args=["--prompt-mode", "free"],
+    ),
+    "lighton-ocr-2": ModelConfig(
+        script="https://huggingface.co/datasets/uv-scripts/ocr/raw/main/lighton-ocr2.py",
+        model_id="lightonai/LightOnOCR-2-1B",
+        size="1B",
+        default_flavor="a100-large",
+    ),
+    "dots-ocr": ModelConfig(
+        script="https://huggingface.co/datasets/uv-scripts/ocr/raw/main/dots-ocr.py",
+        model_id="rednote-hilab/dots.ocr",
+        size="1.7B",
+        default_flavor="l4x1",
+    ),
+}
+DEFAULT_MODELS = ["glm-ocr", "deepseek-ocr", "lighton-ocr-2", "dots-ocr"]
+@dataclass
+class JobRun:
+    """Tracks a launched HF Job."""
+    model_slug: str
+    job_id: str
+    job_url: str
+    status: str = "running"
+def list_models() -> list[str]:
+    """Return sorted list of available model slugs."""
+    return sorted(MODEL_REGISTRY.keys())
+def build_script_args(
+    input_dataset: str,
+    output_repo: str,
+    config_name: str,
+    *,
+    max_samples: int | None = None,
+    shuffle: bool = False,
+    seed: int = 42,
+    extra_args: list[str] | None = None,
+) -> list[str]:
+    """Build the script_args list for run_uv_job."""
+    args = [
+        input_dataset,
+        output_repo,
+        "--config",
+        config_name,
+        "--create-pr",
+    ]
+    if max_samples is not None:
+        args += ["--max-samples", str(max_samples)]
+    if shuffle:
+        args.append("--shuffle")
+    if seed != 42:
+        args += ["--seed", str(seed)]
+    if extra_args:
+        args += extra_args
+    return args
+def launch_ocr_jobs(
+    input_dataset: str,
+    output_repo: str,
+    *,
+    models: list[str] | None = None,
+    max_samples: int | None = None,
+    split: str = "train",
+    shuffle: bool = False,
+    seed: int = 42,
+    flavor_override: str | None = None,
+    timeout: str = "4h",
+    api: HfApi | None = None,
+) -> list[JobRun]:
+    """Launch HF Jobs for each model. Returns list of JobRun tracking objects."""
+    if api is None:
+        api = HfApi()
+    token = get_token()
+    if not token:
+        raise RuntimeError("No HF token found. Log in with `hf login` or set HF_TOKEN.")
+    selected = models or DEFAULT_MODELS
+    for slug in selected:
+        if slug not in MODEL_REGISTRY:
+            raise ValueError(
+                f"Unknown model: {slug}. Available: {', '.join(MODEL_REGISTRY.keys())}"
+            )
+    jobs: list[JobRun] = []
+    for slug in selected:
+        config = MODEL_REGISTRY[slug]
+        flavor = flavor_override or config.default_flavor
+        script_args = build_script_args(
+            input_dataset,
+            output_repo,
+            slug,
+            max_samples=max_samples,
+            shuffle=shuffle,
+            seed=seed,
+            extra_args=config.default_args or None,
+        )
+        logger.info("launching_job", model=slug, flavor=flavor, script=config.script)
+        job = api.run_uv_job(
+            script=config.script,
+            script_args=script_args,
+            flavor=flavor,
+            secrets={"HF_TOKEN": token},
+            timeout=timeout,
+        )
+        jobs.append(JobRun(model_slug=slug, job_id=job.id, job_url=job.url))
+        logger.info("job_launched", model=slug, job_id=job.id, url=job.url)
+    return jobs
+_TERMINAL_STAGES = frozenset({"COMPLETED", "ERROR", "CANCELED", "DELETED"})
+def poll_jobs(
+    jobs: list[JobRun],
+    *,
+    interval: int = 30,
+    api: HfApi | None = None,
+) -> list[JobRun]:
+    """Poll until all jobs complete or fail. Updates status in-place and returns the list."""
+    if api is None:
+        api = HfApi()
+    pending = {j.job_id: j for j in jobs if j.status == "running"}
+    while pending:
+        time.sleep(interval)
+        still_running: dict[str, JobRun] = {}
+        for job_id, job_run in pending.items():
+            info = api.inspect_job(job_id=job_id)
+            stage = info.status.stage
+            if stage in _TERMINAL_STAGES:
+                job_run.status = stage.lower()
+                logger.info("job_finished", model=job_run.model_slug, status=job_run.status)
+            else:
+                still_running[job_id] = job_run
+        pending = still_running
+        if pending:
+            slugs = [j.model_slug for j in pending.values()]
+            logger.info("jobs_pending", models=slugs)
+    return jobs

src/ocr_bench/space.py ADDED Viewed

	@@ -0,0 +1,18 @@

+"""HF Space entry point for ocr-bench viewer."""
+import os
+import uvicorn
+from ocr_bench.web import create_app
+def main():
+    repos = os.environ.get("REPOS", "davanstrien/bpl-ocr-bench-results")
+    repo_id = repos.split(",")[0].strip()
+    app = create_app(repo_id)
+    uvicorn.run(app, host="0.0.0.0", port=7860)
+if __name__ == "__main__":
+    main()

src/ocr_bench/static/style.css ADDED Viewed

	@@ -0,0 +1,379 @@

+/* ocr-bench viewer — Tufte-inspired minimal styles */
+*,
+*::before,
+*::after {
+  box-sizing: border-box;
+}
+body {
+  font-family: system-ui, -apple-system, sans-serif;
+  color: #333;
+  background: #fff;
+  margin: 0;
+  padding: 0;
+  line-height: 1.5;
+}
+.container {
+  max-width: 960px;
+  margin: 0 auto;
+  padding: 0 1.5rem 3rem;
+}
+/* Navigation */
+nav {
+  border-bottom: 1px solid #ddd;
+  padding: 0.75rem 0;
+  margin-bottom: 2rem;
+  display: flex;
+  align-items: baseline;
+  gap: 2rem;
+}
+nav .brand {
+  font-weight: 600;
+  color: #333;
+  text-decoration: none;
+  font-size: 0.9rem;
+  letter-spacing: 0.02em;
+}
+nav a {
+  color: #666;
+  text-decoration: none;
+  font-size: 0.85rem;
+}
+nav a:hover,
+nav a.active {
+  color: #333;
+}
+nav a.active {
+  border-bottom: 2px solid #333;
+  padding-bottom: 2px;
+}
+/* Comparison layout */
+.comparison-columns {
+  display: grid;
+  grid-template-columns: 1fr 1fr;
+  gap: 2rem;
+  margin: 1.5rem 0;
+}
+.ocr-column h3 {
+  font-size: 0.85rem;
+  font-weight: 600;
+  color: #666;
+  margin: 0 0 0.5rem;
+  padding-bottom: 0.35rem;
+  border-bottom: 1px solid #ddd;
+  letter-spacing: 0.02em;
+}
+.ocr-column h3.revealed {
+  color: #333;
+}
+.ocr-text {
+  font-family: "SF Mono", "Menlo", "Consolas", monospace;
+  font-size: 0.82rem;
+  line-height: 1.6;
+  white-space: pre-wrap;
+  word-break: break-word;
+  max-height: 50vh;
+  overflow-y: auto;
+  padding: 0.25rem 0;
+  color: #444;
+}
+/* Navigation header */
+.comp-nav {
+  display: flex;
+  justify-content: flex-end;
+  align-items: baseline;
+  gap: 0.75rem;
+  margin-bottom: 0.5rem;
+  color: #999;
+  font-size: 0.8rem;
+}
+.comp-nav a {
+  color: #999;
+  text-decoration: none;
+  font-size: 0.85rem;
+  padding: 0.15rem 0.4rem;
+}
+.comp-nav a:hover {
+  color: #333;
+}
+/* Vote prompt */
+.vote-prompt {
+  text-align: center;
+  font-size: 0.8rem;
+  color: #999;
+  margin: 1.5rem 0 0.5rem;
+}
+/* Vote buttons */
+.vote-row {
+  text-align: center;
+  margin: 0.25rem 0 0.5rem;
+  display: flex;
+  justify-content: center;
+  gap: 0.5rem;
+}
+.vote-btn {
+  display: inline-block;
+  color: #555;
+  text-decoration: none;
+  padding: 0.35rem 1rem;
+  border: 1px solid #ddd;
+  border-radius: 4px;
+  font-size: 0.85rem;
+  transition: border-color 0.15s, color 0.15s;
+}
+.vote-btn:hover {
+  color: #333;
+  border-color: #999;
+}
+.vote-btn.vote-tie {
+  color: #888;
+}
+/* Hints below vote buttons */
+.vote-hints {
+  text-align: center;
+  margin: 0.5rem 0 1rem;
+  font-size: 0.75rem;
+  color: #bbb;
+}
+.vote-hints a {
+  color: #999;
+  text-decoration: none;
+}
+.vote-hints a:hover {
+  color: #666;
+  text-decoration: underline;
+}
+.vote-hints .separator {
+  color: #ddd;
+}
+.vote-hints kbd {
+  font-family: system-ui, sans-serif;
+  font-size: 0.7rem;
+  padding: 0.05rem 0.3rem;
+  border: 1px solid #ddd;
+  border-radius: 3px;
+  background: #f8f8f8;
+  color: #999;
+}
+/* Legacy reveal-row (kept for compat) */
+.reveal-row {
+  text-align: right;
+  margin: 0.25rem 0 1rem;
+  font-size: 0.8rem;
+}
+.reveal-row a {
+  color: #999;
+  text-decoration: none;
+}
+.reveal-row a:hover {
+  color: #666;
+}
+/* Verdict display */
+.verdict {
+  margin: 1rem 0;
+  font-size: 0.85rem;
+  color: #555;
+  line-height: 1.6;
+}
+.verdict .agreement {
+  font-weight: 500;
+}
+.verdict .agreement.agreed {
+  color: #457b4d;
+}
+.verdict .agreement.soft-disagree {
+  color: #a07828;
+}
+.verdict .agreement.hard-disagree {
+  color: #b04040;
+}
+.verdict .reason {
+  font-style: italic;
+  color: #777;
+  display: block;
+  margin-top: 0.25rem;
+}
+/* Document image */
+.doc-image {
+  margin: 1.5rem 0;
+  text-align: center;
+}
+.doc-image img {
+  max-width: 100%;
+  height: auto;
+  max-height: 60vh;
+}
+/* Leaderboard table */
+table {
+  width: 100%;
+  border-collapse: collapse;
+  font-size: 0.85rem;
+  margin: 1.5rem 0;
+}
+thead th {
+  text-align: left;
+  font-weight: 600;
+  padding: 0.5rem 0.75rem;
+  border-bottom: 2px solid #333;
+  color: #333;
+  font-size: 0.8rem;
+  letter-spacing: 0.02em;
+}
+thead th.num {
+  text-align: right;
+}
+tbody td {
+  padding: 0.4rem 0.75rem;
+  border-bottom: 1px solid #eee;
+}
+tbody td.num {
+  text-align: right;
+  font-variant-numeric: tabular-nums;
+}
+tbody td.model {
+  font-weight: 500;
+}
+tbody tr:hover {
+  background: #fafafa;
+}
+/* Filters */
+.filters {
+  display: flex;
+  gap: 1rem;
+  margin-bottom: 1rem;
+  align-items: center;
+}
+.filters label {
+  font-size: 0.8rem;
+  color: #666;
+}
+.filters select {
+  font-size: 0.8rem;
+  padding: 0.25rem 0.5rem;
+  border: 1px solid #ddd;
+  border-radius: 3px;
+  background: #fff;
+  color: #333;
+}
+/* Stats panel */
+.stats-panel {
+  color: #888;
+  font-size: 0.8rem;
+  padding: 1rem 0;
+  border-top: 1px solid #eee;
+  margin-top: 2rem;
+}
+.stats-panel .calibrated {
+  color: #457b4d;
+}
+.stats-panel .warning {
+  color: #b04040;
+}
+/* Pair summary table */
+.pair-summary {
+  margin-bottom: 1rem;
+}
+.pair-table {
+  width: auto;
+  font-size: 0.8rem;
+  color: #888;
+}
+.pair-table th {
+  font-size: 0.75rem;
+  color: #999;
+  font-weight: 500;
+  padding: 0.2rem 0.6rem;
+  border-bottom: 1px solid #ddd;
+}
+.pair-table td {
+  padding: 0.15rem 0.6rem;
+  border-bottom: 1px solid #f0f0f0;
+}
+/* HTMX loading indicator */
+.htmx-indicator {
+  opacity: 0;
+  transition: opacity 200ms ease-in;
+}
+.htmx-request .htmx-indicator,
+.htmx-request.htmx-indicator {
+  opacity: 1;
+}
+/* Empty state */
+.empty {
+  text-align: center;
+  color: #999;
+  padding: 3rem 0;
+  font-size: 0.9rem;
+}
+/* Responsive */
+@media (max-width: 768px) {
+  .comparison-columns {
+    grid-template-columns: 1fr;
+  }
+  .container {
+    padding: 0 1rem 2rem;
+  }
+  table {
+    display: block;
+    overflow-x: auto;
+    -webkit-overflow-scrolling: touch;
+  }
+}

src/ocr_bench/templates/base.html ADDED Viewed

	@@ -0,0 +1,48 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <title>{% block title %}OCR Bench{% endblock %}</title>
+  <link rel="stylesheet" href="/static/style.css">
+  <script src="https://unpkg.com/htmx.org@2.0.4"></script>
+</head>
+<body>
+  <div class="container">
+    <nav>
+      <a href="/" class="brand">ocr-bench</a>
+      <a href="/leaderboard" {% if active_tab == "leaderboard" %}class="active"{% endif %}>Leaderboard</a>
+      <a href="/comparisons" {% if active_tab == "comparisons" %}class="active"{% endif %}>Comparisons</a>
+    </nav>
+    {% block content %}{% endblock %}
+  </div>
+  <script>
+    document.addEventListener("keydown", function(e) {
+      // Ignore when focus is in input/select/textarea
+      var tag = document.activeElement.tagName.toLowerCase();
+      if (tag === "input" || tag === "select" || tag === "textarea") return;
+      if (e.key === "ArrowLeft") {
+        var prev = document.querySelector("[data-nav='prev']");
+        if (prev) { prev.click(); e.preventDefault(); }
+      } else if (e.key === "ArrowRight") {
+        var next = document.querySelector("[data-nav='next']");
+        if (next) { next.click(); e.preventDefault(); }
+      } else if (e.key === "a" || e.key === "A") {
+        var voteA = document.querySelector("[data-vote='A']");
+        if (voteA) { voteA.click(); e.preventDefault(); }
+      } else if (e.key === "b" || e.key === "B") {
+        var voteB = document.querySelector("[data-vote='B']");
+        if (voteB) { voteB.click(); e.preventDefault(); }
+      } else if (e.key === "t" || e.key === "T") {
+        var voteTie = document.querySelector("[data-vote='tie']");
+        if (voteTie) { voteTie.click(); e.preventDefault(); }
+      } else if (e.key === "r" || e.key === "R") {
+        var reveal = document.querySelector("[data-action='reveal']");
+        if (reveal) { reveal.click(); e.preventDefault(); }
+      }
+    });
+  </script>
+</body>
+</html>

src/ocr_bench/templates/comparison_card.html ADDED Viewed

	@@ -0,0 +1,88 @@

+{% if comp %}
+<div class="comp-nav">
+  <span>{{ nav_idx + 1 }} of {{ nav_total }}</span>
+  {% if nav_idx > 0 %}
+  <a href="#" data-nav="prev"
+     hx-get="/comparisons/{{ nav_idx - 1 }}{% if winner_filter and winner_filter != 'All' %}?winner={{ winner_filter }}{% endif %}{% if model_filter and model_filter != 'All' %}{{ '&' if winner_filter and winner_filter != 'All' else '?' }}model={{ model_filter }}{% endif %}"
+     hx-target="#comparison-container">&larr;</a>
+  {% endif %}
+  {% if nav_idx < nav_total - 1 %}
+  <a href="#" data-nav="next"
+     hx-get="/comparisons/{{ nav_idx + 1 }}{% if winner_filter and winner_filter != 'All' %}?winner={{ winner_filter }}{% endif %}{% if model_filter and model_filter != 'All' %}{{ '&' if winner_filter and winner_filter != 'All' else '?' }}model={{ model_filter }}{% endif %}"
+     hx-target="#comparison-container">&rarr;</a>
+  {% endif %}
+</div>
+<div class="comparison-columns">
+  <div class="ocr-column">
+    {% if revealed %}
+    <h3 class="revealed">{{ model_a_name }}</h3>
+    {% else %}
+    <h3>A</h3>
+    {% endif %}
+    <div class="ocr-text">{{ display_text_a }}</div>
+  </div>
+  <div class="ocr-column">
+    {% if revealed %}
+    <h3 class="revealed">{{ model_b_name }}</h3>
+    {% else %}
+    <h3>B</h3>
+    {% endif %}
+    <div class="ocr-text">{{ display_text_b }}</div>
+  </div>
+</div>
+{% if not voted %}
+<div class="vote-prompt">Which OCR output is better?</div>
+<div class="vote-row">
+  <a href="#" data-vote="A" class="vote-btn"
+     hx-post="/vote/{{ comp_idx }}"
+     hx-vals='{"winner": "A"}'
+     hx-target="#comparison-container">A is better</a>
+  <a href="#" data-vote="tie" class="vote-btn vote-tie"
+     hx-post="/vote/{{ comp_idx }}"
+     hx-vals='{"winner": "tie"}'
+     hx-target="#comparison-container">Tie</a>
+  <a href="#" data-vote="B" class="vote-btn"
+     hx-post="/vote/{{ comp_idx }}"
+     hx-vals='{"winner": "B"}'
+     hx-target="#comparison-container">B is better</a>
+</div>
+<div class="vote-hints">
+  {% if not revealed %}
+  <a href="#" data-action="reveal"
+     hx-get="/reveal/{{ comp_idx }}"
+     hx-target="#comparison-container">show judge verdict</a>
+  <span class="separator">&middot;</span>
+  {% endif %}
+  <span class="keys">keys: <kbd>a</kbd> <kbd>t</kbd> <kbd>b</kbd> vote &middot; <kbd>&larr;</kbd> <kbd>&rarr;</kbd> navigate{% if not revealed %} &middot; <kbd>r</kbd> reveal{% endif %}</span>
+</div>
+{% endif %}
+{% if revealed %}
+<div class="verdict">
+  {% if voted %}
+  Judge: {{ judge_verdict }}
+  &middot; You: {{ human_vote }}
+  &middot; <span class="agreement {{ agreement_class }}">{{ agreement_word }}</span>
+  {% else %}
+  Judge: {{ judge_verdict }}
+  {% endif %}
+  {% if reason %}
+  <span class="reason">"{{ reason }}"</span>
+  {% endif %}
+</div>
+{% if just_voted and next_url %}
+<div hx-get="{{ next_url }}" hx-trigger="load delay:1.2s" hx-target="#comparison-container"></div>
+{% endif %}
+{% endif %}
+{% if has_image %}
+<div class="doc-image">
+  <img src="/image/{{ sample_idx }}" alt="Document image" loading="lazy">
+</div>
+{% endif %}
+{% else %}
+<div class="empty">No comparisons match the current filters.</div>
+{% endif %}

src/ocr_bench/templates/comparisons.html ADDED Viewed

	@@ -0,0 +1,40 @@

+{% extends "base.html" %}
+{% block title %}Comparisons — OCR Bench{% endblock %}
+{% block content %}
+<div class="filters">
+  <label>Winner
+    <select name="winner"
+            hx-get="/comparisons/filter"
+            hx-target="#comparison-container"
+            hx-include="[name='model']">
+      <option value="All" {% if winner_filter == "All" %}selected{% endif %}>All</option>
+      <option value="A" {% if winner_filter == "A" %}selected{% endif %}>A</option>
+      <option value="B" {% if winner_filter == "B" %}selected{% endif %}>B</option>
+      <option value="tie" {% if winner_filter == "tie" %}selected{% endif %}>tie</option>
+    </select>
+  </label>
+  <label>Model
+    <select name="model"
+            hx-get="/comparisons/filter"
+            hx-target="#comparison-container"
+            hx-include="[name='winner']">
+      <option value="All" {% if model_filter == "All" %}selected{% endif %}>All</option>
+      {% for m in models %}
+      <option value="{{ m }}" {% if model_filter == m %}selected{% endif %}>{{ m }}</option>
+      {% endfor %}
+    </select>
+  </label>
+</div>
+{% if pair_summary %}
+<div class="pair-summary">{{ pair_summary | safe }}</div>
+{% endif %}
+<div id="comparison-container">
+  {% include "comparison_card.html" %}
+</div>
+<div id="stats-panel" hx-get="/stats" hx-trigger="vote-recorded from:body" hx-swap="innerHTML">
+  {% include "stats_panel.html" %}
+</div>
+{% endblock %}

src/ocr_bench/templates/leaderboard.html ADDED Viewed

	@@ -0,0 +1,43 @@

+{% extends "base.html" %}
+{% block title %}Leaderboard — OCR Bench{% endblock %}
+{% block content %}
+<h2 style="font-size: 1.1rem; font-weight: 600; margin-bottom: 0.25rem;">Leaderboard</h2>
+<p style="font-size: 0.8rem; color: #888; margin-top: 0;">{{ repo_id }}</p>
+<table>
+  <thead>
+    <tr>
+      <th>#</th>
+      <th>Model</th>
+      <th class="num">Judge ELO</th>
+      {% if has_ci %}<th class="num">95% CI</th>{% endif %}
+      <th class="num">Wins</th>
+      <th class="num">Losses</th>
+      <th class="num">Ties</th>
+      <th class="num">Win%</th>
+      {% if has_human_elo %}
+      <th class="num">Human ELO</th>
+      <th class="num">H-Win%</th>
+      {% endif %}
+    </tr>
+  </thead>
+  <tbody>
+    {% for row in rows %}
+    <tr>
+      <td>{{ loop.index }}</td>
+      <td class="model">{{ row.model_short }}</td>
+      <td class="num">{{ row.elo }}</td>
+      {% if has_ci %}<td class="num">{{ row.elo_low }}&ndash;{{ row.elo_high }}</td>{% endif %}
+      <td class="num">{{ row.wins }}</td>
+      <td class="num">{{ row.losses }}</td>
+      <td class="num">{{ row.ties }}</td>
+      <td class="num">{{ row.win_pct }}%</td>
+      {% if has_human_elo %}
+      <td class="num">{{ row.human_elo if row.human_elo is not none else "—" }}</td>
+      <td class="num">{{ row.human_win_pct if row.human_win_pct is not none else "—" }}</td>
+      {% endif %}
+    </tr>
+    {% endfor %}
+  </tbody>
+</table>
+{% endblock %}

src/ocr_bench/templates/stats_panel.html ADDED Viewed

	@@ -0,0 +1,10 @@

+{% if vote_count > 0 %}
+<span>{{ vote_count }} vote{{ "s" if vote_count != 1 else "" }}</span>
+&middot;
+<span>{{ agreement_pct }}% agree</span>
+{% if hard_disagree_rate > 25 %}
+&middot; <span class="warning">judge may be miscalibrated</span>
+{% elif vote_count >= 15 %}
+&middot; <span class="calibrated">judge well-calibrated</span>
+{% endif %}
+{% endif %}

src/ocr_bench/validate.py ADDED Viewed

	@@ -0,0 +1,311 @@

+"""Blind human A/B validation for OCR judge quality."""
+from __future__ import annotations
+import json
+import os
+import random
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Any
+import structlog
+logger = structlog.get_logger()
+# Confidence thresholds
+MIN_ANNOTATIONS_FOR_CONFIDENCE = 15
+HIGH_AGREEMENT_THRESHOLD = 0.75
+@dataclass
+class AgreementStats:
+    """Tracks agreement between human and VLM judge."""
+    agree: int = 0
+    soft_disagree: int = 0  # one picks tie, other picks winner
+    hard_disagree: int = 0  # both pick winners but opposite
+    total: int = 0
+    @property
+    def agreement_rate(self) -> float:
+        """Rate including soft disagreements as partial agreement."""
+        return (self.agree + self.soft_disagree) / self.total if self.total else 0.0
+    @property
+    def hard_disagree_rate(self) -> float:
+        return self.hard_disagree / self.total if self.total else 0.0
+@dataclass
+class ValidationComparison:
+    """A single comparison for human validation.
+    Built from enriched comparison data published by the judge.
+    """
+    comparison_id: int
+    sample_idx: int
+    model_a: str
+    model_b: str
+    winner: str  # judge's verdict (hidden during annotation)
+    reason: str
+    agreement: str  # jury agreement (e.g. "2/2")
+    text_a: str  # OCR text from model A
+    text_b: str  # OCR text from model B
+    col_a: str
+    col_b: str
+    swapped: bool  # position-bias randomization for human display
+    display_text_a: str = ""  # text shown to human (may be swapped)
+    display_text_b: str = ""
+@dataclass
+class ValidationSession:
+    """Holds state for a validation session."""
+    comparisons: list[ValidationComparison]
+    model_names: list[str]
+    metadata: dict[str, Any] = field(default_factory=dict)
+    annotations: list[dict[str, Any]] = field(default_factory=list)
+    completed_ids: set[int] = field(default_factory=set)
+def _is_split_jury(agreement: str) -> bool:
+    """Check if a jury vote was split (e.g. '1/2' not '2/2')."""
+    parts = agreement.split("/")
+    return len(parts) == 2 and parts[0] != parts[1]
+def _interleave_by_sample(
+    comparisons: list[ValidationComparison],
+) -> list[ValidationComparison]:
+    """Interleave comparisons so you see different samples before repeating."""
+    by_sample: dict[int, list[ValidationComparison]] = defaultdict(list)
+    for comp in comparisons:
+        by_sample[comp.sample_idx].append(comp)
+    result: list[ValidationComparison] = []
+    queues = list(by_sample.values())
+    while queues:
+        next_round = []
+        for q in queues:
+            result.append(q.pop(0))
+            if q:
+                next_round.append(q)
+        queues = next_round
+    return result
+def build_validation_comparisons(
+    comparison_rows: list[dict[str, Any]],
+    *,
+    n: int | None = None,
+    prioritize_splits: bool = True,
+    seed: int = 42,
+) -> list[ValidationComparison]:
+    """Build validation comparisons from published judge results.
+    Args:
+        comparison_rows: Rows from the comparisons config of a results dataset.
+        n: Max number of comparisons to include (None = all).
+        prioritize_splits: Show split-jury cases first (most informative).
+        seed: Random seed for position-bias randomization.
+    """
+    rng = random.Random(seed)
+    comps: list[ValidationComparison] = []
+    for i, row in enumerate(comparison_rows):
+        swapped = rng.random() < 0.5
+        text_a = row.get("text_a", "")
+        text_b = row.get("text_b", "")
+        if swapped:
+            display_a, display_b = text_b, text_a
+        else:
+            display_a, display_b = text_a, text_b
+        comps.append(
+            ValidationComparison(
+                comparison_id=i,
+                sample_idx=row.get("sample_idx", i),
+                model_a=row.get("model_a", ""),
+                model_b=row.get("model_b", ""),
+                winner=row.get("winner", "tie"),
+                reason=row.get("reason", ""),
+                agreement=row.get("agreement", "1/1"),
+                text_a=text_a,
+                text_b=text_b,
+                col_a=row.get("col_a", ""),
+                col_b=row.get("col_b", ""),
+                swapped=swapped,
+                display_text_a=display_a,
+                display_text_b=display_b,
+            )
+        )
+    if prioritize_splits:
+        splits = [c for c in comps if _is_split_jury(c.agreement)]
+        unanimous = [c for c in comps if not _is_split_jury(c.agreement)]
+        ordered = _interleave_by_sample(splits) + _interleave_by_sample(unanimous)
+    else:
+        ordered = _interleave_by_sample(comps)
+    if n is not None and n < len(ordered):
+        ordered = ordered[:n]
+    # Re-assign comparison IDs after reordering
+    return [
+        ValidationComparison(
+            comparison_id=i,
+            sample_idx=c.sample_idx,
+            model_a=c.model_a,
+            model_b=c.model_b,
+            winner=c.winner,
+            reason=c.reason,
+            agreement=c.agreement,
+            text_a=c.text_a,
+            text_b=c.text_b,
+            col_a=c.col_a,
+            col_b=c.col_b,
+            swapped=c.swapped,
+            display_text_a=c.display_text_a,
+            display_text_b=c.display_text_b,
+        )
+        for i, c in enumerate(ordered)
+    ]
+def compute_agreement(
+    annotations: list[dict[str, Any]],
+    comparisons: list[ValidationComparison],
+) -> AgreementStats:
+    """Compute agreement between human annotations and judge verdicts."""
+    comp_by_id = {c.comparison_id: c for c in comparisons}
+    stats = AgreementStats()
+    for ann in annotations:
+        comp = comp_by_id.get(ann.get("comparison_id"))
+        if not comp:
+            continue
+        # Unswap human vote
+        human_winner = ann["winner"]
+        if comp.swapped:
+            if human_winner == "A":
+                human_winner = "B"
+            elif human_winner == "B":
+                human_winner = "A"
+        judge_winner = comp.winner
+        stats.total += 1
+        if human_winner == judge_winner:
+            stats.agree += 1
+        elif human_winner == "tie" or judge_winner == "tie":
+            stats.soft_disagree += 1
+        else:
+            stats.hard_disagree += 1
+    return stats
+def compute_human_elo(
+    annotations: list[dict[str, Any]],
+    comparisons: list[ValidationComparison],
+) -> Any:
+    """Compute ELO leaderboard from human annotations.
+    Returns a ``Leaderboard`` from ``elo.py``, or None if no annotations.
+    """
+    from ocr_bench.elo import ComparisonResult, compute_elo
+    comp_by_id = {c.comparison_id: c for c in comparisons}
+    model_set: set[str] = set()
+    results: list[ComparisonResult] = []
+    for ann in annotations:
+        comp = comp_by_id.get(ann.get("comparison_id"))
+        if not comp:
+            continue
+        # Unswap human vote to get canonical winner
+        human_winner = ann["winner"]
+        if comp.swapped:
+            if human_winner == "A":
+                human_winner = "B"
+            elif human_winner == "B":
+                human_winner = "A"
+        model_set.add(comp.model_a)
+        model_set.add(comp.model_b)
+        results.append(
+            ComparisonResult(
+                sample_idx=comp.sample_idx,
+                model_a=comp.model_a,
+                model_b=comp.model_b,
+                winner=human_winner,
+            )
+        )
+    if not results:
+        return None
+    return compute_elo(results, sorted(model_set))
+def save_annotations(
+    path: str,
+    metadata: dict[str, Any],
+    annotations: list[dict[str, Any]],
+) -> None:
+    """Atomically save annotations to JSON file."""
+    data = {"metadata": metadata, "annotations": annotations}
+    tmp = path + ".tmp"
+    with open(tmp, "w") as f:
+        json.dump(data, f, indent=2)
+    os.replace(tmp, path)
+def load_annotations(path: str) -> tuple[dict[str, Any], list[dict[str, Any]]]:
+    """Load annotations from JSON file. Returns (metadata, annotations)."""
+    if not os.path.exists(path):
+        return {}, []
+    with open(path) as f:
+        data = json.load(f)
+    return data.get("metadata", {}), data.get("annotations", [])
+def _agreement_banner(stats: AgreementStats) -> str:
+    """Format agreement stats for display."""
+    if stats.total == 0:
+        return ""
+    parts = [f"Agree: {stats.agree}"]
+    if stats.soft_disagree:
+        parts.append(f"Soft: {stats.soft_disagree}")
+    if stats.hard_disagree:
+        parts.append(f"**Hard: {stats.hard_disagree}**")
+    parts.append(f"(of {stats.total})")
+    confidence = ""
+    if stats.total >= MIN_ANNOTATIONS_FOR_CONFIDENCE:
+        if stats.hard_disagree_rate == 0:
+            confidence = (
+                f" -- No hard disagreements after {stats.total} annotations. "
+                "Judge rankings reliable for this domain."
+            )
+        elif stats.hard_disagree_rate <= 0.1:
+            confidence = (
+                f" -- Very few hard disagreements ({stats.hard_disagree}). "
+                "Rankings likely trustworthy."
+            )
+        elif stats.hard_disagree_rate > 0.25:
+            confidence = (
+                f" -- Many hard disagreements ({stats.hard_disagree}/{stats.total}). "
+                "Judge may not be calibrated for this content."
+            )
+    return f"Judge: {' | '.join(parts)}{confidence}"

src/ocr_bench/viewer.py ADDED Viewed

	@@ -0,0 +1,202 @@

+"""Results viewer — data loading and helpers for OCR bench results."""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any
+import structlog
+from datasets import load_dataset
+if TYPE_CHECKING:
+    from PIL import Image
+logger = structlog.get_logger()
+def load_results(repo_id: str) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
+    """Load leaderboard and comparisons from a Hub results dataset.
+    Tries the default config first (new repos), then falls back to the
+    named ``leaderboard`` config (old repos).
+    Returns:
+        (leaderboard_rows, comparison_rows)
+    """
+    try:
+        leaderboard_ds = load_dataset(repo_id, split="train")
+        leaderboard_rows = [dict(row) for row in leaderboard_ds]
+    except Exception:
+        leaderboard_ds = load_dataset(repo_id, name="leaderboard", split="train")
+        leaderboard_rows = [dict(row) for row in leaderboard_ds]
+    try:
+        comparisons_ds = load_dataset(repo_id, name="comparisons", split="train")
+    except Exception:
+        logger.warning("no_comparisons_config", repo=repo_id)
+        return leaderboard_rows, []
+    comparison_rows = [dict(row) for row in comparisons_ds]
+    return leaderboard_rows, comparison_rows
+def _load_source_metadata(repo_id: str) -> dict[str, Any]:
+    """Load metadata config from results repo to find the source dataset."""
+    try:
+        meta_ds = load_dataset(repo_id, name="metadata", split="train")
+        if len(meta_ds) > 0:
+            return dict(meta_ds[0])
+    except Exception as exc:
+        logger.warning("could_not_load_metadata", repo=repo_id, error=str(exc))
+    return {}
+class ImageLoader:
+    """Lazy image loader — fetches images from source dataset by sample_idx."""
+    def __init__(self, source_dataset: str, from_prs: bool = False):
+        self._source = source_dataset
+        self._from_prs = from_prs
+        self._cache: dict[int, Any] = {}
+        self._image_col: str | None = None
+        self._pr_revision: str | None = None
+        self._available = True
+        self._init_done = False
+    def _init_source(self) -> None:
+        """Lazy init: discover image column and PR revision on first call."""
+        if self._init_done:
+            return
+        self._init_done = True
+        try:
+            if self._from_prs:
+                from ocr_bench.dataset import discover_pr_configs
+                _, revisions = discover_pr_configs(self._source)
+                if revisions:
+                    # Use the first PR revision to get images
+                    first_config = next(iter(revisions))
+                    self._pr_revision = revisions[first_config]
+            # Probe for image column by loading 1 row
+            kwargs: dict[str, Any] = {"path": self._source, "split": "train[:1]"}
+            if self._pr_revision:
+                # Load from the first PR config
+                first_config = next(iter(revisions))
+                kwargs["name"] = first_config
+                kwargs["revision"] = self._pr_revision
+            probe = load_dataset(**kwargs)
+            for col in probe.column_names:
+                if col == "image" or "image" in col.lower():
+                    self._image_col = col
+                    break
+            if not self._image_col:
+                logger.info("no_image_column_in_source", source=self._source)
+                self._available = False
+        except Exception as exc:
+            logger.warning("image_loader_init_failed", source=self._source, error=str(exc))
+            self._available = False
+    def get(self, sample_idx: int) -> Image.Image | None:
+        """Fetch image for a sample index. Returns None on failure."""
+        self._init_source()
+        if not self._available or self._image_col is None:
+            return None
+        if sample_idx in self._cache:
+            return self._cache[sample_idx]
+        try:
+            kwargs: dict[str, Any] = {
+                "path": self._source,
+                "split": f"train[{sample_idx}:{sample_idx + 1}]",
+            }
+            if self._pr_revision:
+                from ocr_bench.dataset import discover_pr_configs
+                _, revisions = discover_pr_configs(self._source)
+                if revisions:
+                    first_config = next(iter(revisions))
+                    kwargs["name"] = first_config
+                    kwargs["revision"] = revisions[first_config]
+            row = load_dataset(**kwargs)
+            img = row[0][self._image_col]
+            self._cache[sample_idx] = img
+            return img
+        except Exception as exc:
+            logger.debug("image_load_failed", sample_idx=sample_idx, error=str(exc))
+            return None
+def _filter_comparisons(
+    comparisons: list[dict[str, Any]],
+    winner_filter: str,
+    model_filter: str,
+) -> list[dict[str, Any]]:
+    """Filter comparison rows by winner and model."""
+    filtered = comparisons
+    if winner_filter and winner_filter != "All":
+        filtered = [c for c in filtered if c.get("winner") == winner_filter]
+    if model_filter and model_filter != "All":
+        filtered = [
+            c
+            for c in filtered
+            if c.get("model_a") == model_filter or c.get("model_b") == model_filter
+        ]
+    return filtered
+def _winner_badge(winner: str) -> str:
+    """Return a badge string for the winner."""
+    if winner == "A":
+        return "Winner: A"
+    elif winner == "B":
+        return "Winner: B"
+    else:
+        return "Tie"
+def _model_label(model: str, col: str) -> str:
+    """Format model name with optional column name. Avoids empty parens."""
+    if col:
+        return f"{model} ({col})"
+    return model
+def _build_pair_summary(comparisons: list[dict[str, Any]]) -> str:
+    """Build a win/loss summary string for each model pair."""
+    from collections import Counter
+    pair_counts: dict[tuple[str, str], Counter[str]] = {}
+    for c in comparisons:
+        ma = c.get("model_a", "")
+        mb = c.get("model_b", "")
+        winner = c.get("winner", "tie")
+        key = (ma, mb) if ma <= mb else (mb, ma)
+        if key not in pair_counts:
+            pair_counts[key] = Counter()
+        # Track from perspective of first model in sorted pair
+        if winner == "A":
+            actual_winner = ma
+        elif winner == "B":
+            actual_winner = mb
+        else:
+            actual_winner = "tie"
+        if actual_winner == key[0]:
+            pair_counts[key]["W"] += 1
+        elif actual_winner == key[1]:
+            pair_counts[key]["L"] += 1
+        else:
+            pair_counts[key]["T"] += 1
+    if not pair_counts:
+        return ""
+    parts = []
+    for (ma, mb), counts in sorted(pair_counts.items()):
+        short_a = ma.split("/")[-1] if "/" in ma else ma
+        short_b = mb.split("/")[-1] if "/" in mb else mb
+        wins, losses, ties = counts["W"], counts["L"], counts["T"]
+        parts.append(f"**{short_a}** vs **{short_b}**: {wins}W {losses}L {ties}T")
+    return " | ".join(parts)

src/ocr_bench/web.py ADDED Viewed

	@@ -0,0 +1,487 @@

+"""FastAPI + HTMX viewer — unified browse + validate for OCR bench results."""
+from __future__ import annotations
+import io
+from dataclasses import dataclass, field
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+import structlog
+from fastapi import FastAPI, Form, Request
+from fastapi.responses import HTMLResponse, RedirectResponse, StreamingResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
+from ocr_bench.validate import (
+    ValidationComparison,
+    build_validation_comparisons,
+    compute_agreement,
+    compute_human_elo,
+    load_annotations,
+    save_annotations,
+)
+from ocr_bench.viewer import (
+    ImageLoader,
+    _filter_comparisons,
+    _load_source_metadata,
+    load_results,
+)
+logger = structlog.get_logger()
+def _short_model(model: str) -> str:
+    """Return just the model name after the org prefix."""
+    return model.split("/")[-1] if "/" in model else model
+def _build_pair_summary_html(comparisons: list[dict[str, Any]]) -> str:
+    """Build a compact HTML table of head-to-head records."""
+    from collections import Counter
+    pair_counts: dict[tuple[str, str], Counter[str]] = {}
+    for c in comparisons:
+        ma = c.get("model_a", "")
+        mb = c.get("model_b", "")
+        winner = c.get("winner", "tie")
+        key = (ma, mb) if ma <= mb else (mb, ma)
+        if key not in pair_counts:
+            pair_counts[key] = Counter()
+        if winner == "A":
+            actual_winner = ma
+        elif winner == "B":
+            actual_winner = mb
+        else:
+            actual_winner = "tie"
+        if actual_winner == key[0]:
+            pair_counts[key]["W"] += 1
+        elif actual_winner == key[1]:
+            pair_counts[key]["L"] += 1
+        else:
+            pair_counts[key]["T"] += 1
+    if not pair_counts:
+        return ""
+    rows = []
+    for (ma, mb), counts in sorted(pair_counts.items()):
+        short_a = _short_model(ma)
+        short_b = _short_model(mb)
+        wins, losses, ties = counts["W"], counts["L"], counts["T"]
+        rows.append(
+            f"<tr><td>{short_a}</td><td>{short_b}</td>"
+            f"<td class='num'>{wins}</td><td class='num'>{losses}</td>"
+            f"<td class='num'>{ties}</td></tr>"
+        )
+    return (
+        '<table class="pair-table"><thead><tr>'
+        "<th>Model A</th><th>Model B</th>"
+        '<th class="num">W</th><th class="num">L</th><th class="num">T</th>'
+        "</tr></thead><tbody>" + "".join(rows) + "</tbody></table>"
+    )
+PKG_DIR = Path(__file__).parent
+TEMPLATES_DIR = PKG_DIR / "templates"
+STATIC_DIR = PKG_DIR / "static"
+@dataclass
+class ViewerState:
+    """In-memory state for the single-user viewer."""
+    repo_id: str
+    leaderboard_rows: list[dict[str, Any]]
+    comparison_rows: list[dict[str, Any]]
+    validation_comps: list[ValidationComparison]
+    models: list[str]
+    img_loader: ImageLoader | None
+    save_path: str
+    annotations: list[dict[str, Any]] = field(default_factory=list)
+    completed_ids: set[int] = field(default_factory=set)
+    filtered_indices: list[int] = field(default_factory=list)
+def _build_filtered_indices(
+    state: ViewerState,
+    winner_filter: str = "All",
+    model_filter: str = "All",
+) -> list[int]:
+    """Map nav indices to validation_comps indices, respecting filters."""
+    filtered_comps = _filter_comparisons(state.comparison_rows, winner_filter, model_filter)
+    # Build a lookup from (sample_idx, model_a, model_b) -> validation comp index
+    filtered_sample_keys = {
+        (c["sample_idx"], c["model_a"], c["model_b"]) for c in filtered_comps
+    }
+    return [
+        i
+        for i, vc in enumerate(state.validation_comps)
+        if (vc.sample_idx, vc.model_a, vc.model_b) in filtered_sample_keys
+    ]
+def create_app(
+    repo_id: str,
+    *,
+    output_path: str | None = None,
+    n_validate: int | None = None,
+) -> FastAPI:
+    """Create the FastAPI app with all routes.
+    Args:
+        repo_id: HF dataset repo with published judge results.
+        output_path: Path to save human annotations JSON.
+        n_validate: Max comparisons to include for validation (None = all).
+    """
+    app = FastAPI(title=f"OCR Bench — {repo_id}")
+    app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static")
+    templates = Jinja2Templates(directory=str(TEMPLATES_DIR))
+    # --- Load data ---
+    leaderboard_rows, comparison_rows = load_results(repo_id)
+    metadata = _load_source_metadata(repo_id)
+    source_dataset = metadata.get("source_dataset", "")
+    from_prs = metadata.get("from_prs", False)
+    img_loader: ImageLoader | None = None
+    if source_dataset:
+        img_loader = ImageLoader(source_dataset, from_prs=from_prs)
+    validation_comps = build_validation_comparisons(
+        comparison_rows, n=n_validate, prioritize_splits=True
+    )
+    models = sorted(
+        {c.get("model_a", "") for c in comparison_rows}
+        | {c.get("model_b", "") for c in comparison_rows}
+    )
+    slug = repo_id.replace("/", "-")
+    save_path = output_path or f"human-eval-{slug}.json"
+    # Resume existing annotations
+    _, existing_annotations = load_annotations(save_path)
+    completed_ids = {ann["comparison_id"] for ann in existing_annotations}
+    state = ViewerState(
+        repo_id=repo_id,
+        leaderboard_rows=leaderboard_rows,
+        comparison_rows=comparison_rows,
+        validation_comps=validation_comps,
+        models=models,
+        img_loader=img_loader,
+        save_path=save_path,
+        annotations=existing_annotations,
+        completed_ids=completed_ids,
+        filtered_indices=list(range(len(validation_comps))),
+    )
+    # Store state on app for access in routes
+    app.state.viewer = state
+    ann_metadata = {
+        "results_repo": repo_id,
+        "n_comparisons": len(validation_comps),
+        "models": models,
+        "started_at": datetime.now(UTC).isoformat(),
+    }
+    # --- Helpers ---
+    def _get_comp_context(
+        nav_idx: int,
+        *,
+        revealed: bool = False,
+        voted: bool = False,
+        human_vote: str = "",
+        winner_filter: str = "All",
+        model_filter: str = "All",
+    ) -> dict[str, Any]:
+        """Build template context for a comparison card."""
+        indices = state.filtered_indices
+        if nav_idx < 0 or nav_idx >= len(indices):
+            return {"comp": None, "nav_idx": nav_idx, "nav_total": len(indices)}
+        comp_idx = indices[nav_idx]
+        comp = state.validation_comps[comp_idx]
+        # Check if already voted
+        already_voted = comp.comparison_id in state.completed_ids
+        if already_voted:
+            voted = True
+            revealed = True
+            # Find the annotation to get human vote
+            for ann in state.annotations:
+                if ann["comparison_id"] == comp.comparison_id:
+                    human_vote = ann["winner"]
+                    break
+        # Model names — short form for clean headers
+        model_a_name = _short_model(comp.model_a)
+        model_b_name = _short_model(comp.model_b)
+        if comp.swapped:
+            model_a_name, model_b_name = model_b_name, model_a_name
+        # Judge verdict (canonical → display)
+        judge_winner = comp.winner
+        if comp.swapped:
+            if judge_winner == "A":
+                judge_verdict = "B"
+            elif judge_winner == "B":
+                judge_verdict = "A"
+            else:
+                judge_verdict = "tie"
+        else:
+            judge_verdict = judge_winner
+        # Agreement
+        agreement_word = ""
+        agreement_class = ""
+        if voted and human_vote:
+            # Unswap human vote for comparison
+            unswapped_human = human_vote
+            if comp.swapped:
+                if human_vote == "A":
+                    unswapped_human = "B"
+                elif human_vote == "B":
+                    unswapped_human = "A"
+            if unswapped_human == comp.winner:
+                agreement_word = "agreed"
+                agreement_class = "agreed"
+            elif unswapped_human == "tie" or comp.winner == "tie":
+                agreement_word = "soft disagree"
+                agreement_class = "soft-disagree"
+            else:
+                agreement_word = "hard disagree"
+                agreement_class = "hard-disagree"
+        has_image = img_loader is not None
+        return {
+            "comp": comp,
+            "comp_idx": comp_idx,
+            "nav_idx": nav_idx,
+            "nav_total": len(indices),
+            "revealed": revealed,
+            "voted": voted,
+            "display_text_a": comp.display_text_a,
+            "display_text_b": comp.display_text_b,
+            "model_a_name": model_a_name,
+            "model_b_name": model_b_name,
+            "judge_verdict": judge_verdict,
+            "human_vote": human_vote,
+            "agreement_word": agreement_word,
+            "agreement_class": agreement_class,
+            "reason": comp.reason,
+            "sample_idx": comp.sample_idx,
+            "has_image": has_image,
+            "winner_filter": winner_filter,
+            "model_filter": model_filter,
+        }
+    def _stats_context() -> dict[str, Any]:
+        """Build template context for the stats panel."""
+        stats = compute_agreement(state.annotations, state.validation_comps)
+        return {
+            "vote_count": stats.total,
+            "agreement_pct": round(stats.agreement_rate * 100) if stats.total else 0,
+            "hard_disagree_rate": round(stats.hard_disagree_rate * 100) if stats.total else 0,
+        }
+    def _nav_idx_for_comp_idx(comp_idx: int) -> int:
+        """Find the nav_idx for a given comp_idx in filtered_indices."""
+        try:
+            return state.filtered_indices.index(comp_idx)
+        except ValueError:
+            return 0
+    # --- Routes ---
+    @app.get("/", response_class=RedirectResponse)
+    async def index():
+        return RedirectResponse(url="/comparisons", status_code=302)
+    @app.get("/leaderboard", response_class=HTMLResponse)
+    async def leaderboard(request: Request):
+        # Build human ELO if we have annotations
+        human_board = compute_human_elo(state.annotations, state.validation_comps)
+        rows = []
+        for row in sorted(state.leaderboard_rows, key=lambda r: r.get("elo", 0), reverse=True):
+            model = row.get("model", "")
+            short = model.split("/")[-1] if "/" in model else model
+            human_elo = None
+            human_win_pct = None
+            if human_board and model in human_board.elo:
+                human_elo = round(human_board.elo[model])
+                wp = human_board.win_pct(model)
+                human_win_pct = f"{wp:.0f}" if wp is not None else None
+            rows.append({
+                "model": model,
+                "model_short": short,
+                "elo": round(row.get("elo", 0)),
+                "elo_low": row.get("elo_low"),
+                "elo_high": row.get("elo_high"),
+                "wins": row.get("wins", 0),
+                "losses": row.get("losses", 0),
+                "ties": row.get("ties", 0),
+                "win_pct": row.get("win_pct", 0),
+                "human_elo": human_elo,
+                "human_win_pct": human_win_pct,
+            })
+        has_ci = any(r.get("elo_low") is not None for r in rows)
+        return templates.TemplateResponse(request, "leaderboard.html", {
+            "active_tab": "leaderboard",
+            "repo_id": state.repo_id,
+            "rows": rows,
+            "has_ci": has_ci,
+            "has_human_elo": human_board is not None,
+        })
+    @app.get("/comparisons", response_class=HTMLResponse)
+    async def comparisons_page(request: Request):
+        state.filtered_indices = _build_filtered_indices(state)
+        pair_summary = _build_pair_summary_html(state.comparison_rows)
+        ctx = _get_comp_context(0)
+        stats = _stats_context()
+        return templates.TemplateResponse(request, "comparisons.html", {
+            "active_tab": "comparisons",
+            "models": state.models,
+            "pair_summary": pair_summary,
+            "winner_filter": "All",
+            "model_filter": "All",
+            **ctx,
+            **stats,
+        })
+    @app.get("/comparisons/filter", response_class=HTMLResponse)
+    async def comparisons_filter(
+        request: Request,
+        winner: str = "All",
+        model: str = "All",
+    ):
+        state.filtered_indices = _build_filtered_indices(state, winner, model)
+        ctx = _get_comp_context(0, winner_filter=winner, model_filter=model)
+        return templates.TemplateResponse(request, "comparison_card.html", ctx)
+    @app.get("/comparisons/{nav_idx}", response_class=HTMLResponse)
+    async def comparison_at(
+        request: Request,
+        nav_idx: int,
+        winner: str = "All",
+        model: str = "All",
+    ):
+        # Clamp nav_idx
+        nav_idx = max(0, min(nav_idx, len(state.filtered_indices) - 1))
+        ctx = _get_comp_context(nav_idx, winner_filter=winner, model_filter=model)
+        return templates.TemplateResponse(request, "comparison_card.html", ctx)
+    @app.post("/vote/{comp_idx}", response_class=HTMLResponse)
+    async def vote(request: Request, comp_idx: int, winner: str = Form(...)):
+        if comp_idx < 0 or comp_idx >= len(state.validation_comps):
+            return HTMLResponse("Invalid comparison", status_code=404)
+        comp = state.validation_comps[comp_idx]
+        # Idempotent: if already voted, just return revealed card
+        if comp.comparison_id not in state.completed_ids:
+            # Unswap for storage
+            winner_unswapped = winner
+            if comp.swapped:
+                if winner == "A":
+                    winner_unswapped = "B"
+                elif winner == "B":
+                    winner_unswapped = "A"
+            if winner_unswapped == "A":
+                winner_model = comp.model_a
+            elif winner_unswapped == "B":
+                winner_model = comp.model_b
+            else:
+                winner_model = "tie"
+            ann = {
+                "comparison_id": comp.comparison_id,
+                "sample_idx": comp.sample_idx,
+                "model_a": comp.model_a,
+                "model_b": comp.model_b,
+                "swapped": comp.swapped,
+                "winner": winner,
+                "winner_model": winner_model,
+                "timestamp": datetime.now(UTC).isoformat(),
+            }
+            state.annotations.append(ann)
+            state.completed_ids.add(comp.comparison_id)
+            save_annotations(state.save_path, ann_metadata, state.annotations)
+        nav_idx = _nav_idx_for_comp_idx(comp_idx)
+        # Read current filters from request query params (forwarded by htmx)
+        winner_filter = request.query_params.get("winner", "All")
+        model_filter = request.query_params.get("model", "All")
+        ctx = _get_comp_context(
+            nav_idx,
+            revealed=True,
+            voted=True,
+            human_vote=winner,
+            winner_filter=winner_filter,
+            model_filter=model_filter,
+        )
+        # Auto-advance: tell template this was a fresh vote
+        next_nav = nav_idx + 1 if nav_idx + 1 < len(state.filtered_indices) else None
+        ctx["just_voted"] = True
+        ctx["next_nav_idx"] = next_nav
+        ctx["next_url"] = (
+            f"/comparisons/{next_nav}"
+            + (f"?winner={winner_filter}" if winner_filter != "All" else "")
+            + (f"{'&' if winner_filter != 'All' else '?'}model={model_filter}" if model_filter != "All" else "")
+            if next_nav is not None
+            else None
+        )
+        response = templates.TemplateResponse(request, "comparison_card.html", ctx)
+        response.headers["HX-Trigger"] = "vote-recorded"
+        return response
+    @app.get("/reveal/{comp_idx}", response_class=HTMLResponse)
+    async def reveal(request: Request, comp_idx: int):
+        if comp_idx < 0 or comp_idx >= len(state.validation_comps):
+            return HTMLResponse("Invalid comparison", status_code=404)
+        nav_idx = _nav_idx_for_comp_idx(comp_idx)
+        winner_filter = request.query_params.get("winner", "All")
+        model_filter = request.query_params.get("model", "All")
+        ctx = _get_comp_context(
+            nav_idx,
+            revealed=True,
+            voted=False,
+            winner_filter=winner_filter,
+            model_filter=model_filter,
+        )
+        return templates.TemplateResponse(request, "comparison_card.html", ctx)
+    @app.get("/stats", response_class=HTMLResponse)
+    async def stats(request: Request):
+        ctx = _stats_context()
+        return templates.TemplateResponse(request, "stats_panel.html", ctx)
+    @app.get("/image/{sample_idx}")
+    async def image(sample_idx: int):
+        if img_loader is None:
+            return HTMLResponse("No images available", status_code=404)
+        img = img_loader.get(sample_idx)
+        if img is None:
+            return HTMLResponse("Image not found", status_code=404)
+        buf = io.BytesIO()
+        img.save(buf, format="PNG")
+        buf.seek(0)
+        return StreamingResponse(buf, media_type="image/png")
+    return app