"""Hub publishing — push comparisons, leaderboard, and metadata configs to HF Hub.""" from __future__ import annotations import datetime import json from dataclasses import dataclass import structlog from datasets import Dataset, load_dataset from huggingface_hub import HfApi from ocr_bench.elo import ComparisonResult, Leaderboard logger = structlog.get_logger() @dataclass class EvalMetadata: """Metadata for an evaluation run, stored alongside results on Hub.""" source_dataset: str judge_models: list[str] seed: int max_samples: int total_comparisons: int valid_comparisons: int from_prs: bool = False timestamp: str = "" def __post_init__(self): if not self.timestamp: self.timestamp = datetime.datetime.now(datetime.UTC).isoformat() def load_existing_comparisons(repo_id: str) -> list[ComparisonResult]: """Load existing comparisons from a Hub results repo. The stored winner is already unswapped (canonical), so ``swapped=False``. Returns an empty list if the repo or config doesn't exist. """ try: ds = load_dataset(repo_id, name="comparisons", split="train") except Exception as exc: logger.info("no_existing_comparisons", repo=repo_id, reason=str(exc)) return [] results = [] for row in ds: results.append( ComparisonResult( sample_idx=row["sample_idx"], model_a=row["model_a"], model_b=row["model_b"], winner=row["winner"], reason=row.get("reason", ""), agreement=row.get("agreement", "1/1"), swapped=False, text_a=row.get("text_a", ""), text_b=row.get("text_b", ""), col_a=row.get("col_a", ""), col_b=row.get("col_b", ""), ) ) logger.info("loaded_existing_comparisons", repo=repo_id, n=len(results)) return results def load_existing_metadata(repo_id: str) -> list[dict]: """Load existing metadata rows from a Hub results repo. Returns an empty list if the repo or config doesn't exist. """ try: ds = load_dataset(repo_id, name="metadata", split="train") return [dict(row) for row in ds] except Exception as exc: logger.info("no_existing_metadata", repo=repo_id, reason=str(exc)) return [] def build_leaderboard_rows(board: Leaderboard) -> list[dict]: """Convert a Leaderboard into rows suitable for a Hub dataset.""" rows = [] for model, elo in board.ranked: total = board.wins[model] + board.losses[model] + board.ties[model] row = { "model": model, "elo": round(elo), "wins": board.wins[model], "losses": board.losses[model], "ties": board.ties[model], "win_pct": round(board.wins[model] / total * 100) if total > 0 else 0, } if board.elo_ci and model in board.elo_ci: lo, hi = board.elo_ci[model] row["elo_low"] = round(lo) row["elo_high"] = round(hi) rows.append(row) return rows def build_metadata_row(metadata: EvalMetadata) -> dict: """Convert EvalMetadata into a single row for a Hub dataset.""" return { "source_dataset": metadata.source_dataset, "judge_models": json.dumps(metadata.judge_models), "seed": metadata.seed, "max_samples": metadata.max_samples, "total_comparisons": metadata.total_comparisons, "valid_comparisons": metadata.valid_comparisons, "from_prs": metadata.from_prs, "timestamp": metadata.timestamp, } def publish_results( repo_id: str, board: Leaderboard, metadata: EvalMetadata, existing_metadata: list[dict] | None = None, ) -> None: """Push evaluation results to Hub as a dataset with multiple configs. Configs: - (default): Leaderboard table — ``load_dataset("repo")`` returns this. - ``leaderboard``: Same table, named config (backward compat for viewer). - ``comparisons``: Full comparison log from the board (caller merges existing + new before ``compute_elo``, so ``board.comparison_log`` is already the complete set). - ``metadata``: Append-only run log. New row is appended to ``existing_metadata``. """ # Comparisons if board.comparison_log: comp_ds = Dataset.from_list(board.comparison_log) comp_ds.push_to_hub(repo_id, config_name="comparisons") logger.info("published_comparisons", repo=repo_id, n=len(board.comparison_log)) # Leaderboard — dual push: default config + named config rows = build_leaderboard_rows(board) lb_ds = Dataset.from_list(rows) lb_ds.push_to_hub(repo_id) lb_ds.push_to_hub(repo_id, config_name="leaderboard") logger.info("published_leaderboard", repo=repo_id, n=len(rows)) # Metadata — append-only meta_row = build_metadata_row(metadata) all_meta = (existing_metadata or []) + [meta_row] Dataset.from_list(all_meta).push_to_hub(repo_id, config_name="metadata") logger.info("published_metadata", repo=repo_id, n=len(all_meta)) # README — auto-generated dataset card with leaderboard readme = _build_readme(repo_id, rows, board, metadata) api = HfApi() api.upload_file( path_or_fileobj=readme.encode(), path_in_repo="README.md", repo_id=repo_id, repo_type="dataset", ) logger.info("published_readme", repo=repo_id) def _build_readme( repo_id: str, rows: list[dict], board: Leaderboard, metadata: EvalMetadata, ) -> str: """Build a dataset card README with the leaderboard table.""" has_ci = bool(board.elo_ci) source_short = metadata.source_dataset.split("/")[-1] judges = json.loads( metadata.judge_models if isinstance(metadata.judge_models, str) else json.dumps(metadata.judge_models) ) judge_str = ", ".join(j.split("/")[-1] for j in judges) if judges else "N/A" n_comparisons = len(board.comparison_log) lines = [ "---", "license: mit", "tags:", " - ocr-bench", " - leaderboard", "configs:", " - config_name: default", " data_files:", " - split: train", " path: data/train-*.parquet", " - config_name: comparisons", " data_files:", " - split: train", " path: comparisons/train-*.parquet", " - config_name: leaderboard", " data_files:", " - split: train", " path: leaderboard/train-*.parquet", " - config_name: metadata", " data_files:", " - split: train", " path: metadata/train-*.parquet", "---", "", f"# OCR Bench Results: {source_short}", "", "VLM-as-judge pairwise evaluation of OCR models. " "Rankings depend on document type — there is no single best OCR model.", "", "## Leaderboard", "", ] # Table header if has_ci: lines.append("| Rank | Model | ELO | 95% CI | Wins | Losses | Ties | Win% |") lines.append("|------|-------|-----|--------|------|--------|------|------|") else: lines.append("| Rank | Model | ELO | Wins | Losses | Ties | Win% |") lines.append("|------|-------|-----|------|--------|------|------|") for rank, row in enumerate(rows, 1): model = row["model"] elo = row["elo"] if has_ci and "elo_low" in row: ci = f"{row['elo_low']}\u2013{row['elo_high']}" lines.append( f"| {rank} | {model} | {elo} | {ci} " f"| {row['wins']} | {row['losses']} | {row['ties']} " f"| {row['win_pct']}% |" ) else: lines.append( f"| {rank} | {model} | {elo} " f"| {row['wins']} | {row['losses']} | {row['ties']} " f"| {row['win_pct']}% |" ) lines += [ "", "## Details", "", f"- **Source dataset**: [`{metadata.source_dataset}`]" f"(https://huggingface.co/datasets/{metadata.source_dataset})", f"- **Judge**: {judge_str}", f"- **Comparisons**: {n_comparisons}", "- **Method**: Bradley-Terry MLE with bootstrap 95% CIs", "", "## Configs", "", f"- `load_dataset(\"{repo_id}\")` — leaderboard table", f"- `load_dataset(\"{repo_id}\", name=\"comparisons\")` " "— full pairwise comparison log", f"- `load_dataset(\"{repo_id}\", name=\"metadata\")` " "— evaluation run history", "", "*Generated by [ocr-bench](https://github.com/davanstrien/ocr-bench)*", ] return "\n".join(lines) + "\n"