Spaces:
Running
Running
| """Hub publishing β push comparisons, leaderboard, and metadata configs to HF Hub.""" | |
| from __future__ import annotations | |
| import datetime | |
| import json | |
| from dataclasses import dataclass | |
| import structlog | |
| from datasets import Dataset, load_dataset | |
| from huggingface_hub import HfApi | |
| from ocr_bench.elo import ComparisonResult, Leaderboard | |
| logger = structlog.get_logger() | |
| class EvalMetadata: | |
| """Metadata for an evaluation run, stored alongside results on Hub.""" | |
| source_dataset: str | |
| judge_models: list[str] | |
| seed: int | |
| max_samples: int | |
| total_comparisons: int | |
| valid_comparisons: int | |
| from_prs: bool = False | |
| timestamp: str = "" | |
| def __post_init__(self): | |
| if not self.timestamp: | |
| self.timestamp = datetime.datetime.now(datetime.UTC).isoformat() | |
| def load_existing_comparisons(repo_id: str) -> list[ComparisonResult]: | |
| """Load existing comparisons from a Hub results repo. | |
| The stored winner is already unswapped (canonical), so ``swapped=False``. | |
| Returns an empty list if the repo or config doesn't exist. | |
| """ | |
| try: | |
| ds = load_dataset(repo_id, name="comparisons", split="train") | |
| except Exception as exc: | |
| logger.info("no_existing_comparisons", repo=repo_id, reason=str(exc)) | |
| return [] | |
| results = [] | |
| for row in ds: | |
| results.append( | |
| ComparisonResult( | |
| sample_idx=row["sample_idx"], | |
| model_a=row["model_a"], | |
| model_b=row["model_b"], | |
| winner=row["winner"], | |
| reason=row.get("reason", ""), | |
| agreement=row.get("agreement", "1/1"), | |
| swapped=False, | |
| text_a=row.get("text_a", ""), | |
| text_b=row.get("text_b", ""), | |
| col_a=row.get("col_a", ""), | |
| col_b=row.get("col_b", ""), | |
| ) | |
| ) | |
| logger.info("loaded_existing_comparisons", repo=repo_id, n=len(results)) | |
| return results | |
| def load_existing_metadata(repo_id: str) -> list[dict]: | |
| """Load existing metadata rows from a Hub results repo. | |
| Returns an empty list if the repo or config doesn't exist. | |
| """ | |
| try: | |
| ds = load_dataset(repo_id, name="metadata", split="train") | |
| return [dict(row) for row in ds] | |
| except Exception as exc: | |
| logger.info("no_existing_metadata", repo=repo_id, reason=str(exc)) | |
| return [] | |
| def build_leaderboard_rows(board: Leaderboard) -> list[dict]: | |
| """Convert a Leaderboard into rows suitable for a Hub dataset.""" | |
| rows = [] | |
| for model, elo in board.ranked: | |
| total = board.wins[model] + board.losses[model] + board.ties[model] | |
| row = { | |
| "model": model, | |
| "elo": round(elo), | |
| "wins": board.wins[model], | |
| "losses": board.losses[model], | |
| "ties": board.ties[model], | |
| "win_pct": round(board.wins[model] / total * 100) if total > 0 else 0, | |
| } | |
| if board.elo_ci and model in board.elo_ci: | |
| lo, hi = board.elo_ci[model] | |
| row["elo_low"] = round(lo) | |
| row["elo_high"] = round(hi) | |
| rows.append(row) | |
| return rows | |
| def build_metadata_row(metadata: EvalMetadata) -> dict: | |
| """Convert EvalMetadata into a single row for a Hub dataset.""" | |
| return { | |
| "source_dataset": metadata.source_dataset, | |
| "judge_models": json.dumps(metadata.judge_models), | |
| "seed": metadata.seed, | |
| "max_samples": metadata.max_samples, | |
| "total_comparisons": metadata.total_comparisons, | |
| "valid_comparisons": metadata.valid_comparisons, | |
| "from_prs": metadata.from_prs, | |
| "timestamp": metadata.timestamp, | |
| } | |
| def publish_results( | |
| repo_id: str, | |
| board: Leaderboard, | |
| metadata: EvalMetadata, | |
| existing_metadata: list[dict] | None = None, | |
| ) -> None: | |
| """Push evaluation results to Hub as a dataset with multiple configs. | |
| Configs: | |
| - (default): Leaderboard table β ``load_dataset("repo")`` returns this. | |
| - ``leaderboard``: Same table, named config (backward compat for viewer). | |
| - ``comparisons``: Full comparison log from the board (caller merges | |
| existing + new before ``compute_elo``, so ``board.comparison_log`` | |
| is already the complete set). | |
| - ``metadata``: Append-only run log. New row is appended to | |
| ``existing_metadata``. | |
| """ | |
| # Comparisons | |
| if board.comparison_log: | |
| comp_ds = Dataset.from_list(board.comparison_log) | |
| comp_ds.push_to_hub(repo_id, config_name="comparisons") | |
| logger.info("published_comparisons", repo=repo_id, n=len(board.comparison_log)) | |
| # Leaderboard β dual push: default config + named config | |
| rows = build_leaderboard_rows(board) | |
| lb_ds = Dataset.from_list(rows) | |
| lb_ds.push_to_hub(repo_id) | |
| lb_ds.push_to_hub(repo_id, config_name="leaderboard") | |
| logger.info("published_leaderboard", repo=repo_id, n=len(rows)) | |
| # Metadata β append-only | |
| meta_row = build_metadata_row(metadata) | |
| all_meta = (existing_metadata or []) + [meta_row] | |
| Dataset.from_list(all_meta).push_to_hub(repo_id, config_name="metadata") | |
| logger.info("published_metadata", repo=repo_id, n=len(all_meta)) | |
| # README β auto-generated dataset card with leaderboard | |
| readme = _build_readme(repo_id, rows, board, metadata) | |
| api = HfApi() | |
| api.upload_file( | |
| path_or_fileobj=readme.encode(), | |
| path_in_repo="README.md", | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| ) | |
| logger.info("published_readme", repo=repo_id) | |
| def _build_readme( | |
| repo_id: str, | |
| rows: list[dict], | |
| board: Leaderboard, | |
| metadata: EvalMetadata, | |
| ) -> str: | |
| """Build a dataset card README with the leaderboard table.""" | |
| has_ci = bool(board.elo_ci) | |
| source_short = metadata.source_dataset.split("/")[-1] | |
| judges = json.loads( | |
| metadata.judge_models | |
| if isinstance(metadata.judge_models, str) | |
| else json.dumps(metadata.judge_models) | |
| ) | |
| judge_str = ", ".join(j.split("/")[-1] for j in judges) if judges else "N/A" | |
| n_comparisons = len(board.comparison_log) | |
| lines = [ | |
| "---", | |
| "license: mit", | |
| "tags:", | |
| " - ocr-bench", | |
| " - leaderboard", | |
| "configs:", | |
| " - config_name: default", | |
| " data_files:", | |
| " - split: train", | |
| " path: data/train-*.parquet", | |
| " - config_name: comparisons", | |
| " data_files:", | |
| " - split: train", | |
| " path: comparisons/train-*.parquet", | |
| " - config_name: leaderboard", | |
| " data_files:", | |
| " - split: train", | |
| " path: leaderboard/train-*.parquet", | |
| " - config_name: metadata", | |
| " data_files:", | |
| " - split: train", | |
| " path: metadata/train-*.parquet", | |
| "---", | |
| "", | |
| f"# OCR Bench Results: {source_short}", | |
| "", | |
| "VLM-as-judge pairwise evaluation of OCR models. " | |
| "Rankings depend on document type β there is no single best OCR model.", | |
| "", | |
| "## Leaderboard", | |
| "", | |
| ] | |
| # Table header | |
| if has_ci: | |
| lines.append("| Rank | Model | ELO | 95% CI | Wins | Losses | Ties | Win% |") | |
| lines.append("|------|-------|-----|--------|------|--------|------|------|") | |
| else: | |
| lines.append("| Rank | Model | ELO | Wins | Losses | Ties | Win% |") | |
| lines.append("|------|-------|-----|------|--------|------|------|") | |
| for rank, row in enumerate(rows, 1): | |
| model = row["model"] | |
| elo = row["elo"] | |
| if has_ci and "elo_low" in row: | |
| ci = f"{row['elo_low']}\u2013{row['elo_high']}" | |
| lines.append( | |
| f"| {rank} | {model} | {elo} | {ci} " | |
| f"| {row['wins']} | {row['losses']} | {row['ties']} " | |
| f"| {row['win_pct']}% |" | |
| ) | |
| else: | |
| lines.append( | |
| f"| {rank} | {model} | {elo} " | |
| f"| {row['wins']} | {row['losses']} | {row['ties']} " | |
| f"| {row['win_pct']}% |" | |
| ) | |
| lines += [ | |
| "", | |
| "## Details", | |
| "", | |
| f"- **Source dataset**: [`{metadata.source_dataset}`]" | |
| f"(https://huggingface.co/datasets/{metadata.source_dataset})", | |
| f"- **Judge**: {judge_str}", | |
| f"- **Comparisons**: {n_comparisons}", | |
| "- **Method**: Bradley-Terry MLE with bootstrap 95% CIs", | |
| "", | |
| "## Configs", | |
| "", | |
| f"- `load_dataset(\"{repo_id}\")` β leaderboard table", | |
| f"- `load_dataset(\"{repo_id}\", name=\"comparisons\")` " | |
| "β full pairwise comparison log", | |
| f"- `load_dataset(\"{repo_id}\", name=\"metadata\")` " | |
| "β evaluation run history", | |
| "", | |
| "*Generated by [ocr-bench](https://github.com/davanstrien/ocr-bench)*", | |
| ] | |
| return "\n".join(lines) + "\n" | |