Spaces:
Running
Running
File size: 8,940 Bytes
1118181 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 | """Hub publishing β push comparisons, leaderboard, and metadata configs to HF Hub."""
from __future__ import annotations
import datetime
import json
from dataclasses import dataclass
import structlog
from datasets import Dataset, load_dataset
from huggingface_hub import HfApi
from ocr_bench.elo import ComparisonResult, Leaderboard
logger = structlog.get_logger()
@dataclass
class EvalMetadata:
"""Metadata for an evaluation run, stored alongside results on Hub."""
source_dataset: str
judge_models: list[str]
seed: int
max_samples: int
total_comparisons: int
valid_comparisons: int
from_prs: bool = False
timestamp: str = ""
def __post_init__(self):
if not self.timestamp:
self.timestamp = datetime.datetime.now(datetime.UTC).isoformat()
def load_existing_comparisons(repo_id: str) -> list[ComparisonResult]:
"""Load existing comparisons from a Hub results repo.
The stored winner is already unswapped (canonical), so ``swapped=False``.
Returns an empty list if the repo or config doesn't exist.
"""
try:
ds = load_dataset(repo_id, name="comparisons", split="train")
except Exception as exc:
logger.info("no_existing_comparisons", repo=repo_id, reason=str(exc))
return []
results = []
for row in ds:
results.append(
ComparisonResult(
sample_idx=row["sample_idx"],
model_a=row["model_a"],
model_b=row["model_b"],
winner=row["winner"],
reason=row.get("reason", ""),
agreement=row.get("agreement", "1/1"),
swapped=False,
text_a=row.get("text_a", ""),
text_b=row.get("text_b", ""),
col_a=row.get("col_a", ""),
col_b=row.get("col_b", ""),
)
)
logger.info("loaded_existing_comparisons", repo=repo_id, n=len(results))
return results
def load_existing_metadata(repo_id: str) -> list[dict]:
"""Load existing metadata rows from a Hub results repo.
Returns an empty list if the repo or config doesn't exist.
"""
try:
ds = load_dataset(repo_id, name="metadata", split="train")
return [dict(row) for row in ds]
except Exception as exc:
logger.info("no_existing_metadata", repo=repo_id, reason=str(exc))
return []
def build_leaderboard_rows(board: Leaderboard) -> list[dict]:
"""Convert a Leaderboard into rows suitable for a Hub dataset."""
rows = []
for model, elo in board.ranked:
total = board.wins[model] + board.losses[model] + board.ties[model]
row = {
"model": model,
"elo": round(elo),
"wins": board.wins[model],
"losses": board.losses[model],
"ties": board.ties[model],
"win_pct": round(board.wins[model] / total * 100) if total > 0 else 0,
}
if board.elo_ci and model in board.elo_ci:
lo, hi = board.elo_ci[model]
row["elo_low"] = round(lo)
row["elo_high"] = round(hi)
rows.append(row)
return rows
def build_metadata_row(metadata: EvalMetadata) -> dict:
"""Convert EvalMetadata into a single row for a Hub dataset."""
return {
"source_dataset": metadata.source_dataset,
"judge_models": json.dumps(metadata.judge_models),
"seed": metadata.seed,
"max_samples": metadata.max_samples,
"total_comparisons": metadata.total_comparisons,
"valid_comparisons": metadata.valid_comparisons,
"from_prs": metadata.from_prs,
"timestamp": metadata.timestamp,
}
def publish_results(
repo_id: str,
board: Leaderboard,
metadata: EvalMetadata,
existing_metadata: list[dict] | None = None,
) -> None:
"""Push evaluation results to Hub as a dataset with multiple configs.
Configs:
- (default): Leaderboard table β ``load_dataset("repo")`` returns this.
- ``leaderboard``: Same table, named config (backward compat for viewer).
- ``comparisons``: Full comparison log from the board (caller merges
existing + new before ``compute_elo``, so ``board.comparison_log``
is already the complete set).
- ``metadata``: Append-only run log. New row is appended to
``existing_metadata``.
"""
# Comparisons
if board.comparison_log:
comp_ds = Dataset.from_list(board.comparison_log)
comp_ds.push_to_hub(repo_id, config_name="comparisons")
logger.info("published_comparisons", repo=repo_id, n=len(board.comparison_log))
# Leaderboard β dual push: default config + named config
rows = build_leaderboard_rows(board)
lb_ds = Dataset.from_list(rows)
lb_ds.push_to_hub(repo_id)
lb_ds.push_to_hub(repo_id, config_name="leaderboard")
logger.info("published_leaderboard", repo=repo_id, n=len(rows))
# Metadata β append-only
meta_row = build_metadata_row(metadata)
all_meta = (existing_metadata or []) + [meta_row]
Dataset.from_list(all_meta).push_to_hub(repo_id, config_name="metadata")
logger.info("published_metadata", repo=repo_id, n=len(all_meta))
# README β auto-generated dataset card with leaderboard
readme = _build_readme(repo_id, rows, board, metadata)
api = HfApi()
api.upload_file(
path_or_fileobj=readme.encode(),
path_in_repo="README.md",
repo_id=repo_id,
repo_type="dataset",
)
logger.info("published_readme", repo=repo_id)
def _build_readme(
repo_id: str,
rows: list[dict],
board: Leaderboard,
metadata: EvalMetadata,
) -> str:
"""Build a dataset card README with the leaderboard table."""
has_ci = bool(board.elo_ci)
source_short = metadata.source_dataset.split("/")[-1]
judges = json.loads(
metadata.judge_models
if isinstance(metadata.judge_models, str)
else json.dumps(metadata.judge_models)
)
judge_str = ", ".join(j.split("/")[-1] for j in judges) if judges else "N/A"
n_comparisons = len(board.comparison_log)
lines = [
"---",
"license: mit",
"tags:",
" - ocr-bench",
" - leaderboard",
"configs:",
" - config_name: default",
" data_files:",
" - split: train",
" path: data/train-*.parquet",
" - config_name: comparisons",
" data_files:",
" - split: train",
" path: comparisons/train-*.parquet",
" - config_name: leaderboard",
" data_files:",
" - split: train",
" path: leaderboard/train-*.parquet",
" - config_name: metadata",
" data_files:",
" - split: train",
" path: metadata/train-*.parquet",
"---",
"",
f"# OCR Bench Results: {source_short}",
"",
"VLM-as-judge pairwise evaluation of OCR models. "
"Rankings depend on document type β there is no single best OCR model.",
"",
"## Leaderboard",
"",
]
# Table header
if has_ci:
lines.append("| Rank | Model | ELO | 95% CI | Wins | Losses | Ties | Win% |")
lines.append("|------|-------|-----|--------|------|--------|------|------|")
else:
lines.append("| Rank | Model | ELO | Wins | Losses | Ties | Win% |")
lines.append("|------|-------|-----|------|--------|------|------|")
for rank, row in enumerate(rows, 1):
model = row["model"]
elo = row["elo"]
if has_ci and "elo_low" in row:
ci = f"{row['elo_low']}\u2013{row['elo_high']}"
lines.append(
f"| {rank} | {model} | {elo} | {ci} "
f"| {row['wins']} | {row['losses']} | {row['ties']} "
f"| {row['win_pct']}% |"
)
else:
lines.append(
f"| {rank} | {model} | {elo} "
f"| {row['wins']} | {row['losses']} | {row['ties']} "
f"| {row['win_pct']}% |"
)
lines += [
"",
"## Details",
"",
f"- **Source dataset**: [`{metadata.source_dataset}`]"
f"(https://huggingface.co/datasets/{metadata.source_dataset})",
f"- **Judge**: {judge_str}",
f"- **Comparisons**: {n_comparisons}",
"- **Method**: Bradley-Terry MLE with bootstrap 95% CIs",
"",
"## Configs",
"",
f"- `load_dataset(\"{repo_id}\")` β leaderboard table",
f"- `load_dataset(\"{repo_id}\", name=\"comparisons\")` "
"β full pairwise comparison log",
f"- `load_dataset(\"{repo_id}\", name=\"metadata\")` "
"β evaluation run history",
"",
"*Generated by [ocr-bench](https://github.com/davanstrien/ocr-bench)*",
]
return "\n".join(lines) + "\n"
|