Clemson-Computing-User's picture
VANTAGE-Bench v1.0
8b5161a
Raw
History Blame Contribute Delete
11.5 kB
"""Load and validate the static leaderboard JSON.
Public surface:
ModelRecord — frozen dataclass for one row of the leaderboard.
LeaderboardData — frozen dataclass wrapping the full dataset + index.
load_results_json — read + validate + build the dataset.
Validation is strict: any schema violation raises ValueError with a
single multi-line message listing every problem encountered. The app
should let this propagate so the HF Space build fails loudly when the
JSON is bad.
"""
from __future__ import annotations
import json
import re
from dataclasses import dataclass
from pathlib import Path
from .config import (
ALLOWED_RESULT_TYPES,
ALLOWED_TYPES,
OPTIONAL_TASK_FIELDS,
REQUIRED_MODEL_FIELDS,
REQUIRED_SCORE_FIELDS,
SCHEMA_VERSION,
TYPE_DISPLAY,
)
# -- Dataclasses -----------------------------------------------------------
@dataclass(frozen=True)
class ModelRecord:
"""One model row, post-validation.
`scores` is a flat dict keyed by JSON field name (e.g. "overall",
"spatial", "2d_localization"). Optional task scores may be absent;
callers should use ``.score(field)`` which returns None when missing.
"""
# Required fields
id: str
name: str
organization: str
params: str
type: str # "open" or "closed"
result_type: str # "single" or "ensemble"
scores: dict[str, float]
param_value: float | None
param_bucket: str | None
type_display: str
# Optional fields with defaults
url: str | None = None
verified: bool = False
ensemble: bool = False
is_new: bool = False
date_evaluated: str = ""
model_url: str = ""
def score(self, field_name: str) -> float | None:
return self.scores.get(field_name)
@dataclass(frozen=True)
class LeaderboardData:
schema_version: str
benchmark_version: str
updated: str # YYYY-MM-DD as in the source JSON
models: list[ModelRecord]
model_by_id: dict[str, ModelRecord]
# -- Internals -------------------------------------------------------------
_PARAMS_RE = re.compile(r"^\s*(\d+(?:\.\d+)?)\s*B\s*$")
_DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$")
_EM_DASH = "—"
def _parse_param_value(params: str) -> float | None:
"""Return the numeric param count in billions, or None if undisclosed/unparseable."""
if params.strip() == _EM_DASH:
return None
m = _PARAMS_RE.match(params)
return float(m.group(1)) if m else None
def _param_bucket(value: float | None) -> str | None:
"""Map a numeric param count (in billions) to a coarse bucket label.
Boundaries match util.config.PARAM_BUCKETS:
v < 10 → "<10B"
10 ≤ v ≤ 40 → "10B–40B"
v > 40 → ">40B"
None inputs return None (used to mark models with undisclosed sizes —
those models are visible only under the "All sizes" filter, never
under a numeric bucket).
"""
if value is None:
return None
if value < 10.0:
return "<10B"
if value <= 40.0:
return "10B–40B"
return ">40B"
def _id_label(model_dict: dict, index: int) -> str:
raw = model_dict.get("id")
if isinstance(raw, str) and raw:
return f'models[{index}].id="{raw}"'
return f"models[{index}].id=<missing>"
def _validate_score(
model_dict: dict, field: str, prefix: str, errors: list[str], required: bool
) -> float | None:
"""Validate one score field. Returns the float on success, None on failure
(or when an optional field is absent)."""
if field not in model_dict:
if required:
errors.append(f'{prefix}: missing required field "{field}"')
return None
v = model_dict[field]
if v is None:
errors.append(
f'{prefix}: field "{field}" is null; omit the field instead of using null'
)
return None
# bool is a subclass of int; reject it explicitly.
if isinstance(v, bool) or not isinstance(v, (int, float)):
errors.append(
f'{prefix}: field "{field}" must be numeric, got {type(v).__name__}'
)
return None
fv = float(v)
if not (0.0 <= fv <= 100.0):
errors.append(f'{prefix}: field "{field}" must be in [0, 100], got {fv}')
return None
return fv
def _validate_optional_bool(
model_dict: dict, field_name: str, prefix: str, errors: list[str]
) -> bool:
"""Validate an optional bool field. Returns the value if valid, False if absent."""
if field_name not in model_dict:
return False
v = model_dict[field_name]
if not isinstance(v, bool):
errors.append(
f'{prefix}: field "{field_name}" must be a boolean, got {type(v).__name__}'
)
return False
return v
def _fail(errors: list[str]) -> None:
msg = "[VANTAGE-Bench leaderboard] Schema validation failed:\n" + "\n".join(
f" - {e}" for e in errors
)
raise ValueError(msg)
# -- Public loader ---------------------------------------------------------
def load_results_json(path: str | Path) -> LeaderboardData:
"""Read, validate, and build the in-memory leaderboard dataset.
Hard-fails with ValueError listing every problem if validation does not pass.
"""
p = Path(path)
try:
raw_text = p.read_text(encoding="utf-8")
except OSError as e:
raise ValueError(
f"[VANTAGE-Bench leaderboard] Could not read {p}: {e}"
) from e
try:
data = json.loads(raw_text)
except json.JSONDecodeError as e:
raise ValueError(
f"[VANTAGE-Bench leaderboard] JSON parse error in {p}: {e}"
) from e
if not isinstance(data, dict):
raise ValueError(
"[VANTAGE-Bench leaderboard] Schema validation failed:\n"
" - top-level JSON must be an object"
)
errors: list[str] = []
# -- Top-level fields ---------------------------------------------------
sv = data.get("schema_version")
if sv != SCHEMA_VERSION:
errors.append(
f'schema_version must equal "{SCHEMA_VERSION}", got {sv!r}'
)
bv = data.get("benchmark_version")
if not isinstance(bv, str) or not bv:
errors.append("benchmark_version must be a non-empty string")
updated = data.get("updated")
if not isinstance(updated, str) or not _DATE_RE.match(updated):
errors.append(f"updated must be a YYYY-MM-DD string, got {updated!r}")
models_raw = data.get("models")
if not isinstance(models_raw, list) or not models_raw:
errors.append("models must be a non-empty list")
_fail(errors)
return # unreachable; satisfies type checkers
# -- Per-model validation ----------------------------------------------
seen_ids: set[str] = set()
for i, m in enumerate(models_raw):
if not isinstance(m, dict):
errors.append(
f"models[{i}] must be an object, got {type(m).__name__}"
)
continue
prefix = _id_label(m, i)
# Required string fields
for fld in REQUIRED_MODEL_FIELDS:
if fld not in m:
errors.append(f'{prefix}: missing required field "{fld}"')
elif not isinstance(m[fld], str) or not m[fld]:
errors.append(
f'{prefix}: field "{fld}" must be a non-empty string, got {m[fld]!r}'
)
# type must be in allowed set
if isinstance(m.get("type"), str) and m["type"] not in ALLOWED_TYPES:
errors.append(
f'{prefix}: field "type" must be one of {ALLOWED_TYPES}, got {m["type"]!r}'
)
# result_type must be in allowed set
rt = m.get("result_type")
if isinstance(rt, str) and rt not in ALLOWED_RESULT_TYPES:
errors.append(
f'{prefix}: field "result_type" must be one of {ALLOWED_RESULT_TYPES}, got {rt!r}'
)
# Optional bool fields
_validate_optional_bool(m, "verified", prefix, errors)
_validate_optional_bool(m, "ensemble", prefix, errors)
_validate_optional_bool(m, "is_new", prefix, errors)
# Uniqueness on id
mid = m.get("id")
if isinstance(mid, str) and mid:
if mid in seen_ids:
errors.append(f"{prefix}: duplicate id")
seen_ids.add(mid)
# url (optional legacy field — model name link)
if "url" in m:
url = m["url"]
if url is None:
errors.append(
f'{prefix}: field "url" is null; omit the field instead of using null'
)
elif not isinstance(url, str) or not url:
errors.append(
f'{prefix}: field "url" must be a non-empty string when present, got {url!r}'
)
# model_url (optional — HF model card or project page link)
if "model_url" in m:
mu = m["model_url"]
if not isinstance(mu, str):
errors.append(
f'{prefix}: field "model_url" must be a string, got {type(mu).__name__}'
)
# date_evaluated (optional — YYYY-MM-DD or empty string)
if "date_evaluated" in m:
de = m["date_evaluated"]
if not isinstance(de, str):
errors.append(
f'{prefix}: field "date_evaluated" must be a string, got {type(de).__name__}'
)
elif de and not _DATE_RE.match(de):
errors.append(
f'{prefix}: field "date_evaluated" must be YYYY-MM-DD or empty, got {de!r}'
)
# Required score fields
for fld in REQUIRED_SCORE_FIELDS:
_validate_score(m, fld, prefix, errors, required=True)
# Optional task score fields
for fld in OPTIONAL_TASK_FIELDS:
_validate_score(m, fld, prefix, errors, required=False)
if errors:
_fail(errors)
# -- Build records (guaranteed valid past this point) ------------------
records: list[ModelRecord] = []
for m in models_raw:
scores: dict[str, float] = {}
for fld in REQUIRED_SCORE_FIELDS:
scores[fld] = float(m[fld])
for fld in OPTIONAL_TASK_FIELDS:
if fld in m:
scores[fld] = float(m[fld])
params_str = m["params"]
pv = _parse_param_value(params_str)
pb = _param_bucket(pv)
records.append(
ModelRecord(
id=m["id"],
name=m["name"],
organization=m["organization"],
params=params_str,
type=m["type"],
result_type=m["result_type"],
scores=scores,
param_value=pv,
param_bucket=pb,
type_display=TYPE_DISPLAY[m["type"]],
url=m.get("url"),
verified=bool(m.get("verified", False)),
ensemble=bool(m.get("ensemble", False)),
is_new=bool(m.get("is_new", False)),
date_evaluated=m.get("date_evaluated", ""),
model_url=m.get("model_url", ""),
)
)
return LeaderboardData(
schema_version=sv,
benchmark_version=bv,
updated=updated,
models=records,
model_by_id={r.id: r for r in records},
)