"""Load and validate the static leaderboard JSON. Public surface: ModelRecord — frozen dataclass for one row of the leaderboard. LeaderboardData — frozen dataclass wrapping the full dataset + index. load_results_json — read + validate + build the dataset. Validation is strict: any schema violation raises ValueError with a single multi-line message listing every problem encountered. The app should let this propagate so the HF Space build fails loudly when the JSON is bad. """ from __future__ import annotations import json import re from dataclasses import dataclass from pathlib import Path from .config import ( ALLOWED_RESULT_TYPES, ALLOWED_TYPES, OPTIONAL_TASK_FIELDS, REQUIRED_MODEL_FIELDS, REQUIRED_SCORE_FIELDS, SCHEMA_VERSION, TYPE_DISPLAY, ) # -- Dataclasses ----------------------------------------------------------- @dataclass(frozen=True) class ModelRecord: """One model row, post-validation. `scores` is a flat dict keyed by JSON field name (e.g. "overall", "spatial", "2d_localization"). Optional task scores may be absent; callers should use ``.score(field)`` which returns None when missing. """ # Required fields id: str name: str organization: str params: str type: str # "open" or "closed" result_type: str # "single" or "ensemble" scores: dict[str, float] param_value: float | None param_bucket: str | None type_display: str # Optional fields with defaults url: str | None = None verified: bool = False ensemble: bool = False is_new: bool = False date_evaluated: str = "" model_url: str = "" def score(self, field_name: str) -> float | None: return self.scores.get(field_name) @dataclass(frozen=True) class LeaderboardData: schema_version: str benchmark_version: str updated: str # YYYY-MM-DD as in the source JSON models: list[ModelRecord] model_by_id: dict[str, ModelRecord] # -- Internals ------------------------------------------------------------- _PARAMS_RE = re.compile(r"^\s*(\d+(?:\.\d+)?)\s*B\s*$") _DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$") _EM_DASH = "—" def _parse_param_value(params: str) -> float | None: """Return the numeric param count in billions, or None if undisclosed/unparseable.""" if params.strip() == _EM_DASH: return None m = _PARAMS_RE.match(params) return float(m.group(1)) if m else None def _param_bucket(value: float | None) -> str | None: """Map a numeric param count (in billions) to a coarse bucket label. Boundaries match util.config.PARAM_BUCKETS: v < 10 → "<10B" 10 ≤ v ≤ 40 → "10B–40B" v > 40 → ">40B" None inputs return None (used to mark models with undisclosed sizes — those models are visible only under the "All sizes" filter, never under a numeric bucket). """ if value is None: return None if value < 10.0: return "<10B" if value <= 40.0: return "10B–40B" return ">40B" def _id_label(model_dict: dict, index: int) -> str: raw = model_dict.get("id") if isinstance(raw, str) and raw: return f'models[{index}].id="{raw}"' return f"models[{index}].id=" def _validate_score( model_dict: dict, field: str, prefix: str, errors: list[str], required: bool ) -> float | None: """Validate one score field. Returns the float on success, None on failure (or when an optional field is absent).""" if field not in model_dict: if required: errors.append(f'{prefix}: missing required field "{field}"') return None v = model_dict[field] if v is None: errors.append( f'{prefix}: field "{field}" is null; omit the field instead of using null' ) return None # bool is a subclass of int; reject it explicitly. if isinstance(v, bool) or not isinstance(v, (int, float)): errors.append( f'{prefix}: field "{field}" must be numeric, got {type(v).__name__}' ) return None fv = float(v) if not (0.0 <= fv <= 100.0): errors.append(f'{prefix}: field "{field}" must be in [0, 100], got {fv}') return None return fv def _validate_optional_bool( model_dict: dict, field_name: str, prefix: str, errors: list[str] ) -> bool: """Validate an optional bool field. Returns the value if valid, False if absent.""" if field_name not in model_dict: return False v = model_dict[field_name] if not isinstance(v, bool): errors.append( f'{prefix}: field "{field_name}" must be a boolean, got {type(v).__name__}' ) return False return v def _fail(errors: list[str]) -> None: msg = "[VANTAGE-Bench leaderboard] Schema validation failed:\n" + "\n".join( f" - {e}" for e in errors ) raise ValueError(msg) # -- Public loader --------------------------------------------------------- def load_results_json(path: str | Path) -> LeaderboardData: """Read, validate, and build the in-memory leaderboard dataset. Hard-fails with ValueError listing every problem if validation does not pass. """ p = Path(path) try: raw_text = p.read_text(encoding="utf-8") except OSError as e: raise ValueError( f"[VANTAGE-Bench leaderboard] Could not read {p}: {e}" ) from e try: data = json.loads(raw_text) except json.JSONDecodeError as e: raise ValueError( f"[VANTAGE-Bench leaderboard] JSON parse error in {p}: {e}" ) from e if not isinstance(data, dict): raise ValueError( "[VANTAGE-Bench leaderboard] Schema validation failed:\n" " - top-level JSON must be an object" ) errors: list[str] = [] # -- Top-level fields --------------------------------------------------- sv = data.get("schema_version") if sv != SCHEMA_VERSION: errors.append( f'schema_version must equal "{SCHEMA_VERSION}", got {sv!r}' ) bv = data.get("benchmark_version") if not isinstance(bv, str) or not bv: errors.append("benchmark_version must be a non-empty string") updated = data.get("updated") if not isinstance(updated, str) or not _DATE_RE.match(updated): errors.append(f"updated must be a YYYY-MM-DD string, got {updated!r}") models_raw = data.get("models") if not isinstance(models_raw, list) or not models_raw: errors.append("models must be a non-empty list") _fail(errors) return # unreachable; satisfies type checkers # -- Per-model validation ---------------------------------------------- seen_ids: set[str] = set() for i, m in enumerate(models_raw): if not isinstance(m, dict): errors.append( f"models[{i}] must be an object, got {type(m).__name__}" ) continue prefix = _id_label(m, i) # Required string fields for fld in REQUIRED_MODEL_FIELDS: if fld not in m: errors.append(f'{prefix}: missing required field "{fld}"') elif not isinstance(m[fld], str) or not m[fld]: errors.append( f'{prefix}: field "{fld}" must be a non-empty string, got {m[fld]!r}' ) # type must be in allowed set if isinstance(m.get("type"), str) and m["type"] not in ALLOWED_TYPES: errors.append( f'{prefix}: field "type" must be one of {ALLOWED_TYPES}, got {m["type"]!r}' ) # result_type must be in allowed set rt = m.get("result_type") if isinstance(rt, str) and rt not in ALLOWED_RESULT_TYPES: errors.append( f'{prefix}: field "result_type" must be one of {ALLOWED_RESULT_TYPES}, got {rt!r}' ) # Optional bool fields _validate_optional_bool(m, "verified", prefix, errors) _validate_optional_bool(m, "ensemble", prefix, errors) _validate_optional_bool(m, "is_new", prefix, errors) # Uniqueness on id mid = m.get("id") if isinstance(mid, str) and mid: if mid in seen_ids: errors.append(f"{prefix}: duplicate id") seen_ids.add(mid) # url (optional legacy field — model name link) if "url" in m: url = m["url"] if url is None: errors.append( f'{prefix}: field "url" is null; omit the field instead of using null' ) elif not isinstance(url, str) or not url: errors.append( f'{prefix}: field "url" must be a non-empty string when present, got {url!r}' ) # model_url (optional — HF model card or project page link) if "model_url" in m: mu = m["model_url"] if not isinstance(mu, str): errors.append( f'{prefix}: field "model_url" must be a string, got {type(mu).__name__}' ) # date_evaluated (optional — YYYY-MM-DD or empty string) if "date_evaluated" in m: de = m["date_evaluated"] if not isinstance(de, str): errors.append( f'{prefix}: field "date_evaluated" must be a string, got {type(de).__name__}' ) elif de and not _DATE_RE.match(de): errors.append( f'{prefix}: field "date_evaluated" must be YYYY-MM-DD or empty, got {de!r}' ) # Required score fields for fld in REQUIRED_SCORE_FIELDS: _validate_score(m, fld, prefix, errors, required=True) # Optional task score fields for fld in OPTIONAL_TASK_FIELDS: _validate_score(m, fld, prefix, errors, required=False) if errors: _fail(errors) # -- Build records (guaranteed valid past this point) ------------------ records: list[ModelRecord] = [] for m in models_raw: scores: dict[str, float] = {} for fld in REQUIRED_SCORE_FIELDS: scores[fld] = float(m[fld]) for fld in OPTIONAL_TASK_FIELDS: if fld in m: scores[fld] = float(m[fld]) params_str = m["params"] pv = _parse_param_value(params_str) pb = _param_bucket(pv) records.append( ModelRecord( id=m["id"], name=m["name"], organization=m["organization"], params=params_str, type=m["type"], result_type=m["result_type"], scores=scores, param_value=pv, param_bucket=pb, type_display=TYPE_DISPLAY[m["type"]], url=m.get("url"), verified=bool(m.get("verified", False)), ensemble=bool(m.get("ensemble", False)), is_new=bool(m.get("is_new", False)), date_evaluated=m.get("date_evaluated", ""), model_url=m.get("model_url", ""), ) ) return LeaderboardData( schema_version=sv, benchmark_version=bv, updated=updated, models=records, model_by_id={r.id: r for r in records}, )