| """Load and validate the static leaderboard JSON. |
| |
| Public surface: |
| ModelRecord — frozen dataclass for one row of the leaderboard. |
| LeaderboardData — frozen dataclass wrapping the full dataset + index. |
| load_results_json — read + validate + build the dataset. |
| |
| Validation is strict: any schema violation raises ValueError with a |
| single multi-line message listing every problem encountered. The app |
| should let this propagate so the HF Space build fails loudly when the |
| JSON is bad. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json |
| import re |
| from dataclasses import dataclass |
| from pathlib import Path |
|
|
| from .config import ( |
| ALLOWED_RESULT_TYPES, |
| ALLOWED_TYPES, |
| OPTIONAL_TASK_FIELDS, |
| REQUIRED_MODEL_FIELDS, |
| REQUIRED_SCORE_FIELDS, |
| SCHEMA_VERSION, |
| TYPE_DISPLAY, |
| ) |
|
|
|
|
| |
|
|
|
|
| @dataclass(frozen=True) |
| class ModelRecord: |
| """One model row, post-validation. |
| |
| `scores` is a flat dict keyed by JSON field name (e.g. "overall", |
| "spatial", "2d_localization"). Optional task scores may be absent; |
| callers should use ``.score(field)`` which returns None when missing. |
| """ |
|
|
| |
| id: str |
| name: str |
| organization: str |
| params: str |
| type: str |
| result_type: str |
| scores: dict[str, float] |
| param_value: float | None |
| param_bucket: str | None |
| type_display: str |
|
|
| |
| url: str | None = None |
| verified: bool = False |
| ensemble: bool = False |
| is_new: bool = False |
| date_evaluated: str = "" |
| model_url: str = "" |
|
|
| def score(self, field_name: str) -> float | None: |
| return self.scores.get(field_name) |
|
|
|
|
| @dataclass(frozen=True) |
| class LeaderboardData: |
| schema_version: str |
| benchmark_version: str |
| updated: str |
| models: list[ModelRecord] |
| model_by_id: dict[str, ModelRecord] |
|
|
|
|
| |
|
|
|
|
| _PARAMS_RE = re.compile(r"^\s*(\d+(?:\.\d+)?)\s*B\s*$") |
| _DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$") |
| _EM_DASH = "—" |
|
|
|
|
| def _parse_param_value(params: str) -> float | None: |
| """Return the numeric param count in billions, or None if undisclosed/unparseable.""" |
| if params.strip() == _EM_DASH: |
| return None |
| m = _PARAMS_RE.match(params) |
| return float(m.group(1)) if m else None |
|
|
|
|
| def _param_bucket(value: float | None) -> str | None: |
| """Map a numeric param count (in billions) to a coarse bucket label. |
| |
| Boundaries match util.config.PARAM_BUCKETS: |
| v < 10 → "<10B" |
| 10 ≤ v ≤ 40 → "10B–40B" |
| v > 40 → ">40B" |
| |
| None inputs return None (used to mark models with undisclosed sizes — |
| those models are visible only under the "All sizes" filter, never |
| under a numeric bucket). |
| """ |
| if value is None: |
| return None |
| if value < 10.0: |
| return "<10B" |
| if value <= 40.0: |
| return "10B–40B" |
| return ">40B" |
|
|
|
|
| def _id_label(model_dict: dict, index: int) -> str: |
| raw = model_dict.get("id") |
| if isinstance(raw, str) and raw: |
| return f'models[{index}].id="{raw}"' |
| return f"models[{index}].id=<missing>" |
|
|
|
|
| def _validate_score( |
| model_dict: dict, field: str, prefix: str, errors: list[str], required: bool |
| ) -> float | None: |
| """Validate one score field. Returns the float on success, None on failure |
| (or when an optional field is absent).""" |
| if field not in model_dict: |
| if required: |
| errors.append(f'{prefix}: missing required field "{field}"') |
| return None |
| v = model_dict[field] |
| if v is None: |
| errors.append( |
| f'{prefix}: field "{field}" is null; omit the field instead of using null' |
| ) |
| return None |
| |
| if isinstance(v, bool) or not isinstance(v, (int, float)): |
| errors.append( |
| f'{prefix}: field "{field}" must be numeric, got {type(v).__name__}' |
| ) |
| return None |
| fv = float(v) |
| if not (0.0 <= fv <= 100.0): |
| errors.append(f'{prefix}: field "{field}" must be in [0, 100], got {fv}') |
| return None |
| return fv |
|
|
|
|
| def _validate_optional_bool( |
| model_dict: dict, field_name: str, prefix: str, errors: list[str] |
| ) -> bool: |
| """Validate an optional bool field. Returns the value if valid, False if absent.""" |
| if field_name not in model_dict: |
| return False |
| v = model_dict[field_name] |
| if not isinstance(v, bool): |
| errors.append( |
| f'{prefix}: field "{field_name}" must be a boolean, got {type(v).__name__}' |
| ) |
| return False |
| return v |
|
|
|
|
| def _fail(errors: list[str]) -> None: |
| msg = "[VANTAGE-Bench leaderboard] Schema validation failed:\n" + "\n".join( |
| f" - {e}" for e in errors |
| ) |
| raise ValueError(msg) |
|
|
|
|
| |
|
|
|
|
| def load_results_json(path: str | Path) -> LeaderboardData: |
| """Read, validate, and build the in-memory leaderboard dataset. |
| |
| Hard-fails with ValueError listing every problem if validation does not pass. |
| """ |
| p = Path(path) |
| try: |
| raw_text = p.read_text(encoding="utf-8") |
| except OSError as e: |
| raise ValueError( |
| f"[VANTAGE-Bench leaderboard] Could not read {p}: {e}" |
| ) from e |
|
|
| try: |
| data = json.loads(raw_text) |
| except json.JSONDecodeError as e: |
| raise ValueError( |
| f"[VANTAGE-Bench leaderboard] JSON parse error in {p}: {e}" |
| ) from e |
|
|
| if not isinstance(data, dict): |
| raise ValueError( |
| "[VANTAGE-Bench leaderboard] Schema validation failed:\n" |
| " - top-level JSON must be an object" |
| ) |
|
|
| errors: list[str] = [] |
|
|
| |
|
|
| sv = data.get("schema_version") |
| if sv != SCHEMA_VERSION: |
| errors.append( |
| f'schema_version must equal "{SCHEMA_VERSION}", got {sv!r}' |
| ) |
|
|
| bv = data.get("benchmark_version") |
| if not isinstance(bv, str) or not bv: |
| errors.append("benchmark_version must be a non-empty string") |
|
|
| updated = data.get("updated") |
| if not isinstance(updated, str) or not _DATE_RE.match(updated): |
| errors.append(f"updated must be a YYYY-MM-DD string, got {updated!r}") |
|
|
| models_raw = data.get("models") |
| if not isinstance(models_raw, list) or not models_raw: |
| errors.append("models must be a non-empty list") |
| _fail(errors) |
| return |
|
|
| |
|
|
| seen_ids: set[str] = set() |
|
|
| for i, m in enumerate(models_raw): |
| if not isinstance(m, dict): |
| errors.append( |
| f"models[{i}] must be an object, got {type(m).__name__}" |
| ) |
| continue |
|
|
| prefix = _id_label(m, i) |
|
|
| |
| for fld in REQUIRED_MODEL_FIELDS: |
| if fld not in m: |
| errors.append(f'{prefix}: missing required field "{fld}"') |
| elif not isinstance(m[fld], str) or not m[fld]: |
| errors.append( |
| f'{prefix}: field "{fld}" must be a non-empty string, got {m[fld]!r}' |
| ) |
|
|
| |
| if isinstance(m.get("type"), str) and m["type"] not in ALLOWED_TYPES: |
| errors.append( |
| f'{prefix}: field "type" must be one of {ALLOWED_TYPES}, got {m["type"]!r}' |
| ) |
|
|
| |
| rt = m.get("result_type") |
| if isinstance(rt, str) and rt not in ALLOWED_RESULT_TYPES: |
| errors.append( |
| f'{prefix}: field "result_type" must be one of {ALLOWED_RESULT_TYPES}, got {rt!r}' |
| ) |
|
|
| |
| _validate_optional_bool(m, "verified", prefix, errors) |
| _validate_optional_bool(m, "ensemble", prefix, errors) |
| _validate_optional_bool(m, "is_new", prefix, errors) |
|
|
| |
| mid = m.get("id") |
| if isinstance(mid, str) and mid: |
| if mid in seen_ids: |
| errors.append(f"{prefix}: duplicate id") |
| seen_ids.add(mid) |
|
|
| |
| if "url" in m: |
| url = m["url"] |
| if url is None: |
| errors.append( |
| f'{prefix}: field "url" is null; omit the field instead of using null' |
| ) |
| elif not isinstance(url, str) or not url: |
| errors.append( |
| f'{prefix}: field "url" must be a non-empty string when present, got {url!r}' |
| ) |
|
|
| |
| if "model_url" in m: |
| mu = m["model_url"] |
| if not isinstance(mu, str): |
| errors.append( |
| f'{prefix}: field "model_url" must be a string, got {type(mu).__name__}' |
| ) |
|
|
| |
| if "date_evaluated" in m: |
| de = m["date_evaluated"] |
| if not isinstance(de, str): |
| errors.append( |
| f'{prefix}: field "date_evaluated" must be a string, got {type(de).__name__}' |
| ) |
| elif de and not _DATE_RE.match(de): |
| errors.append( |
| f'{prefix}: field "date_evaluated" must be YYYY-MM-DD or empty, got {de!r}' |
| ) |
|
|
| |
| for fld in REQUIRED_SCORE_FIELDS: |
| _validate_score(m, fld, prefix, errors, required=True) |
|
|
| |
| for fld in OPTIONAL_TASK_FIELDS: |
| _validate_score(m, fld, prefix, errors, required=False) |
|
|
| if errors: |
| _fail(errors) |
|
|
| |
|
|
| records: list[ModelRecord] = [] |
| for m in models_raw: |
| scores: dict[str, float] = {} |
| for fld in REQUIRED_SCORE_FIELDS: |
| scores[fld] = float(m[fld]) |
| for fld in OPTIONAL_TASK_FIELDS: |
| if fld in m: |
| scores[fld] = float(m[fld]) |
|
|
| params_str = m["params"] |
| pv = _parse_param_value(params_str) |
| pb = _param_bucket(pv) |
|
|
| records.append( |
| ModelRecord( |
| id=m["id"], |
| name=m["name"], |
| organization=m["organization"], |
| params=params_str, |
| type=m["type"], |
| result_type=m["result_type"], |
| scores=scores, |
| param_value=pv, |
| param_bucket=pb, |
| type_display=TYPE_DISPLAY[m["type"]], |
| url=m.get("url"), |
| verified=bool(m.get("verified", False)), |
| ensemble=bool(m.get("ensemble", False)), |
| is_new=bool(m.get("is_new", False)), |
| date_evaluated=m.get("date_evaluated", ""), |
| model_url=m.get("model_url", ""), |
| ) |
| ) |
|
|
| return LeaderboardData( |
| schema_version=sv, |
| benchmark_version=bv, |
| updated=updated, |
| models=records, |
| model_by_id={r.id: r for r in records}, |
| ) |
|
|