Spaces:

clemson-computing
/

VANTAGE-Bench-Leaderboard

Running

App Files Files Community

VANTAGE-Bench-Leaderboard / util /data.py

Clemson-Computing-User

VANTAGE-Bench v1.0

8b5161a about 1 month ago

Raw

History Blame Contribute Delete

11.5 kB

	"""Load and validate the static leaderboard JSON.

	Public surface:
	ModelRecord — frozen dataclass for one row of the leaderboard.
	LeaderboardData — frozen dataclass wrapping the full dataset + index.
	load_results_json — read + validate + build the dataset.

	Validation is strict: any schema violation raises ValueError with a
	single multi-line message listing every problem encountered. The app
	should let this propagate so the HF Space build fails loudly when the
	JSON is bad.
	"""

	from __future__ import annotations

	import json
	import re
	from dataclasses import dataclass
	from pathlib import Path

	from .config import (
	ALLOWED_RESULT_TYPES,
	ALLOWED_TYPES,
	OPTIONAL_TASK_FIELDS,
	REQUIRED_MODEL_FIELDS,
	REQUIRED_SCORE_FIELDS,
	SCHEMA_VERSION,
	TYPE_DISPLAY,
	)


	# -- Dataclasses -----------------------------------------------------------


	@dataclass(frozen=True)
	class ModelRecord:
	"""One model row, post-validation.

	`scores` is a flat dict keyed by JSON field name (e.g. "overall",
	"spatial", "2d_localization"). Optional task scores may be absent;
	callers should use ``.score(field)`` which returns None when missing.
	"""

	# Required fields
	id: str
	name: str
	organization: str
	params: str
	type: str # "open" or "closed"
	result_type: str # "single" or "ensemble"
	scores: dict[str, float]
	param_value: float \| None
	param_bucket: str \| None
	type_display: str

	# Optional fields with defaults
	url: str \| None = None
	verified: bool = False
	ensemble: bool = False
	is_new: bool = False
	date_evaluated: str = ""
	model_url: str = ""

	def score(self, field_name: str) -> float \| None:
	return self.scores.get(field_name)


	@dataclass(frozen=True)
	class LeaderboardData:
	schema_version: str
	benchmark_version: str
	updated: str # YYYY-MM-DD as in the source JSON
	models: list[ModelRecord]
	model_by_id: dict[str, ModelRecord]


	# -- Internals -------------------------------------------------------------


	_PARAMS_RE = re.compile(r"^\s(\d+(?:\.\d+)?)\sB\s*$")
	_DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$")
	_EM_DASH = "—"


	def _parse_param_value(params: str) -> float \| None:
	"""Return the numeric param count in billions, or None if undisclosed/unparseable."""
	if params.strip() == _EM_DASH:
	return None
	m = _PARAMS_RE.match(params)
	return float(m.group(1)) if m else None


	def _param_bucket(value: float \| None) -> str \| None:
	"""Map a numeric param count (in billions) to a coarse bucket label.

	Boundaries match util.config.PARAM_BUCKETS:
	v < 10 → "<10B"
	10 ≤ v ≤ 40 → "10B–40B"
	v > 40 → ">40B"

	None inputs return None (used to mark models with undisclosed sizes —
	those models are visible only under the "All sizes" filter, never
	under a numeric bucket).
	"""
	if value is None:
	return None
	if value < 10.0:
	return "<10B"
	if value <= 40.0:
	return "10B–40B"
	return ">40B"


	def _id_label(model_dict: dict, index: int) -> str:
	raw = model_dict.get("id")
	if isinstance(raw, str) and raw:
	return f'models[{index}].id="{raw}"'
	return f"models[{index}].id=<missing>"


	def _validate_score(
	model_dict: dict, field: str, prefix: str, errors: list[str], required: bool
	) -> float \| None:
	"""Validate one score field. Returns the float on success, None on failure
	(or when an optional field is absent)."""
	if field not in model_dict:
	if required:
	errors.append(f'{prefix}: missing required field "{field}"')
	return None
	v = model_dict[field]
	if v is None:
	errors.append(
	f'{prefix}: field "{field}" is null; omit the field instead of using null'
	)
	return None
	# bool is a subclass of int; reject it explicitly.
	if isinstance(v, bool) or not isinstance(v, (int, float)):
	errors.append(
	f'{prefix}: field "{field}" must be numeric, got {type(v).__name__}'
	)
	return None
	fv = float(v)
	if not (0.0 <= fv <= 100.0):
	errors.append(f'{prefix}: field "{field}" must be in [0, 100], got {fv}')
	return None
	return fv


	def _validate_optional_bool(
	model_dict: dict, field_name: str, prefix: str, errors: list[str]
	) -> bool:
	"""Validate an optional bool field. Returns the value if valid, False if absent."""
	if field_name not in model_dict:
	return False
	v = model_dict[field_name]
	if not isinstance(v, bool):
	errors.append(
	f'{prefix}: field "{field_name}" must be a boolean, got {type(v).__name__}'
	)
	return False
	return v


	def _fail(errors: list[str]) -> None:
	msg = "[VANTAGE-Bench leaderboard] Schema validation failed:\n" + "\n".join(
	f" - {e}" for e in errors
	)
	raise ValueError(msg)


	# -- Public loader ---------------------------------------------------------


	def load_results_json(path: str \| Path) -> LeaderboardData:
	"""Read, validate, and build the in-memory leaderboard dataset.

	Hard-fails with ValueError listing every problem if validation does not pass.
	"""
	p = Path(path)
	try:
	raw_text = p.read_text(encoding="utf-8")
	except OSError as e:
	raise ValueError(
	f"[VANTAGE-Bench leaderboard] Could not read {p}: {e}"
	) from e

	try:
	data = json.loads(raw_text)
	except json.JSONDecodeError as e:
	raise ValueError(
	f"[VANTAGE-Bench leaderboard] JSON parse error in {p}: {e}"
	) from e

	if not isinstance(data, dict):
	raise ValueError(
	"[VANTAGE-Bench leaderboard] Schema validation failed:\n"
	" - top-level JSON must be an object"
	)

	errors: list[str] = []

	# -- Top-level fields ---------------------------------------------------

	sv = data.get("schema_version")
	if sv != SCHEMA_VERSION:
	errors.append(
	f'schema_version must equal "{SCHEMA_VERSION}", got {sv!r}'
	)

	bv = data.get("benchmark_version")
	if not isinstance(bv, str) or not bv:
	errors.append("benchmark_version must be a non-empty string")

	updated = data.get("updated")
	if not isinstance(updated, str) or not _DATE_RE.match(updated):
	errors.append(f"updated must be a YYYY-MM-DD string, got {updated!r}")

	models_raw = data.get("models")
	if not isinstance(models_raw, list) or not models_raw:
	errors.append("models must be a non-empty list")
	_fail(errors)
	return # unreachable; satisfies type checkers

	# -- Per-model validation ----------------------------------------------

	seen_ids: set[str] = set()

	for i, m in enumerate(models_raw):
	if not isinstance(m, dict):
	errors.append(
	f"models[{i}] must be an object, got {type(m).__name__}"
	)
	continue

	prefix = _id_label(m, i)

	# Required string fields
	for fld in REQUIRED_MODEL_FIELDS:
	if fld not in m:
	errors.append(f'{prefix}: missing required field "{fld}"')
	elif not isinstance(m[fld], str) or not m[fld]:
	errors.append(
	f'{prefix}: field "{fld}" must be a non-empty string, got {m[fld]!r}'
	)

	# type must be in allowed set
	if isinstance(m.get("type"), str) and m["type"] not in ALLOWED_TYPES:
	errors.append(
	f'{prefix}: field "type" must be one of {ALLOWED_TYPES}, got {m["type"]!r}'
	)

	# result_type must be in allowed set
	rt = m.get("result_type")
	if isinstance(rt, str) and rt not in ALLOWED_RESULT_TYPES:
	errors.append(
	f'{prefix}: field "result_type" must be one of {ALLOWED_RESULT_TYPES}, got {rt!r}'
	)

	# Optional bool fields
	_validate_optional_bool(m, "verified", prefix, errors)
	_validate_optional_bool(m, "ensemble", prefix, errors)
	_validate_optional_bool(m, "is_new", prefix, errors)

	# Uniqueness on id
	mid = m.get("id")
	if isinstance(mid, str) and mid:
	if mid in seen_ids:
	errors.append(f"{prefix}: duplicate id")
	seen_ids.add(mid)

	# url (optional legacy field — model name link)
	if "url" in m:
	url = m["url"]
	if url is None:
	errors.append(
	f'{prefix}: field "url" is null; omit the field instead of using null'
	)
	elif not isinstance(url, str) or not url:
	errors.append(
	f'{prefix}: field "url" must be a non-empty string when present, got {url!r}'
	)

	# model_url (optional — HF model card or project page link)
	if "model_url" in m:
	mu = m["model_url"]
	if not isinstance(mu, str):
	errors.append(
	f'{prefix}: field "model_url" must be a string, got {type(mu).__name__}'
	)

	# date_evaluated (optional — YYYY-MM-DD or empty string)
	if "date_evaluated" in m:
	de = m["date_evaluated"]
	if not isinstance(de, str):
	errors.append(
	f'{prefix}: field "date_evaluated" must be a string, got {type(de).__name__}'
	)
	elif de and not _DATE_RE.match(de):
	errors.append(
	f'{prefix}: field "date_evaluated" must be YYYY-MM-DD or empty, got {de!r}'
	)

	# Required score fields
	for fld in REQUIRED_SCORE_FIELDS:
	_validate_score(m, fld, prefix, errors, required=True)

	# Optional task score fields
	for fld in OPTIONAL_TASK_FIELDS:
	_validate_score(m, fld, prefix, errors, required=False)

	if errors:
	_fail(errors)

	# -- Build records (guaranteed valid past this point) ------------------

	records: list[ModelRecord] = []
	for m in models_raw:
	scores: dict[str, float] = {}
	for fld in REQUIRED_SCORE_FIELDS:
	scores[fld] = float(m[fld])
	for fld in OPTIONAL_TASK_FIELDS:
	if fld in m:
	scores[fld] = float(m[fld])

	params_str = m["params"]
	pv = _parse_param_value(params_str)
	pb = _param_bucket(pv)

	records.append(
	ModelRecord(
	id=m["id"],
	name=m["name"],
	organization=m["organization"],
	params=params_str,
	type=m["type"],
	result_type=m["result_type"],
	scores=scores,
	param_value=pv,
	param_bucket=pb,
	type_display=TYPE_DISPLAY[m["type"]],
	url=m.get("url"),
	verified=bool(m.get("verified", False)),
	ensemble=bool(m.get("ensemble", False)),
	is_new=bool(m.get("is_new", False)),
	date_evaluated=m.get("date_evaluated", ""),
	model_url=m.get("model_url", ""),
	)
	)

	return LeaderboardData(
	schema_version=sv,
	benchmark_version=bv,
	updated=updated,
	models=records,
	model_by_id={r.id: r for r in records},
	)