"""Data loading and subset configuration for the TabArena leaderboard.
This module owns everything about *where* leaderboard artifacts live and *how*
they are read. Layout (Gradio components) lives in ``views.py`` and ``pages.py``;
user-facing copy lives in ``website_texts.py``.
Performance note: the website optimizes for fast first paint. CSVs are tiny and
cached (:func:`load_leaderboard_csv`); the large per-subset PNGs are only
unzipped on demand (:meth:`LBContainer.image_path`) and only for the subset the
user is currently viewing.
"""
from __future__ import annotations
import re
import zipfile
from dataclasses import dataclass, field
from functools import lru_cache
from pathlib import Path
import pandas as pd
DATA_DIR = Path(__file__).parent / "data"
# BeyondArena artifacts live under their own root (see
# scripts/run_generate_beyondarena_website_artifacts.py in the tabarena repo).
BEYOND_DATA_DIR = Path(__file__).parent / "data_beyondarena"
# --------------------------------------------------------------------------- #
# Subset axes
#
# A leaderboard "subset" is one cell of a 4-axis grid. Two axes are *view
# modifiers* surfaced as toggles (imputation, splits); two are *content subsets*
# surfaced as tab bars (tasks, datasets). Keeping the axis definitions here (as
# data, not as if/elif chains) means adding or reordering a subset is a one-line
# edit. The first value of each axis is its default.
# --------------------------------------------------------------------------- #
# axis -> {value: human label}. Insertion order = display order; first = default.
TASK_LABELS = {
"all": "All Tasks",
"classification": "Classification",
"regression": "Regression",
"binary": "Binary",
"multiclass": "Multiclass",
}
DATASET_LABELS = {
"all": "All Datasets",
"small": "Small",
"medium": "Medium",
}
# Short labels used as column headers in the cross-subset overview.
TASK_SHORT = {
"all": "Overall",
"classification": "Class.",
"regression": "Regr.",
"binary": "Binary",
"multiclass": "Multi.",
}
DATASET_SHORT = {
"small": "Small",
"medium": "Medium",
}
DATASET_SIZE_NOTE = {
"small": "Small datasets contain between 500 and 10,000 samples.",
"medium": "Medium datasets contain between 10,000 and 250,000 samples.",
"tabpfn": (
"TabPFNv2-compatible datasets contain at most 10,000 samples, "
"500 features, and 10 classes."
),
}
@dataclass(frozen=True)
class Subset:
"""One cell of the leaderboard grid (imputation x splits x tasks x datasets)."""
imputation: str = "yes" # "yes" | "no"
splits: str = "all" # "all" | "lite"
tasks: str = "all" # see TASK_LABELS
datasets: str = "all" # see DATASET_LABELS
@property
def rel_path(self) -> str:
return (
f"imputation_{self.imputation}/"
f"splits_{self.splits}/"
f"tasks_{self.tasks}/"
f"datasets_{self.datasets}"
)
@lru_cache(maxsize=None)
def load_leaderboard_csv(path: str) -> pd.DataFrame:
"""Read a ``website_leaderboard.csv`` (cached; files are tiny and immutable)."""
df = pd.read_csv(path)
return df.rename(columns={"1#": "#"})
def unzip_png(base_dir: Path, img_name: str) -> str:
"""Return the path to ``base_dir/img_name``.png, unzipping the ``.png.zip`` on first access."""
base = Path(base_dir) / img_name
img_path = base.with_suffix(".png")
if img_path.exists():
return str(img_path)
with zipfile.ZipFile(base.with_suffix(".png.zip"), "r") as zipf:
zipf.extractall(img_path.parent)
return str(img_path)
@dataclass
class LBContainer:
"""Loads the artifacts for a single subset under a given data root."""
data_root: Path
subset: Subset
name: str
n_datasets: int | None = None
blurb: str | None = None
base_path: Path = field(init=False)
def __post_init__(self) -> None:
self.base_path = Path(self.data_root) / self.subset.rel_path
for fname in self._listdir():
match = re.match(r"n_datasets_(.+)", fname)
if match:
self.n_datasets = match.group(1)
break
def _listdir(self) -> list[str]:
try:
return [p.name for p in self.base_path.iterdir()]
except FileNotFoundError:
return []
def load_df(self) -> pd.DataFrame:
return load_leaderboard_csv(str((self.base_path / "website_leaderboard.csv").resolve())).copy()
def image_path(self, img_name: str) -> str:
"""Return the path to ``img_name``.png, unzipping it on first access."""
return unzip_png(self.base_path, img_name)
def subset_name(subset: Subset) -> str:
"""Human-readable name for a subset, used in figure labels."""
impute = "with imputation" if subset.imputation == "yes" else "no imputation"
split = "all repeats" if subset.splits == "all" else "Lite"
return (
f"{TASK_LABELS[subset.tasks]} | {DATASET_LABELS[subset.datasets]} "
f"| {split} | {impute}"
)
def subset_blurb(subset: Subset, n_datasets: int | None) -> str:
"""One-line description of the subset shown above its figures."""
datasets_name = DATASET_LABELS[subset.datasets].lower()
blurb = (
f"Leaderboard for {n_datasets} datasets "
f"({datasets_name}, {TASK_LABELS[subset.tasks].lower()}) "
)
if subset.splits == "lite":
blurb += "for one split (1st fold, 1st repeat) "
blurb += "including all "
if subset.imputation == "yes":
blurb += "(imputed) "
blurb += "models."
note = DATASET_SIZE_NOTE.get(subset.datasets)
if note:
blurb += f"
{note}"
return blurb
# --------------------------------------------------------------------------- #
# BeyondArena subsets
#
# BeyondArena diverges from TabArena: there is no imputation/splits/tasks/datasets
# grid. Instead a single axis of subset dimensions (split regime, size bucket,
# feature dimensionality/type) is surfaced as one tab bar, and every leaderboard
# is always computed on the recommended `core` protocol (`["core", ]`; the
# "full" subset is `core` with no extra filter). The artifacts are produced by
# scripts/run_generate_beyondarena_website_artifacts.py in the tabarena repo, whose
# `BEYOND_SUBSETS` keys must match the labels below.
# --------------------------------------------------------------------------- #
# label -> human name. Insertion order = tab-bar order; first = default. Groups are
# only used to draw section separators in the tab bar / copy.
BEYOND_SUBSET_LABELS = {
"full": "Full",
"random": "IID",
"temporal": "Temporal",
"grouped": "Grouped",
"tiny": "Tiny",
"small": "Small",
"medium": "Medium",
"large": "Large",
"low-dim": "Low-dim",
"high-dim": "High-dim",
"text": "Text",
"high-cardinality": "High-cardinality",
}
# One-line description shown above each subset's figures. Kept in sync with the
# BeyondArena subset predicates (see BeyondArenaContext.SUBSET_PREDICATES).
BEYOND_SUBSET_NOTE = {
"full": "All BeyondArena datasets, on the recommended core protocol.",
"random": "IID (randomly split) tasks only.",
"temporal": "Temporally split tasks only โ train on the past, test on the future.",
"grouped": "Group-wise split tasks only โ disjoint groups between train and test.",
"tiny": "Tiny datasets contain at most 1,000 training rows.",
"small": "Small datasets contain between 1,001 and 10,000 training rows.",
"medium": "Medium datasets contain between 10,001 and 100,000 training rows.",
"large": "Large datasets contain between 100,001 and 1,000,000 training rows.",
"low-dim": "Low-dimensional datasets have at most 100 columns after preprocessing.",
"high-dim": "High-dimensional datasets have more than 100 columns after preprocessing.",
"text": "Datasets that contain one or more text columns.",
"high-cardinality": "Datasets that contain one or more high-cardinality categorical columns.",
}
@dataclass(frozen=True)
class BeyondSubset:
"""One cell of the BeyondArena leaderboard โ a single subset dimension, always on core."""
subset: str = "full" # see BEYOND_SUBSET_LABELS
@property
def rel_path(self) -> str:
return f"subsets/{self.subset}"
def beyond_subset_name(subset: BeyondSubset) -> str:
"""Human-readable name for a BeyondArena subset, used in figure labels."""
return f"{BEYOND_SUBSET_LABELS[subset.subset]} ยท core"
def beyond_subset_blurb(subset: BeyondSubset, n_datasets: int | None) -> str:
"""One-line description of a BeyondArena subset shown above its figures."""
human = BEYOND_SUBSET_LABELS[subset.subset].lower()
blurb = (
f"Leaderboard for {n_datasets} BeyondArena datasets ({human}), evaluated on the "
"recommended core protocol."
)
note = BEYOND_SUBSET_NOTE.get(subset.subset)
if note:
blurb += f"
{note}"
return blurb