"""Data loading and subset configuration for the TabArena leaderboard. This module owns everything about *where* leaderboard artifacts live and *how* they are read. Layout (Gradio components) lives in ``views.py`` and ``pages.py``; user-facing copy lives in ``website_texts.py``. Performance note: the website optimizes for fast first paint. CSVs are tiny and cached (:func:`load_leaderboard_csv`); the large per-subset PNGs are only unzipped on demand (:meth:`LBContainer.image_path`) and only for the subset the user is currently viewing. """ from __future__ import annotations import re import zipfile from dataclasses import dataclass, field from functools import lru_cache from pathlib import Path import pandas as pd DATA_DIR = Path(__file__).parent / "data" # BeyondArena artifacts live under their own root (see # scripts/run_generate_beyondarena_website_artifacts.py in the tabarena repo). BEYOND_DATA_DIR = Path(__file__).parent / "data_beyondarena" # --------------------------------------------------------------------------- # # Subset axes # # A leaderboard "subset" is one cell of a 4-axis grid. Two axes are *view # modifiers* surfaced as toggles (imputation, splits); two are *content subsets* # surfaced as tab bars (tasks, datasets). Keeping the axis definitions here (as # data, not as if/elif chains) means adding or reordering a subset is a one-line # edit. The first value of each axis is its default. # --------------------------------------------------------------------------- # # axis -> {value: human label}. Insertion order = display order; first = default. TASK_LABELS = { "all": "All Tasks", "classification": "Classification", "regression": "Regression", "binary": "Binary", "multiclass": "Multiclass", } DATASET_LABELS = { "all": "All Datasets", "small": "Small", "medium": "Medium", } # Short labels used as column headers in the cross-subset overview. TASK_SHORT = { "all": "Overall", "classification": "Class.", "regression": "Regr.", "binary": "Binary", "multiclass": "Multi.", } DATASET_SHORT = { "small": "Small", "medium": "Medium", } DATASET_SIZE_NOTE = { "small": "Small datasets contain between 500 and 10,000 samples.", "medium": "Medium datasets contain between 10,000 and 250,000 samples.", "tabpfn": ( "TabPFNv2-compatible datasets contain at most 10,000 samples, " "500 features, and 10 classes." ), } @dataclass(frozen=True) class Subset: """One cell of the leaderboard grid (imputation x splits x tasks x datasets).""" imputation: str = "yes" # "yes" | "no" splits: str = "all" # "all" | "lite" tasks: str = "all" # see TASK_LABELS datasets: str = "all" # see DATASET_LABELS @property def rel_path(self) -> str: return ( f"imputation_{self.imputation}/" f"splits_{self.splits}/" f"tasks_{self.tasks}/" f"datasets_{self.datasets}" ) @lru_cache(maxsize=None) def load_leaderboard_csv(path: str) -> pd.DataFrame: """Read a ``website_leaderboard.csv`` (cached; files are tiny and immutable).""" df = pd.read_csv(path) return df.rename(columns={"1#": "#"}) def unzip_png(base_dir: Path, img_name: str) -> str: """Return the path to ``base_dir/img_name``.png, unzipping the ``.png.zip`` on first access.""" base = Path(base_dir) / img_name img_path = base.with_suffix(".png") if img_path.exists(): return str(img_path) with zipfile.ZipFile(base.with_suffix(".png.zip"), "r") as zipf: zipf.extractall(img_path.parent) return str(img_path) @dataclass class LBContainer: """Loads the artifacts for a single subset under a given data root.""" data_root: Path subset: Subset name: str n_datasets: int | None = None blurb: str | None = None base_path: Path = field(init=False) def __post_init__(self) -> None: self.base_path = Path(self.data_root) / self.subset.rel_path for fname in self._listdir(): match = re.match(r"n_datasets_(.+)", fname) if match: self.n_datasets = match.group(1) break def _listdir(self) -> list[str]: try: return [p.name for p in self.base_path.iterdir()] except FileNotFoundError: return [] def load_df(self) -> pd.DataFrame: return load_leaderboard_csv(str((self.base_path / "website_leaderboard.csv").resolve())).copy() def image_path(self, img_name: str) -> str: """Return the path to ``img_name``.png, unzipping it on first access.""" return unzip_png(self.base_path, img_name) def subset_name(subset: Subset) -> str: """Human-readable name for a subset, used in figure labels.""" impute = "with imputation" if subset.imputation == "yes" else "no imputation" split = "all repeats" if subset.splits == "all" else "Lite" return ( f"{TASK_LABELS[subset.tasks]} | {DATASET_LABELS[subset.datasets]} " f"| {split} | {impute}" ) def subset_blurb(subset: Subset, n_datasets: int | None) -> str: """One-line description of the subset shown above its figures.""" datasets_name = DATASET_LABELS[subset.datasets].lower() blurb = ( f"Leaderboard for {n_datasets} datasets " f"({datasets_name}, {TASK_LABELS[subset.tasks].lower()}) " ) if subset.splits == "lite": blurb += "for one split (1st fold, 1st repeat) " blurb += "including all " if subset.imputation == "yes": blurb += "(imputed) " blurb += "models." note = DATASET_SIZE_NOTE.get(subset.datasets) if note: blurb += f"
{note}" return blurb # --------------------------------------------------------------------------- # # BeyondArena subsets # # BeyondArena diverges from TabArena: there is no imputation/splits/tasks/datasets # grid. Instead a single axis of subset dimensions (split regime, size bucket, # feature dimensionality/type) is surfaced as one tab bar, and every leaderboard # is always computed on the recommended `core` protocol (`["core", ]`; the # "full" subset is `core` with no extra filter). The artifacts are produced by # scripts/run_generate_beyondarena_website_artifacts.py in the tabarena repo, whose # `BEYOND_SUBSETS` keys must match the labels below. # --------------------------------------------------------------------------- # # label -> human name. Insertion order = tab-bar order; first = default. Groups are # only used to draw section separators in the tab bar / copy. BEYOND_SUBSET_LABELS = { "full": "Full", "random": "IID", "temporal": "Temporal", "grouped": "Grouped", "tiny": "Tiny", "small": "Small", "medium": "Medium", "large": "Large", "low-dim": "Low-dim", "high-dim": "High-dim", "text": "Text", "high-cardinality": "High-cardinality", } # One-line description shown above each subset's figures. Kept in sync with the # BeyondArena subset predicates (see BeyondArenaContext.SUBSET_PREDICATES). BEYOND_SUBSET_NOTE = { "full": "All BeyondArena datasets, on the recommended core protocol.", "random": "IID (randomly split) tasks only.", "temporal": "Temporally split tasks only โ€” train on the past, test on the future.", "grouped": "Group-wise split tasks only โ€” disjoint groups between train and test.", "tiny": "Tiny datasets contain at most 1,000 training rows.", "small": "Small datasets contain between 1,001 and 10,000 training rows.", "medium": "Medium datasets contain between 10,001 and 100,000 training rows.", "large": "Large datasets contain between 100,001 and 1,000,000 training rows.", "low-dim": "Low-dimensional datasets have at most 100 columns after preprocessing.", "high-dim": "High-dimensional datasets have more than 100 columns after preprocessing.", "text": "Datasets that contain one or more text columns.", "high-cardinality": "Datasets that contain one or more high-cardinality categorical columns.", } @dataclass(frozen=True) class BeyondSubset: """One cell of the BeyondArena leaderboard โ€” a single subset dimension, always on core.""" subset: str = "full" # see BEYOND_SUBSET_LABELS @property def rel_path(self) -> str: return f"subsets/{self.subset}" def beyond_subset_name(subset: BeyondSubset) -> str: """Human-readable name for a BeyondArena subset, used in figure labels.""" return f"{BEYOND_SUBSET_LABELS[subset.subset]} ยท core" def beyond_subset_blurb(subset: BeyondSubset, n_datasets: int | None) -> str: """One-line description of a BeyondArena subset shown above its figures.""" human = BEYOND_SUBSET_LABELS[subset.subset].lower() blurb = ( f"Leaderboard for {n_datasets} BeyondArena datasets ({human}), evaluated on the " "recommended core protocol." ) note = BEYOND_SUBSET_NOTE.get(subset.subset) if note: blurb += f"
{note}" return blurb