from __future__ import annotations
import importlib
import math
import traceback
import time
from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path
from typing import Callable
import gradio as gr
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.base import clone
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_california_housing, fetch_openml, load_breast_cancer, load_diabetes, load_digits, load_iris, load_wine, make_classification
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import (
AdaBoostClassifier,
AdaBoostRegressor,
ExtraTreesClassifier,
ExtraTreesRegressor,
GradientBoostingClassifier,
GradientBoostingRegressor,
HistGradientBoostingClassifier,
HistGradientBoostingRegressor,
RandomForestClassifier,
RandomForestRegressor,
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import BayesianRidge, LinearRegression, LogisticRegression, Ridge
from sklearn.metrics import (
accuracy_score,
f1_score,
mean_absolute_error,
mean_squared_error,
r2_score,
roc_auc_score,
)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC, SVR
APP_TITLE = "tabBench"
TABFM_MODEL_ID = "google/tabfm-1.0.0-pytorch"
RANDOM_STATE = 42
CANDY_DATA_URL = "https://raw.githubusercontent.com/fivethirtyeight/data/master/candy-power-ranking/candy-data.csv"
GOOGLE_COLORS = ["#4285F4", "#DB4437", "#F4B400", "#0F9D58", "#A142F4", "#00ACC1"]
METRIC_CHOICES = ["accuracy", "f1_weighted", "roc_auc", "rmse", "mae", "r2", "seconds"]
TABFM_PRESETS = {
"Fast": {"n_estimators": 1, "max_num_rows": 256, "max_num_features": 64, "batch_size": 1, "enable_nnls": False, "n_feature_crosses": 0, "n_svd_features": 0, "max_eval_rows": 256},
"Balanced": {"n_estimators": 4, "max_num_rows": 512, "max_num_features": 128, "batch_size": 1, "enable_nnls": False, "n_feature_crosses": 0, "n_svd_features": 0, "max_eval_rows": 512},
"Default": {"n_estimators": 32, "max_num_rows": None, "max_num_features": 500, "batch_size": 1, "enable_nnls": False, "n_feature_crosses": 0, "n_svd_features": 0, "max_eval_rows": 1000},
"Ensemble": {"n_estimators": 32, "max_num_rows": None, "max_num_features": 500, "batch_size": 1, "enable_nnls": True, "n_feature_crosses": "sqrt", "n_svd_features": "sqrt", "max_eval_rows": 1000},
}
@dataclass(frozen=True)
class DatasetSpec:
name: str
task: str
target: str
source: str
rows: int
description: str
loader: Callable[[int, int], pd.DataFrame]
def _add_categorical_noise(df: pd.DataFrame, rng: np.random.Generator, prefix: str) -> pd.DataFrame:
df = df.copy()
df[f"{prefix}_segment"] = rng.choice(["A", "B", "C", "D"], len(df), p=[0.35, 0.25, 0.25, 0.15])
df[f"{prefix}_region"] = rng.choice(["north", "south", "east", "west"], len(df))
return df
def sample_df(df: pd.DataFrame, limit: int, seed: int) -> pd.DataFrame:
return df.sample(min(limit, len(df)), random_state=seed).reset_index(drop=True)
def find_first_data_file(root: str | Path, suffixes: tuple[str, ...]) -> Path:
root = Path(root)
for suffix in suffixes:
matches = sorted(root.rglob(f"*{suffix}"))
if matches:
return matches[0]
raise FileNotFoundError(f"No data file with suffixes {suffixes} found in {root}")
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
df.columns = (
pd.Index(df.columns)
.astype(str)
.str.strip()
.str.lower()
.str.replace(r"[^0-9a-z]+", "_", regex=True)
.str.strip("_")
)
return df
def choose_col(df: pd.DataFrame, candidates: list[str], contains: list[str] | None = None) -> str:
normalized = {c.lower(): c for c in df.columns}
for candidate in candidates:
key = candidate.lower()
if key in normalized:
return normalized[key]
if contains:
for col in df.columns:
lower = str(col).lower()
if all(part in lower for part in contains):
return col
raise KeyError(f"None of {candidates} found in columns.")
def kaggle_csv(dataset_id: str, preferred_names: tuple[str, ...] = ()) -> pd.DataFrame:
import kagglehub
path = Path(kagglehub.dataset_download(dataset_id))
csvs = sorted(path.rglob("*.csv"))
if preferred_names:
for preferred in preferred_names:
for csv in csvs:
if preferred.lower() in csv.name.lower():
return pd.read_csv(csv)
if not csvs:
raise FileNotFoundError(f"No CSV files found in Kaggle dataset {dataset_id}.")
return pd.read_csv(csvs[0])
def select_numeric_features(df: pd.DataFrame, target: str, max_features: int = 12) -> pd.DataFrame:
numeric = df.select_dtypes(include=np.number).columns.tolist()
cols = [c for c in numeric if c != target][:max_features]
return df[[*cols, target]].dropna(subset=[target])
@lru_cache(maxsize=1)
def openml_titanic() -> pd.DataFrame:
data = fetch_openml(data_id=40945, as_frame=True, parser="auto")
df = data.frame.copy()
keep = [c for c in ["pclass", "sex", "age", "sibsp", "parch", "fare", "embarked", "survived"] if c in df.columns]
return df[keep].dropna(subset=["survived"])
@lru_cache(maxsize=1)
def openml_ames_housing() -> pd.DataFrame:
data = fetch_openml(data_id=42165, as_frame=True, parser="auto")
df = data.frame.copy()
target = "SalePrice" if "SalePrice" in df.columns else data.target_names[0]
useful = [
"OverallQual",
"GrLivArea",
"GarageCars",
"GarageArea",
"TotalBsmtSF",
"FullBath",
"YearBuilt",
"Neighborhood",
"HouseStyle",
target,
]
cols = [c for c in useful if c in df.columns]
return df[cols].rename(columns={target: "sale_price"}).dropna(subset=["sale_price"])
@lru_cache(maxsize=1)
def openml_adult_income() -> pd.DataFrame:
data = fetch_openml(data_id=1590, as_frame=True, parser="auto")
df = data.frame.copy()
if "class" in df.columns:
df = df.rename(columns={"class": "income_gt_50k"})
elif "income" in df.columns:
df = df.rename(columns={"income": "income_gt_50k"})
return df.dropna(subset=["income_gt_50k"])
@lru_cache(maxsize=1)
def sklearn_california_housing() -> pd.DataFrame:
data = fetch_california_housing(as_frame=True)
df = data.frame.rename(columns={"MedHouseVal": "median_house_value"})
return df
@lru_cache(maxsize=1)
def fivethirtyeight_candy() -> pd.DataFrame:
df = pd.read_csv(CANDY_DATA_URL)
return df.drop(columns=[c for c in ["competitorname"] if c in df.columns]).dropna(subset=["winpercent"])
@lru_cache(maxsize=1)
def kaggle_credit_card_fraud() -> pd.DataFrame:
import kagglehub
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
csv_path = find_first_data_file(path, (".csv",))
df = pd.read_csv(csv_path)
if "Class" in df.columns:
df = df.rename(columns={"Class": "fraud"})
return df.dropna(subset=["fraud"])
@lru_cache(maxsize=1)
def kaggle_epirecipes() -> pd.DataFrame:
import kagglehub
path = kagglehub.dataset_download("hugodarwood/epirecipes")
try:
json_path = find_first_data_file(path, (".json",))
df = pd.read_json(json_path)
except FileNotFoundError:
csv_path = find_first_data_file(path, (".csv",))
df = pd.read_csv(csv_path)
if "rating" not in df.columns:
raise ValueError("Epicurious dataset does not include a rating column.")
preferred = [
"calories",
"protein",
"fat",
"sodium",
"dessert",
"dinner",
"breakfast",
"healthy",
"vegetarian",
"vegan",
"cakeweek",
"rating",
]
cols = [c for c in preferred if c in df.columns]
return df[cols].dropna(subset=["rating"])
@lru_cache(maxsize=1)
def kaggle_calcofi() -> pd.DataFrame:
df = normalize_columns(kaggle_csv("sohier/calcofi", ("bottle",)))
target = choose_col(df, ["t_deg_c", "temperature"], ["t", "deg"])
salinity = choose_col(df, ["salnty", "salinity"], ["sal"])
cols = [c for c in [salinity, "depthm", "o2ml_l", "sio3um", "no3um", "po4um", target] if c in df.columns]
return df[cols].rename(columns={target: "water_temperature", salinity: "salinity"}).dropna(subset=["water_temperature"])
@lru_cache(maxsize=1)
def kaggle_szeged_weather() -> pd.DataFrame:
df = normalize_columns(kaggle_csv("budincsevity/szeged-weather"))
target = choose_col(df, ["apparent_temperature_c", "apparent_temperature"], ["apparent", "temperature"])
cols = [c for c in ["temperature_c", "humidity", "wind_speed_km_h", "wind_bearing_degrees", "visibility_km", "pressure_millibars", "summary", "precip_type", target] if c in df.columns]
return df[cols].rename(columns={target: "apparent_temperature"}).dropna(subset=["apparent_temperature"])
@lru_cache(maxsize=1)
def kaggle_weather_ww2() -> pd.DataFrame:
df = normalize_columns(kaggle_csv("smid80/weatherww2", ("summary",)))
target = choose_col(df, ["maxtemp", "max_temp", "max"], ["max"])
cols = [c for c in ["mintemp", "meantemp", "precip", "snowfall", "yr", "mo", "da", target] if c in df.columns]
return df[cols].rename(columns={target: "max_temperature"}).dropna(subset=["max_temperature"])
@lru_cache(maxsize=1)
def kaggle_montreal_bike_lanes() -> pd.DataFrame:
df = normalize_columns(kaggle_csv("pablomonleon/montreal-bike-lanes"))
numeric = df.select_dtypes(include=np.number)
if numeric.shape[1] < 2:
raise ValueError("Montreal bike lanes dataset needs at least two numeric count columns.")
target = numeric.columns[-1]
cols = numeric.columns[: min(8, len(numeric.columns))].tolist()
if target not in cols:
cols.append(target)
return df[cols].rename(columns={target: "rider_count"}).dropna(subset=["rider_count"])
@lru_cache(maxsize=1)
def kaggle_nyc_bike_crossings() -> pd.DataFrame:
df = normalize_columns(kaggle_csv("new-york-city/nyc-east-river-bicycle-crossings"))
numeric = df.select_dtypes(include=np.number)
target = choose_col(numeric, ["total", "total_bicycle_count"], ["total"])
cols = [c for c in numeric.columns[:8] if c != target]
return df[[*cols, target]].rename(columns={target: "total_bike_crossings"}).dropna(subset=["total_bike_crossings"])
@lru_cache(maxsize=1)
def kaggle_uk_road_safety() -> pd.DataFrame:
df = normalize_columns(kaggle_csv("bluehorseshoe/uk-2016-road-safety-data", ("accident",)))
target = choose_col(df, ["number_of_casualties", "casualties"], ["casual"])
preferred = [
"number_of_vehicles",
"day_of_week",
"speed_limit",
"light_conditions",
"weather_conditions",
"road_surface_conditions",
"urban_or_rural_area",
target,
]
cols = [c for c in preferred if c in df.columns]
return df[cols].rename(columns={target: "casualty_count"}).dropna(subset=["casualty_count"])
@lru_cache(maxsize=1)
def kaggle_kcbs_bbq() -> pd.DataFrame:
df = normalize_columns(kaggle_csv("jaysobel/kcbs-bbq"))
numeric = df.select_dtypes(include=np.number)
if "place" in df.columns:
target = "place"
elif "rank" in df.columns:
target = "rank"
else:
target = numeric.columns[0]
out = df.copy()
out["first_place"] = pd.to_numeric(out[target], errors="coerce").eq(1).astype(int)
feature_cols = [c for c in numeric.columns if c != target][:10]
categorical_cols = [c for c in out.columns if c not in numeric.columns and c != target][:4]
return out[[*feature_cols, *categorical_cols, "first_place"]].dropna(subset=["first_place"])
def load_iris_df(limit: int, seed: int) -> pd.DataFrame:
data = load_iris(as_frame=True)
df = data.frame.rename(columns={"target": "species"})
df["species"] = df["species"].map(dict(enumerate(data.target_names)))
return df.sample(min(limit, len(df)), random_state=seed)
def load_wine_df(limit: int, seed: int) -> pd.DataFrame:
data = load_wine(as_frame=True)
df = data.frame.rename(columns={"target": "wine_class"})
return df.sample(min(limit, len(df)), random_state=seed)
def load_breast_cancer_df(limit: int, seed: int) -> pd.DataFrame:
data = load_breast_cancer(as_frame=True)
df = data.frame.rename(columns={"target": "diagnosis"})
df["diagnosis"] = df["diagnosis"].map({0: "malignant", 1: "benign"})
return df.sample(min(limit, len(df)), random_state=seed)
def load_digits_df(limit: int, seed: int) -> pd.DataFrame:
data = load_digits(as_frame=True)
df = data.frame.rename(columns={"target": "digit"})
return df.sample(min(limit, len(df)), random_state=seed)
def load_diabetes_df(limit: int, seed: int) -> pd.DataFrame:
data = load_diabetes(as_frame=True)
df = data.frame.rename(columns={"target": "disease_progression"})
return df.sample(min(limit, len(df)), random_state=seed)
def load_california_housing_df(limit: int, seed: int) -> pd.DataFrame:
try:
return sample_df(sklearn_california_housing(), limit, seed)
except Exception:
return load_synthetic_housing_df(limit, seed).rename(columns={"sale_price": "median_house_value"})
def load_ames_housing_df(limit: int, seed: int) -> pd.DataFrame:
try:
return sample_df(openml_ames_housing(), limit, seed)
except Exception:
return load_synthetic_housing_df(limit, seed)
def load_synthetic_housing_df(limit: int, seed: int) -> pd.DataFrame:
rng = np.random.default_rng(seed)
n = min(limit, 12000)
bedrooms = rng.integers(1, 7, n)
sqft = rng.normal(1750, 650, n).clip(450, 5200)
age = rng.integers(0, 90, n)
zipcode = rng.choice(["94016", "98101", "10011", "60614", "78704", "30309"], n)
price = 120000 + sqft * rng.normal(230, 20, n) + bedrooms * 18000 - age * 1400
price += pd.Series(zipcode).map({"94016": 260000, "98101": 140000, "10011": 210000, "60614": 80000, "78704": 110000, "30309": 70000}).to_numpy()
price += rng.normal(0, 45000, n)
return pd.DataFrame(
{
"sqft": sqft.round(0),
"bedrooms": bedrooms,
"home_age": age,
"zipcode": zipcode,
"has_garage": rng.choice(["yes", "no"], n, p=[0.72, 0.28]),
"sale_price": price.round(0),
}
)
def load_titanic_proxy_df(limit: int, seed: int) -> pd.DataFrame:
rng = np.random.default_rng(seed)
n = min(limit, 891)
sex = rng.choice(["female", "male"], n, p=[0.38, 0.62])
pclass = rng.choice([1, 2, 3], n, p=[0.24, 0.21, 0.55])
age = rng.normal(30, 14, n).clip(0.5, 78)
fare = np.exp(rng.normal(3.1, 0.85, n)) * (4 - pclass)
embarked = rng.choice(["S", "C", "Q"], n, p=[0.72, 0.19, 0.09])
logit = 1.6 * (sex == "female") + 0.9 * (pclass == 1) + 0.25 * (pclass == 2) - 0.025 * age + 0.01 * fare - 1.1
survived = rng.binomial(1, 1 / (1 + np.exp(-logit)))
return pd.DataFrame(
{
"pclass": pclass,
"sex": sex,
"age": age.round(1),
"sibsp": rng.integers(0, 5, n),
"parch": rng.integers(0, 4, n),
"fare": fare.round(2),
"embarked": embarked,
"survived": survived,
}
)
def load_titanic_df(limit: int, seed: int) -> pd.DataFrame:
try:
return sample_df(openml_titanic(), limit, seed)
except Exception:
return load_titanic_proxy_df(limit, seed)
def load_credit_fraud_proxy_df(limit: int, seed: int) -> pd.DataFrame:
rng = np.random.default_rng(seed)
n = min(limit, 50000)
x, y = make_classification(
n_samples=n,
n_features=18,
n_informative=8,
n_redundant=4,
weights=[0.985, 0.015],
class_sep=1.6,
random_state=seed,
)
df = pd.DataFrame(x, columns=[f"v{i}" for i in range(1, 19)])
df["amount"] = np.exp(rng.normal(3.2, 1.0, n)).round(2)
df["merchant_category"] = rng.choice(["travel", "grocery", "electronics", "fuel", "cash"], n)
df["fraud"] = y
return df
def load_credit_fraud_df(limit: int, seed: int) -> pd.DataFrame:
try:
return sample_df(kaggle_credit_card_fraud(), limit, seed)
except Exception:
return load_credit_fraud_proxy_df(limit, seed)
def load_epirecipes_proxy_df(limit: int, seed: int) -> pd.DataFrame:
rng = np.random.default_rng(seed)
n = min(limit, 20000)
calories = rng.gamma(4, 120, n)
protein = rng.gamma(2, 12, n)
fat = rng.gamma(2.5, 9, n)
sodium = rng.gamma(2.4, 180, n)
course = rng.choice(["main", "dessert", "side", "salad", "breakfast"], n)
cuisine = rng.choice(["american", "italian", "mexican", "asian", "mediterranean"], n)
rating = 2.8 + 0.12 * (course == "dessert") + 0.18 * (cuisine == "italian") - 0.00035 * sodium + rng.normal(0, 0.65, n)
return pd.DataFrame(
{
"calories": calories.round(0),
"protein": protein.round(1),
"fat": fat.round(1),
"sodium": sodium.round(0),
"course": course,
"cuisine": cuisine,
"make_again": rng.choice(["yes", "no"], n, p=[0.66, 0.34]),
"rating": rating.clip(0, 5).round(2),
}
)
def load_epirecipes_df(limit: int, seed: int) -> pd.DataFrame:
try:
return sample_df(kaggle_epirecipes(), limit, seed)
except Exception:
return load_epirecipes_proxy_df(limit, seed)
def load_epirecipes_cakeweek_df(limit: int, seed: int) -> pd.DataFrame:
try:
df = kaggle_epirecipes().copy()
if "cakeweek" not in df.columns:
raise KeyError("cakeweek")
df["cakeweek"] = pd.to_numeric(df["cakeweek"], errors="coerce").fillna(0).astype(int)
return sample_df(df, limit, seed)
except Exception:
df = load_epirecipes_proxy_df(limit, seed).copy()
df["cakeweek"] = ((df["course"] == "dessert") & (df["rating"] >= df["rating"].median())).astype(int)
return df
def load_candy_proxy_df(limit: int, seed: int) -> pd.DataFrame:
rng = np.random.default_rng(seed)
n = min(limit, 1200)
chocolate = rng.binomial(1, 0.45, n)
fruity = rng.binomial(1, 0.38, n)
caramel = rng.binomial(1, 0.24, n)
pricepercent = rng.beta(2, 4, n)
sugarpercent = rng.beta(3, 2, n)
winpercent = 35 + 18 * chocolate + 8 * caramel + 9 * sugarpercent - 10 * pricepercent + rng.normal(0, 8, n)
return pd.DataFrame(
{
"chocolate": chocolate,
"fruity": fruity,
"caramel": caramel,
"peanutyalmondy": rng.binomial(1, 0.2, n),
"nougat": rng.binomial(1, 0.14, n),
"crispedricewafer": rng.binomial(1, 0.16, n),
"hard": rng.binomial(1, 0.28, n),
"bar": rng.binomial(1, 0.36, n),
"sugarpercent": sugarpercent.round(3),
"pricepercent": pricepercent.round(3),
"winpercent": winpercent.clip(5, 95).round(2),
}
)
def load_candy_df(limit: int, seed: int) -> pd.DataFrame:
try:
return sample_df(fivethirtyeight_candy(), limit, seed)
except Exception:
return load_candy_proxy_df(limit, seed)
def load_candy_chocolate_df(limit: int, seed: int) -> pd.DataFrame:
df = load_candy_df(limit, seed).copy()
if "chocolate" not in df.columns:
raise gr.Error("Candy dataset does not include the chocolate target.")
return df
def load_adult_income_proxy_df(limit: int, seed: int) -> pd.DataFrame:
rng = np.random.default_rng(seed)
n = min(limit, 30000)
education_num = rng.integers(6, 17, n)
hours = rng.normal(40, 12, n).clip(1, 80)
age = rng.normal(39, 13, n).clip(18, 75)
occupation = rng.choice(["tech", "sales", "ops", "admin", "service", "exec"], n)
logit = -6 + 0.16 * age + 0.36 * education_num + 0.035 * hours + 0.9 * (occupation == "exec") + 0.55 * (occupation == "tech")
income = rng.binomial(1, 1 / (1 + np.exp(-logit)))
return pd.DataFrame(
{
"age": age.round(0),
"education_num": education_num,
"hours_per_week": hours.round(0),
"occupation": occupation,
"marital_status": rng.choice(["single", "married", "divorced"], n),
"income_gt_50k": income,
}
)
def load_adult_income_df(limit: int, seed: int) -> pd.DataFrame:
try:
return sample_df(openml_adult_income(), limit, seed)
except Exception:
return load_adult_income_proxy_df(limit, seed)
def load_bike_demand_proxy_df(limit: int, seed: int) -> pd.DataFrame:
rng = np.random.default_rng(seed)
n = min(limit, 15000)
hour = rng.integers(0, 24, n)
temp = rng.normal(21, 9, n).clip(-5, 40)
workingday = rng.binomial(1, 0.69, n)
weather = rng.choice(["clear", "mist", "rain", "storm"], n, p=[0.55, 0.28, 0.14, 0.03])
commute_peak = ((hour >= 7) & (hour <= 9)) | ((hour >= 16) & (hour <= 18))
count = 80 + 115 * commute_peak + 5.5 * temp + 45 * workingday - 75 * (weather == "rain") - 130 * (weather == "storm")
count += rng.normal(0, 45, n)
return pd.DataFrame({"hour": hour, "temp": temp.round(1), "workingday": workingday, "weather": weather, "rental_count": count.clip(0).round(0)})
def load_calcofi_df(limit: int, seed: int) -> pd.DataFrame:
try:
return sample_df(kaggle_calcofi(), limit, seed)
except Exception:
rng = np.random.default_rng(seed)
n = min(limit, 12000)
salinity = rng.normal(33.5, 0.7, n)
depth = rng.gamma(2.0, 40.0, n)
temp = 23 - 0.38 * depth / 10 - 1.7 * (salinity - 33.5) + rng.normal(0, 1.8, n)
return pd.DataFrame({"salinity": salinity, "depthm": depth, "water_temperature": temp})
def load_szeged_weather_df(limit: int, seed: int) -> pd.DataFrame:
try:
return sample_df(kaggle_szeged_weather(), limit, seed)
except Exception:
rng = np.random.default_rng(seed)
n = min(limit, 20000)
humidity = rng.beta(4, 2, n)
temp = rng.normal(12, 10, n)
apparent = temp - 5 * humidity + rng.normal(0, 2.5, n)
return pd.DataFrame({"temperature_c": temp, "humidity": humidity, "wind_speed_km_h": rng.gamma(2, 4, n), "apparent_temperature": apparent})
def load_weather_ww2_df(limit: int, seed: int) -> pd.DataFrame:
try:
return sample_df(kaggle_weather_ww2(), limit, seed)
except Exception:
rng = np.random.default_rng(seed)
n = min(limit, 15000)
mintemp = rng.normal(15, 9, n)
return pd.DataFrame({"mintemp": mintemp, "precip": rng.gamma(1.5, 2, n), "max_temperature": mintemp + rng.normal(8, 3, n)})
def load_montreal_bike_lanes_df(limit: int, seed: int) -> pd.DataFrame:
try:
return sample_df(kaggle_montreal_bike_lanes(), limit, seed)
except Exception:
return load_bike_demand_proxy_df(limit, seed).rename(columns={"rental_count": "rider_count"})
def load_nyc_bike_crossings_df(limit: int, seed: int) -> pd.DataFrame:
try:
return sample_df(kaggle_nyc_bike_crossings(), limit, seed)
except Exception:
df = load_bike_demand_proxy_df(limit, seed)
df["brooklyn_bridge"] = (df["rental_count"] * 0.32).round()
df["manhattan_bridge"] = (df["rental_count"] * 0.27).round()
df["total_bike_crossings"] = df["rental_count"]
return df.drop(columns=["rental_count"])
def load_uk_road_safety_df(limit: int, seed: int) -> pd.DataFrame:
try:
return sample_df(kaggle_uk_road_safety(), limit, seed)
except Exception:
rng = np.random.default_rng(seed)
n = min(limit, 30000)
vehicles = rng.integers(1, 5, n)
speed = rng.choice([20, 30, 40, 50, 60, 70], n)
casualties = rng.poisson(0.4 + vehicles * 0.28 + (speed > 50) * 0.25, n)
return pd.DataFrame({"number_of_vehicles": vehicles, "speed_limit": speed, "light_conditions": rng.choice(["daylight", "dark"], n), "casualty_count": casualties})
def load_kcbs_bbq_df(limit: int, seed: int) -> pd.DataFrame:
try:
return sample_df(kaggle_kcbs_bbq(), limit, seed)
except Exception:
rng = np.random.default_rng(seed)
n = min(limit, 8000)
score = rng.normal(165, 12, n)
first = rng.binomial(1, 1 / (1 + np.exp(-(score - 184) / 5)))
return pd.DataFrame({"score": score, "contest_size": rng.integers(10, 80, n), "category": rng.choice(["chicken", "ribs", "pork", "brisket"], n), "first_place": first})
DATASETS: list[DatasetSpec] = [
DatasetSpec("Titanic Survival", "classification", "survived", "OpenML data_id=40945", 1309, "Mixed categorical/numeric binary classification.", load_titanic_df),
DatasetSpec("Ames Housing Prices", "regression", "sale_price", "OpenML data_id=42165", 1460, "Ames real-estate regression with neighborhood and quality features.", load_ames_housing_df),
DatasetSpec("California Housing", "regression", "median_house_value", "sklearn California housing", 20640, "Block-level California housing value regression.", load_california_housing_df),
DatasetSpec("Credit Card Fraud", "classification", "fraud", "KaggleHub mlg-ulb/creditcardfraud", 284807, "Large imbalanced binary fraud task.", load_credit_fraud_df),
DatasetSpec("Epicurious Recipes", "regression", "rating", "KaggleHub hugodarwood/epirecipes", 20000, "Recipe nutrition and tags to rating.", load_epirecipes_df),
DatasetSpec("Halloween Candy", "regression", "winpercent", "FiveThirtyEight GitHub CSV", 85, "Candy attributes to popularity score.", load_candy_df),
DatasetSpec("Candy Chocolate", "classification", "chocolate", "FiveThirtyEight GitHub CSV", 85, "Predict whether a candy is chocolate from other candy attributes.", load_candy_chocolate_df),
DatasetSpec("Epicurious Cakeweek", "classification", "cakeweek", "KaggleHub hugodarwood/epirecipes", 20000, "Predict cakeweek recipes from nutrition and recipe tags.", load_epirecipes_cakeweek_df),
DatasetSpec("CalCOFI Ocean Temperature", "regression", "water_temperature", "KaggleHub sohier/calcofi", 864863, "Predict ocean water temperature from salinity and chemistry readings.", load_calcofi_df),
DatasetSpec("Szeged Apparent Temperature", "regression", "apparent_temperature", "KaggleHub budincsevity/szeged-weather", 96453, "Predict apparent temperature from humidity, wind, pressure, and weather.", load_szeged_weather_df),
DatasetSpec("WW2 Max Temperature", "regression", "max_temperature", "KaggleHub smid80/weatherww2", 119040, "Predict daily maximum temperature from minimum temperature and weather fields.", load_weather_ww2_df),
DatasetSpec("Montreal Bike Lane Counts", "regression", "rider_count", "KaggleHub pablomonleon/montreal-bike-lanes", 319, "Predict rider counts on one Montreal bike path from other paths.", load_montreal_bike_lanes_df),
DatasetSpec("NYC Bike Crossings", "regression", "total_bike_crossings", "KaggleHub new-york-city/nyc-east-river-bicycle-crossings", 210, "Predict total East River bicycle crossings from bridge counts.", load_nyc_bike_crossings_df),
DatasetSpec("UK Road Casualties", "regression", "casualty_count", "KaggleHub bluehorseshoe/uk-2016-road-safety-data", 136621, "Predict accident casualty count from road safety fields.", load_uk_road_safety_df),
DatasetSpec("KCBS BBQ First Place", "classification", "first_place", "KaggleHub jaysobel/kcbs-bbq", 1559, "Predict whether a BBQ competition team wins first place.", load_kcbs_bbq_df),
DatasetSpec("Adult Income", "classification", "income_gt_50k", "OpenML data_id=1590", 48842, "Demographic and work attributes to income bucket.", load_adult_income_df),
DatasetSpec("Bike Demand", "regression", "rental_count", "Kaggle-style proxy", 15000, "Weather and time features to rental demand.", load_bike_demand_proxy_df),
DatasetSpec("Iris", "classification", "species", "sklearn", 150, "Classic multi-class flower classification.", load_iris_df),
DatasetSpec("Wine", "classification", "wine_class", "sklearn", 178, "Chemical analysis to cultivar class.", load_wine_df),
DatasetSpec("Breast Cancer", "classification", "diagnosis", "sklearn", 569, "Diagnostic measurements to benign/malignant label.", load_breast_cancer_df),
DatasetSpec("Digits", "classification", "digit", "sklearn", 1797, "Pixel features to handwritten digit class.", load_digits_df),
DatasetSpec("Diabetes", "regression", "disease_progression", "sklearn", 442, "Clinical variables to disease progression.", load_diabetes_df),
]
def dataset_names() -> list[str]:
return [d.name for d in DATASETS]
def get_spec(name: str) -> DatasetSpec:
return next(d for d in DATASETS if d.name == name)
def get_dataset(name: str, sample_size: int, seed: int) -> pd.DataFrame:
spec = get_spec(name)
return spec.loader(sample_size, seed).reset_index(drop=True)
def split_xy(df: pd.DataFrame, target: str) -> tuple[pd.DataFrame, pd.Series]:
cleaned = df.dropna(axis=1, how="all").copy()
if target not in cleaned.columns:
raise gr.Error(f"Target column '{target}' was not found.")
y = cleaned[target]
x = cleaned.drop(columns=[target])
if x.empty:
raise gr.Error("Dataset must include at least one feature column.")
return x, y
def coerce_numeric_target(y: pd.Series) -> pd.Series:
if y.dtype.kind in "ifu":
return pd.to_numeric(y, errors="coerce")
cleaned = y.astype("string").str.strip().str.replace(",", "", regex=False)
return pd.to_numeric(cleaned, errors="coerce")
def prepare_xy(df: pd.DataFrame, target: str, task: str | None) -> tuple[pd.DataFrame, pd.Series, str]:
x, raw_y = split_xy(df, target)
inferred_task = task or infer_task(raw_y)
y = raw_y.copy()
if inferred_task == "regression":
y = coerce_numeric_target(y)
valid_target = y.notna() & np.isfinite(y.to_numpy(dtype=float))
if valid_target.sum() < 2:
raise gr.Error("Regression target must contain at least two numeric values.")
dropped = len(y) - int(valid_target.sum())
x = x.loc[valid_target].reset_index(drop=True)
y = y.loc[valid_target].reset_index(drop=True)
if dropped and x.empty:
raise gr.Error("No usable rows remain after dropping non-numeric regression targets.")
else:
valid_target = raw_y.notna()
if valid_target.sum() < 2:
raise gr.Error("Classification target must contain at least two non-empty values.")
x = x.loc[valid_target].reset_index(drop=True)
y = raw_y.loc[valid_target].reset_index(drop=True)
if x.empty:
raise gr.Error("Dataset must include at least one feature column.")
return x, y, inferred_task
def infer_task(y: pd.Series) -> str:
if y.dtype.kind in "ifu" and y.nunique(dropna=True) > 20:
return "regression"
numeric_y = coerce_numeric_target(y)
non_missing = y.notna().sum()
numeric_non_missing = numeric_y.notna().sum()
if non_missing and numeric_non_missing / non_missing >= 0.9 and numeric_y.nunique(dropna=True) > 20:
return "regression"
return "classification"
def make_preprocessor(x: pd.DataFrame, scale_numeric: bool = False) -> ColumnTransformer:
numeric_cols = x.select_dtypes(include=np.number).columns.tolist()
categorical_cols = [c for c in x.columns if c not in numeric_cols]
numeric_steps: list[tuple[str, object]] = [("impute", SimpleImputer(strategy="median"))]
if scale_numeric:
numeric_steps.append(("scale", StandardScaler()))
transformers: list[tuple[str, object, list[str]]] = []
if numeric_cols:
transformers.append(("num", Pipeline(numeric_steps), numeric_cols))
if categorical_cols:
transformers.append(
(
"cat",
Pipeline(
[
("impute", SimpleImputer(strategy="most_frequent")),
("encode", OneHotEncoder(handle_unknown="ignore", sparse_output=False, max_categories=32)),
]
),
categorical_cols,
)
)
return ColumnTransformer(transformers=transformers, remainder="drop", verbose_feature_names_out=False)
def available_baselines(task: str) -> dict[str, object]:
if task == "classification":
models: dict[str, object] = {
"Logistic": Pipeline([("prep", make_preprocessor(pd.DataFrame(), True)), ("model", LogisticRegression(max_iter=800))]),
"RandomForest": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", RandomForestClassifier(n_estimators=80, min_samples_leaf=2, n_jobs=-1, random_state=RANDOM_STATE))]),
"ExtraTrees": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", ExtraTreesClassifier(n_estimators=120, min_samples_leaf=2, n_jobs=-1, random_state=RANDOM_STATE))]),
"GradientBoosting": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", GradientBoostingClassifier(n_estimators=100, learning_rate=0.06, random_state=RANDOM_STATE))]),
"HistGradientBoosting": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", HistGradientBoostingClassifier(max_iter=120, random_state=RANDOM_STATE))]),
"AdaBoost": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", AdaBoostClassifier(n_estimators=80, learning_rate=0.08, random_state=RANDOM_STATE))]),
"NaiveBayes": Pipeline([("prep", make_preprocessor(pd.DataFrame(), True)), ("model", GaussianNB())]),
"KNN": Pipeline([("prep", make_preprocessor(pd.DataFrame(), True)), ("model", KNeighborsClassifier(n_neighbors=7))]),
"SVC": Pipeline([("prep", make_preprocessor(pd.DataFrame(), True)), ("model", SVC(C=1.0, probability=True, random_state=RANDOM_STATE))]),
"Dummy": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", DummyClassifier(strategy="most_frequent"))]),
}
else:
models = {
"LinearRegression": Pipeline([("prep", make_preprocessor(pd.DataFrame(), True)), ("model", LinearRegression())]),
"Ridge": Pipeline([("prep", make_preprocessor(pd.DataFrame(), True)), ("model", Ridge(alpha=1.0))]),
"BayesianRidge": Pipeline([("prep", make_preprocessor(pd.DataFrame(), True)), ("model", BayesianRidge())]),
"RandomForest": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", RandomForestRegressor(n_estimators=80, min_samples_leaf=2, n_jobs=-1, random_state=RANDOM_STATE))]),
"ExtraTrees": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", ExtraTreesRegressor(n_estimators=120, min_samples_leaf=2, n_jobs=-1, random_state=RANDOM_STATE))]),
"GradientBoosting": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", GradientBoostingRegressor(n_estimators=100, learning_rate=0.06, random_state=RANDOM_STATE))]),
"HistGradientBoosting": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", HistGradientBoostingRegressor(max_iter=120, random_state=RANDOM_STATE))]),
"AdaBoost": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", AdaBoostRegressor(n_estimators=80, learning_rate=0.08, random_state=RANDOM_STATE))]),
"KNN": Pipeline([("prep", make_preprocessor(pd.DataFrame(), True)), ("model", KNeighborsRegressor(n_neighbors=7))]),
"SVR": Pipeline([("prep", make_preprocessor(pd.DataFrame(), True)), ("model", SVR(C=1.0))]),
"Dummy": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", DummyRegressor(strategy="median"))]),
}
if importlib.util.find_spec("xgboost"):
from xgboost import XGBClassifier, XGBRegressor
if task == "classification":
models["XGBoost"] = Pipeline(
[
("prep", make_preprocessor(pd.DataFrame())),
("model", XGBClassifier(n_estimators=80, max_depth=4, learning_rate=0.08, eval_metric="logloss", random_state=RANDOM_STATE)),
]
)
else:
models["XGBoost"] = Pipeline(
[
("prep", make_preprocessor(pd.DataFrame())),
("model", XGBRegressor(n_estimators=80, max_depth=4, learning_rate=0.08, random_state=RANDOM_STATE)),
]
)
if importlib.util.find_spec("lightgbm"):
try:
from lightgbm import LGBMClassifier, LGBMRegressor
if task == "classification":
models["LightGBM"] = Pipeline(
[
("prep", make_preprocessor(pd.DataFrame())),
("model", LGBMClassifier(n_estimators=120, learning_rate=0.06, random_state=RANDOM_STATE, verbose=-1)),
]
)
else:
models["LightGBM"] = Pipeline(
[
("prep", make_preprocessor(pd.DataFrame())),
("model", LGBMRegressor(n_estimators=120, learning_rate=0.06, random_state=RANDOM_STATE, verbose=-1)),
]
)
except Exception:
pass
return models
def rebuild_pipeline(model: Pipeline, x_train: pd.DataFrame) -> Pipeline:
pipe = clone(model)
wants_scale = pipe.steps[0][1].transformers and "scale" in str(pipe.steps[0][1].transformers[0][1])
pipe.steps[0] = ("prep", make_preprocessor(x_train, scale_numeric=wants_scale))
return pipe
@lru_cache(maxsize=2)
def load_tabfm_model(task: str):
from tabfm import tabfm_v1_0_0_pytorch
model_type = "classification" if task == "classification" else "regression"
return tabfm_v1_0_0_pytorch.load(model_type=model_type)
def resolve_tabfm_params(
preset: str,
n_estimators: int,
max_num_rows: int,
max_num_features: int,
batch_size: int,
enable_nnls: bool,
n_feature_crosses: str,
n_svd_features: str,
max_eval_rows: int,
) -> dict[str, object]:
if preset in TABFM_PRESETS:
return dict(TABFM_PRESETS[preset])
resolved_rows = None if max_num_rows <= 0 else int(max_num_rows)
resolved = {
"n_estimators": int(n_estimators),
"max_num_rows": resolved_rows,
"max_num_features": int(max_num_features),
"batch_size": int(batch_size),
"enable_nnls": bool(enable_nnls),
"n_feature_crosses": 0 if n_feature_crosses == "0" else n_feature_crosses,
"n_svd_features": 0 if n_svd_features == "0" else n_svd_features,
"max_eval_rows": None if max_eval_rows <= 0 else int(max_eval_rows),
}
if resolved["enable_nnls"] and resolved["max_num_rows"] is not None:
resolved["max_num_rows"] = None
return resolved
def run_tabfm(task: str, x_train: pd.DataFrame, x_test: pd.DataFrame, y_train: pd.Series, tabfm_params: dict[str, object]):
from tabfm import TabFMClassifier, TabFMRegressor
foundation_model = load_tabfm_model(task)
estimator = (
TabFMClassifier(model=foundation_model, **tabfm_params)
if task == "classification"
else TabFMRegressor(model=foundation_model, **tabfm_params)
)
estimator.fit(x_train, y_train.to_numpy())
pred = estimator.predict(x_test)
proba = estimator.predict_proba(x_test) if task == "classification" and hasattr(estimator, "predict_proba") else None
return pred, proba
def clean_tabfm_features(x_train: pd.DataFrame, x_test: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
cleaned_train = pd.DataFrame(index=x_train.index)
cleaned_test = pd.DataFrame(index=x_test.index)
for column in x_train.columns:
train_series = x_train[column].replace([np.inf, -np.inf], np.nan)
test_series = x_test[column].replace([np.inf, -np.inf], np.nan)
if train_series.isna().all() and test_series.isna().all():
continue
if pd.api.types.is_bool_dtype(train_series):
cleaned_train[column] = train_series.astype("float64").fillna(0.0)
cleaned_test[column] = test_series.astype("float64").fillna(0.0)
elif pd.api.types.is_numeric_dtype(train_series):
train_numeric = pd.to_numeric(train_series, errors="coerce")
test_numeric = pd.to_numeric(test_series, errors="coerce")
fill_value = train_numeric.median()
if pd.isna(fill_value):
fill_value = 0.0
cleaned_train[column] = train_numeric.fillna(fill_value)
cleaned_test[column] = test_numeric.fillna(fill_value)
elif pd.api.types.is_datetime64_any_dtype(train_series):
cleaned_train[column] = train_series.dt.strftime("%Y-%m-%d %H:%M:%S").fillna("__missing__")
cleaned_test[column] = test_series.dt.strftime("%Y-%m-%d %H:%M:%S").fillna("__missing__")
else:
train_text = train_series.astype("string").str.strip()
test_text = test_series.astype("string").str.strip()
train_numeric = coerce_numeric_target(train_series)
test_numeric = coerce_numeric_target(test_series)
non_missing = train_text.notna() & (train_text != "")
numeric_ratio = train_numeric.notna().sum() / non_missing.sum() if non_missing.sum() else 0
if numeric_ratio >= 0.9:
fill_value = train_numeric.median()
if pd.isna(fill_value):
fill_value = 0.0
cleaned_train[column] = train_numeric.fillna(fill_value)
cleaned_test[column] = test_numeric.fillna(fill_value)
else:
cleaned_train[column] = train_text.fillna("__missing__").replace("", "__missing__")
cleaned_test[column] = test_text.fillna("__missing__").replace("", "__missing__")
if cleaned_train.empty:
raise gr.Error("TabFM needs at least one non-empty feature column.")
return cleaned_train.reset_index(drop=True), cleaned_test.reset_index(drop=True)
def tabfm_failure_note(exc: Exception) -> str:
detail = f"{type(exc).__name__}: {exc}"
print("TabFM failed:\n" + traceback.format_exc())
env_errors = (ImportError, ModuleNotFoundError, OSError)
if isinstance(exc, env_errors):
return f"TabFM did not run because the runtime could not load it: `{detail}`. On Spaces, keep Python 3.11 and allow the GitHub dependency plus model download for `{TABFM_MODEL_ID}`."
return f"TabFM failed while processing this dataset: `{detail}`."
def score_predictions(task: str, y_true: pd.Series, pred, proba=None) -> dict[str, float]:
if task == "classification":
metrics = {
"accuracy": accuracy_score(y_true, pred),
"f1_weighted": f1_score(y_true, pred, average="weighted", zero_division=0),
}
if proba is not None and len(np.unique(y_true)) == 2:
try:
metrics["roc_auc"] = roc_auc_score(y_true, proba[:, 1])
except Exception:
metrics["roc_auc"] = np.nan
else:
metrics["roc_auc"] = np.nan
metrics["rank_score"] = np.nanmean([metrics["accuracy"], metrics["f1_weighted"], metrics["roc_auc"]])
return metrics
rmse = math.sqrt(mean_squared_error(y_true, pred))
mae = mean_absolute_error(y_true, pred)
r2 = r2_score(y_true, pred)
return {"rmse": rmse, "mae": mae, "r2": r2, "rank_score": -rmse}
def benchmark_frame(
df: pd.DataFrame,
target: str,
task: str | None,
sample_size: int,
test_size: float,
seed: int,
selected_models: list[str],
include_tabfm: bool,
tabfm_params: dict[str, object] | None = None,
) -> tuple[pd.DataFrame, pd.DataFrame, str]:
df = df.sample(min(sample_size, len(df)), random_state=seed).reset_index(drop=True)
x, y, task = prepare_xy(df, target, task)
if task == "classification" and y.nunique(dropna=True) < 2:
raise gr.Error("Classification needs at least two target classes.")
stratify = y if task == "classification" and y.value_counts().min() >= 2 else None
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=seed, stratify=stratify)
rows: list[dict[str, object]] = []
notes: list[str] = []
selected_models = selected_models or []
models = available_baselines(task)
for name, model in models.items():
if name not in selected_models:
continue
start = time.perf_counter()
try:
pipe = rebuild_pipeline(model, x_train)
pipe.fit(x_train, y_train)
pred = pipe.predict(x_test)
proba = pipe.predict_proba(x_test) if task == "classification" and hasattr(pipe, "predict_proba") else None
metrics = score_predictions(task, y_test, pred, proba)
rows.append({"model": name, "status": "ok", "seconds": time.perf_counter() - start, **metrics})
except Exception as exc:
rows.append({"model": name, "status": f"failed: {exc}", "seconds": time.perf_counter() - start})
for name in selected_models:
if name not in models:
rows.append({"model": name, "status": f"not compatible with {task} or unavailable", "seconds": 0.0, "rank_score": np.nan})
if include_tabfm:
start = time.perf_counter()
try:
tabfm_params = tabfm_params or TABFM_PRESETS["Fast"]
tabfm_display_params = dict(tabfm_params)
tabfm_model_params = dict(tabfm_params)
tabfm_eval_rows = tabfm_model_params.pop("max_eval_rows", None)
if tabfm_eval_rows is not None and len(x_test) > int(tabfm_eval_rows):
eval_idx = x_test.sample(int(tabfm_eval_rows), random_state=seed).index
x_eval = x_test.loc[eval_idx]
y_eval = y_test.loc[eval_idx]
status = f"ok ({len(x_eval):,}/{len(x_test):,} test rows)"
else:
x_eval = x_test
y_eval = y_test
status = "ok"
x_tab_train, x_tab_eval = clean_tabfm_features(x_train, x_eval)
pred, proba = run_tabfm(task, x_tab_train, x_tab_eval, y_train, tabfm_model_params)
metrics = score_predictions(task, y_eval, pred, proba)
rows.append({"model": "TabFM", "status": status, "seconds": time.perf_counter() - start, **metrics})
except Exception as exc:
rows.append({"model": "TabFM", "status": f"unavailable: {exc}", "seconds": time.perf_counter() - start})
notes.append(tabfm_failure_note(exc))
else:
rows.append({"model": "TabFM", "status": "skipped - enable Run TabFM live", "seconds": 0.0, "rank_score": np.nan})
notes.append("TabFM is listed as skipped because **Run TabFM live** is off. Enable it to benchmark TabFM; the first run may download large model weights.")
results = pd.DataFrame(rows)
metric_cols = [c for c in ["accuracy", "f1_weighted", "roc_auc", "rmse", "mae", "r2", "rank_score", "seconds"] if c in results.columns]
if not results.empty and "rank_score" in results.columns:
results = results.sort_values("rank_score", ascending=False, na_position="last").reset_index(drop=True)
results.insert(0, "rank", np.arange(1, len(results) + 1))
preview = pd.concat([x_test.reset_index(drop=True).head(12), y_test.reset_index(drop=True).head(12).rename(target)], axis=1)
summary = (
f"**Task:** {task} \n"
f"**Rows used:** {len(df):,} | **Train:** {len(x_train):,} | **Test:** {len(x_test):,} | **Features:** {x.shape[1]:,} \n"
f"**Primary rank:** {'higher accuracy/F1/AUC' if task == 'classification' else 'lower RMSE'}"
)
if notes:
summary += "\n\n" + "\n".join(f"- {note}" for note in notes)
if include_tabfm:
summary += f"\n\n**TabFM params:** `{tabfm_display_params}`"
return results[["rank", "model", "status", *metric_cols]], preview, summary
def metric_chart(results: pd.DataFrame, selected_metrics: list[str] | None = None, chart_style: str = "Line") -> go.Figure:
if results is None or results.empty:
return go.Figure()
selected_metrics = selected_metrics or METRIC_CHOICES
metric_cols = [c for c in selected_metrics if c in results.columns and results[c].notna().any()]
if not metric_cols:
metric_cols = [c for c in METRIC_CHOICES if c in results.columns and results[c].notna().any()]
if not metric_cols:
return go.Figure()
clean = results.sort_values("rank") if "rank" in results.columns else results.copy()
clean = clean.dropna(subset=metric_cols, how="all")
if clean.empty:
return go.Figure()
x_labels = clean["model"].astype(str).tolist()
if chart_style == "Radar":
normalized = clean[["model", *metric_cols]].copy()
for metric in metric_cols:
values = pd.to_numeric(normalized[metric], errors="coerce")
lo, hi = values.min(), values.max()
if pd.isna(lo) or pd.isna(hi) or hi == lo:
normalized[metric] = 0.5
elif metric in {"rmse", "mae", "seconds"}:
normalized[metric] = 1 - ((values - lo) / (hi - lo))
else:
normalized[metric] = (values - lo) / (hi - lo)
fig = go.Figure()
theta = metric_cols + [metric_cols[0]]
for idx, row in normalized.iterrows():
values = [row[m] for m in metric_cols] + [row[metric_cols[0]]]
fig.add_trace(
go.Scatterpolar(
r=values,
theta=theta,
fill="toself",
name=str(row["model"]),
line=dict(color=GOOGLE_COLORS[idx % len(GOOGLE_COLORS)], width=2),
opacity=0.78,
)
)
fig.update_layout(
template="plotly_white",
height=420,
margin=dict(l=35, r=35, t=35, b=25),
polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
legend_title_text="Model",
title="Normalized metric shape (higher is better)",
)
return fig
fig = make_subplots(
rows=len(metric_cols),
cols=1,
shared_xaxes=True,
vertical_spacing=0.08,
subplot_titles=[metric.replace("_", " ").upper() for metric in metric_cols],
)
for idx, metric in enumerate(metric_cols, start=1):
fig.add_trace(
go.Scatter(
x=x_labels,
y=clean[metric],
mode="lines+markers",
name=metric,
line=dict(color=GOOGLE_COLORS[(idx - 1) % len(GOOGLE_COLORS)], width=3, shape="spline"),
marker=dict(size=9, line=dict(color="white", width=1.5)),
hovertemplate=f"%{{x}}
{metric}: %{{y:.4f}}
A clean arena for benchmarking google/tabfm-1.0.0-pytorch against practical tabular baselines across small, classic, imbalanced, and user-uploaded datasets.