from __future__ import annotations import importlib import math import traceback import time from dataclasses import dataclass from functools import lru_cache from pathlib import Path from typing import Callable import gradio as gr import numpy as np import pandas as pd import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots from sklearn.base import clone from sklearn.compose import ColumnTransformer from sklearn.datasets import fetch_california_housing, fetch_openml, load_breast_cancer, load_diabetes, load_digits, load_iris, load_wine, make_classification from sklearn.dummy import DummyClassifier, DummyRegressor from sklearn.ensemble import ( AdaBoostClassifier, AdaBoostRegressor, ExtraTreesClassifier, ExtraTreesRegressor, GradientBoostingClassifier, GradientBoostingRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor, RandomForestClassifier, RandomForestRegressor, ) from sklearn.impute import SimpleImputer from sklearn.linear_model import BayesianRidge, LinearRegression, LogisticRegression, Ridge from sklearn.metrics import ( accuracy_score, f1_score, mean_absolute_error, mean_squared_error, r2_score, roc_auc_score, ) from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.svm import SVC, SVR APP_TITLE = "tabBench" TABFM_MODEL_ID = "google/tabfm-1.0.0-pytorch" RANDOM_STATE = 42 CANDY_DATA_URL = "https://raw.githubusercontent.com/fivethirtyeight/data/master/candy-power-ranking/candy-data.csv" GOOGLE_COLORS = ["#4285F4", "#DB4437", "#F4B400", "#0F9D58", "#A142F4", "#00ACC1"] METRIC_CHOICES = ["accuracy", "f1_weighted", "roc_auc", "rmse", "mae", "r2", "seconds"] TABFM_PRESETS = { "Fast": {"n_estimators": 1, "max_num_rows": 256, "max_num_features": 64, "batch_size": 1, "enable_nnls": False, "n_feature_crosses": 0, "n_svd_features": 0, "max_eval_rows": 256}, "Balanced": {"n_estimators": 4, "max_num_rows": 512, "max_num_features": 128, "batch_size": 1, "enable_nnls": False, "n_feature_crosses": 0, "n_svd_features": 0, "max_eval_rows": 512}, "Default": {"n_estimators": 32, "max_num_rows": None, "max_num_features": 500, "batch_size": 1, "enable_nnls": False, "n_feature_crosses": 0, "n_svd_features": 0, "max_eval_rows": 1000}, "Ensemble": {"n_estimators": 32, "max_num_rows": None, "max_num_features": 500, "batch_size": 1, "enable_nnls": True, "n_feature_crosses": "sqrt", "n_svd_features": "sqrt", "max_eval_rows": 1000}, } @dataclass(frozen=True) class DatasetSpec: name: str task: str target: str source: str rows: int description: str loader: Callable[[int, int], pd.DataFrame] def _add_categorical_noise(df: pd.DataFrame, rng: np.random.Generator, prefix: str) -> pd.DataFrame: df = df.copy() df[f"{prefix}_segment"] = rng.choice(["A", "B", "C", "D"], len(df), p=[0.35, 0.25, 0.25, 0.15]) df[f"{prefix}_region"] = rng.choice(["north", "south", "east", "west"], len(df)) return df def sample_df(df: pd.DataFrame, limit: int, seed: int) -> pd.DataFrame: return df.sample(min(limit, len(df)), random_state=seed).reset_index(drop=True) def find_first_data_file(root: str | Path, suffixes: tuple[str, ...]) -> Path: root = Path(root) for suffix in suffixes: matches = sorted(root.rglob(f"*{suffix}")) if matches: return matches[0] raise FileNotFoundError(f"No data file with suffixes {suffixes} found in {root}") def normalize_columns(df: pd.DataFrame) -> pd.DataFrame: df = df.copy() df.columns = ( pd.Index(df.columns) .astype(str) .str.strip() .str.lower() .str.replace(r"[^0-9a-z]+", "_", regex=True) .str.strip("_") ) return df def choose_col(df: pd.DataFrame, candidates: list[str], contains: list[str] | None = None) -> str: normalized = {c.lower(): c for c in df.columns} for candidate in candidates: key = candidate.lower() if key in normalized: return normalized[key] if contains: for col in df.columns: lower = str(col).lower() if all(part in lower for part in contains): return col raise KeyError(f"None of {candidates} found in columns.") def kaggle_csv(dataset_id: str, preferred_names: tuple[str, ...] = ()) -> pd.DataFrame: import kagglehub path = Path(kagglehub.dataset_download(dataset_id)) csvs = sorted(path.rglob("*.csv")) if preferred_names: for preferred in preferred_names: for csv in csvs: if preferred.lower() in csv.name.lower(): return pd.read_csv(csv) if not csvs: raise FileNotFoundError(f"No CSV files found in Kaggle dataset {dataset_id}.") return pd.read_csv(csvs[0]) def select_numeric_features(df: pd.DataFrame, target: str, max_features: int = 12) -> pd.DataFrame: numeric = df.select_dtypes(include=np.number).columns.tolist() cols = [c for c in numeric if c != target][:max_features] return df[[*cols, target]].dropna(subset=[target]) @lru_cache(maxsize=1) def openml_titanic() -> pd.DataFrame: data = fetch_openml(data_id=40945, as_frame=True, parser="auto") df = data.frame.copy() keep = [c for c in ["pclass", "sex", "age", "sibsp", "parch", "fare", "embarked", "survived"] if c in df.columns] return df[keep].dropna(subset=["survived"]) @lru_cache(maxsize=1) def openml_ames_housing() -> pd.DataFrame: data = fetch_openml(data_id=42165, as_frame=True, parser="auto") df = data.frame.copy() target = "SalePrice" if "SalePrice" in df.columns else data.target_names[0] useful = [ "OverallQual", "GrLivArea", "GarageCars", "GarageArea", "TotalBsmtSF", "FullBath", "YearBuilt", "Neighborhood", "HouseStyle", target, ] cols = [c for c in useful if c in df.columns] return df[cols].rename(columns={target: "sale_price"}).dropna(subset=["sale_price"]) @lru_cache(maxsize=1) def openml_adult_income() -> pd.DataFrame: data = fetch_openml(data_id=1590, as_frame=True, parser="auto") df = data.frame.copy() if "class" in df.columns: df = df.rename(columns={"class": "income_gt_50k"}) elif "income" in df.columns: df = df.rename(columns={"income": "income_gt_50k"}) return df.dropna(subset=["income_gt_50k"]) @lru_cache(maxsize=1) def sklearn_california_housing() -> pd.DataFrame: data = fetch_california_housing(as_frame=True) df = data.frame.rename(columns={"MedHouseVal": "median_house_value"}) return df @lru_cache(maxsize=1) def fivethirtyeight_candy() -> pd.DataFrame: df = pd.read_csv(CANDY_DATA_URL) return df.drop(columns=[c for c in ["competitorname"] if c in df.columns]).dropna(subset=["winpercent"]) @lru_cache(maxsize=1) def kaggle_credit_card_fraud() -> pd.DataFrame: import kagglehub path = kagglehub.dataset_download("mlg-ulb/creditcardfraud") csv_path = find_first_data_file(path, (".csv",)) df = pd.read_csv(csv_path) if "Class" in df.columns: df = df.rename(columns={"Class": "fraud"}) return df.dropna(subset=["fraud"]) @lru_cache(maxsize=1) def kaggle_epirecipes() -> pd.DataFrame: import kagglehub path = kagglehub.dataset_download("hugodarwood/epirecipes") try: json_path = find_first_data_file(path, (".json",)) df = pd.read_json(json_path) except FileNotFoundError: csv_path = find_first_data_file(path, (".csv",)) df = pd.read_csv(csv_path) if "rating" not in df.columns: raise ValueError("Epicurious dataset does not include a rating column.") preferred = [ "calories", "protein", "fat", "sodium", "dessert", "dinner", "breakfast", "healthy", "vegetarian", "vegan", "cakeweek", "rating", ] cols = [c for c in preferred if c in df.columns] return df[cols].dropna(subset=["rating"]) @lru_cache(maxsize=1) def kaggle_calcofi() -> pd.DataFrame: df = normalize_columns(kaggle_csv("sohier/calcofi", ("bottle",))) target = choose_col(df, ["t_deg_c", "temperature"], ["t", "deg"]) salinity = choose_col(df, ["salnty", "salinity"], ["sal"]) cols = [c for c in [salinity, "depthm", "o2ml_l", "sio3um", "no3um", "po4um", target] if c in df.columns] return df[cols].rename(columns={target: "water_temperature", salinity: "salinity"}).dropna(subset=["water_temperature"]) @lru_cache(maxsize=1) def kaggle_szeged_weather() -> pd.DataFrame: df = normalize_columns(kaggle_csv("budincsevity/szeged-weather")) target = choose_col(df, ["apparent_temperature_c", "apparent_temperature"], ["apparent", "temperature"]) cols = [c for c in ["temperature_c", "humidity", "wind_speed_km_h", "wind_bearing_degrees", "visibility_km", "pressure_millibars", "summary", "precip_type", target] if c in df.columns] return df[cols].rename(columns={target: "apparent_temperature"}).dropna(subset=["apparent_temperature"]) @lru_cache(maxsize=1) def kaggle_weather_ww2() -> pd.DataFrame: df = normalize_columns(kaggle_csv("smid80/weatherww2", ("summary",))) target = choose_col(df, ["maxtemp", "max_temp", "max"], ["max"]) cols = [c for c in ["mintemp", "meantemp", "precip", "snowfall", "yr", "mo", "da", target] if c in df.columns] return df[cols].rename(columns={target: "max_temperature"}).dropna(subset=["max_temperature"]) @lru_cache(maxsize=1) def kaggle_montreal_bike_lanes() -> pd.DataFrame: df = normalize_columns(kaggle_csv("pablomonleon/montreal-bike-lanes")) numeric = df.select_dtypes(include=np.number) if numeric.shape[1] < 2: raise ValueError("Montreal bike lanes dataset needs at least two numeric count columns.") target = numeric.columns[-1] cols = numeric.columns[: min(8, len(numeric.columns))].tolist() if target not in cols: cols.append(target) return df[cols].rename(columns={target: "rider_count"}).dropna(subset=["rider_count"]) @lru_cache(maxsize=1) def kaggle_nyc_bike_crossings() -> pd.DataFrame: df = normalize_columns(kaggle_csv("new-york-city/nyc-east-river-bicycle-crossings")) numeric = df.select_dtypes(include=np.number) target = choose_col(numeric, ["total", "total_bicycle_count"], ["total"]) cols = [c for c in numeric.columns[:8] if c != target] return df[[*cols, target]].rename(columns={target: "total_bike_crossings"}).dropna(subset=["total_bike_crossings"]) @lru_cache(maxsize=1) def kaggle_uk_road_safety() -> pd.DataFrame: df = normalize_columns(kaggle_csv("bluehorseshoe/uk-2016-road-safety-data", ("accident",))) target = choose_col(df, ["number_of_casualties", "casualties"], ["casual"]) preferred = [ "number_of_vehicles", "day_of_week", "speed_limit", "light_conditions", "weather_conditions", "road_surface_conditions", "urban_or_rural_area", target, ] cols = [c for c in preferred if c in df.columns] return df[cols].rename(columns={target: "casualty_count"}).dropna(subset=["casualty_count"]) @lru_cache(maxsize=1) def kaggle_kcbs_bbq() -> pd.DataFrame: df = normalize_columns(kaggle_csv("jaysobel/kcbs-bbq")) numeric = df.select_dtypes(include=np.number) if "place" in df.columns: target = "place" elif "rank" in df.columns: target = "rank" else: target = numeric.columns[0] out = df.copy() out["first_place"] = pd.to_numeric(out[target], errors="coerce").eq(1).astype(int) feature_cols = [c for c in numeric.columns if c != target][:10] categorical_cols = [c for c in out.columns if c not in numeric.columns and c != target][:4] return out[[*feature_cols, *categorical_cols, "first_place"]].dropna(subset=["first_place"]) def load_iris_df(limit: int, seed: int) -> pd.DataFrame: data = load_iris(as_frame=True) df = data.frame.rename(columns={"target": "species"}) df["species"] = df["species"].map(dict(enumerate(data.target_names))) return df.sample(min(limit, len(df)), random_state=seed) def load_wine_df(limit: int, seed: int) -> pd.DataFrame: data = load_wine(as_frame=True) df = data.frame.rename(columns={"target": "wine_class"}) return df.sample(min(limit, len(df)), random_state=seed) def load_breast_cancer_df(limit: int, seed: int) -> pd.DataFrame: data = load_breast_cancer(as_frame=True) df = data.frame.rename(columns={"target": "diagnosis"}) df["diagnosis"] = df["diagnosis"].map({0: "malignant", 1: "benign"}) return df.sample(min(limit, len(df)), random_state=seed) def load_digits_df(limit: int, seed: int) -> pd.DataFrame: data = load_digits(as_frame=True) df = data.frame.rename(columns={"target": "digit"}) return df.sample(min(limit, len(df)), random_state=seed) def load_diabetes_df(limit: int, seed: int) -> pd.DataFrame: data = load_diabetes(as_frame=True) df = data.frame.rename(columns={"target": "disease_progression"}) return df.sample(min(limit, len(df)), random_state=seed) def load_california_housing_df(limit: int, seed: int) -> pd.DataFrame: try: return sample_df(sklearn_california_housing(), limit, seed) except Exception: return load_synthetic_housing_df(limit, seed).rename(columns={"sale_price": "median_house_value"}) def load_ames_housing_df(limit: int, seed: int) -> pd.DataFrame: try: return sample_df(openml_ames_housing(), limit, seed) except Exception: return load_synthetic_housing_df(limit, seed) def load_synthetic_housing_df(limit: int, seed: int) -> pd.DataFrame: rng = np.random.default_rng(seed) n = min(limit, 12000) bedrooms = rng.integers(1, 7, n) sqft = rng.normal(1750, 650, n).clip(450, 5200) age = rng.integers(0, 90, n) zipcode = rng.choice(["94016", "98101", "10011", "60614", "78704", "30309"], n) price = 120000 + sqft * rng.normal(230, 20, n) + bedrooms * 18000 - age * 1400 price += pd.Series(zipcode).map({"94016": 260000, "98101": 140000, "10011": 210000, "60614": 80000, "78704": 110000, "30309": 70000}).to_numpy() price += rng.normal(0, 45000, n) return pd.DataFrame( { "sqft": sqft.round(0), "bedrooms": bedrooms, "home_age": age, "zipcode": zipcode, "has_garage": rng.choice(["yes", "no"], n, p=[0.72, 0.28]), "sale_price": price.round(0), } ) def load_titanic_proxy_df(limit: int, seed: int) -> pd.DataFrame: rng = np.random.default_rng(seed) n = min(limit, 891) sex = rng.choice(["female", "male"], n, p=[0.38, 0.62]) pclass = rng.choice([1, 2, 3], n, p=[0.24, 0.21, 0.55]) age = rng.normal(30, 14, n).clip(0.5, 78) fare = np.exp(rng.normal(3.1, 0.85, n)) * (4 - pclass) embarked = rng.choice(["S", "C", "Q"], n, p=[0.72, 0.19, 0.09]) logit = 1.6 * (sex == "female") + 0.9 * (pclass == 1) + 0.25 * (pclass == 2) - 0.025 * age + 0.01 * fare - 1.1 survived = rng.binomial(1, 1 / (1 + np.exp(-logit))) return pd.DataFrame( { "pclass": pclass, "sex": sex, "age": age.round(1), "sibsp": rng.integers(0, 5, n), "parch": rng.integers(0, 4, n), "fare": fare.round(2), "embarked": embarked, "survived": survived, } ) def load_titanic_df(limit: int, seed: int) -> pd.DataFrame: try: return sample_df(openml_titanic(), limit, seed) except Exception: return load_titanic_proxy_df(limit, seed) def load_credit_fraud_proxy_df(limit: int, seed: int) -> pd.DataFrame: rng = np.random.default_rng(seed) n = min(limit, 50000) x, y = make_classification( n_samples=n, n_features=18, n_informative=8, n_redundant=4, weights=[0.985, 0.015], class_sep=1.6, random_state=seed, ) df = pd.DataFrame(x, columns=[f"v{i}" for i in range(1, 19)]) df["amount"] = np.exp(rng.normal(3.2, 1.0, n)).round(2) df["merchant_category"] = rng.choice(["travel", "grocery", "electronics", "fuel", "cash"], n) df["fraud"] = y return df def load_credit_fraud_df(limit: int, seed: int) -> pd.DataFrame: try: return sample_df(kaggle_credit_card_fraud(), limit, seed) except Exception: return load_credit_fraud_proxy_df(limit, seed) def load_epirecipes_proxy_df(limit: int, seed: int) -> pd.DataFrame: rng = np.random.default_rng(seed) n = min(limit, 20000) calories = rng.gamma(4, 120, n) protein = rng.gamma(2, 12, n) fat = rng.gamma(2.5, 9, n) sodium = rng.gamma(2.4, 180, n) course = rng.choice(["main", "dessert", "side", "salad", "breakfast"], n) cuisine = rng.choice(["american", "italian", "mexican", "asian", "mediterranean"], n) rating = 2.8 + 0.12 * (course == "dessert") + 0.18 * (cuisine == "italian") - 0.00035 * sodium + rng.normal(0, 0.65, n) return pd.DataFrame( { "calories": calories.round(0), "protein": protein.round(1), "fat": fat.round(1), "sodium": sodium.round(0), "course": course, "cuisine": cuisine, "make_again": rng.choice(["yes", "no"], n, p=[0.66, 0.34]), "rating": rating.clip(0, 5).round(2), } ) def load_epirecipes_df(limit: int, seed: int) -> pd.DataFrame: try: return sample_df(kaggle_epirecipes(), limit, seed) except Exception: return load_epirecipes_proxy_df(limit, seed) def load_epirecipes_cakeweek_df(limit: int, seed: int) -> pd.DataFrame: try: df = kaggle_epirecipes().copy() if "cakeweek" not in df.columns: raise KeyError("cakeweek") df["cakeweek"] = pd.to_numeric(df["cakeweek"], errors="coerce").fillna(0).astype(int) return sample_df(df, limit, seed) except Exception: df = load_epirecipes_proxy_df(limit, seed).copy() df["cakeweek"] = ((df["course"] == "dessert") & (df["rating"] >= df["rating"].median())).astype(int) return df def load_candy_proxy_df(limit: int, seed: int) -> pd.DataFrame: rng = np.random.default_rng(seed) n = min(limit, 1200) chocolate = rng.binomial(1, 0.45, n) fruity = rng.binomial(1, 0.38, n) caramel = rng.binomial(1, 0.24, n) pricepercent = rng.beta(2, 4, n) sugarpercent = rng.beta(3, 2, n) winpercent = 35 + 18 * chocolate + 8 * caramel + 9 * sugarpercent - 10 * pricepercent + rng.normal(0, 8, n) return pd.DataFrame( { "chocolate": chocolate, "fruity": fruity, "caramel": caramel, "peanutyalmondy": rng.binomial(1, 0.2, n), "nougat": rng.binomial(1, 0.14, n), "crispedricewafer": rng.binomial(1, 0.16, n), "hard": rng.binomial(1, 0.28, n), "bar": rng.binomial(1, 0.36, n), "sugarpercent": sugarpercent.round(3), "pricepercent": pricepercent.round(3), "winpercent": winpercent.clip(5, 95).round(2), } ) def load_candy_df(limit: int, seed: int) -> pd.DataFrame: try: return sample_df(fivethirtyeight_candy(), limit, seed) except Exception: return load_candy_proxy_df(limit, seed) def load_candy_chocolate_df(limit: int, seed: int) -> pd.DataFrame: df = load_candy_df(limit, seed).copy() if "chocolate" not in df.columns: raise gr.Error("Candy dataset does not include the chocolate target.") return df def load_adult_income_proxy_df(limit: int, seed: int) -> pd.DataFrame: rng = np.random.default_rng(seed) n = min(limit, 30000) education_num = rng.integers(6, 17, n) hours = rng.normal(40, 12, n).clip(1, 80) age = rng.normal(39, 13, n).clip(18, 75) occupation = rng.choice(["tech", "sales", "ops", "admin", "service", "exec"], n) logit = -6 + 0.16 * age + 0.36 * education_num + 0.035 * hours + 0.9 * (occupation == "exec") + 0.55 * (occupation == "tech") income = rng.binomial(1, 1 / (1 + np.exp(-logit))) return pd.DataFrame( { "age": age.round(0), "education_num": education_num, "hours_per_week": hours.round(0), "occupation": occupation, "marital_status": rng.choice(["single", "married", "divorced"], n), "income_gt_50k": income, } ) def load_adult_income_df(limit: int, seed: int) -> pd.DataFrame: try: return sample_df(openml_adult_income(), limit, seed) except Exception: return load_adult_income_proxy_df(limit, seed) def load_bike_demand_proxy_df(limit: int, seed: int) -> pd.DataFrame: rng = np.random.default_rng(seed) n = min(limit, 15000) hour = rng.integers(0, 24, n) temp = rng.normal(21, 9, n).clip(-5, 40) workingday = rng.binomial(1, 0.69, n) weather = rng.choice(["clear", "mist", "rain", "storm"], n, p=[0.55, 0.28, 0.14, 0.03]) commute_peak = ((hour >= 7) & (hour <= 9)) | ((hour >= 16) & (hour <= 18)) count = 80 + 115 * commute_peak + 5.5 * temp + 45 * workingday - 75 * (weather == "rain") - 130 * (weather == "storm") count += rng.normal(0, 45, n) return pd.DataFrame({"hour": hour, "temp": temp.round(1), "workingday": workingday, "weather": weather, "rental_count": count.clip(0).round(0)}) def load_calcofi_df(limit: int, seed: int) -> pd.DataFrame: try: return sample_df(kaggle_calcofi(), limit, seed) except Exception: rng = np.random.default_rng(seed) n = min(limit, 12000) salinity = rng.normal(33.5, 0.7, n) depth = rng.gamma(2.0, 40.0, n) temp = 23 - 0.38 * depth / 10 - 1.7 * (salinity - 33.5) + rng.normal(0, 1.8, n) return pd.DataFrame({"salinity": salinity, "depthm": depth, "water_temperature": temp}) def load_szeged_weather_df(limit: int, seed: int) -> pd.DataFrame: try: return sample_df(kaggle_szeged_weather(), limit, seed) except Exception: rng = np.random.default_rng(seed) n = min(limit, 20000) humidity = rng.beta(4, 2, n) temp = rng.normal(12, 10, n) apparent = temp - 5 * humidity + rng.normal(0, 2.5, n) return pd.DataFrame({"temperature_c": temp, "humidity": humidity, "wind_speed_km_h": rng.gamma(2, 4, n), "apparent_temperature": apparent}) def load_weather_ww2_df(limit: int, seed: int) -> pd.DataFrame: try: return sample_df(kaggle_weather_ww2(), limit, seed) except Exception: rng = np.random.default_rng(seed) n = min(limit, 15000) mintemp = rng.normal(15, 9, n) return pd.DataFrame({"mintemp": mintemp, "precip": rng.gamma(1.5, 2, n), "max_temperature": mintemp + rng.normal(8, 3, n)}) def load_montreal_bike_lanes_df(limit: int, seed: int) -> pd.DataFrame: try: return sample_df(kaggle_montreal_bike_lanes(), limit, seed) except Exception: return load_bike_demand_proxy_df(limit, seed).rename(columns={"rental_count": "rider_count"}) def load_nyc_bike_crossings_df(limit: int, seed: int) -> pd.DataFrame: try: return sample_df(kaggle_nyc_bike_crossings(), limit, seed) except Exception: df = load_bike_demand_proxy_df(limit, seed) df["brooklyn_bridge"] = (df["rental_count"] * 0.32).round() df["manhattan_bridge"] = (df["rental_count"] * 0.27).round() df["total_bike_crossings"] = df["rental_count"] return df.drop(columns=["rental_count"]) def load_uk_road_safety_df(limit: int, seed: int) -> pd.DataFrame: try: return sample_df(kaggle_uk_road_safety(), limit, seed) except Exception: rng = np.random.default_rng(seed) n = min(limit, 30000) vehicles = rng.integers(1, 5, n) speed = rng.choice([20, 30, 40, 50, 60, 70], n) casualties = rng.poisson(0.4 + vehicles * 0.28 + (speed > 50) * 0.25, n) return pd.DataFrame({"number_of_vehicles": vehicles, "speed_limit": speed, "light_conditions": rng.choice(["daylight", "dark"], n), "casualty_count": casualties}) def load_kcbs_bbq_df(limit: int, seed: int) -> pd.DataFrame: try: return sample_df(kaggle_kcbs_bbq(), limit, seed) except Exception: rng = np.random.default_rng(seed) n = min(limit, 8000) score = rng.normal(165, 12, n) first = rng.binomial(1, 1 / (1 + np.exp(-(score - 184) / 5))) return pd.DataFrame({"score": score, "contest_size": rng.integers(10, 80, n), "category": rng.choice(["chicken", "ribs", "pork", "brisket"], n), "first_place": first}) DATASETS: list[DatasetSpec] = [ DatasetSpec("Titanic Survival", "classification", "survived", "OpenML data_id=40945", 1309, "Mixed categorical/numeric binary classification.", load_titanic_df), DatasetSpec("Ames Housing Prices", "regression", "sale_price", "OpenML data_id=42165", 1460, "Ames real-estate regression with neighborhood and quality features.", load_ames_housing_df), DatasetSpec("California Housing", "regression", "median_house_value", "sklearn California housing", 20640, "Block-level California housing value regression.", load_california_housing_df), DatasetSpec("Credit Card Fraud", "classification", "fraud", "KaggleHub mlg-ulb/creditcardfraud", 284807, "Large imbalanced binary fraud task.", load_credit_fraud_df), DatasetSpec("Epicurious Recipes", "regression", "rating", "KaggleHub hugodarwood/epirecipes", 20000, "Recipe nutrition and tags to rating.", load_epirecipes_df), DatasetSpec("Halloween Candy", "regression", "winpercent", "FiveThirtyEight GitHub CSV", 85, "Candy attributes to popularity score.", load_candy_df), DatasetSpec("Candy Chocolate", "classification", "chocolate", "FiveThirtyEight GitHub CSV", 85, "Predict whether a candy is chocolate from other candy attributes.", load_candy_chocolate_df), DatasetSpec("Epicurious Cakeweek", "classification", "cakeweek", "KaggleHub hugodarwood/epirecipes", 20000, "Predict cakeweek recipes from nutrition and recipe tags.", load_epirecipes_cakeweek_df), DatasetSpec("CalCOFI Ocean Temperature", "regression", "water_temperature", "KaggleHub sohier/calcofi", 864863, "Predict ocean water temperature from salinity and chemistry readings.", load_calcofi_df), DatasetSpec("Szeged Apparent Temperature", "regression", "apparent_temperature", "KaggleHub budincsevity/szeged-weather", 96453, "Predict apparent temperature from humidity, wind, pressure, and weather.", load_szeged_weather_df), DatasetSpec("WW2 Max Temperature", "regression", "max_temperature", "KaggleHub smid80/weatherww2", 119040, "Predict daily maximum temperature from minimum temperature and weather fields.", load_weather_ww2_df), DatasetSpec("Montreal Bike Lane Counts", "regression", "rider_count", "KaggleHub pablomonleon/montreal-bike-lanes", 319, "Predict rider counts on one Montreal bike path from other paths.", load_montreal_bike_lanes_df), DatasetSpec("NYC Bike Crossings", "regression", "total_bike_crossings", "KaggleHub new-york-city/nyc-east-river-bicycle-crossings", 210, "Predict total East River bicycle crossings from bridge counts.", load_nyc_bike_crossings_df), DatasetSpec("UK Road Casualties", "regression", "casualty_count", "KaggleHub bluehorseshoe/uk-2016-road-safety-data", 136621, "Predict accident casualty count from road safety fields.", load_uk_road_safety_df), DatasetSpec("KCBS BBQ First Place", "classification", "first_place", "KaggleHub jaysobel/kcbs-bbq", 1559, "Predict whether a BBQ competition team wins first place.", load_kcbs_bbq_df), DatasetSpec("Adult Income", "classification", "income_gt_50k", "OpenML data_id=1590", 48842, "Demographic and work attributes to income bucket.", load_adult_income_df), DatasetSpec("Bike Demand", "regression", "rental_count", "Kaggle-style proxy", 15000, "Weather and time features to rental demand.", load_bike_demand_proxy_df), DatasetSpec("Iris", "classification", "species", "sklearn", 150, "Classic multi-class flower classification.", load_iris_df), DatasetSpec("Wine", "classification", "wine_class", "sklearn", 178, "Chemical analysis to cultivar class.", load_wine_df), DatasetSpec("Breast Cancer", "classification", "diagnosis", "sklearn", 569, "Diagnostic measurements to benign/malignant label.", load_breast_cancer_df), DatasetSpec("Digits", "classification", "digit", "sklearn", 1797, "Pixel features to handwritten digit class.", load_digits_df), DatasetSpec("Diabetes", "regression", "disease_progression", "sklearn", 442, "Clinical variables to disease progression.", load_diabetes_df), ] def dataset_names() -> list[str]: return [d.name for d in DATASETS] def get_spec(name: str) -> DatasetSpec: return next(d for d in DATASETS if d.name == name) def get_dataset(name: str, sample_size: int, seed: int) -> pd.DataFrame: spec = get_spec(name) return spec.loader(sample_size, seed).reset_index(drop=True) def split_xy(df: pd.DataFrame, target: str) -> tuple[pd.DataFrame, pd.Series]: cleaned = df.dropna(axis=1, how="all").copy() if target not in cleaned.columns: raise gr.Error(f"Target column '{target}' was not found.") y = cleaned[target] x = cleaned.drop(columns=[target]) if x.empty: raise gr.Error("Dataset must include at least one feature column.") return x, y def coerce_numeric_target(y: pd.Series) -> pd.Series: if y.dtype.kind in "ifu": return pd.to_numeric(y, errors="coerce") cleaned = y.astype("string").str.strip().str.replace(",", "", regex=False) return pd.to_numeric(cleaned, errors="coerce") def prepare_xy(df: pd.DataFrame, target: str, task: str | None) -> tuple[pd.DataFrame, pd.Series, str]: x, raw_y = split_xy(df, target) inferred_task = task or infer_task(raw_y) y = raw_y.copy() if inferred_task == "regression": y = coerce_numeric_target(y) valid_target = y.notna() & np.isfinite(y.to_numpy(dtype=float)) if valid_target.sum() < 2: raise gr.Error("Regression target must contain at least two numeric values.") dropped = len(y) - int(valid_target.sum()) x = x.loc[valid_target].reset_index(drop=True) y = y.loc[valid_target].reset_index(drop=True) if dropped and x.empty: raise gr.Error("No usable rows remain after dropping non-numeric regression targets.") else: valid_target = raw_y.notna() if valid_target.sum() < 2: raise gr.Error("Classification target must contain at least two non-empty values.") x = x.loc[valid_target].reset_index(drop=True) y = raw_y.loc[valid_target].reset_index(drop=True) if x.empty: raise gr.Error("Dataset must include at least one feature column.") return x, y, inferred_task def infer_task(y: pd.Series) -> str: if y.dtype.kind in "ifu" and y.nunique(dropna=True) > 20: return "regression" numeric_y = coerce_numeric_target(y) non_missing = y.notna().sum() numeric_non_missing = numeric_y.notna().sum() if non_missing and numeric_non_missing / non_missing >= 0.9 and numeric_y.nunique(dropna=True) > 20: return "regression" return "classification" def make_preprocessor(x: pd.DataFrame, scale_numeric: bool = False) -> ColumnTransformer: numeric_cols = x.select_dtypes(include=np.number).columns.tolist() categorical_cols = [c for c in x.columns if c not in numeric_cols] numeric_steps: list[tuple[str, object]] = [("impute", SimpleImputer(strategy="median"))] if scale_numeric: numeric_steps.append(("scale", StandardScaler())) transformers: list[tuple[str, object, list[str]]] = [] if numeric_cols: transformers.append(("num", Pipeline(numeric_steps), numeric_cols)) if categorical_cols: transformers.append( ( "cat", Pipeline( [ ("impute", SimpleImputer(strategy="most_frequent")), ("encode", OneHotEncoder(handle_unknown="ignore", sparse_output=False, max_categories=32)), ] ), categorical_cols, ) ) return ColumnTransformer(transformers=transformers, remainder="drop", verbose_feature_names_out=False) def available_baselines(task: str) -> dict[str, object]: if task == "classification": models: dict[str, object] = { "Logistic": Pipeline([("prep", make_preprocessor(pd.DataFrame(), True)), ("model", LogisticRegression(max_iter=800))]), "RandomForest": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", RandomForestClassifier(n_estimators=80, min_samples_leaf=2, n_jobs=-1, random_state=RANDOM_STATE))]), "ExtraTrees": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", ExtraTreesClassifier(n_estimators=120, min_samples_leaf=2, n_jobs=-1, random_state=RANDOM_STATE))]), "GradientBoosting": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", GradientBoostingClassifier(n_estimators=100, learning_rate=0.06, random_state=RANDOM_STATE))]), "HistGradientBoosting": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", HistGradientBoostingClassifier(max_iter=120, random_state=RANDOM_STATE))]), "AdaBoost": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", AdaBoostClassifier(n_estimators=80, learning_rate=0.08, random_state=RANDOM_STATE))]), "NaiveBayes": Pipeline([("prep", make_preprocessor(pd.DataFrame(), True)), ("model", GaussianNB())]), "KNN": Pipeline([("prep", make_preprocessor(pd.DataFrame(), True)), ("model", KNeighborsClassifier(n_neighbors=7))]), "SVC": Pipeline([("prep", make_preprocessor(pd.DataFrame(), True)), ("model", SVC(C=1.0, probability=True, random_state=RANDOM_STATE))]), "Dummy": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", DummyClassifier(strategy="most_frequent"))]), } else: models = { "LinearRegression": Pipeline([("prep", make_preprocessor(pd.DataFrame(), True)), ("model", LinearRegression())]), "Ridge": Pipeline([("prep", make_preprocessor(pd.DataFrame(), True)), ("model", Ridge(alpha=1.0))]), "BayesianRidge": Pipeline([("prep", make_preprocessor(pd.DataFrame(), True)), ("model", BayesianRidge())]), "RandomForest": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", RandomForestRegressor(n_estimators=80, min_samples_leaf=2, n_jobs=-1, random_state=RANDOM_STATE))]), "ExtraTrees": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", ExtraTreesRegressor(n_estimators=120, min_samples_leaf=2, n_jobs=-1, random_state=RANDOM_STATE))]), "GradientBoosting": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", GradientBoostingRegressor(n_estimators=100, learning_rate=0.06, random_state=RANDOM_STATE))]), "HistGradientBoosting": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", HistGradientBoostingRegressor(max_iter=120, random_state=RANDOM_STATE))]), "AdaBoost": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", AdaBoostRegressor(n_estimators=80, learning_rate=0.08, random_state=RANDOM_STATE))]), "KNN": Pipeline([("prep", make_preprocessor(pd.DataFrame(), True)), ("model", KNeighborsRegressor(n_neighbors=7))]), "SVR": Pipeline([("prep", make_preprocessor(pd.DataFrame(), True)), ("model", SVR(C=1.0))]), "Dummy": Pipeline([("prep", make_preprocessor(pd.DataFrame())), ("model", DummyRegressor(strategy="median"))]), } if importlib.util.find_spec("xgboost"): from xgboost import XGBClassifier, XGBRegressor if task == "classification": models["XGBoost"] = Pipeline( [ ("prep", make_preprocessor(pd.DataFrame())), ("model", XGBClassifier(n_estimators=80, max_depth=4, learning_rate=0.08, eval_metric="logloss", random_state=RANDOM_STATE)), ] ) else: models["XGBoost"] = Pipeline( [ ("prep", make_preprocessor(pd.DataFrame())), ("model", XGBRegressor(n_estimators=80, max_depth=4, learning_rate=0.08, random_state=RANDOM_STATE)), ] ) if importlib.util.find_spec("lightgbm"): try: from lightgbm import LGBMClassifier, LGBMRegressor if task == "classification": models["LightGBM"] = Pipeline( [ ("prep", make_preprocessor(pd.DataFrame())), ("model", LGBMClassifier(n_estimators=120, learning_rate=0.06, random_state=RANDOM_STATE, verbose=-1)), ] ) else: models["LightGBM"] = Pipeline( [ ("prep", make_preprocessor(pd.DataFrame())), ("model", LGBMRegressor(n_estimators=120, learning_rate=0.06, random_state=RANDOM_STATE, verbose=-1)), ] ) except Exception: pass return models def rebuild_pipeline(model: Pipeline, x_train: pd.DataFrame) -> Pipeline: pipe = clone(model) wants_scale = pipe.steps[0][1].transformers and "scale" in str(pipe.steps[0][1].transformers[0][1]) pipe.steps[0] = ("prep", make_preprocessor(x_train, scale_numeric=wants_scale)) return pipe @lru_cache(maxsize=2) def load_tabfm_model(task: str): from tabfm import tabfm_v1_0_0_pytorch model_type = "classification" if task == "classification" else "regression" return tabfm_v1_0_0_pytorch.load(model_type=model_type) def resolve_tabfm_params( preset: str, n_estimators: int, max_num_rows: int, max_num_features: int, batch_size: int, enable_nnls: bool, n_feature_crosses: str, n_svd_features: str, max_eval_rows: int, ) -> dict[str, object]: if preset in TABFM_PRESETS: return dict(TABFM_PRESETS[preset]) resolved_rows = None if max_num_rows <= 0 else int(max_num_rows) resolved = { "n_estimators": int(n_estimators), "max_num_rows": resolved_rows, "max_num_features": int(max_num_features), "batch_size": int(batch_size), "enable_nnls": bool(enable_nnls), "n_feature_crosses": 0 if n_feature_crosses == "0" else n_feature_crosses, "n_svd_features": 0 if n_svd_features == "0" else n_svd_features, "max_eval_rows": None if max_eval_rows <= 0 else int(max_eval_rows), } if resolved["enable_nnls"] and resolved["max_num_rows"] is not None: resolved["max_num_rows"] = None return resolved def run_tabfm(task: str, x_train: pd.DataFrame, x_test: pd.DataFrame, y_train: pd.Series, tabfm_params: dict[str, object]): from tabfm import TabFMClassifier, TabFMRegressor foundation_model = load_tabfm_model(task) estimator = ( TabFMClassifier(model=foundation_model, **tabfm_params) if task == "classification" else TabFMRegressor(model=foundation_model, **tabfm_params) ) estimator.fit(x_train, y_train.to_numpy()) pred = estimator.predict(x_test) proba = estimator.predict_proba(x_test) if task == "classification" and hasattr(estimator, "predict_proba") else None return pred, proba def clean_tabfm_features(x_train: pd.DataFrame, x_test: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: cleaned_train = pd.DataFrame(index=x_train.index) cleaned_test = pd.DataFrame(index=x_test.index) for column in x_train.columns: train_series = x_train[column].replace([np.inf, -np.inf], np.nan) test_series = x_test[column].replace([np.inf, -np.inf], np.nan) if train_series.isna().all() and test_series.isna().all(): continue if pd.api.types.is_bool_dtype(train_series): cleaned_train[column] = train_series.astype("float64").fillna(0.0) cleaned_test[column] = test_series.astype("float64").fillna(0.0) elif pd.api.types.is_numeric_dtype(train_series): train_numeric = pd.to_numeric(train_series, errors="coerce") test_numeric = pd.to_numeric(test_series, errors="coerce") fill_value = train_numeric.median() if pd.isna(fill_value): fill_value = 0.0 cleaned_train[column] = train_numeric.fillna(fill_value) cleaned_test[column] = test_numeric.fillna(fill_value) elif pd.api.types.is_datetime64_any_dtype(train_series): cleaned_train[column] = train_series.dt.strftime("%Y-%m-%d %H:%M:%S").fillna("__missing__") cleaned_test[column] = test_series.dt.strftime("%Y-%m-%d %H:%M:%S").fillna("__missing__") else: train_text = train_series.astype("string").str.strip() test_text = test_series.astype("string").str.strip() train_numeric = coerce_numeric_target(train_series) test_numeric = coerce_numeric_target(test_series) non_missing = train_text.notna() & (train_text != "") numeric_ratio = train_numeric.notna().sum() / non_missing.sum() if non_missing.sum() else 0 if numeric_ratio >= 0.9: fill_value = train_numeric.median() if pd.isna(fill_value): fill_value = 0.0 cleaned_train[column] = train_numeric.fillna(fill_value) cleaned_test[column] = test_numeric.fillna(fill_value) else: cleaned_train[column] = train_text.fillna("__missing__").replace("", "__missing__") cleaned_test[column] = test_text.fillna("__missing__").replace("", "__missing__") if cleaned_train.empty: raise gr.Error("TabFM needs at least one non-empty feature column.") return cleaned_train.reset_index(drop=True), cleaned_test.reset_index(drop=True) def tabfm_failure_note(exc: Exception) -> str: detail = f"{type(exc).__name__}: {exc}" print("TabFM failed:\n" + traceback.format_exc()) env_errors = (ImportError, ModuleNotFoundError, OSError) if isinstance(exc, env_errors): return f"TabFM did not run because the runtime could not load it: `{detail}`. On Spaces, keep Python 3.11 and allow the GitHub dependency plus model download for `{TABFM_MODEL_ID}`." return f"TabFM failed while processing this dataset: `{detail}`." def score_predictions(task: str, y_true: pd.Series, pred, proba=None) -> dict[str, float]: if task == "classification": metrics = { "accuracy": accuracy_score(y_true, pred), "f1_weighted": f1_score(y_true, pred, average="weighted", zero_division=0), } if proba is not None and len(np.unique(y_true)) == 2: try: metrics["roc_auc"] = roc_auc_score(y_true, proba[:, 1]) except Exception: metrics["roc_auc"] = np.nan else: metrics["roc_auc"] = np.nan metrics["rank_score"] = np.nanmean([metrics["accuracy"], metrics["f1_weighted"], metrics["roc_auc"]]) return metrics rmse = math.sqrt(mean_squared_error(y_true, pred)) mae = mean_absolute_error(y_true, pred) r2 = r2_score(y_true, pred) return {"rmse": rmse, "mae": mae, "r2": r2, "rank_score": -rmse} def benchmark_frame( df: pd.DataFrame, target: str, task: str | None, sample_size: int, test_size: float, seed: int, selected_models: list[str], include_tabfm: bool, tabfm_params: dict[str, object] | None = None, ) -> tuple[pd.DataFrame, pd.DataFrame, str]: df = df.sample(min(sample_size, len(df)), random_state=seed).reset_index(drop=True) x, y, task = prepare_xy(df, target, task) if task == "classification" and y.nunique(dropna=True) < 2: raise gr.Error("Classification needs at least two target classes.") stratify = y if task == "classification" and y.value_counts().min() >= 2 else None x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=seed, stratify=stratify) rows: list[dict[str, object]] = [] notes: list[str] = [] selected_models = selected_models or [] models = available_baselines(task) for name, model in models.items(): if name not in selected_models: continue start = time.perf_counter() try: pipe = rebuild_pipeline(model, x_train) pipe.fit(x_train, y_train) pred = pipe.predict(x_test) proba = pipe.predict_proba(x_test) if task == "classification" and hasattr(pipe, "predict_proba") else None metrics = score_predictions(task, y_test, pred, proba) rows.append({"model": name, "status": "ok", "seconds": time.perf_counter() - start, **metrics}) except Exception as exc: rows.append({"model": name, "status": f"failed: {exc}", "seconds": time.perf_counter() - start}) for name in selected_models: if name not in models: rows.append({"model": name, "status": f"not compatible with {task} or unavailable", "seconds": 0.0, "rank_score": np.nan}) if include_tabfm: start = time.perf_counter() try: tabfm_params = tabfm_params or TABFM_PRESETS["Fast"] tabfm_display_params = dict(tabfm_params) tabfm_model_params = dict(tabfm_params) tabfm_eval_rows = tabfm_model_params.pop("max_eval_rows", None) if tabfm_eval_rows is not None and len(x_test) > int(tabfm_eval_rows): eval_idx = x_test.sample(int(tabfm_eval_rows), random_state=seed).index x_eval = x_test.loc[eval_idx] y_eval = y_test.loc[eval_idx] status = f"ok ({len(x_eval):,}/{len(x_test):,} test rows)" else: x_eval = x_test y_eval = y_test status = "ok" x_tab_train, x_tab_eval = clean_tabfm_features(x_train, x_eval) pred, proba = run_tabfm(task, x_tab_train, x_tab_eval, y_train, tabfm_model_params) metrics = score_predictions(task, y_eval, pred, proba) rows.append({"model": "TabFM", "status": status, "seconds": time.perf_counter() - start, **metrics}) except Exception as exc: rows.append({"model": "TabFM", "status": f"unavailable: {exc}", "seconds": time.perf_counter() - start}) notes.append(tabfm_failure_note(exc)) else: rows.append({"model": "TabFM", "status": "skipped - enable Run TabFM live", "seconds": 0.0, "rank_score": np.nan}) notes.append("TabFM is listed as skipped because **Run TabFM live** is off. Enable it to benchmark TabFM; the first run may download large model weights.") results = pd.DataFrame(rows) metric_cols = [c for c in ["accuracy", "f1_weighted", "roc_auc", "rmse", "mae", "r2", "rank_score", "seconds"] if c in results.columns] if not results.empty and "rank_score" in results.columns: results = results.sort_values("rank_score", ascending=False, na_position="last").reset_index(drop=True) results.insert(0, "rank", np.arange(1, len(results) + 1)) preview = pd.concat([x_test.reset_index(drop=True).head(12), y_test.reset_index(drop=True).head(12).rename(target)], axis=1) summary = ( f"**Task:** {task} \n" f"**Rows used:** {len(df):,} | **Train:** {len(x_train):,} | **Test:** {len(x_test):,} | **Features:** {x.shape[1]:,} \n" f"**Primary rank:** {'higher accuracy/F1/AUC' if task == 'classification' else 'lower RMSE'}" ) if notes: summary += "\n\n" + "\n".join(f"- {note}" for note in notes) if include_tabfm: summary += f"\n\n**TabFM params:** `{tabfm_display_params}`" return results[["rank", "model", "status", *metric_cols]], preview, summary def metric_chart(results: pd.DataFrame, selected_metrics: list[str] | None = None, chart_style: str = "Line") -> go.Figure: if results is None or results.empty: return go.Figure() selected_metrics = selected_metrics or METRIC_CHOICES metric_cols = [c for c in selected_metrics if c in results.columns and results[c].notna().any()] if not metric_cols: metric_cols = [c for c in METRIC_CHOICES if c in results.columns and results[c].notna().any()] if not metric_cols: return go.Figure() clean = results.sort_values("rank") if "rank" in results.columns else results.copy() clean = clean.dropna(subset=metric_cols, how="all") if clean.empty: return go.Figure() x_labels = clean["model"].astype(str).tolist() if chart_style == "Radar": normalized = clean[["model", *metric_cols]].copy() for metric in metric_cols: values = pd.to_numeric(normalized[metric], errors="coerce") lo, hi = values.min(), values.max() if pd.isna(lo) or pd.isna(hi) or hi == lo: normalized[metric] = 0.5 elif metric in {"rmse", "mae", "seconds"}: normalized[metric] = 1 - ((values - lo) / (hi - lo)) else: normalized[metric] = (values - lo) / (hi - lo) fig = go.Figure() theta = metric_cols + [metric_cols[0]] for idx, row in normalized.iterrows(): values = [row[m] for m in metric_cols] + [row[metric_cols[0]]] fig.add_trace( go.Scatterpolar( r=values, theta=theta, fill="toself", name=str(row["model"]), line=dict(color=GOOGLE_COLORS[idx % len(GOOGLE_COLORS)], width=2), opacity=0.78, ) ) fig.update_layout( template="plotly_white", height=420, margin=dict(l=35, r=35, t=35, b=25), polar=dict(radialaxis=dict(visible=True, range=[0, 1])), legend_title_text="Model", title="Normalized metric shape (higher is better)", ) return fig fig = make_subplots( rows=len(metric_cols), cols=1, shared_xaxes=True, vertical_spacing=0.08, subplot_titles=[metric.replace("_", " ").upper() for metric in metric_cols], ) for idx, metric in enumerate(metric_cols, start=1): fig.add_trace( go.Scatter( x=x_labels, y=clean[metric], mode="lines+markers", name=metric, line=dict(color=GOOGLE_COLORS[(idx - 1) % len(GOOGLE_COLORS)], width=3, shape="spline"), marker=dict(size=9, line=dict(color="white", width=1.5)), hovertemplate=f"%{{x}}
{metric}: %{{y:.4f}}", ), row=idx, col=1, ) if metric in {"accuracy", "f1_weighted", "roc_auc", "r2"}: fig.update_yaxes(range=[min(0, float(clean[metric].min()) - 0.05), 1.02], row=idx, col=1) fig.update_layout( template="plotly_white", height=max(320, 185 * len(metric_cols)), margin=dict(l=30, r=20, t=40, b=35), showlegend=False, hovermode="x unified", ) return fig def bar_chart(results: pd.DataFrame, selected_metrics: list[str] | None = None) -> go.Figure: if results is None or results.empty: return go.Figure() results = results.copy() selected_metrics = selected_metrics or METRIC_CHOICES metric_cols = [c for c in selected_metrics if c in results.columns and results[c].notna().any()] if not metric_cols: metric_cols = [c for c in METRIC_CHOICES if c in results.columns and results[c].notna().any()] if not metric_cols: return go.Figure() clean = results.sort_values("rank") if "rank" in results.columns else results.copy() clean = clean.dropna(subset=metric_cols, how="all") if clean.empty: return go.Figure() long = clean.melt(id_vars=["model"], value_vars=metric_cols, var_name="metric", value_name="score") fig = px.bar( long, x="model", y="score", color="metric", barmode="group", color_discrete_sequence=GOOGLE_COLORS, title="Grouped metric comparison", ) fig.update_layout( template="plotly_white", height=360, margin=dict(l=25, r=20, t=45, b=35), legend_title_text="Metric", hovermode="x unified", ) return fig def time_chart(results: pd.DataFrame) -> go.Figure: if results is None or results.empty or "seconds" not in results: return go.Figure() results = results.copy() if "status" in results.columns: results = results[~results["status"].astype(str).str.startswith("skipped")] if results.empty: return go.Figure() fig = px.scatter( results, x="seconds", y="model", size=np.maximum(results.get("rank_score", pd.Series([1] * len(results))).fillna(0).abs(), 0.1), color="model", color_discrete_sequence=px.colors.qualitative.Set2, ) fig.update_layout(template="plotly_white", height=300, margin=dict(l=20, r=20, t=25, b=20), showlegend=False) return fig def run_catalog( dataset_name: str, sample_size: int, test_percent: int, seed: int, selected_models: list[str], include_tabfm: bool, selected_metrics: list[str], chart_style: str, tabfm_preset: str, tabfm_n_estimators: int, tabfm_max_rows: int, tabfm_max_features: int, tabfm_batch_size: int, tabfm_enable_nnls: bool, tabfm_crosses: str, tabfm_svd: str, tabfm_max_eval_rows: int, ): spec = get_spec(dataset_name) df = get_dataset(dataset_name, sample_size, seed) tabfm_params = resolve_tabfm_params(tabfm_preset, tabfm_n_estimators, tabfm_max_rows, tabfm_max_features, tabfm_batch_size, tabfm_enable_nnls, tabfm_crosses, tabfm_svd, tabfm_max_eval_rows) results, preview, summary = benchmark_frame(df, spec.target, spec.task, sample_size, test_percent / 100, seed, selected_models, include_tabfm, tabfm_params) return summary, results.round(4), metric_chart(results, selected_metrics, chart_style), time_chart(results), bar_chart(results, selected_metrics), preview def run_upload( file, target: str, task: str, sample_size: int, test_percent: int, seed: int, selected_models: list[str], include_tabfm: bool, selected_metrics: list[str], chart_style: str, tabfm_preset: str, tabfm_n_estimators: int, tabfm_max_rows: int, tabfm_max_features: int, tabfm_batch_size: int, tabfm_enable_nnls: bool, tabfm_crosses: str, tabfm_svd: str, tabfm_max_eval_rows: int, ): if file is None: raise gr.Error("Upload a CSV file first.") df = pd.read_csv(file.name) selected_task = None if task == "Auto" else task.lower() tabfm_params = resolve_tabfm_params(tabfm_preset, tabfm_n_estimators, tabfm_max_rows, tabfm_max_features, tabfm_batch_size, tabfm_enable_nnls, tabfm_crosses, tabfm_svd, tabfm_max_eval_rows) results, preview, summary = benchmark_frame(df, target, selected_task, sample_size, test_percent / 100, seed, selected_models, include_tabfm, tabfm_params) return summary, results.round(4), metric_chart(results, selected_metrics, chart_style), time_chart(results), bar_chart(results, selected_metrics), preview def redraw_metric_chart(results: pd.DataFrame, selected_metrics: list[str], chart_style: str): if results is None or len(results) == 0: return go.Figure() return metric_chart(pd.DataFrame(results), selected_metrics, chart_style) def redraw_bar_chart(results: pd.DataFrame, selected_metrics: list[str]): if results is None or len(results) == 0: return go.Figure() return bar_chart(pd.DataFrame(results), selected_metrics) def catalog_table() -> pd.DataFrame: return pd.DataFrame( [ { "dataset": d.name, "task": d.task, "target": d.target, "rows": d.rows, "source": d.source, "description": d.description, } for d in DATASETS ] ) DEFAULT_MODELS = [ "Logistic", "LinearRegression", "Ridge", "BayesianRidge", "NaiveBayes", "RandomForest", "ExtraTrees", "GradientBoosting", "HistGradientBoosting", "AdaBoost", "KNN", "SVC", "SVR", "XGBoost", "LightGBM", "Dummy", ] DEFAULT_SELECTED_MODELS = ["HistGradientBoosting", "XGBoost", "LightGBM", "Dummy"] def build_app() -> gr.Blocks: css = """ body, .gradio-container { background: #f7f9fd; color: #101828; } .shell { max-width: 1440px; margin: 0 auto; } .hero { background: linear-gradient(135deg, #ffffff 0%, #f6f9ff 56%, #fff7ed 100%); border: 1px solid #e9edf5; border-radius: 18px; padding: 26px 28px; box-shadow: 0 20px 55px rgba(15, 23, 42, 0.07); } .hero h1 { font-size: 36px; line-height: 1.05; margin: 0 0 8px; letter-spacing: 0; } .hero p { margin: 0; color: #667085; font-size: 15px; } .stat-card { background: #fff; border: 1px solid #edf0f5; border-radius: 14px; padding: 18px; box-shadow: 0 12px 35px rgba(15, 23, 42, 0.05); min-height: 118px; } .stat-card .label { color: #667085; font-size: 13px; } .stat-card .value { font-size: 28px; font-weight: 760; margin-top: 12px; } .stat-card .trend { display: inline-block; margin-left: 8px; font-size: 12px; color: #027a48; background: #ecfdf3; border-radius: 999px; padding: 2px 8px; } .panel { background: #fff; border: 1px solid #edf0f5; border-radius: 14px; padding: 14px; box-shadow: 0 12px 35px rgba(15, 23, 42, 0.04); } .control-panel { background: linear-gradient(180deg, #ffffff 0%, #fbfcff 100%); border-top: 4px solid #f97316; } .control-panel label span { color: #344054; font-weight: 720; } .control-panel input, .control-panel textarea, .control-panel select { border-radius: 10px !important; } .control-panel .wrap { gap: 9px !important; } .control-panel .token, .control-panel [data-testid="token"] { background: #fff7ed !important; color: #c2410c !important; border: 1px solid #fed7aa !important; border-radius: 999px !important; } .control-panel .checkbox label { border-radius: 999px !important; } .gr-button-primary { background: linear-gradient(135deg, #f97316, #ea4335) !important; border-color: #f97316 !important; box-shadow: 0 12px 24px rgba(249, 115, 22, 0.25) !important; border-radius: 12px !important; min-height: 46px !important; font-weight: 760 !important; } .plot-container, .table-wrap { border-radius: 14px !important; overflow: hidden; } footer { display: none !important; } """ with gr.Blocks(title=APP_TITLE, css=css, theme=gr.themes.Soft(primary_hue="orange", secondary_hue="violet")) as demo: with gr.Column(elem_classes=["shell"]): gr.HTML( """

tabBench

A clean arena for benchmarking google/tabfm-1.0.0-pytorch against practical tabular baselines across small, classic, imbalanced, and user-uploaded datasets.

""" ) with gr.Row(): gr.HTML(f'
Benchmark catalog
{len(DATASETS)} mixed tasks
Classification + regression
') gr.HTML('
Linked HF model
TabFM 1.0
google/tabfm-1.0.0-pytorch
') gr.HTML('
User datasets
CSV upload
Pick target, task, sample size
') with gr.Tabs(): with gr.Tab("Arena"): with gr.Row(): with gr.Column(scale=1, elem_classes=["panel", "control-panel"]): dataset = gr.Dropdown(dataset_names(), value="Titanic Survival", label="Dataset") sample = gr.Slider(100, 50000, value=1000, step=100, label="Sample size") test_pct = gr.Slider(10, 40, value=25, step=5, label="Test split (%)") seed = gr.Number(value=42, precision=0, label="Random seed") models = gr.Dropdown(DEFAULT_MODELS, value=DEFAULT_SELECTED_MODELS, multiselect=True, label="Models") include_tabfm = gr.Checkbox(value=False, label="Run TabFM live (adds TabFM row)") metric_toggles = gr.Dropdown(METRIC_CHOICES, value=["rmse", "mae", "r2", "accuracy", "f1_weighted", "roc_auc"], multiselect=True, label="Chart metrics") chart_style = gr.Radio(["Line", "Radar"], value="Line", label="Chart style") with gr.Accordion("TabFM tuning", open=False): tabfm_preset = gr.Dropdown(list(TABFM_PRESETS.keys()) + ["Custom"], value="Fast", label="Preset") tabfm_n_estimators = gr.Slider(1, 32, value=1, step=1, label="Estimators") tabfm_max_rows = gr.Slider(0, 5000, value=256, step=64, label="Max context rows (0 = no cap)") tabfm_max_features = gr.Slider(8, 500, value=64, step=8, label="Max features") tabfm_batch_size = gr.Slider(1, 8, value=1, step=1, label="Batch size") tabfm_enable_nnls = gr.Checkbox(value=False, label="NNLS ensemble weights") tabfm_crosses = gr.Radio(["0", "sqrt"], value="0", label="Feature crosses") tabfm_svd = gr.Radio(["0", "sqrt"], value="0", label="SVD features") tabfm_max_eval_rows = gr.Slider(0, 5000, value=256, step=64, label="Max TabFM test rows (0 = no cap)") run_btn = gr.Button("Run benchmark", variant="primary") with gr.Column(scale=3): summary = gr.Markdown() leaderboard = gr.Dataframe(label="Leaderboard", interactive=False) bars = gr.Plot(label="Main grouped comparison") with gr.Row(): chart = gr.Plot(label="Metric comparison") speed = gr.Plot(label="Speed") preview = gr.Dataframe(label="Held-out preview", interactive=False) run_inputs = [ dataset, sample, test_pct, seed, models, include_tabfm, metric_toggles, chart_style, tabfm_preset, tabfm_n_estimators, tabfm_max_rows, tabfm_max_features, tabfm_batch_size, tabfm_enable_nnls, tabfm_crosses, tabfm_svd, tabfm_max_eval_rows, ] run_outputs = [summary, leaderboard, chart, speed, bars, preview] run_btn.click(run_catalog, run_inputs, run_outputs) metric_toggles.change(redraw_metric_chart, [leaderboard, metric_toggles, chart_style], chart) metric_toggles.change(redraw_bar_chart, [leaderboard, metric_toggles], bars) chart_style.change(redraw_metric_chart, [leaderboard, metric_toggles, chart_style], chart) demo.load(run_catalog, run_inputs, run_outputs) with gr.Tab("Upload Dataset"): with gr.Row(): with gr.Column(scale=1, elem_classes=["panel", "control-panel"]): file = gr.File(label="CSV file", file_types=[".csv"]) target = gr.Textbox(label="Target column") task = gr.Radio(["Auto", "Classification", "Regression"], value="Auto", label="Task") upload_sample = gr.Slider(100, 50000, value=1000, step=100, label="Sample size") upload_test_pct = gr.Slider(10, 40, value=25, step=5, label="Test split (%)") upload_seed = gr.Number(value=42, precision=0, label="Random seed") upload_models = gr.Dropdown(DEFAULT_MODELS, value=DEFAULT_SELECTED_MODELS, multiselect=True, label="Models") upload_tabfm = gr.Checkbox(value=False, label="Run TabFM live (adds TabFM row)") upload_metric_toggles = gr.Dropdown(METRIC_CHOICES, value=["rmse", "mae", "r2", "accuracy", "f1_weighted", "roc_auc"], multiselect=True, label="Chart metrics") upload_chart_style = gr.Radio(["Line", "Radar"], value="Line", label="Chart style") with gr.Accordion("TabFM tuning", open=False): upload_tabfm_preset = gr.Dropdown(list(TABFM_PRESETS.keys()) + ["Custom"], value="Fast", label="Preset") upload_tabfm_n_estimators = gr.Slider(1, 32, value=1, step=1, label="Estimators") upload_tabfm_max_rows = gr.Slider(0, 5000, value=256, step=64, label="Max context rows (0 = no cap)") upload_tabfm_max_features = gr.Slider(8, 500, value=64, step=8, label="Max features") upload_tabfm_batch_size = gr.Slider(1, 8, value=1, step=1, label="Batch size") upload_tabfm_enable_nnls = gr.Checkbox(value=False, label="NNLS ensemble weights") upload_tabfm_crosses = gr.Radio(["0", "sqrt"], value="0", label="Feature crosses") upload_tabfm_svd = gr.Radio(["0", "sqrt"], value="0", label="SVD features") upload_tabfm_max_eval_rows = gr.Slider(0, 5000, value=256, step=64, label="Max TabFM test rows (0 = no cap)") upload_btn = gr.Button("Run uploaded dataset", variant="primary") with gr.Column(scale=3): upload_summary = gr.Markdown() upload_leaderboard = gr.Dataframe(label="Upload leaderboard", interactive=False) upload_bars = gr.Plot(label="Main grouped comparison") with gr.Row(): upload_chart = gr.Plot(label="Metric comparison") upload_speed = gr.Plot(label="Speed") upload_preview = gr.Dataframe(label="Held-out preview", interactive=False) upload_btn.click( run_upload, [ file, target, task, upload_sample, upload_test_pct, upload_seed, upload_models, upload_tabfm, upload_metric_toggles, upload_chart_style, upload_tabfm_preset, upload_tabfm_n_estimators, upload_tabfm_max_rows, upload_tabfm_max_features, upload_tabfm_batch_size, upload_tabfm_enable_nnls, upload_tabfm_crosses, upload_tabfm_svd, upload_tabfm_max_eval_rows, ], [upload_summary, upload_leaderboard, upload_chart, upload_speed, upload_bars, upload_preview], ) upload_metric_toggles.change(redraw_metric_chart, [upload_leaderboard, upload_metric_toggles, upload_chart_style], upload_chart) upload_metric_toggles.change(redraw_bar_chart, [upload_leaderboard, upload_metric_toggles], upload_bars) upload_chart_style.change(redraw_metric_chart, [upload_leaderboard, upload_metric_toggles, upload_chart_style], upload_chart) with gr.Tab("Dataset Catalog"): gr.Dataframe(catalog_table(), interactive=False, label="Included benchmark catalog") gr.Markdown( """ Most catalog datasets are loaded from OpenML, KaggleHub, FiveThirtyEight GitHub data, or sklearn. Each remote loader has a fallback so the Space remains usable if an upstream dataset is temporarily unavailable. """ ) with gr.Tab("Implementation Notes"): gr.Markdown( """ This Space declares `models: google/tabfm-1.0.0-pytorch` in its README metadata, which is what Hugging Face uses to associate Spaces with model pages. TabFM is attempted only when **Run TabFM live** is enabled because the first run downloads large model weights and CPU Basic inference can be slow. Use the **Fast** preset for a quick smoke test, then increase estimators/context rows for stronger but slower runs. The TabFM integration follows the Google Research README pattern: load `tabfm_v1_0_0_pytorch`, wrap it with `TabFMClassifier` or `TabFMRegressor`, call `fit` for context preparation, then `predict`. """ ) return demo if __name__ == "__main__": build_app().queue(max_size=16).launch()