File size: 7,420 Bytes
9e6a0bb f2c3b11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | """
PredictLM Playground — Gradio demo for predictlm-mini-13m.
Upload a CSV → pick target column → get predictions on a held-out split.
Single-model fast path (no Duo, no TTT). For the full 0.751/0.609 recipe,
see `pip install predictlm`.
"""
import os
import gradio as gr
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, mean_absolute_error, r2_score
from predictlm import PredictLM
# Load model once at startup. First request after cold-start downloads the
# 54MB Mini checkpoint; cached for subsequent requests.
print("Loading predictlm-mini-13m (single-model mode for speed)...")
MODEL = PredictLM.from_pretrained(
"zerooneresearch/predictlm-mini-13m",
device="cpu",
auto_duo=False,
)
print("Model loaded.")
MAX_ROWS = 1100
MAX_FEATURES = 128
EXAMPLE_DATASETS = {
"Breast cancer (classification, 569 rows × 30 features)": "examples/breast_cancer.csv",
"California housing (regression, 1000 rows × 8 features)": "examples/california_housing.csv",
}
def load_csv(file) -> tuple:
if file is None:
return (
None,
gr.Dropdown(choices=[], value=None, interactive=False),
"_Upload a CSV (or pick an example below) to start._",
)
try:
df = pd.read_csv(file)
except Exception as e:
return None, gr.Dropdown(choices=[], value=None), f"❌ Could not read CSV: {e}"
if len(df) > MAX_ROWS:
df = df.sample(n=MAX_ROWS, random_state=42).reset_index(drop=True)
sample_note = f" (sampled to {MAX_ROWS} rows for speed)"
else:
sample_note = ""
cols = list(df.columns)
return (
df,
gr.Dropdown(choices=cols, value=cols[-1], interactive=True),
f"✅ Loaded {len(df)} rows × {len(cols)} columns{sample_note}. "
f"Default target is the last column — change it if needed.",
)
def load_example(name):
path = EXAMPLE_DATASETS.get(name)
if not path or not os.path.exists(path):
return None, gr.Dropdown(choices=[], value=None), f"Example file not found: {path}"
return load_csv(path)
def run_prediction(df, target_col, test_frac):
if df is None or target_col is None:
return "_Load a CSV first._", None
if target_col not in df.columns:
return f"❌ Target column **{target_col}** not in CSV.", None
df = df.dropna(subset=[target_col]).copy()
n = len(df)
if n < 20:
return f"❌ Need at least 20 rows after dropping NA target. Got {n}.", None
feature_cols = [c for c in df.columns if c != target_col]
numeric_feats = [c for c in feature_cols if pd.api.types.is_numeric_dtype(df[c])]
if not numeric_feats:
return (
"❌ No numeric feature columns found. PredictLM v1 expects numeric features "
"(encode categoricals first).",
None,
)
if len(numeric_feats) > MAX_FEATURES:
return (
f"❌ PredictLM v1 supports ≤{MAX_FEATURES} features. CSV has "
f"{len(numeric_feats)} numeric features.",
None,
)
test_n = max(5, int(n * test_frac))
train_n = n - test_n
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
train_df = df.iloc[:train_n]
test_df = df.iloc[train_n:]
X_train = train_df[numeric_feats].values.astype(np.float32)
y_train = train_df[target_col].values
X_test = test_df[numeric_feats].values.astype(np.float32)
y_test = test_df[target_col].values
try:
preds = MODEL.fit(X_train, y_train).predict(X_test)
except Exception as e:
return f"❌ Prediction error: {e}", None
result = pd.DataFrame(
{
"actual": y_test,
"predicted": preds,
}
)
if pd.api.types.is_numeric_dtype(df[target_col]) and df[target_col].nunique() > 10:
# Regression
r2 = r2_score(y_test, preds)
mae = mean_absolute_error(y_test, preds)
result["error"] = (result["actual"] - result["predicted"]).round(4)
summary = (
f"**Regression** · {len(numeric_feats)} features · n_train = {train_n} · "
f"n_test = {test_n}\n\n"
f"R² = **{r2:.3f}** · MAE = **{mae:.3f}**\n\n"
f"_Single-model fast path. Full Duo + TTT recipe averages 0.609 R² across_ "
f"_25 OpenML regression datasets._"
)
else:
# Classification
acc = accuracy_score(y_test, preds)
result["correct"] = result["actual"] == result["predicted"]
n_classes = pd.Series(y_test).nunique()
summary = (
f"**Classification** · {len(numeric_feats)} features · {n_classes} classes · "
f"n_train = {train_n} · n_test = {test_n}\n\n"
f"Accuracy = **{acc:.3f}**\n\n"
f"_Single-model fast path. Full Duo + TTT recipe averages 0.751 accuracy_ "
f"_across 25 OpenML classification datasets._"
)
return summary, result.head(50)
HEADER = """
# PredictLM Playground
Upload a CSV, pick a target column, and run **predictlm-mini-13m** on it — a 13M-parameter open-weight tabular foundation model, Apache-2.0.
> **Note**: This Space runs single-model fast-path (no Duo + TTT) for snappy responses. Local Python with `pip install predictlm` gets the full 0.751 / 0.609 OpenML numbers.
"""
FOOTER = """
---
[Model card](https://huggingface.co/zerooneresearch/predictlm-mini-13m) · [PyPI `pip install predictlm`](https://pypi.org/project/predictlm/) · [Source on GitHub](https://github.com/matej-01RAI/predictlm-mcp) · [Org](https://huggingface.co/zerooneresearch)
PredictLM is built by [Zero One Research](https://huggingface.co/zerooneresearch), an independent AI lab in Bratislava, EU.
"""
with gr.Blocks(title="PredictLM Playground", theme=gr.themes.Soft()) as demo:
gr.Markdown(HEADER)
df_state = gr.State(None)
with gr.Row():
with gr.Column(scale=1):
file = gr.File(label="Upload CSV", file_types=[".csv"])
example = gr.Dropdown(
choices=list(EXAMPLE_DATASETS.keys()),
label="…or pick a built-in example",
value=None,
)
target = gr.Dropdown(label="Target column", choices=[], interactive=False)
test_frac = gr.Slider(
0.1, 0.5, value=0.2, step=0.05,
label="Test fraction (held-out for evaluation)",
)
run = gr.Button("Predict", variant="primary", size="lg")
status = gr.Markdown("_Upload a CSV (or pick an example below) to start._")
with gr.Column(scale=2):
summary = gr.Markdown(
"_Predictions will appear here._"
)
results = gr.Dataframe(
label="Predictions (first 50 rows of test split)",
interactive=False,
wrap=True,
)
file.change(load_csv, inputs=[file], outputs=[df_state, target, status])
example.change(load_example, inputs=[example], outputs=[df_state, target, status])
run.click(
run_prediction,
inputs=[df_state, target, test_frac],
outputs=[summary, results],
)
gr.Markdown(FOOTER)
if __name__ == "__main__":
demo.queue().launch(
server_name="0.0.0.0",
server_port=int(os.environ.get("PORT", 7860)),
show_error=True,
)
|