""" PredictLM Playground — Gradio demo for predictlm-mini-13m. Upload a CSV → pick target column → get predictions on a held-out split. Single-model fast path (no Duo, no TTT). For the full 0.751/0.609 recipe, see `pip install predictlm`. """ import os import gradio as gr import numpy as np import pandas as pd from sklearn.metrics import accuracy_score, mean_absolute_error, r2_score from predictlm import PredictLM # Load model once at startup. First request after cold-start downloads the # 54MB Mini checkpoint; cached for subsequent requests. print("Loading predictlm-mini-13m (single-model mode for speed)...") MODEL = PredictLM.from_pretrained( "zerooneresearch/predictlm-mini-13m", device="cpu", auto_duo=False, ) print("Model loaded.") MAX_ROWS = 1100 MAX_FEATURES = 128 EXAMPLE_DATASETS = { "Breast cancer (classification, 569 rows × 30 features)": "examples/breast_cancer.csv", "California housing (regression, 1000 rows × 8 features)": "examples/california_housing.csv", } def load_csv(file) -> tuple: if file is None: return ( None, gr.Dropdown(choices=[], value=None, interactive=False), "_Upload a CSV (or pick an example below) to start._", ) try: df = pd.read_csv(file) except Exception as e: return None, gr.Dropdown(choices=[], value=None), f"❌ Could not read CSV: {e}" if len(df) > MAX_ROWS: df = df.sample(n=MAX_ROWS, random_state=42).reset_index(drop=True) sample_note = f" (sampled to {MAX_ROWS} rows for speed)" else: sample_note = "" cols = list(df.columns) return ( df, gr.Dropdown(choices=cols, value=cols[-1], interactive=True), f"✅ Loaded {len(df)} rows × {len(cols)} columns{sample_note}. " f"Default target is the last column — change it if needed.", ) def load_example(name): path = EXAMPLE_DATASETS.get(name) if not path or not os.path.exists(path): return None, gr.Dropdown(choices=[], value=None), f"Example file not found: {path}" return load_csv(path) def run_prediction(df, target_col, test_frac): if df is None or target_col is None: return "_Load a CSV first._", None if target_col not in df.columns: return f"❌ Target column **{target_col}** not in CSV.", None df = df.dropna(subset=[target_col]).copy() n = len(df) if n < 20: return f"❌ Need at least 20 rows after dropping NA target. Got {n}.", None feature_cols = [c for c in df.columns if c != target_col] numeric_feats = [c for c in feature_cols if pd.api.types.is_numeric_dtype(df[c])] if not numeric_feats: return ( "❌ No numeric feature columns found. PredictLM v1 expects numeric features " "(encode categoricals first).", None, ) if len(numeric_feats) > MAX_FEATURES: return ( f"❌ PredictLM v1 supports ≤{MAX_FEATURES} features. CSV has " f"{len(numeric_feats)} numeric features.", None, ) test_n = max(5, int(n * test_frac)) train_n = n - test_n df = df.sample(frac=1, random_state=42).reset_index(drop=True) train_df = df.iloc[:train_n] test_df = df.iloc[train_n:] X_train = train_df[numeric_feats].values.astype(np.float32) y_train = train_df[target_col].values X_test = test_df[numeric_feats].values.astype(np.float32) y_test = test_df[target_col].values try: preds = MODEL.fit(X_train, y_train).predict(X_test) except Exception as e: return f"❌ Prediction error: {e}", None result = pd.DataFrame( { "actual": y_test, "predicted": preds, } ) if pd.api.types.is_numeric_dtype(df[target_col]) and df[target_col].nunique() > 10: # Regression r2 = r2_score(y_test, preds) mae = mean_absolute_error(y_test, preds) result["error"] = (result["actual"] - result["predicted"]).round(4) summary = ( f"**Regression** · {len(numeric_feats)} features · n_train = {train_n} · " f"n_test = {test_n}\n\n" f"R² = **{r2:.3f}** · MAE = **{mae:.3f}**\n\n" f"_Single-model fast path. Full Duo + TTT recipe averages 0.609 R² across_ " f"_25 OpenML regression datasets._" ) else: # Classification acc = accuracy_score(y_test, preds) result["correct"] = result["actual"] == result["predicted"] n_classes = pd.Series(y_test).nunique() summary = ( f"**Classification** · {len(numeric_feats)} features · {n_classes} classes · " f"n_train = {train_n} · n_test = {test_n}\n\n" f"Accuracy = **{acc:.3f}**\n\n" f"_Single-model fast path. Full Duo + TTT recipe averages 0.751 accuracy_ " f"_across 25 OpenML classification datasets._" ) return summary, result.head(50) HEADER = """ # PredictLM Playground Upload a CSV, pick a target column, and run **predictlm-mini-13m** on it — a 13M-parameter open-weight tabular foundation model, Apache-2.0. > **Note**: This Space runs single-model fast-path (no Duo + TTT) for snappy responses. Local Python with `pip install predictlm` gets the full 0.751 / 0.609 OpenML numbers. """ FOOTER = """ --- [Model card](https://huggingface.co/zerooneresearch/predictlm-mini-13m) · [PyPI `pip install predictlm`](https://pypi.org/project/predictlm/) · [Source on GitHub](https://github.com/matej-01RAI/predictlm-mcp) · [Org](https://huggingface.co/zerooneresearch) PredictLM is built by [Zero One Research](https://huggingface.co/zerooneresearch), an independent AI lab in Bratislava, EU. """ with gr.Blocks(title="PredictLM Playground", theme=gr.themes.Soft()) as demo: gr.Markdown(HEADER) df_state = gr.State(None) with gr.Row(): with gr.Column(scale=1): file = gr.File(label="Upload CSV", file_types=[".csv"]) example = gr.Dropdown( choices=list(EXAMPLE_DATASETS.keys()), label="…or pick a built-in example", value=None, ) target = gr.Dropdown(label="Target column", choices=[], interactive=False) test_frac = gr.Slider( 0.1, 0.5, value=0.2, step=0.05, label="Test fraction (held-out for evaluation)", ) run = gr.Button("Predict", variant="primary", size="lg") status = gr.Markdown("_Upload a CSV (or pick an example below) to start._") with gr.Column(scale=2): summary = gr.Markdown( "_Predictions will appear here._" ) results = gr.Dataframe( label="Predictions (first 50 rows of test split)", interactive=False, wrap=True, ) file.change(load_csv, inputs=[file], outputs=[df_state, target, status]) example.change(load_example, inputs=[example], outputs=[df_state, target, status]) run.click( run_prediction, inputs=[df_state, target, test_frac], outputs=[summary, results], ) gr.Markdown(FOOTER) if __name__ == "__main__": demo.queue().launch( server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)), show_error=True, )