Spaces:

zerooneresearch
/

predictlm-playground

Sleeping

File size: 7,420 Bytes

"""
PredictLM Playground — Gradio demo for predictlm-mini-13m.

Upload a CSV → pick target column → get predictions on a held-out split.
Single-model fast path (no Duo, no TTT). For the full 0.751/0.609 recipe,
see `pip install predictlm`.
"""

import os

import gradio as gr
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, mean_absolute_error, r2_score

from predictlm import PredictLM

# Load model once at startup. First request after cold-start downloads the
# 54MB Mini checkpoint; cached for subsequent requests.
print("Loading predictlm-mini-13m (single-model mode for speed)...")
MODEL = PredictLM.from_pretrained(
    "zerooneresearch/predictlm-mini-13m",
    device="cpu",
    auto_duo=False,
)
print("Model loaded.")


MAX_ROWS = 1100
MAX_FEATURES = 128
EXAMPLE_DATASETS = {
    "Breast cancer (classification, 569 rows × 30 features)": "examples/breast_cancer.csv",
    "California housing (regression, 1000 rows × 8 features)": "examples/california_housing.csv",
}


def load_csv(file) -> tuple:
    if file is None:
        return (
            None,
            gr.Dropdown(choices=[], value=None, interactive=False),
            "_Upload a CSV (or pick an example below) to start._",
        )
    try:
        df = pd.read_csv(file)
    except Exception as e:
        return None, gr.Dropdown(choices=[], value=None), f"❌ Could not read CSV: {e}"

    if len(df) > MAX_ROWS:
        df = df.sample(n=MAX_ROWS, random_state=42).reset_index(drop=True)
        sample_note = f" (sampled to {MAX_ROWS} rows for speed)"
    else:
        sample_note = ""

    cols = list(df.columns)
    return (
        df,
        gr.Dropdown(choices=cols, value=cols[-1], interactive=True),
        f"✅ Loaded {len(df)} rows × {len(cols)} columns{sample_note}. "
        f"Default target is the last column — change it if needed.",
    )


def load_example(name):
    path = EXAMPLE_DATASETS.get(name)
    if not path or not os.path.exists(path):
        return None, gr.Dropdown(choices=[], value=None), f"Example file not found: {path}"
    return load_csv(path)


def run_prediction(df, target_col, test_frac):
    if df is None or target_col is None:
        return "_Load a CSV first._", None
    if target_col not in df.columns:
        return f"❌ Target column **{target_col}** not in CSV.", None

    df = df.dropna(subset=[target_col]).copy()
    n = len(df)
    if n < 20:
        return f"❌ Need at least 20 rows after dropping NA target. Got {n}.", None

    feature_cols = [c for c in df.columns if c != target_col]
    numeric_feats = [c for c in feature_cols if pd.api.types.is_numeric_dtype(df[c])]

    if not numeric_feats:
        return (
            "❌ No numeric feature columns found. PredictLM v1 expects numeric features "
            "(encode categoricals first).",
            None,
        )

    if len(numeric_feats) > MAX_FEATURES:
        return (
            f"❌ PredictLM v1 supports ≤{MAX_FEATURES} features. CSV has "
            f"{len(numeric_feats)} numeric features.",
            None,
        )

    test_n = max(5, int(n * test_frac))
    train_n = n - test_n

    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    train_df = df.iloc[:train_n]
    test_df = df.iloc[train_n:]

    X_train = train_df[numeric_feats].values.astype(np.float32)
    y_train = train_df[target_col].values
    X_test = test_df[numeric_feats].values.astype(np.float32)
    y_test = test_df[target_col].values

    try:
        preds = MODEL.fit(X_train, y_train).predict(X_test)
    except Exception as e:
        return f"❌ Prediction error: {e}", None

    result = pd.DataFrame(
        {
            "actual": y_test,
            "predicted": preds,
        }
    )

    if pd.api.types.is_numeric_dtype(df[target_col]) and df[target_col].nunique() > 10:
        # Regression
        r2 = r2_score(y_test, preds)
        mae = mean_absolute_error(y_test, preds)
        result["error"] = (result["actual"] - result["predicted"]).round(4)
        summary = (
            f"**Regression** · {len(numeric_feats)} features · n_train = {train_n} · "
            f"n_test = {test_n}\n\n"
            f"R² = **{r2:.3f}** · MAE = **{mae:.3f}**\n\n"
            f"_Single-model fast path. Full Duo + TTT recipe averages 0.609 R² across_ "
            f"_25 OpenML regression datasets._"
        )
    else:
        # Classification
        acc = accuracy_score(y_test, preds)
        result["correct"] = result["actual"] == result["predicted"]
        n_classes = pd.Series(y_test).nunique()
        summary = (
            f"**Classification** · {len(numeric_feats)} features · {n_classes} classes · "
            f"n_train = {train_n} · n_test = {test_n}\n\n"
            f"Accuracy = **{acc:.3f}**\n\n"
            f"_Single-model fast path. Full Duo + TTT recipe averages 0.751 accuracy_ "
            f"_across 25 OpenML classification datasets._"
        )

    return summary, result.head(50)


HEADER = """
# PredictLM Playground

Upload a CSV, pick a target column, and run **predictlm-mini-13m** on it — a 13M-parameter open-weight tabular foundation model, Apache-2.0.

> **Note**: This Space runs single-model fast-path (no Duo + TTT) for snappy responses. Local Python with `pip install predictlm` gets the full 0.751 / 0.609 OpenML numbers.
"""

FOOTER = """
---

[Model card](https://huggingface.co/zerooneresearch/predictlm-mini-13m) · [PyPI `pip install predictlm`](https://pypi.org/project/predictlm/) · [Source on GitHub](https://github.com/matej-01RAI/predictlm-mcp) · [Org](https://huggingface.co/zerooneresearch)

PredictLM is built by [Zero One Research](https://huggingface.co/zerooneresearch), an independent AI lab in Bratislava, EU.
"""


with gr.Blocks(title="PredictLM Playground", theme=gr.themes.Soft()) as demo:
    gr.Markdown(HEADER)

    df_state = gr.State(None)

    with gr.Row():
        with gr.Column(scale=1):
            file = gr.File(label="Upload CSV", file_types=[".csv"])
            example = gr.Dropdown(
                choices=list(EXAMPLE_DATASETS.keys()),
                label="…or pick a built-in example",
                value=None,
            )
            target = gr.Dropdown(label="Target column", choices=[], interactive=False)
            test_frac = gr.Slider(
                0.1, 0.5, value=0.2, step=0.05,
                label="Test fraction (held-out for evaluation)",
            )
            run = gr.Button("Predict", variant="primary", size="lg")
            status = gr.Markdown("_Upload a CSV (or pick an example below) to start._")

        with gr.Column(scale=2):
            summary = gr.Markdown(
                "_Predictions will appear here._"
            )
            results = gr.Dataframe(
                label="Predictions (first 50 rows of test split)",
                interactive=False,
                wrap=True,
            )

    file.change(load_csv, inputs=[file], outputs=[df_state, target, status])
    example.change(load_example, inputs=[example], outputs=[df_state, target, status])
    run.click(
        run_prediction,
        inputs=[df_state, target, test_frac],
        outputs=[summary, results],
    )

    gr.Markdown(FOOTER)


if __name__ == "__main__":
    demo.queue().launch(
        server_name="0.0.0.0",
        server_port=int(os.environ.get("PORT", 7860)),
        show_error=True,
    )