File size: 7,420 Bytes
9e6a0bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2c3b11
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"""
PredictLM Playground — Gradio demo for predictlm-mini-13m.

Upload a CSV → pick target column → get predictions on a held-out split.
Single-model fast path (no Duo, no TTT). For the full 0.751/0.609 recipe,
see `pip install predictlm`.
"""

import os

import gradio as gr
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, mean_absolute_error, r2_score

from predictlm import PredictLM

# Load model once at startup. First request after cold-start downloads the
# 54MB Mini checkpoint; cached for subsequent requests.
print("Loading predictlm-mini-13m (single-model mode for speed)...")
MODEL = PredictLM.from_pretrained(
    "zerooneresearch/predictlm-mini-13m",
    device="cpu",
    auto_duo=False,
)
print("Model loaded.")


MAX_ROWS = 1100
MAX_FEATURES = 128
EXAMPLE_DATASETS = {
    "Breast cancer (classification, 569 rows × 30 features)": "examples/breast_cancer.csv",
    "California housing (regression, 1000 rows × 8 features)": "examples/california_housing.csv",
}


def load_csv(file) -> tuple:
    if file is None:
        return (
            None,
            gr.Dropdown(choices=[], value=None, interactive=False),
            "_Upload a CSV (or pick an example below) to start._",
        )
    try:
        df = pd.read_csv(file)
    except Exception as e:
        return None, gr.Dropdown(choices=[], value=None), f"❌ Could not read CSV: {e}"

    if len(df) > MAX_ROWS:
        df = df.sample(n=MAX_ROWS, random_state=42).reset_index(drop=True)
        sample_note = f" (sampled to {MAX_ROWS} rows for speed)"
    else:
        sample_note = ""

    cols = list(df.columns)
    return (
        df,
        gr.Dropdown(choices=cols, value=cols[-1], interactive=True),
        f"✅ Loaded {len(df)} rows × {len(cols)} columns{sample_note}. "
        f"Default target is the last column — change it if needed.",
    )


def load_example(name):
    path = EXAMPLE_DATASETS.get(name)
    if not path or not os.path.exists(path):
        return None, gr.Dropdown(choices=[], value=None), f"Example file not found: {path}"
    return load_csv(path)


def run_prediction(df, target_col, test_frac):
    if df is None or target_col is None:
        return "_Load a CSV first._", None
    if target_col not in df.columns:
        return f"❌ Target column **{target_col}** not in CSV.", None

    df = df.dropna(subset=[target_col]).copy()
    n = len(df)
    if n < 20:
        return f"❌ Need at least 20 rows after dropping NA target. Got {n}.", None

    feature_cols = [c for c in df.columns if c != target_col]
    numeric_feats = [c for c in feature_cols if pd.api.types.is_numeric_dtype(df[c])]

    if not numeric_feats:
        return (
            "❌ No numeric feature columns found. PredictLM v1 expects numeric features "
            "(encode categoricals first).",
            None,
        )

    if len(numeric_feats) > MAX_FEATURES:
        return (
            f"❌ PredictLM v1 supports ≤{MAX_FEATURES} features. CSV has "
            f"{len(numeric_feats)} numeric features.",
            None,
        )

    test_n = max(5, int(n * test_frac))
    train_n = n - test_n

    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    train_df = df.iloc[:train_n]
    test_df = df.iloc[train_n:]

    X_train = train_df[numeric_feats].values.astype(np.float32)
    y_train = train_df[target_col].values
    X_test = test_df[numeric_feats].values.astype(np.float32)
    y_test = test_df[target_col].values

    try:
        preds = MODEL.fit(X_train, y_train).predict(X_test)
    except Exception as e:
        return f"❌ Prediction error: {e}", None

    result = pd.DataFrame(
        {
            "actual": y_test,
            "predicted": preds,
        }
    )

    if pd.api.types.is_numeric_dtype(df[target_col]) and df[target_col].nunique() > 10:
        # Regression
        r2 = r2_score(y_test, preds)
        mae = mean_absolute_error(y_test, preds)
        result["error"] = (result["actual"] - result["predicted"]).round(4)
        summary = (
            f"**Regression** · {len(numeric_feats)} features · n_train = {train_n} · "
            f"n_test = {test_n}\n\n"
            f"R² = **{r2:.3f}** · MAE = **{mae:.3f}**\n\n"
            f"_Single-model fast path. Full Duo + TTT recipe averages 0.609 R² across_ "
            f"_25 OpenML regression datasets._"
        )
    else:
        # Classification
        acc = accuracy_score(y_test, preds)
        result["correct"] = result["actual"] == result["predicted"]
        n_classes = pd.Series(y_test).nunique()
        summary = (
            f"**Classification** · {len(numeric_feats)} features · {n_classes} classes · "
            f"n_train = {train_n} · n_test = {test_n}\n\n"
            f"Accuracy = **{acc:.3f}**\n\n"
            f"_Single-model fast path. Full Duo + TTT recipe averages 0.751 accuracy_ "
            f"_across 25 OpenML classification datasets._"
        )

    return summary, result.head(50)


HEADER = """
# PredictLM Playground

Upload a CSV, pick a target column, and run **predictlm-mini-13m** on it — a 13M-parameter open-weight tabular foundation model, Apache-2.0.

> **Note**: This Space runs single-model fast-path (no Duo + TTT) for snappy responses. Local Python with `pip install predictlm` gets the full 0.751 / 0.609 OpenML numbers.
"""

FOOTER = """
---

[Model card](https://huggingface.co/zerooneresearch/predictlm-mini-13m) · [PyPI `pip install predictlm`](https://pypi.org/project/predictlm/) · [Source on GitHub](https://github.com/matej-01RAI/predictlm-mcp) · [Org](https://huggingface.co/zerooneresearch)

PredictLM is built by [Zero One Research](https://huggingface.co/zerooneresearch), an independent AI lab in Bratislava, EU.
"""


with gr.Blocks(title="PredictLM Playground", theme=gr.themes.Soft()) as demo:
    gr.Markdown(HEADER)

    df_state = gr.State(None)

    with gr.Row():
        with gr.Column(scale=1):
            file = gr.File(label="Upload CSV", file_types=[".csv"])
            example = gr.Dropdown(
                choices=list(EXAMPLE_DATASETS.keys()),
                label="…or pick a built-in example",
                value=None,
            )
            target = gr.Dropdown(label="Target column", choices=[], interactive=False)
            test_frac = gr.Slider(
                0.1, 0.5, value=0.2, step=0.05,
                label="Test fraction (held-out for evaluation)",
            )
            run = gr.Button("Predict", variant="primary", size="lg")
            status = gr.Markdown("_Upload a CSV (or pick an example below) to start._")

        with gr.Column(scale=2):
            summary = gr.Markdown(
                "_Predictions will appear here._"
            )
            results = gr.Dataframe(
                label="Predictions (first 50 rows of test split)",
                interactive=False,
                wrap=True,
            )

    file.change(load_csv, inputs=[file], outputs=[df_state, target, status])
    example.change(load_example, inputs=[example], outputs=[df_state, target, status])
    run.click(
        run_prediction,
        inputs=[df_state, target, test_frac],
        outputs=[summary, results],
    )

    gr.Markdown(FOOTER)


if __name__ == "__main__":
    demo.queue().launch(
        server_name="0.0.0.0",
        server_port=int(os.environ.get("PORT", 7860)),
        show_error=True,
    )