Spaces:

zerooneresearch
/

predictlm-playground

Sleeping

App Files Files Community

predictlm-playground / app.py

01RAI

Upgrade to gradio 5.4.0 + explicit launch config for HF Spaces

f2c3b11 verified 1 day ago

raw

history blame contribute delete

7.42 kB

	"""
	PredictLM Playground — Gradio demo for predictlm-mini-13m.

	Upload a CSV → pick target column → get predictions on a held-out split.
	Single-model fast path (no Duo, no TTT). For the full 0.751/0.609 recipe,
	see `pip install predictlm`.
	"""

	import os

	import gradio as gr
	import numpy as np
	import pandas as pd
	from sklearn.metrics import accuracy_score, mean_absolute_error, r2_score

	from predictlm import PredictLM

	# Load model once at startup. First request after cold-start downloads the
	# 54MB Mini checkpoint; cached for subsequent requests.
	print("Loading predictlm-mini-13m (single-model mode for speed)...")
	MODEL = PredictLM.from_pretrained(
	"zerooneresearch/predictlm-mini-13m",
	device="cpu",
	auto_duo=False,
	)
	print("Model loaded.")


	MAX_ROWS = 1100
	MAX_FEATURES = 128
	EXAMPLE_DATASETS = {
	"Breast cancer (classification, 569 rows × 30 features)": "examples/breast_cancer.csv",
	"California housing (regression, 1000 rows × 8 features)": "examples/california_housing.csv",
	}


	def load_csv(file) -> tuple:
	if file is None:
	return (
	None,
	gr.Dropdown(choices=[], value=None, interactive=False),
	"_Upload a CSV (or pick an example below) to start._",
	)
	try:
	df = pd.read_csv(file)
	except Exception as e:
	return None, gr.Dropdown(choices=[], value=None), f"❌ Could not read CSV: {e}"

	if len(df) > MAX_ROWS:
	df = df.sample(n=MAX_ROWS, random_state=42).reset_index(drop=True)
	sample_note = f" (sampled to {MAX_ROWS} rows for speed)"
	else:
	sample_note = ""

	cols = list(df.columns)
	return (
	df,
	gr.Dropdown(choices=cols, value=cols[-1], interactive=True),
	f"✅ Loaded {len(df)} rows × {len(cols)} columns{sample_note}. "
	f"Default target is the last column — change it if needed.",
	)


	def load_example(name):
	path = EXAMPLE_DATASETS.get(name)
	if not path or not os.path.exists(path):
	return None, gr.Dropdown(choices=[], value=None), f"Example file not found: {path}"
	return load_csv(path)


	def run_prediction(df, target_col, test_frac):
	if df is None or target_col is None:
	return "_Load a CSV first._", None
	if target_col not in df.columns:
	return f"❌ Target column {target_col} not in CSV.", None

	df = df.dropna(subset=[target_col]).copy()
	n = len(df)
	if n < 20:
	return f"❌ Need at least 20 rows after dropping NA target. Got {n}.", None

	feature_cols = [c for c in df.columns if c != target_col]
	numeric_feats = [c for c in feature_cols if pd.api.types.is_numeric_dtype(df[c])]

	if not numeric_feats:
	return (
	"❌ No numeric feature columns found. PredictLM v1 expects numeric features "
	"(encode categoricals first).",
	None,
	)

	if len(numeric_feats) > MAX_FEATURES:
	return (
	f"❌ PredictLM v1 supports ≤{MAX_FEATURES} features. CSV has "
	f"{len(numeric_feats)} numeric features.",
	None,
	)

	test_n = max(5, int(n * test_frac))
	train_n = n - test_n

	df = df.sample(frac=1, random_state=42).reset_index(drop=True)
	train_df = df.iloc[:train_n]
	test_df = df.iloc[train_n:]

	X_train = train_df[numeric_feats].values.astype(np.float32)
	y_train = train_df[target_col].values
	X_test = test_df[numeric_feats].values.astype(np.float32)
	y_test = test_df[target_col].values

	try:
	preds = MODEL.fit(X_train, y_train).predict(X_test)
	except Exception as e:
	return f"❌ Prediction error: {e}", None

	result = pd.DataFrame(
	{
	"actual": y_test,
	"predicted": preds,
	}
	)

	if pd.api.types.is_numeric_dtype(df[target_col]) and df[target_col].nunique() > 10:
	# Regression
	r2 = r2_score(y_test, preds)
	mae = mean_absolute_error(y_test, preds)
	result["error"] = (result["actual"] - result["predicted"]).round(4)
	summary = (
	f"Regression · {len(numeric_feats)} features · n_train = {train_n} · "
	f"n_test = {test_n}\n\n"
	f"R² = {r2:.3f} · MAE = {mae:.3f}\n\n"
	f"_Single-model fast path. Full Duo + TTT recipe averages 0.609 R² across_ "
	f"_25 OpenML regression datasets._"
	)
	else:
	# Classification
	acc = accuracy_score(y_test, preds)
	result["correct"] = result["actual"] == result["predicted"]
	n_classes = pd.Series(y_test).nunique()
	summary = (
	f"Classification · {len(numeric_feats)} features · {n_classes} classes · "
	f"n_train = {train_n} · n_test = {test_n}\n\n"
	f"Accuracy = {acc:.3f}\n\n"
	f"_Single-model fast path. Full Duo + TTT recipe averages 0.751 accuracy_ "
	f"_across 25 OpenML classification datasets._"
	)

	return summary, result.head(50)


	HEADER = """
	# PredictLM Playground

	Upload a CSV, pick a target column, and run predictlm-mini-13m on it — a 13M-parameter open-weight tabular foundation model, Apache-2.0.

	> Note: This Space runs single-model fast-path (no Duo + TTT) for snappy responses. Local Python with `pip install predictlm` gets the full 0.751 / 0.609 OpenML numbers.
	"""

	FOOTER = """
	---

	[Model card](https://huggingface.co/zerooneresearch/predictlm-mini-13m) · [PyPI `pip install predictlm`](https://pypi.org/project/predictlm/) · [Source on GitHub](https://github.com/matej-01RAI/predictlm-mcp) · [Org](https://huggingface.co/zerooneresearch)

	PredictLM is built by [Zero One Research](https://huggingface.co/zerooneresearch), an independent AI lab in Bratislava, EU.
	"""


	with gr.Blocks(title="PredictLM Playground", theme=gr.themes.Soft()) as demo:
	gr.Markdown(HEADER)

	df_state = gr.State(None)

	with gr.Row():
	with gr.Column(scale=1):
	file = gr.File(label="Upload CSV", file_types=[".csv"])
	example = gr.Dropdown(
	choices=list(EXAMPLE_DATASETS.keys()),
	label="…or pick a built-in example",
	value=None,
	)
	target = gr.Dropdown(label="Target column", choices=[], interactive=False)
	test_frac = gr.Slider(
	0.1, 0.5, value=0.2, step=0.05,
	label="Test fraction (held-out for evaluation)",
	)
	run = gr.Button("Predict", variant="primary", size="lg")
	status = gr.Markdown("_Upload a CSV (or pick an example below) to start._")

	with gr.Column(scale=2):
	summary = gr.Markdown(
	"_Predictions will appear here._"
	)
	results = gr.Dataframe(
	label="Predictions (first 50 rows of test split)",
	interactive=False,
	wrap=True,
	)

	file.change(load_csv, inputs=[file], outputs=[df_state, target, status])
	example.change(load_example, inputs=[example], outputs=[df_state, target, status])
	run.click(
	run_prediction,
	inputs=[df_state, target, test_frac],
	outputs=[summary, results],
	)

	gr.Markdown(FOOTER)


	if __name__ == "__main__":
	demo.queue().launch(
	server_name="0.0.0.0",
	server_port=int(os.environ.get("PORT", 7860)),
	show_error=True,
	)