""" SAP RPT-1 Benchmarking Demo ============================ Self-contained demo: runs XGBoost, LightGBM, CatBoost, and SAP RPT-1 (simulated) on classic sklearn datasets (Iris, Breast Cancer, Diabetes regression) using 5-fold cross-validation. Saves JSON results and a beautiful HTML report. Run from repo root: python scripts/demo_benchmark.py """ import os, sys, json, time, warnings import numpy as np import pandas as pd from pathlib import Path from datetime import datetime from sklearn.model_selection import StratifiedKFold, KFold from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, r2_score, mean_absolute_error from sklearn.preprocessing import LabelEncoder from sklearn.datasets import load_iris, load_breast_cancer, load_diabetes from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor warnings.filterwarnings("ignore") RESULTS_DIR = Path(__file__).parent.parent / "results" / "raw" RESULTS_DIR.mkdir(parents=True, exist_ok=True) N_FOLDS = 5 RANDOM_STATE = 42 # ───────────────────────────────────────────── # Helpers # ───────────────────────────────────────────── def timer(): return time.perf_counter() def load_datasets(): datasets = {} # 1. Iris (multi-class classification) d = load_iris(as_frame=True) datasets["iris"] = { "X": d.data, "y": d.target, "task": "classification", "desc": "Iris flower species (3 classes, 150 rows, 4 features)" } # 2. Breast Cancer (binary classification) d = load_breast_cancer(as_frame=True) datasets["breast_cancer"] = { "X": d.data, "y": d.target, "task": "classification", "desc": "Wisconsin Breast Cancer (binary, 569 rows, 30 features)" } # 3. Diabetes (regression) d = load_diabetes(as_frame=True) datasets["diabetes"] = { "X": d.data, "y": d.target, "task": "regression", "desc": "Diabetes progression (regression, 442 rows, 10 features)" } return datasets # ───────────────────────────────────────────── # Model builders # ───────────────────────────────────────────── def build_xgboost(task): import xgboost as xgb if task == "classification": return xgb.XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=RANDOM_STATE, use_label_encoder=False, eval_metric="logloss", verbosity=0) return xgb.XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=RANDOM_STATE, verbosity=0) def build_lightgbm(task): import lightgbm as lgb if task == "classification": return lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=RANDOM_STATE, verbose=-1) return lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=RANDOM_STATE, verbose=-1) def build_catboost(task): from catboost import CatBoostClassifier, CatBoostRegressor if task == "classification": return CatBoostClassifier(iterations=100, learning_rate=0.1, random_state=RANDOM_STATE, verbose=False) return CatBoostRegressor(iterations=100, learning_rate=0.1, random_state=RANDOM_STATE, verbose=False) class SAPSimulator: """ SAP RPT-1 Simulator. Mimics SAP RPT-1's in-context learning behaviour using a fast k-NN retrieval backbone (conceptually similar to how RPT-1 retrieves nearest context rows and predicts via its pretrained head). NOTE: This is a *demonstration substitute* for the real SAP RPT-1 OSS model which requires a gated HuggingFace token + pip install of the SAP-samples package. The real wrapper is in code/models/sap_rpt1_hf_wrapper.py. """ def __init__(self, task, k=15): self.task = task self.k = k if task == "classification": self.model = KNeighborsClassifier(n_neighbors=k) else: self.model = KNeighborsRegressor(n_neighbors=k) self.le = LabelEncoder() if task == "classification" else None def fit(self, X, y): if self.task == "classification": y_enc = self.le.fit_transform(y) self.model.fit(X, y_enc) else: self.model.fit(X, y) return self def predict(self, X): preds = self.model.predict(X) if self.task == "classification": return self.le.inverse_transform(preds) return preds def predict_proba(self, X): return self.model.predict_proba(X) MODELS = { "XGBoost": build_xgboost, "LightGBM": build_lightgbm, "CatBoost": build_catboost, "SAP-RPT1 (sim)": lambda task: SAPSimulator(task), } # ───────────────────────────────────────────── # Evaluation # ───────────────────────────────────────────── def eval_fold_classification(model, X_train, y_train, X_val, y_val): t0 = timer() model.fit(X_train, y_train) fit_time = timer() - t0 t0 = timer() y_pred = model.predict(X_val) pred_time = timer() - t0 acc = accuracy_score(y_val, y_pred) f1 = f1_score(y_val, y_pred, average="macro", zero_division=0) try: proba = model.predict_proba(X_val) n_cls = len(np.unique(y_val)) if n_cls == 2: auc = roc_auc_score(y_val, proba[:, 1]) else: auc = roc_auc_score(y_val, proba, multi_class="ovr", average="macro") except Exception: auc = float("nan") return {"accuracy": acc, "f1_macro": f1, "roc_auc": auc, "fit_time": fit_time, "pred_time": pred_time} def eval_fold_regression(model, X_train, y_train, X_val, y_val): t0 = timer() model.fit(X_train, y_train) fit_time = timer() - t0 t0 = timer() y_pred = model.predict(X_val) pred_time = timer() - t0 r2 = r2_score(y_val, y_pred) mae = mean_absolute_error(y_val, y_pred) return {"r2": r2, "mae": mae, "fit_time": fit_time, "pred_time": pred_time} def run_cv(model_fn, dataset_name, ds): X, y, task = ds["X"], ds["y"], ds["task"] if task == "classification": cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE) splits = list(cv.split(X, y)) else: cv = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE) splits = list(cv.split(X)) fold_results = [] for fold_i, (train_idx, val_idx) in enumerate(splits): X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx] y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx] model = model_fn(task) if task == "classification": fold_results.append(eval_fold_classification(model, X_tr, y_tr, X_val, y_val)) else: fold_results.append(eval_fold_regression(model, X_tr, y_tr, X_val, y_val)) df = pd.DataFrame(fold_results) return {"mean": df.mean().to_dict(), "std": df.std().to_dict(), "folds": fold_results} # ───────────────────────────────────────────── # Main # ───────────────────────────────────────────── def main(): print("\n" + "="*65) print(" SAP RPT-1 Benchmarking Demo") print(f" Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print("="*65) datasets = load_datasets() all_results = {} for ds_name, ds in datasets.items(): print(f"\n[DATASET] {ds_name} ({ds['desc']})") all_results[ds_name] = {"task": ds["task"], "models": {}} for model_name, model_fn in MODELS.items(): try: print(f" >> Running {model_name}...", end=" ", flush=True) t_total = timer() cv_res = run_cv(model_fn, ds_name, ds) elapsed = timer() - t_total all_results[ds_name]["models"][model_name] = cv_res task = ds["task"] if task == "classification": primary = cv_res["mean"].get("roc_auc", cv_res["mean"]["accuracy"]) print(f"ROC-AUC={primary:.4f} ({elapsed:.1f}s)") else: primary = cv_res["mean"]["r2"] print(f"R²={primary:.4f} ({elapsed:.1f}s)") except Exception as e: print(f" ✗ FAILED: {e}") all_results[ds_name]["models"][model_name] = {"error": str(e)} # Save JSON ts = datetime.now().strftime("%Y%m%d_%H%M%S") json_path = RESULTS_DIR / f"demo_results_{ts}.json" with open(json_path, "w") as f: json.dump(all_results, f, indent=2, default=str) print(f"\n[OK] JSON saved -> {json_path}") # Generate HTML dashboard html_path = Path(__file__).parent.parent / "results" / f"demo_dashboard_{ts}.html" generate_html(all_results, html_path, ts) print(f"[OK] HTML dashboard -> {html_path}") print("\nOpen the HTML file in your browser to see the results!\n") return all_results, html_path # ───────────────────────────────────────────── # HTML Report Generator # ───────────────────────────────────────────── def color_for_metric(val, task): """Return a CSS color class based on metric value.""" if task == "classification": # ROC-AUC or Accuracy if val >= 0.97: return "excellent" if val >= 0.92: return "good" if val >= 0.85: return "fair" return "poor" else: # R² if val >= 0.55: return "excellent" if val >= 0.40: return "good" if val >= 0.20: return "fair" return "poor" def generate_html(results, out_path, ts): MODEL_COLORS = { "XGBoost": "#f59e0b", "LightGBM": "#10b981", "CatBoost": "#6366f1", "SAP-RPT1 (sim)": "#ec4899", } # Build chart data JSON chart_datasets = {} for ds_name, ds_data in results.items(): task = ds_data["task"] metric = "roc_auc" if task == "classification" else "r2" fallback = "accuracy" chart_datasets[ds_name] = { "task": task, "models": {}, } for m_name, m_data in ds_data["models"].items(): if "error" in m_data: continue val = m_data["mean"].get(metric, m_data["mean"].get(fallback, 0)) std = m_data["std"].get(metric, m_data["std"].get(fallback, 0)) chart_datasets[ds_name]["models"][m_name] = {"val": round(val, 4), "std": round(std, 4)} chart_json = json.dumps(chart_datasets) colors_json = json.dumps(MODEL_COLORS) # Table rows table_rows = "" for ds_name, ds_data in results.items(): task = ds_data["task"] metric_key = "roc_auc" if task == "classification" else "r2" for m_name, m_data in ds_data["models"].items(): if "error" in m_data: table_rows += f"""{ds_name}{m_name} {task}ERROR: {m_data['error'][:60]}""" continue acc = m_data["mean"].get("accuracy", "-") f1 = m_data["mean"].get("f1_macro", "-") auc = m_data["mean"].get("roc_auc", "-") r2 = m_data["mean"].get("r2", "-") mae = m_data["mean"].get("mae", "-") ft = m_data["mean"].get("fit_time", 0) prim = m_data["mean"].get(metric_key, m_data["mean"].get("accuracy", 0)) cls = color_for_metric(prim, task) def fmt(v): return f"{v:.4f}" if isinstance(v, float) else "-" color = MODEL_COLORS.get(m_name, "#888") dot = f'' table_rows += f""" {ds_name} {dot}{m_name} {task} {fmt(acc) if task=='classification' else '-'} {fmt(f1) if task=='classification' else '-'} {fmt(auc) if task=='classification' else '-'} {'-' if task=='classification' else fmt(r2)} {fmt(mae) if task=='regression' else '-'} {ft:.3f}s """ html = f""" SAP RPT-1 Benchmarking Results

🔬 SAP RPT-1 Benchmarking

Comparative evaluation of tabular machine learning models across classification and regression datasets

Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')} {N_FOLDS}-Fold Cross-Validation Seed: {RANDOM_STATE}
ℹ️ About SAP RPT-1 (sim): The real SAP RPT-1 OSS model is a Retrieval-Pretrained Transformer for tabular data available at huggingface.co/SAP/sap-rpt-1-oss — it requires a gated HuggingFace token and pip install git+https://github.com/SAP-samples/sap-rpt-1-oss.git. In this demo, SAP-RPT1 (sim) is a conceptually faithful substitute (k-NN in-context retrieval, k=15) to demonstrate the pipeline without authentication. See code/models/sap_rpt1_hf_wrapper.py for the real wrapper.

📈 Summary Statistics

📊 Model Comparison Charts

📋 Full Results Table

All Metrics (mean across {N_FOLDS} folds)

↑ higher is better (except MAE)
{table_rows}
DatasetModelTask AccuracyF1-MacroROC-AUC MAEFit Time
""" with open(out_path, "w", encoding="utf-8") as f: f.write(html) if __name__ == "__main__": main()