Update src/streamlit_app.py
Browse files- src/streamlit_app.py +345 -101
src/streamlit_app.py
CHANGED
|
@@ -361,115 +361,359 @@ with tabs[3]:
|
|
| 361 |
st.dataframe(df.describe().T.style.format("{:.3f}"), height=500)
|
| 362 |
|
| 363 |
|
| 364 |
-
# ----- Ensemble + SHAP tab
|
| 365 |
with tabs[4]:
|
| 366 |
-
st.subheader("
|
| 367 |
-
|
| 368 |
-
# --- Step 1: Basic UI selections ---
|
| 369 |
-
target = st.selectbox("Target variable", numeric_cols, index=numeric_cols.index("furnace_temp") if "furnace_temp" in numeric_cols else 0)
|
| 370 |
-
default_features = [c for c in numeric_cols if c != target][:60]
|
| 371 |
-
features = st.multiselect("Model input features", numeric_cols, default=default_features)
|
| 372 |
-
sample_size = st.slider("Sample rows for training", 500, min(4000, df.shape[0]), 1000, step=100)
|
| 373 |
-
sub_df = df[features + [target]].sample(n=sample_size, random_state=42)
|
| 374 |
-
X = sub_df[features].fillna(0)
|
| 375 |
-
y = sub_df[target].fillna(0)
|
| 376 |
|
| 377 |
-
# --- Step
|
| 378 |
-
st.markdown("###
|
| 379 |
-
|
| 380 |
-
"
|
| 381 |
[
|
| 382 |
-
"
|
| 383 |
-
"
|
| 384 |
-
"
|
| 385 |
-
"
|
| 386 |
-
"
|
|
|
|
|
|
|
|
|
|
| 387 |
],
|
| 388 |
-
index=
|
| 389 |
)
|
| 390 |
|
| 391 |
-
#
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
else:
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
#
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
|
| 474 |
# ----- Target & Business Impact tab
|
| 475 |
with tabs[5]:
|
|
|
|
| 361 |
st.dataframe(df.describe().T.style.format("{:.3f}"), height=500)
|
| 362 |
|
| 363 |
|
| 364 |
+
# ----- Ensemble + SHAP tab (Expanded AutoML + Stacking + Multi-Family) -----
|
| 365 |
with tabs[4]:
|
| 366 |
+
st.subheader(" AutoML Ensemble — Expanded Families + Stacking + SHAP")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
|
| 368 |
+
# --- Step 0: High-level Use Case (keeps previous defaults) ---
|
| 369 |
+
st.markdown("### Choose Industrial Use Case ")
|
| 370 |
+
use_case = st.selectbox(
|
| 371 |
+
"Select Use Case",
|
| 372 |
[
|
| 373 |
+
"Predictive Maintenance",
|
| 374 |
+
"EAF Data Intelligence",
|
| 375 |
+
"Casting Quality Optimization",
|
| 376 |
+
"Rolling Mill Energy Optimization",
|
| 377 |
+
"Surface Defect Detection (Vision AI)",
|
| 378 |
+
"Material Composition & Alloy Mix AI",
|
| 379 |
+
"Inventory & Yield Optimization",
|
| 380 |
+
"Refractory & Cooling Loss Prediction"
|
| 381 |
],
|
| 382 |
+
index=1
|
| 383 |
)
|
| 384 |
|
| 385 |
+
# Map use-case -> defaults (same as before)
|
| 386 |
+
use_case_config = {
|
| 387 |
+
"Predictive Maintenance": {"target": "bearing_temp", "model_hint": "RandomForest"},
|
| 388 |
+
"EAF Data Intelligence": {"target": "furnace_temp", "model_hint": "GradientBoosting"},
|
| 389 |
+
"Casting Quality Optimization": {"target": "surface_temp" if "surface_temp" in numeric_cols else "furnace_temp", "model_hint": "GradientBoosting"},
|
| 390 |
+
"Rolling Mill Energy Optimization": {"target": "energy_efficiency", "model_hint": "ExtraTrees"},
|
| 391 |
+
"Surface Defect Detection (Vision AI)": {"target": "image_entropy_proxy", "model_hint": "GradientBoosting"},
|
| 392 |
+
"Material Composition & Alloy Mix AI": {"target": "chemical_C", "model_hint": "RandomForest"},
|
| 393 |
+
"Inventory & Yield Optimization": {"target": "yield_ratio", "model_hint": "GradientBoosting"},
|
| 394 |
+
"Refractory & Cooling Loss Prediction": {"target": "lining_thickness", "model_hint": "ExtraTrees"},
|
| 395 |
+
}
|
| 396 |
+
cfg = use_case_config.get(use_case, {"target": numeric_cols[0], "model_hint": "RandomForest"})
|
| 397 |
+
target = cfg["target"]
|
| 398 |
+
model_hint = cfg["model_hint"]
|
| 399 |
+
|
| 400 |
+
# --- Feature auto-suggestion (keeps your earlier heuristic) ---
|
| 401 |
+
suggested = [c for c in numeric_cols if any(k in c for k in target.split('_'))]
|
| 402 |
+
if len(suggested) < 6:
|
| 403 |
+
suggested = [c for c in numeric_cols if any(k in c for k in ["temp", "power", "energy", "pressure", "yield"])]
|
| 404 |
+
if len(suggested) < 6:
|
| 405 |
+
suggested = numeric_cols[:50]
|
| 406 |
+
|
| 407 |
+
features = st.multiselect("Model input features (auto-suggested)", numeric_cols, default=suggested)
|
| 408 |
+
st.markdown(f"Auto target: `{target}` · Suggested family hint: `{model_hint}`")
|
| 409 |
+
|
| 410 |
+
# --- Data sampling controls ---
|
| 411 |
+
max_rows = min(df.shape[0], 20000)
|
| 412 |
+
sample_size = st.slider("Sample rows (train speed vs fidelity)", 500, max_rows, min(1500, max_rows), step=100)
|
| 413 |
+
|
| 414 |
+
sub_df = df[features + [target]].sample(n=sample_size, random_state=42).reset_index(drop=True)
|
| 415 |
+
X = sub_df[features].fillna(0)
|
| 416 |
+
y = sub_df[target].fillna(0)
|
| 417 |
|
| 418 |
+
# --- Ensemble control UI ---
|
| 419 |
+
st.markdown("### Ensemble & AutoML Settings")
|
| 420 |
+
max_trials = st.slider("Optuna trials per family (total trials grow with families)", 5, 80, 20, step=5)
|
| 421 |
+
top_k = st.slider("Max base models to keep in final ensemble", 2, 8, 5)
|
| 422 |
+
allow_advanced = st.checkbox("Include advanced families (XGBoost, LightGBM, CatBoost, TabPFN if installed)", value=True)
|
| 423 |
+
|
| 424 |
+
# --- Conditional imports (graceful fallbacks) ---
|
| 425 |
+
available_models = ["RandomForest", "ExtraTrees"] # always available (sklearn)
|
| 426 |
+
optional_families = {}
|
| 427 |
+
if allow_advanced:
|
| 428 |
+
try:
|
| 429 |
+
import xgboost as xgb
|
| 430 |
+
optional_families["XGBoost"] = True
|
| 431 |
+
available_models.append("XGBoost")
|
| 432 |
+
except Exception:
|
| 433 |
+
optional_families["XGBoost"] = False
|
| 434 |
+
try:
|
| 435 |
+
import lightgbm as lgb
|
| 436 |
+
optional_families["LightGBM"] = True
|
| 437 |
+
available_models.append("LightGBM")
|
| 438 |
+
except Exception:
|
| 439 |
+
optional_families["LightGBM"] = False
|
| 440 |
+
try:
|
| 441 |
+
import catboost as cb
|
| 442 |
+
optional_families["CatBoost"] = True
|
| 443 |
+
available_models.append("CatBoost")
|
| 444 |
+
except Exception:
|
| 445 |
+
optional_families["CatBoost"] = False
|
| 446 |
+
try:
|
| 447 |
+
# TabPFN is often packaged differently; attempt import but it's optional
|
| 448 |
+
import tabpfn
|
| 449 |
+
optional_families["TabPFN"] = True
|
| 450 |
+
available_models.append("TabPFN")
|
| 451 |
+
except Exception:
|
| 452 |
+
optional_families["TabPFN"] = False
|
| 453 |
+
try:
|
| 454 |
+
# FT-Transformer optional
|
| 455 |
+
from pytorch_tabular.models import transformers # may not be installed
|
| 456 |
+
optional_families["FTTransformer"] = True
|
| 457 |
+
available_models.append("FTTransformer")
|
| 458 |
+
except Exception:
|
| 459 |
+
optional_families["FTTransformer"] = False
|
| 460 |
+
|
| 461 |
+
st.markdown(f"Available model families: {', '.join(available_models)}")
|
| 462 |
+
|
| 463 |
+
# --- Optuna tuning routine per family ---
|
| 464 |
+
import optuna
|
| 465 |
+
from sklearn.model_selection import cross_val_score, KFold
|
| 466 |
+
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
|
| 467 |
+
from sklearn.linear_model import Ridge
|
| 468 |
+
from sklearn.neural_network import MLPRegressor
|
| 469 |
+
from sklearn.metrics import r2_score, mean_squared_error
|
| 470 |
+
|
| 471 |
+
def tune_family(family_name, X_local, y_local, n_trials=20, random_state=42):
|
| 472 |
+
"""Tune one model family using Optuna; returns best (model_obj, cv_score, best_params)."""
|
| 473 |
+
def obj(trial):
|
| 474 |
+
# sample hyperparams per family
|
| 475 |
+
if family_name == "RandomForest":
|
| 476 |
+
n_estimators = trial.suggest_int("n_estimators", 100, 800)
|
| 477 |
+
max_depth = trial.suggest_int("max_depth", 4, 30)
|
| 478 |
+
m = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1, random_state=random_state)
|
| 479 |
+
elif family_name == "ExtraTrees":
|
| 480 |
+
n_estimators = trial.suggest_int("n_estimators", 100, 800)
|
| 481 |
+
max_depth = trial.suggest_int("max_depth", 4, 30)
|
| 482 |
+
m = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1, random_state=random_state)
|
| 483 |
+
elif family_name == "XGBoost" and optional_families.get("XGBoost"):
|
| 484 |
+
n_estimators = trial.suggest_int("n_estimators", 100, 1000)
|
| 485 |
+
max_depth = trial.suggest_int("max_depth", 3, 12)
|
| 486 |
+
lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
|
| 487 |
+
m = xgb.XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, tree_method="hist", verbosity=0, random_state=random_state, n_jobs=1)
|
| 488 |
+
elif family_name == "LightGBM" and optional_families.get("LightGBM"):
|
| 489 |
+
n_estimators = trial.suggest_int("n_estimators", 100, 1000)
|
| 490 |
+
max_depth = trial.suggest_int("max_depth", 3, 16)
|
| 491 |
+
lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
|
| 492 |
+
m = lgb.LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, n_jobs=1, random_state=random_state)
|
| 493 |
+
elif family_name == "CatBoost" and optional_families.get("CatBoost"):
|
| 494 |
+
iterations = trial.suggest_int("iterations", 200, 1000)
|
| 495 |
+
depth = trial.suggest_int("depth", 4, 10)
|
| 496 |
+
lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
|
| 497 |
+
m = cb.CatBoostRegressor(iterations=iterations, depth=depth, learning_rate=lr, verbose=0, random_state=random_state)
|
| 498 |
+
elif family_name == "MLP":
|
| 499 |
+
hidden = trial.suggest_int("hidden_layer_sizes", 32, 512, log=True)
|
| 500 |
+
lr = trial.suggest_float("learning_rate_init", 1e-4, 1e-1, log=True)
|
| 501 |
+
m = MLPRegressor(hidden_layer_sizes=(hidden,), learning_rate_init=lr, max_iter=500, random_state=random_state)
|
| 502 |
+
elif family_name == "TabPFN" and optional_families.get("TabPFN"):
|
| 503 |
+
# TabPFN often works without hyperparams exposure; return a surrogate score using quick fit
|
| 504 |
+
# We'll call its predict_proba style API if available; as fallback use a mean score to let stacking consider it.
|
| 505 |
+
# For tuning, just return a placeholder; we'll build model object later.
|
| 506 |
+
return 0.0
|
| 507 |
else:
|
| 508 |
+
# fallback to a small RandomForest to avoid crashing
|
| 509 |
+
m = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=random_state, n_jobs=-1)
|
| 510 |
+
|
| 511 |
+
# use negative RMSE if better for our domain? keep R2 for generality
|
| 512 |
+
try:
|
| 513 |
+
scores = cross_val_score(m, X_local, y_local, scoring="r2", cv=3, n_jobs=1)
|
| 514 |
+
return float(np.mean(scores))
|
| 515 |
+
except Exception:
|
| 516 |
+
return -999.0
|
| 517 |
+
|
| 518 |
+
study = optuna.create_study(direction="maximize")
|
| 519 |
+
study.optimize(obj, n_trials=n_trials, show_progress_bar=False)
|
| 520 |
+
best = study.best_trial.params if study.trials else {}
|
| 521 |
+
# instantiate best model
|
| 522 |
+
try:
|
| 523 |
+
if family_name == "RandomForest":
|
| 524 |
+
model = RandomForestRegressor(n_estimators=best.get("n_estimators",200), max_depth=best.get("max_depth",8), n_jobs=-1, random_state=42)
|
| 525 |
+
elif family_name == "ExtraTrees":
|
| 526 |
+
model = ExtraTreesRegressor(n_estimators=best.get("n_estimators",200), max_depth=best.get("max_depth",8), n_jobs=-1, random_state=42)
|
| 527 |
+
elif family_name == "XGBoost" and optional_families.get("XGBoost"):
|
| 528 |
+
model = xgb.XGBRegressor(n_estimators=best.get("n_estimators",200), max_depth=best.get("max_depth",6), learning_rate=best.get("learning_rate",0.1), tree_method="hist", verbosity=0, random_state=42, n_jobs=1)
|
| 529 |
+
elif family_name == "LightGBM" and optional_families.get("LightGBM"):
|
| 530 |
+
model = lgb.LGBMRegressor(n_estimators=best.get("n_estimators",200), max_depth=best.get("max_depth",8), learning_rate=best.get("learning_rate",0.1), n_jobs=1, random_state=42)
|
| 531 |
+
elif family_name == "CatBoost" and optional_families.get("CatBoost"):
|
| 532 |
+
model = cb.CatBoostRegressor(iterations=best.get("iterations",200), depth=best.get("depth",6), learning_rate=best.get("learning_rate",0.1), verbose=0, random_state=42)
|
| 533 |
+
elif family_name == "MLP":
|
| 534 |
+
model = MLPRegressor(hidden_layer_sizes=(best.get("hidden_layer_sizes",128),), learning_rate_init=best.get("learning_rate_init",0.001), max_iter=500, random_state=42)
|
| 535 |
+
elif family_name == "TabPFN" and optional_families.get("TabPFN"):
|
| 536 |
+
# We'll create a small wrapper for TabPFN later on train time
|
| 537 |
+
model = "TabPFN_placeholder"
|
| 538 |
+
else:
|
| 539 |
+
model = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
|
| 540 |
+
except Exception:
|
| 541 |
+
model = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
|
| 542 |
+
|
| 543 |
+
# compute cross-validated score for the best model
|
| 544 |
+
try:
|
| 545 |
+
score = float(np.mean(cross_val_score(model, X_local, y_local, scoring="r2", cv=3, n_jobs=1)))
|
| 546 |
+
except Exception:
|
| 547 |
+
score = -999.0
|
| 548 |
+
|
| 549 |
+
return {"model_obj": model, "cv_score": score, "best_params": best, "family": family_name, "study": study}
|
| 550 |
+
|
| 551 |
+
# --- Run tuning across available families (user triggered) ---
|
| 552 |
+
run_btn = st.button(" Run expanded AutoML + Stacking")
|
| 553 |
+
if run_btn:
|
| 554 |
+
with st.spinner("Tuning multiple families (this may take a while depending on choices)..."):
|
| 555 |
+
families_to_try = ["RandomForest", "ExtraTrees", "MLP"]
|
| 556 |
+
if allow_advanced:
|
| 557 |
+
if optional_families.get("XGBoost"): families_to_try.append("XGBoost")
|
| 558 |
+
if optional_families.get("LightGBM"): families_to_try.append("LightGBM")
|
| 559 |
+
if optional_families.get("CatBoost"): families_to_try.append("CatBoost")
|
| 560 |
+
if optional_families.get("TabPFN"): families_to_try.append("TabPFN")
|
| 561 |
+
if optional_families.get("FTTransformer"): families_to_try.append("FTTransformer")
|
| 562 |
+
|
| 563 |
+
tuned_results = []
|
| 564 |
+
for fam in families_to_try:
|
| 565 |
+
st.caption(f"Tuning family: {fam}")
|
| 566 |
+
res = tune_family(fam, X, y, n_trials=max_trials)
|
| 567 |
+
# res can be dict or single-run result; ensure consistent format
|
| 568 |
+
if isinstance(res, dict) and "model_obj" in res:
|
| 569 |
+
tuned_results.append(res)
|
| 570 |
+
else:
|
| 571 |
+
st.warning(f"Family {fam} returned unexpected tune result: {res}")
|
| 572 |
+
|
| 573 |
+
# build leaderboard DataFrame
|
| 574 |
+
lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"], "params": r["best_params"]} for r in tuned_results])
|
| 575 |
+
lb = lb.sort_values("cv_r2", ascending=False).reset_index(drop=True)
|
| 576 |
+
st.markdown("### Tuning Leaderboard (by CV R²)")
|
| 577 |
+
st.dataframe(lb[["family","cv_r2"]].round(4))
|
| 578 |
+
|
| 579 |
+
# --- Build base-models and collect out-of-fold preds for stacking ---
|
| 580 |
+
st.markdown("### Building base models & out-of-fold predictions for stacking")
|
| 581 |
+
kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
| 582 |
+
base_models = []
|
| 583 |
+
oof_preds = pd.DataFrame(index=X.index)
|
| 584 |
+
|
| 585 |
+
for idx, row in lb.iterrows():
|
| 586 |
+
fam = row["family"]
|
| 587 |
+
model_entry = next((r for r in tuned_results if r["family"] == fam), None)
|
| 588 |
+
if model_entry is None:
|
| 589 |
+
continue
|
| 590 |
+
model_obj = model_entry["model_obj"]
|
| 591 |
+
# train out-of-fold predictions
|
| 592 |
+
oof = np.zeros(X.shape[0])
|
| 593 |
+
for tr_idx, val_idx in kf.split(X):
|
| 594 |
+
X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
|
| 595 |
+
y_tr = y.iloc[tr_idx]
|
| 596 |
+
# fit family-specific wrapper (TabPFN/FTTransformer special-case)
|
| 597 |
+
if model_obj == "TabPFN_placeholder":
|
| 598 |
+
try:
|
| 599 |
+
# TabPFN expects specific API; create a simple fallback: use RandomForest to approximate
|
| 600 |
+
tmp = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
|
| 601 |
+
tmp.fit(X_tr, y_tr)
|
| 602 |
+
oof[val_idx] = tmp.predict(X_val)
|
| 603 |
+
except Exception:
|
| 604 |
+
oof[val_idx] = np.mean(y_tr)
|
| 605 |
+
else:
|
| 606 |
+
try:
|
| 607 |
+
model_obj.fit(X_tr, y_tr)
|
| 608 |
+
oof[val_idx] = model_obj.predict(X_val)
|
| 609 |
+
except Exception:
|
| 610 |
+
# fallback to mean
|
| 611 |
+
oof[val_idx] = np.mean(y_tr)
|
| 612 |
+
oof_preds[f"{fam}_oof"] = oof
|
| 613 |
+
|
| 614 |
+
# finally fit model on full data
|
| 615 |
+
try:
|
| 616 |
+
if model_entry["model_obj"] == "TabPFN_placeholder":
|
| 617 |
+
# fallback full-model: RandomForest
|
| 618 |
+
fitted = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
|
| 619 |
+
fitted.fit(X, y)
|
| 620 |
+
else:
|
| 621 |
+
model_entry["model_obj"].fit(X, y)
|
| 622 |
+
fitted = model_entry["model_obj"]
|
| 623 |
+
except Exception:
|
| 624 |
+
fitted = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
|
| 625 |
+
fitted.fit(X, y)
|
| 626 |
+
|
| 627 |
+
base_models.append({"family": fam, "model": fitted, "cv_r2": model_entry["cv_score"]})
|
| 628 |
+
|
| 629 |
+
# --- prune highly correlated OOF preds and keep top_k diverse models ---
|
| 630 |
+
if oof_preds.shape[1] == 0:
|
| 631 |
+
st.error("No base models created — aborting stacking.")
|
| 632 |
+
else:
|
| 633 |
+
corr_matrix = oof_preds.corr().abs()
|
| 634 |
+
# compute diversity score = (1 - mean correlation with others)
|
| 635 |
+
diversity = {col: 1 - corr_matrix[col].drop(col).mean() for col in corr_matrix.columns}
|
| 636 |
+
summary = []
|
| 637 |
+
for bm in base_models:
|
| 638 |
+
col = f"{bm['family']}_oof"
|
| 639 |
+
summary.append({"family": bm["family"], "cv_r2": bm["cv_r2"], "diversity": diversity.get(col, 0.0)})
|
| 640 |
+
summary_df = pd.DataFrame(summary).sort_values(["cv_r2", "diversity"], ascending=[False, False]).reset_index(drop=True)
|
| 641 |
+
st.markdown("### Base Model Summary (cv_r2, diversity)")
|
| 642 |
+
st.dataframe(summary_df.round(4))
|
| 643 |
+
|
| 644 |
+
# select top_k by cv_r2 and diversity combined
|
| 645 |
+
selected = summary_df.sort_values(["cv_r2","diversity"], ascending=[False, False]).head(top_k)["family"].tolist()
|
| 646 |
+
st.markdown(f"Selected for stacking (top {top_k}): {selected}")
|
| 647 |
+
|
| 648 |
+
# build stacking training data (OOF preds for selected)
|
| 649 |
+
selected_cols = [f"{s}_oof" for s in selected]
|
| 650 |
+
X_stack = oof_preds[selected_cols].fillna(0)
|
| 651 |
+
meta = Ridge(alpha=1.0)
|
| 652 |
+
meta.fit(X_stack, y)
|
| 653 |
+
|
| 654 |
+
# evaluate stacked ensemble on a holdout split
|
| 655 |
+
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 656 |
+
# predict with base models -> create meta inputs
|
| 657 |
+
meta_inputs = []
|
| 658 |
+
for fam in selected:
|
| 659 |
+
bm = next((b for b in base_models if b["family"] == fam), None)
|
| 660 |
+
if bm is not None:
|
| 661 |
+
try:
|
| 662 |
+
meta_inputs.append(bm["model"].predict(X_val))
|
| 663 |
+
except Exception:
|
| 664 |
+
meta_inputs.append(np.full(len(X_val), y_tr.mean()))
|
| 665 |
+
else:
|
| 666 |
+
meta_inputs.append(np.full(len(X_val), y_tr.mean()))
|
| 667 |
+
X_meta_val = np.column_stack(meta_inputs)
|
| 668 |
+
y_meta_pred = meta.predict(X_meta_val)
|
| 669 |
+
|
| 670 |
+
final_r2 = r2_score(y_val, y_meta_pred)
|
| 671 |
+
final_rmse = mean_squared_error(y_val, y_meta_pred, squared=False)
|
| 672 |
+
|
| 673 |
+
c1, c2 = st.columns(2)
|
| 674 |
+
c1.metric("Stacked Ensemble R² (holdout)", f"{final_r2:.4f}")
|
| 675 |
+
c2.metric("Stacked Ensemble RMSE (holdout)", f"{final_rmse:.4f}")
|
| 676 |
+
|
| 677 |
+
# scatter plot
|
| 678 |
+
fig, ax = plt.subplots(figsize=(7,4))
|
| 679 |
+
ax.scatter(y_val, y_meta_pred, alpha=0.6)
|
| 680 |
+
ax.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], "r--")
|
| 681 |
+
ax.set_xlabel("Actual"); ax.set_ylabel("Stacked Predicted")
|
| 682 |
+
st.pyplot(fig)
|
| 683 |
+
|
| 684 |
+
# save artifacts: base models list + meta learner
|
| 685 |
+
stack_artifact = os.path.join(DATA_DIR, f"stacked_{use_case.replace(' ','_')}.joblib")
|
| 686 |
+
to_save = {"base_models": {bm["family"]: bm["model"] for bm in base_models if bm["family"] in selected}, "meta": meta, "features": features, "selected": selected, "target": target}
|
| 687 |
+
joblib.dump(to_save, stack_artifact)
|
| 688 |
+
st.caption(f"Stacked ensemble saved: {stack_artifact}")
|
| 689 |
+
|
| 690 |
+
# --- SHAP on final stack: approximate by SHAP of top base model or meta contributions ---
|
| 691 |
+
st.markdown("### Explainability (approximate)")
|
| 692 |
+
try:
|
| 693 |
+
# Prefer SHAP on top base model (tree) for interpretability
|
| 694 |
+
top_base = next((b for b in base_models if b["family"] == selected[0]), None)
|
| 695 |
+
if top_base is not None and hasattr(top_base["model"], "predict"):
|
| 696 |
+
# sample for speed
|
| 697 |
+
sample_X = X_val.sample(min(300, len(X_val)), random_state=42)
|
| 698 |
+
if hasattr(top_base["model"], "predict") and ("XGBoost" in top_base["family"] or "LightGBM" in top_base["family"] or "RandomForest" in top_base["family"] or "ExtraTrees" in top_base["family"] or "CatBoost" in top_base["family"]):
|
| 699 |
+
expl = None
|
| 700 |
+
# safe tree explainer creation
|
| 701 |
+
try:
|
| 702 |
+
expl = shap.TreeExplainer(top_base["model"])
|
| 703 |
+
shap_vals = expl.shap_values(sample_X)
|
| 704 |
+
fig_sh = plt.figure(figsize=(8,6))
|
| 705 |
+
shap.summary_plot(shap_vals, sample_X, show=False)
|
| 706 |
+
st.pyplot(fig_sh)
|
| 707 |
+
except Exception as e:
|
| 708 |
+
st.warning(f"SHAP tree explainer unavailable: {e}")
|
| 709 |
+
else:
|
| 710 |
+
st.info("Top base model not tree-based; SHAP summary skipped. You can inspect per-base feature importances above.")
|
| 711 |
+
else:
|
| 712 |
+
st.info("No suitable base model for SHAP explanation found.")
|
| 713 |
+
except Exception as e:
|
| 714 |
+
st.warning(f"SHAP step failed gracefully: {e}")
|
| 715 |
+
|
| 716 |
+
st.success("AutoML + Stacking complete. Review metrics and saved artifacts.")
|
| 717 |
|
| 718 |
# ----- Target & Business Impact tab
|
| 719 |
with tabs[5]:
|