singhn9 commited on
Commit
b056229
·
verified ·
1 Parent(s): 9a0d8df

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +345 -101
src/streamlit_app.py CHANGED
@@ -361,115 +361,359 @@ with tabs[3]:
361
  st.dataframe(df.describe().T.style.format("{:.3f}"), height=500)
362
 
363
 
364
- # ----- Ensemble + SHAP tab
365
  with tabs[4]:
366
- st.subheader("Autonomous Ensemble Modeling + SHAP Explainability")
367
-
368
- # --- Step 1: Basic UI selections ---
369
- target = st.selectbox("Target variable", numeric_cols, index=numeric_cols.index("furnace_temp") if "furnace_temp" in numeric_cols else 0)
370
- default_features = [c for c in numeric_cols if c != target][:60]
371
- features = st.multiselect("Model input features", numeric_cols, default=default_features)
372
- sample_size = st.slider("Sample rows for training", 500, min(4000, df.shape[0]), 1000, step=100)
373
- sub_df = df[features + [target]].sample(n=sample_size, random_state=42)
374
- X = sub_df[features].fillna(0)
375
- y = sub_df[target].fillna(0)
376
 
377
- # --- Step 2: Business / Process Objective selection ---
378
- st.markdown("### 🎯 Select Operational Objective")
379
- objective = st.selectbox(
380
- "Optimization Objective",
381
  [
382
- "Maximize Accuracy (R²)",
383
- "Minimize RMSE (Stable Control)",
384
- "Maximize Yield Ratio (EAF/Inventory)",
385
- "Minimize Energy Consumption (Efficiency)",
386
- "Balanced (Accuracy + Efficiency)"
 
 
 
387
  ],
388
- index=0
389
  )
390
 
391
- # --- Step 3: Auto-tuning with Optuna ---
392
- import optuna
393
- from sklearn.model_selection import cross_val_score
394
-
395
- st.markdown("### ⚙️ Auto Tuning in Progress")
396
-
397
- def objective_fn(trial):
398
- model_name = trial.suggest_categorical("model", ["RandomForest", "GradientBoosting", "ExtraTrees"])
399
- n_estimators = trial.suggest_int("n_estimators", 100, 600)
400
- max_depth = trial.suggest_int("max_depth", 3, 20)
401
- learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
 
403
- if model_name == "RandomForest":
404
- model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1)
405
- elif model_name == "GradientBoosting":
406
- model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
407
- else:
408
- model = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1)
409
-
410
- # Metric selection
411
- scoring_metric = "r2"
412
- if "RMSE" in objective:
413
- scoring_metric = "neg_root_mean_squared_error"
414
-
415
- score = cross_val_score(model, X, y, cv=3, scoring=scoring_metric).mean()
416
- return score
417
-
418
- if st.button("Run Auto Ensemble Optimization"):
419
- with st.spinner("Optimizing models... please wait (~20–60s)"):
420
- study = optuna.create_study(direction="maximize")
421
- study.optimize(objective_fn, n_trials=20)
422
-
423
- best_params = study.best_params
424
- st.success(" Best Auto-Tuned Model Found")
425
- st.json(best_params)
426
-
427
- # Build best model
428
- model_name = best_params.pop("model")
429
- if model_name == "RandomForest":
430
- model = RandomForestRegressor(**best_params)
431
- elif model_name == "GradientBoosting":
432
- model = GradientBoostingRegressor(**best_params)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  else:
434
- model = ExtraTreesRegressor(**best_params)
435
- model.fit(X, y)
436
-
437
- # Save model
438
- joblib.dump(model, ENSEMBLE_ARTIFACT)
439
- st.caption(f"Model saved: {ENSEMBLE_ARTIFACT}")
440
-
441
- # --- Auto Visualizations ---
442
- st.markdown("### 📈 Optimization History")
443
- fig_hist = optuna.visualization.matplotlib.plot_optimization_history(study)
444
- st.pyplot(fig_hist)
445
-
446
- # Predictions
447
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
448
- y_pred = model.predict(X_test)
449
-
450
- r2 = r2_score(y_test, y_pred)
451
- rmse = mean_squared_error(y_test, y_pred, squared=False)
452
-
453
- st.metric("R² Score", f"{r2:.3f}")
454
- st.metric("RMSE", f"{rmse:.3f}")
455
-
456
- # Scatter plot
457
- fig, ax = plt.subplots(figsize=(7,4))
458
- ax.scatter(y_test, y_pred, alpha=0.6)
459
- ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--")
460
- ax.set_xlabel("Actual"); ax.set_ylabel("Predicted")
461
- st.pyplot(fig)
462
-
463
- # --- SHAP Explainability for Best Model ---
464
- st.markdown("### 🔍 SHAP Explainability (Auto Model)")
465
- explainer = shap.TreeExplainer(model)
466
- shap_values = explainer.shap_values(X_test.sample(300))
467
- fig_shap = plt.figure(figsize=(8,6))
468
- shap.summary_plot(shap_values, X_test.sample(300), show=False)
469
- st.pyplot(fig_shap)
470
-
471
- st.info("Auto tuning complete. Model performance and SHAP summary shown above.")
472
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
473
 
474
  # ----- Target & Business Impact tab
475
  with tabs[5]:
 
361
  st.dataframe(df.describe().T.style.format("{:.3f}"), height=500)
362
 
363
 
364
+ # ----- Ensemble + SHAP tab (Expanded AutoML + Stacking + Multi-Family) -----
365
  with tabs[4]:
366
+ st.subheader(" AutoML Ensemble Expanded Families + Stacking + SHAP")
 
 
 
 
 
 
 
 
 
367
 
368
+ # --- Step 0: High-level Use Case (keeps previous defaults) ---
369
+ st.markdown("### Choose Industrial Use Case ")
370
+ use_case = st.selectbox(
371
+ "Select Use Case",
372
  [
373
+ "Predictive Maintenance",
374
+ "EAF Data Intelligence",
375
+ "Casting Quality Optimization",
376
+ "Rolling Mill Energy Optimization",
377
+ "Surface Defect Detection (Vision AI)",
378
+ "Material Composition & Alloy Mix AI",
379
+ "Inventory & Yield Optimization",
380
+ "Refractory & Cooling Loss Prediction"
381
  ],
382
+ index=1
383
  )
384
 
385
+ # Map use-case -> defaults (same as before)
386
+ use_case_config = {
387
+ "Predictive Maintenance": {"target": "bearing_temp", "model_hint": "RandomForest"},
388
+ "EAF Data Intelligence": {"target": "furnace_temp", "model_hint": "GradientBoosting"},
389
+ "Casting Quality Optimization": {"target": "surface_temp" if "surface_temp" in numeric_cols else "furnace_temp", "model_hint": "GradientBoosting"},
390
+ "Rolling Mill Energy Optimization": {"target": "energy_efficiency", "model_hint": "ExtraTrees"},
391
+ "Surface Defect Detection (Vision AI)": {"target": "image_entropy_proxy", "model_hint": "GradientBoosting"},
392
+ "Material Composition & Alloy Mix AI": {"target": "chemical_C", "model_hint": "RandomForest"},
393
+ "Inventory & Yield Optimization": {"target": "yield_ratio", "model_hint": "GradientBoosting"},
394
+ "Refractory & Cooling Loss Prediction": {"target": "lining_thickness", "model_hint": "ExtraTrees"},
395
+ }
396
+ cfg = use_case_config.get(use_case, {"target": numeric_cols[0], "model_hint": "RandomForest"})
397
+ target = cfg["target"]
398
+ model_hint = cfg["model_hint"]
399
+
400
+ # --- Feature auto-suggestion (keeps your earlier heuristic) ---
401
+ suggested = [c for c in numeric_cols if any(k in c for k in target.split('_'))]
402
+ if len(suggested) < 6:
403
+ suggested = [c for c in numeric_cols if any(k in c for k in ["temp", "power", "energy", "pressure", "yield"])]
404
+ if len(suggested) < 6:
405
+ suggested = numeric_cols[:50]
406
+
407
+ features = st.multiselect("Model input features (auto-suggested)", numeric_cols, default=suggested)
408
+ st.markdown(f"Auto target: `{target}` · Suggested family hint: `{model_hint}`")
409
+
410
+ # --- Data sampling controls ---
411
+ max_rows = min(df.shape[0], 20000)
412
+ sample_size = st.slider("Sample rows (train speed vs fidelity)", 500, max_rows, min(1500, max_rows), step=100)
413
+
414
+ sub_df = df[features + [target]].sample(n=sample_size, random_state=42).reset_index(drop=True)
415
+ X = sub_df[features].fillna(0)
416
+ y = sub_df[target].fillna(0)
417
 
418
+ # --- Ensemble control UI ---
419
+ st.markdown("### Ensemble & AutoML Settings")
420
+ max_trials = st.slider("Optuna trials per family (total trials grow with families)", 5, 80, 20, step=5)
421
+ top_k = st.slider("Max base models to keep in final ensemble", 2, 8, 5)
422
+ allow_advanced = st.checkbox("Include advanced families (XGBoost, LightGBM, CatBoost, TabPFN if installed)", value=True)
423
+
424
+ # --- Conditional imports (graceful fallbacks) ---
425
+ available_models = ["RandomForest", "ExtraTrees"] # always available (sklearn)
426
+ optional_families = {}
427
+ if allow_advanced:
428
+ try:
429
+ import xgboost as xgb
430
+ optional_families["XGBoost"] = True
431
+ available_models.append("XGBoost")
432
+ except Exception:
433
+ optional_families["XGBoost"] = False
434
+ try:
435
+ import lightgbm as lgb
436
+ optional_families["LightGBM"] = True
437
+ available_models.append("LightGBM")
438
+ except Exception:
439
+ optional_families["LightGBM"] = False
440
+ try:
441
+ import catboost as cb
442
+ optional_families["CatBoost"] = True
443
+ available_models.append("CatBoost")
444
+ except Exception:
445
+ optional_families["CatBoost"] = False
446
+ try:
447
+ # TabPFN is often packaged differently; attempt import but it's optional
448
+ import tabpfn
449
+ optional_families["TabPFN"] = True
450
+ available_models.append("TabPFN")
451
+ except Exception:
452
+ optional_families["TabPFN"] = False
453
+ try:
454
+ # FT-Transformer optional
455
+ from pytorch_tabular.models import transformers # may not be installed
456
+ optional_families["FTTransformer"] = True
457
+ available_models.append("FTTransformer")
458
+ except Exception:
459
+ optional_families["FTTransformer"] = False
460
+
461
+ st.markdown(f"Available model families: {', '.join(available_models)}")
462
+
463
+ # --- Optuna tuning routine per family ---
464
+ import optuna
465
+ from sklearn.model_selection import cross_val_score, KFold
466
+ from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
467
+ from sklearn.linear_model import Ridge
468
+ from sklearn.neural_network import MLPRegressor
469
+ from sklearn.metrics import r2_score, mean_squared_error
470
+
471
+ def tune_family(family_name, X_local, y_local, n_trials=20, random_state=42):
472
+ """Tune one model family using Optuna; returns best (model_obj, cv_score, best_params)."""
473
+ def obj(trial):
474
+ # sample hyperparams per family
475
+ if family_name == "RandomForest":
476
+ n_estimators = trial.suggest_int("n_estimators", 100, 800)
477
+ max_depth = trial.suggest_int("max_depth", 4, 30)
478
+ m = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1, random_state=random_state)
479
+ elif family_name == "ExtraTrees":
480
+ n_estimators = trial.suggest_int("n_estimators", 100, 800)
481
+ max_depth = trial.suggest_int("max_depth", 4, 30)
482
+ m = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1, random_state=random_state)
483
+ elif family_name == "XGBoost" and optional_families.get("XGBoost"):
484
+ n_estimators = trial.suggest_int("n_estimators", 100, 1000)
485
+ max_depth = trial.suggest_int("max_depth", 3, 12)
486
+ lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
487
+ m = xgb.XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, tree_method="hist", verbosity=0, random_state=random_state, n_jobs=1)
488
+ elif family_name == "LightGBM" and optional_families.get("LightGBM"):
489
+ n_estimators = trial.suggest_int("n_estimators", 100, 1000)
490
+ max_depth = trial.suggest_int("max_depth", 3, 16)
491
+ lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
492
+ m = lgb.LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, n_jobs=1, random_state=random_state)
493
+ elif family_name == "CatBoost" and optional_families.get("CatBoost"):
494
+ iterations = trial.suggest_int("iterations", 200, 1000)
495
+ depth = trial.suggest_int("depth", 4, 10)
496
+ lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
497
+ m = cb.CatBoostRegressor(iterations=iterations, depth=depth, learning_rate=lr, verbose=0, random_state=random_state)
498
+ elif family_name == "MLP":
499
+ hidden = trial.suggest_int("hidden_layer_sizes", 32, 512, log=True)
500
+ lr = trial.suggest_float("learning_rate_init", 1e-4, 1e-1, log=True)
501
+ m = MLPRegressor(hidden_layer_sizes=(hidden,), learning_rate_init=lr, max_iter=500, random_state=random_state)
502
+ elif family_name == "TabPFN" and optional_families.get("TabPFN"):
503
+ # TabPFN often works without hyperparams exposure; return a surrogate score using quick fit
504
+ # We'll call its predict_proba style API if available; as fallback use a mean score to let stacking consider it.
505
+ # For tuning, just return a placeholder; we'll build model object later.
506
+ return 0.0
507
  else:
508
+ # fallback to a small RandomForest to avoid crashing
509
+ m = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=random_state, n_jobs=-1)
510
+
511
+ # use negative RMSE if better for our domain? keep R2 for generality
512
+ try:
513
+ scores = cross_val_score(m, X_local, y_local, scoring="r2", cv=3, n_jobs=1)
514
+ return float(np.mean(scores))
515
+ except Exception:
516
+ return -999.0
517
+
518
+ study = optuna.create_study(direction="maximize")
519
+ study.optimize(obj, n_trials=n_trials, show_progress_bar=False)
520
+ best = study.best_trial.params if study.trials else {}
521
+ # instantiate best model
522
+ try:
523
+ if family_name == "RandomForest":
524
+ model = RandomForestRegressor(n_estimators=best.get("n_estimators",200), max_depth=best.get("max_depth",8), n_jobs=-1, random_state=42)
525
+ elif family_name == "ExtraTrees":
526
+ model = ExtraTreesRegressor(n_estimators=best.get("n_estimators",200), max_depth=best.get("max_depth",8), n_jobs=-1, random_state=42)
527
+ elif family_name == "XGBoost" and optional_families.get("XGBoost"):
528
+ model = xgb.XGBRegressor(n_estimators=best.get("n_estimators",200), max_depth=best.get("max_depth",6), learning_rate=best.get("learning_rate",0.1), tree_method="hist", verbosity=0, random_state=42, n_jobs=1)
529
+ elif family_name == "LightGBM" and optional_families.get("LightGBM"):
530
+ model = lgb.LGBMRegressor(n_estimators=best.get("n_estimators",200), max_depth=best.get("max_depth",8), learning_rate=best.get("learning_rate",0.1), n_jobs=1, random_state=42)
531
+ elif family_name == "CatBoost" and optional_families.get("CatBoost"):
532
+ model = cb.CatBoostRegressor(iterations=best.get("iterations",200), depth=best.get("depth",6), learning_rate=best.get("learning_rate",0.1), verbose=0, random_state=42)
533
+ elif family_name == "MLP":
534
+ model = MLPRegressor(hidden_layer_sizes=(best.get("hidden_layer_sizes",128),), learning_rate_init=best.get("learning_rate_init",0.001), max_iter=500, random_state=42)
535
+ elif family_name == "TabPFN" and optional_families.get("TabPFN"):
536
+ # We'll create a small wrapper for TabPFN later on train time
537
+ model = "TabPFN_placeholder"
538
+ else:
539
+ model = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
540
+ except Exception:
541
+ model = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
542
+
543
+ # compute cross-validated score for the best model
544
+ try:
545
+ score = float(np.mean(cross_val_score(model, X_local, y_local, scoring="r2", cv=3, n_jobs=1)))
546
+ except Exception:
547
+ score = -999.0
548
+
549
+ return {"model_obj": model, "cv_score": score, "best_params": best, "family": family_name, "study": study}
550
+
551
+ # --- Run tuning across available families (user triggered) ---
552
+ run_btn = st.button(" Run expanded AutoML + Stacking")
553
+ if run_btn:
554
+ with st.spinner("Tuning multiple families (this may take a while depending on choices)..."):
555
+ families_to_try = ["RandomForest", "ExtraTrees", "MLP"]
556
+ if allow_advanced:
557
+ if optional_families.get("XGBoost"): families_to_try.append("XGBoost")
558
+ if optional_families.get("LightGBM"): families_to_try.append("LightGBM")
559
+ if optional_families.get("CatBoost"): families_to_try.append("CatBoost")
560
+ if optional_families.get("TabPFN"): families_to_try.append("TabPFN")
561
+ if optional_families.get("FTTransformer"): families_to_try.append("FTTransformer")
562
+
563
+ tuned_results = []
564
+ for fam in families_to_try:
565
+ st.caption(f"Tuning family: {fam}")
566
+ res = tune_family(fam, X, y, n_trials=max_trials)
567
+ # res can be dict or single-run result; ensure consistent format
568
+ if isinstance(res, dict) and "model_obj" in res:
569
+ tuned_results.append(res)
570
+ else:
571
+ st.warning(f"Family {fam} returned unexpected tune result: {res}")
572
+
573
+ # build leaderboard DataFrame
574
+ lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"], "params": r["best_params"]} for r in tuned_results])
575
+ lb = lb.sort_values("cv_r2", ascending=False).reset_index(drop=True)
576
+ st.markdown("### Tuning Leaderboard (by CV R²)")
577
+ st.dataframe(lb[["family","cv_r2"]].round(4))
578
+
579
+ # --- Build base-models and collect out-of-fold preds for stacking ---
580
+ st.markdown("### Building base models & out-of-fold predictions for stacking")
581
+ kf = KFold(n_splits=5, shuffle=True, random_state=42)
582
+ base_models = []
583
+ oof_preds = pd.DataFrame(index=X.index)
584
+
585
+ for idx, row in lb.iterrows():
586
+ fam = row["family"]
587
+ model_entry = next((r for r in tuned_results if r["family"] == fam), None)
588
+ if model_entry is None:
589
+ continue
590
+ model_obj = model_entry["model_obj"]
591
+ # train out-of-fold predictions
592
+ oof = np.zeros(X.shape[0])
593
+ for tr_idx, val_idx in kf.split(X):
594
+ X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
595
+ y_tr = y.iloc[tr_idx]
596
+ # fit family-specific wrapper (TabPFN/FTTransformer special-case)
597
+ if model_obj == "TabPFN_placeholder":
598
+ try:
599
+ # TabPFN expects specific API; create a simple fallback: use RandomForest to approximate
600
+ tmp = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
601
+ tmp.fit(X_tr, y_tr)
602
+ oof[val_idx] = tmp.predict(X_val)
603
+ except Exception:
604
+ oof[val_idx] = np.mean(y_tr)
605
+ else:
606
+ try:
607
+ model_obj.fit(X_tr, y_tr)
608
+ oof[val_idx] = model_obj.predict(X_val)
609
+ except Exception:
610
+ # fallback to mean
611
+ oof[val_idx] = np.mean(y_tr)
612
+ oof_preds[f"{fam}_oof"] = oof
613
+
614
+ # finally fit model on full data
615
+ try:
616
+ if model_entry["model_obj"] == "TabPFN_placeholder":
617
+ # fallback full-model: RandomForest
618
+ fitted = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
619
+ fitted.fit(X, y)
620
+ else:
621
+ model_entry["model_obj"].fit(X, y)
622
+ fitted = model_entry["model_obj"]
623
+ except Exception:
624
+ fitted = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
625
+ fitted.fit(X, y)
626
+
627
+ base_models.append({"family": fam, "model": fitted, "cv_r2": model_entry["cv_score"]})
628
+
629
+ # --- prune highly correlated OOF preds and keep top_k diverse models ---
630
+ if oof_preds.shape[1] == 0:
631
+ st.error("No base models created — aborting stacking.")
632
+ else:
633
+ corr_matrix = oof_preds.corr().abs()
634
+ # compute diversity score = (1 - mean correlation with others)
635
+ diversity = {col: 1 - corr_matrix[col].drop(col).mean() for col in corr_matrix.columns}
636
+ summary = []
637
+ for bm in base_models:
638
+ col = f"{bm['family']}_oof"
639
+ summary.append({"family": bm["family"], "cv_r2": bm["cv_r2"], "diversity": diversity.get(col, 0.0)})
640
+ summary_df = pd.DataFrame(summary).sort_values(["cv_r2", "diversity"], ascending=[False, False]).reset_index(drop=True)
641
+ st.markdown("### Base Model Summary (cv_r2, diversity)")
642
+ st.dataframe(summary_df.round(4))
643
+
644
+ # select top_k by cv_r2 and diversity combined
645
+ selected = summary_df.sort_values(["cv_r2","diversity"], ascending=[False, False]).head(top_k)["family"].tolist()
646
+ st.markdown(f"Selected for stacking (top {top_k}): {selected}")
647
+
648
+ # build stacking training data (OOF preds for selected)
649
+ selected_cols = [f"{s}_oof" for s in selected]
650
+ X_stack = oof_preds[selected_cols].fillna(0)
651
+ meta = Ridge(alpha=1.0)
652
+ meta.fit(X_stack, y)
653
+
654
+ # evaluate stacked ensemble on a holdout split
655
+ X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
656
+ # predict with base models -> create meta inputs
657
+ meta_inputs = []
658
+ for fam in selected:
659
+ bm = next((b for b in base_models if b["family"] == fam), None)
660
+ if bm is not None:
661
+ try:
662
+ meta_inputs.append(bm["model"].predict(X_val))
663
+ except Exception:
664
+ meta_inputs.append(np.full(len(X_val), y_tr.mean()))
665
+ else:
666
+ meta_inputs.append(np.full(len(X_val), y_tr.mean()))
667
+ X_meta_val = np.column_stack(meta_inputs)
668
+ y_meta_pred = meta.predict(X_meta_val)
669
+
670
+ final_r2 = r2_score(y_val, y_meta_pred)
671
+ final_rmse = mean_squared_error(y_val, y_meta_pred, squared=False)
672
+
673
+ c1, c2 = st.columns(2)
674
+ c1.metric("Stacked Ensemble R² (holdout)", f"{final_r2:.4f}")
675
+ c2.metric("Stacked Ensemble RMSE (holdout)", f"{final_rmse:.4f}")
676
+
677
+ # scatter plot
678
+ fig, ax = plt.subplots(figsize=(7,4))
679
+ ax.scatter(y_val, y_meta_pred, alpha=0.6)
680
+ ax.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], "r--")
681
+ ax.set_xlabel("Actual"); ax.set_ylabel("Stacked Predicted")
682
+ st.pyplot(fig)
683
+
684
+ # save artifacts: base models list + meta learner
685
+ stack_artifact = os.path.join(DATA_DIR, f"stacked_{use_case.replace(' ','_')}.joblib")
686
+ to_save = {"base_models": {bm["family"]: bm["model"] for bm in base_models if bm["family"] in selected}, "meta": meta, "features": features, "selected": selected, "target": target}
687
+ joblib.dump(to_save, stack_artifact)
688
+ st.caption(f"Stacked ensemble saved: {stack_artifact}")
689
+
690
+ # --- SHAP on final stack: approximate by SHAP of top base model or meta contributions ---
691
+ st.markdown("### Explainability (approximate)")
692
+ try:
693
+ # Prefer SHAP on top base model (tree) for interpretability
694
+ top_base = next((b for b in base_models if b["family"] == selected[0]), None)
695
+ if top_base is not None and hasattr(top_base["model"], "predict"):
696
+ # sample for speed
697
+ sample_X = X_val.sample(min(300, len(X_val)), random_state=42)
698
+ if hasattr(top_base["model"], "predict") and ("XGBoost" in top_base["family"] or "LightGBM" in top_base["family"] or "RandomForest" in top_base["family"] or "ExtraTrees" in top_base["family"] or "CatBoost" in top_base["family"]):
699
+ expl = None
700
+ # safe tree explainer creation
701
+ try:
702
+ expl = shap.TreeExplainer(top_base["model"])
703
+ shap_vals = expl.shap_values(sample_X)
704
+ fig_sh = plt.figure(figsize=(8,6))
705
+ shap.summary_plot(shap_vals, sample_X, show=False)
706
+ st.pyplot(fig_sh)
707
+ except Exception as e:
708
+ st.warning(f"SHAP tree explainer unavailable: {e}")
709
+ else:
710
+ st.info("Top base model not tree-based; SHAP summary skipped. You can inspect per-base feature importances above.")
711
+ else:
712
+ st.info("No suitable base model for SHAP explanation found.")
713
+ except Exception as e:
714
+ st.warning(f"SHAP step failed gracefully: {e}")
715
+
716
+ st.success("AutoML + Stacking complete. Review metrics and saved artifacts.")
717
 
718
  # ----- Target & Business Impact tab
719
  with tabs[5]: