Spaces:

singhn9
/

SteelAI_Module2_EAF_Intelligence_Explorer

Sleeping

App Files Files Community

singhn9 commited on Nov 7, 2025

Commit

9a0d8df

verified ·

1 Parent(s): 5a05838

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +100 -193

src/streamlit_app.py CHANGED Viewed

@@ -360,208 +360,115 @@ with tabs[3]:
     st.subheader("Summary statistics (numeric features)")
     st.dataframe(df.describe().T.style.format("{:.3f}"), height=500)
 # ----- Ensemble + SHAP tab
 with tabs[4]:
-    st.subheader("Ensemble modeling sandbox (fast) + SHAP explainability")
-    # Feature & target selector
-    target = st.selectbox("Target variable", numeric_cols, index=numeric_cols.index("furnace_temp") if "furnace_temp" in numeric_cols else 0)
-    default_features = [c for c in numeric_cols if c != target][:50]  # preselect up to 50 features default
-    features = st.multiselect("Model input features (select many; start with defaults)", numeric_cols, default=default_features)
-    sample_size = st.slider("Sample rows to use for training (speed vs fidelity)", min_value=200, max_value=min(4000, df.shape[0]), value=1000, step=100)
-    train_button = st.button("Train ensemble & compute SHAP (recommended sample only)")
-    # Model Remediation & Tuning Options
-    st.markdown("###  Model Remediation & Tuning Options")
-    st.info("Use these to improve flat or low-variance predictions without editing code.")
-    colA, colB, colC = st.columns(3)
-    with colA:
-        apply_scaling = st.checkbox("Apply StandardScaler()", value=False)
-        feature_filter = st.checkbox("Use key furnace-relevant features", value=True)
-    with colB:
-        random_seed = st.number_input("Random Seed", min_value=0, max_value=9999, value=42)
-        n_estimators = st.slider("n_estimators (trees)", 50, 600, 150, step=25)
-    with colC:
-        furnace_temp_sd = st.slider("Synthetic Furnace Temp σ (spread)", 20, 500, 50, step=10)
-        arc_power_sd = st.slider("Synthetic Arc Power σ (spread)", 50, 700, 120, step=10)
-    st.markdown("---")
-    # --- Variance Controls UI ---
-    st.markdown("#### Variance controls (global & per-feature)")
-    global_var_mult = st.slider(
-        "Global variance multiplier", 0.1, 5.0, 1.0, step=0.1,
-        help="Multiply base standard deviations by this factor for all features."
-    )
-    # Optional: choose features to override
-    feat_for_override = st.multiselect(
-        "Select features to override variance (optional)", numeric_cols, max_selections=8
     )
-    variance_overrides = {}
-    if feat_for_override:
-        st.markdown("Set multipliers for selected features")
-        for f in feat_for_override:
-            mult = st.number_input(
-                f"Variance multiplier for {f}", min_value=0.1, max_value=10.0,
-                value=1.0, step=0.1, key=f"mult_{f}"
-            )
-            variance_overrides[f] = float(mult)
-    st.markdown("---")
-    # --- Regeneration button ---
-    if st.button("Regenerate Synthetic Dataset with Updated Variance"):
-        with st.spinner("Regenerating synthetic data..."):
-            variance_overrides.update({
-                "furnace_temp": furnace_temp_sd / 50,
-                "arc_power": arc_power_sd / 120
-            })
-            CSV_PATH, META_PATH, PDF_PATH = generate_advanced_flatfile(
-                n_rows=3000,
-                random_seed=int(random_seed),
-                max_polynomial_new=60,
-                global_variance_multiplier=float(global_var_mult),
-                variance_overrides=variance_overrides,
-            )
-            # Clear cache and reload fresh
-            st.cache_data.clear()
-            df, meta_df = load_data(csv_path=CSV_PATH, meta_path=META_PATH)
-            numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-            st.success(
-                f"Synthetic dataset regenerated — {len(df)} rows × {len(df.columns)} features "
-                f"(Global×{global_var_mult:.2f}; Overrides={len(variance_overrides)})"
-            )
-            st.caption(
-                f"Mean furnace_temp: {df['furnace_temp'].mean():.2f}, "
-                f"Std furnace_temp: {df['furnace_temp'].std():.2f}"
-            )
-    if train_button:
-        with st.spinner("Preparing data and training ensemble..."):
-            sub_df = df[features + [target]].sample(n=sample_size, random_state=42)
-            X = sub_df[features].fillna(0)
-            y = sub_df[target].fillna(0)
-            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-            # models
-            models = {
-                "Linear": LinearRegression(),
-                "RandomForest": RandomForestRegressor(n_estimators=150, random_state=42, n_jobs=-1),
-                "GradientBoosting": GradientBoostingRegressor(n_estimators=150, random_state=42),
-                "ExtraTrees": ExtraTreesRegressor(n_estimators=150, random_state=42, n_jobs=-1)
-            }
-            preds = {}
-            results = []
-            for name, m in models.items():
-                m.fit(X_train, y_train)
-                p = m.predict(X_test)
-                preds[name] = p
-                results.append({"Model": name, "R2": r2_score(y_test, p), "RMSE": float(np.sqrt(mean_squared_error(y_test, p)))})
-            # ensemble average
-            ensemble_pred = np.column_stack(list(preds.values())).mean(axis=1)
-            results.append({"Model": "EnsembleAvg", "R2": r2_score(y_test, ensemble_pred), "RMSE": float(np.sqrt(mean_squared_error(y_test, ensemble_pred)))})
-            st.dataframe(pd.DataFrame(results).set_index("Model").round(4))
-            # scatter
-            fig, ax = plt.subplots(figsize=(8,4))
-            ax.scatter(y_test, ensemble_pred, alpha=0.5)
-            ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--")
-            ax.set_xlabel("Actual"); ax.set_ylabel("Predicted (Ensemble)")
-            st.pyplot(fig)
-            # save the models (lightweight)
-            joblib.dump(models, ENSEMBLE_ARTIFACT)
-            st.success(f"Saved ensemble models to {ENSEMBLE_ARTIFACT}")
-            # ---------- SHAP explainability ----------
-            st.markdown("### SHAP Explainability — pick a model to explain (Tree models recommended)")
-            explain_model_name = st.selectbox("Model to explain", list(models.keys()), index= list(models.keys()).index("RandomForest") if "RandomForest" in models else 0)
-            explainer_sample = st.slider("Number of rows to use for SHAP explanation (memory heavy)", 50, min(1500, sample_size), value=300, step=50)
-            # Use a Tree explainer if possible; otherwise KernelExplainer (slow)
-            model_to_explain = models[explain_model_name]
-            X_shap = X_test.copy()
-            if explainer_sample < X_shap.shape[0]:
-                X_shap_for = X_shap.sample(n=explainer_sample, random_state=42)
             else:
-                X_shap_for = X_shap
-            with st.spinner("Computing SHAP values (this may take a while for large SHAP sample)..."):
-                try:
-                    if hasattr(model_to_explain, "predict") and (explain_model_name in ["RandomForest","ExtraTrees","GradientBoosting"]):
-                        explainer = shap.TreeExplainer(model_to_explain)
-                        shap_values = explainer.shap_values(X_shap_for)
-                        # summary plot
-                        import warnings
-                        warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")
-                        fig_shap = plt.figure(figsize=(8,6))
-                        shap.summary_plot(shap_values, X_shap_for, show=False)
-                        st.pyplot(fig_shap)
-                    else:
-                        # fallback: use KernelExplainer on small sample (very slow)
-                        explainer = shap.KernelExplainer(model_to_explain.predict, shap.sample(X_train, 100))
-                        shap_values = explainer.shap_values(X_shap_for, nsamples=100)
-                        fig_shap = plt.figure(figsize=(8,6))
-                        shap.summary_plot(shap_values, X_shap_for, show=False)
-                        st.pyplot(fig_shap)
-                    st.success("SHAP summary plotted.")
-                except Exception as e:
-                    st.error(f"SHAP failed: {e}")
-            # per-instance explanation waterfall
-            st.markdown("#### Explain a single prediction (waterfall):")
-            idx_choice = st.number_input("Row index (0..n_test-1)", min_value=0, max_value=X_shap.shape[0]-1, value=0)
-            try:
-                row = X_shap_for.iloc[[idx_choice]]
-                if explain_model_name in ["RandomForest","ExtraTrees","GradientBoosting"]:
-                  expl = shap.TreeExplainer(model_to_explain)
-                  shap_vals_row = expl.shap_values(row)
-                  exp_val = expl.expected_value
-                  shap_vals = shap_vals_row
-                  # Handle tree models returning arrays for single target
-                  if isinstance(exp_val, (list, np.ndarray)) and not np.isscalar(exp_val):
-                      exp_val = exp_val[0]
-                  if isinstance(shap_vals, list):
-                      shap_vals = shap_vals[0]
-                  exp_val = expl.expected_value
-                  shap_vals = shap_vals_row
-                  # Handle multi-output case
-                  if isinstance(exp_val, (list, np.ndarray)) and not np.isscalar(exp_val):
-                      exp_val = exp_val[0]
-                  if isinstance(shap_vals, list):
-                      shap_vals = shap_vals[0]
-                  # Plot safely across SHAP versions
-                  try:
-                      explanation = shap.Explanation(
-                          values=shap_vals[0],
-                          base_values=exp_val,
-                          data=row.iloc[0],
-                          feature_names=row.columns.tolist()
-                      )
-                      plot_obj = shap.plots.waterfall(explanation, show=False)
-                      # If SHAP returns Axes instead of Figure, wrap it
-                      import matplotlib.pyplot as plt
-                      if hasattr(plot_obj, "figure"):
-                          fig2 = plot_obj.figure
-                      else:
-                          fig2 = plt.gcf()
-                      st.pyplot(fig2)
-                  except Exception as e:
-                      st.warning(f"Waterfall plotting failed gracefully: {e}")
-                else:
-                    st.info("Per-instance waterfall not available for this model type in fallback.")
-            except Exception as e:
-                st.warning(f"Could not plot waterfall: {e}")
 # -----  Target & Business Impact tab

     st.subheader("Summary statistics (numeric features)")
     st.dataframe(df.describe().T.style.format("{:.3f}"), height=500)
 # ----- Ensemble + SHAP tab
 with tabs[4]:
+    st.subheader("Autonomous Ensemble Modeling + SHAP Explainability")
+    # --- Step 1: Basic UI selections ---
+    target = st.selectbox("Target variable", numeric_cols, index=numeric_cols.index("furnace_temp") if "furnace_temp" in numeric_cols else 0)
+    default_features = [c for c in numeric_cols if c != target][:60]
+    features = st.multiselect("Model input features", numeric_cols, default=default_features)
+    sample_size = st.slider("Sample rows for training", 500, min(4000, df.shape[0]), 1000, step=100)
+    sub_df = df[features + [target]].sample(n=sample_size, random_state=42)
+    X = sub_df[features].fillna(0)
+    y = sub_df[target].fillna(0)
+    # --- Step 2: Business / Process Objective selection ---
+    st.markdown("### 🎯 Select Operational Objective")
+    objective = st.selectbox(
+        "Optimization Objective",
+        [
+            "Maximize Accuracy (R²)",
+            "Minimize RMSE (Stable Control)",
+            "Maximize Yield Ratio (EAF/Inventory)",
+            "Minimize Energy Consumption (Efficiency)",
+            "Balanced (Accuracy + Efficiency)"
+        ],
+        index=0
     )
+    # --- Step 3: Auto-tuning with Optuna ---
+    import optuna
+    from sklearn.model_selection import cross_val_score
+    st.markdown("### ⚙️ Auto Tuning in Progress")
+    def objective_fn(trial):
+        model_name = trial.suggest_categorical("model", ["RandomForest", "GradientBoosting", "ExtraTrees"])
+        n_estimators = trial.suggest_int("n_estimators", 100, 600)
+        max_depth = trial.suggest_int("max_depth", 3, 20)
+        learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
+        if model_name == "RandomForest":
+            model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1)
+        elif model_name == "GradientBoosting":
+            model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
+        else:
+            model = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1)
+        # Metric selection
+        scoring_metric = "r2"
+        if "RMSE" in objective:
+            scoring_metric = "neg_root_mean_squared_error"
+        score = cross_val_score(model, X, y, cv=3, scoring=scoring_metric).mean()
+        return score
+    if st.button("Run Auto Ensemble Optimization"):
+        with st.spinner("Optimizing models... please wait (~20–60s)"):
+            study = optuna.create_study(direction="maximize")
+            study.optimize(objective_fn, n_trials=20)
+            best_params = study.best_params
+            st.success("✅ Best Auto-Tuned Model Found")
+            st.json(best_params)
+            # Build best model
+            model_name = best_params.pop("model")
+            if model_name == "RandomForest":
+                model = RandomForestRegressor(**best_params)
+            elif model_name == "GradientBoosting":
+                model = GradientBoostingRegressor(**best_params)
             else:
+                model = ExtraTreesRegressor(**best_params)
+            model.fit(X, y)
+            # Save model
+            joblib.dump(model, ENSEMBLE_ARTIFACT)
+            st.caption(f"Model saved: {ENSEMBLE_ARTIFACT}")
+            # --- Auto Visualizations ---
+            st.markdown("### 📈 Optimization History")
+            fig_hist = optuna.visualization.matplotlib.plot_optimization_history(study)
+            st.pyplot(fig_hist)
+            # Predictions
+            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+            y_pred = model.predict(X_test)
+            r2 = r2_score(y_test, y_pred)
+            rmse = mean_squared_error(y_test, y_pred, squared=False)
+            st.metric("R² Score", f"{r2:.3f}")
+            st.metric("RMSE", f"{rmse:.3f}")
+            # Scatter plot
+            fig, ax = plt.subplots(figsize=(7,4))
+            ax.scatter(y_test, y_pred, alpha=0.6)
+            ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--")
+            ax.set_xlabel("Actual"); ax.set_ylabel("Predicted")
+            st.pyplot(fig)
+            # --- SHAP Explainability for Best Model ---
+            st.markdown("### 🔍 SHAP Explainability (Auto Model)")
+            explainer = shap.TreeExplainer(model)
+            shap_values = explainer.shap_values(X_test.sample(300))
+            fig_shap = plt.figure(figsize=(8,6))
+            shap.summary_plot(shap_values, X_test.sample(300), show=False)
+            st.pyplot(fig_shap)
+            st.info("Auto tuning complete. Model performance and SHAP summary shown above.")
 # -----  Target & Business Impact tab