Spaces:

singhn9
/

SteelAI_Module2_EAF_Intelligence_Explorer

Sleeping

App Files Files Community

singhn9 commited on Nov 9, 2025

Commit

18057c4

verified ·

1 Parent(s): 54e00e0

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +216 -737

src/streamlit_app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
 import json
 import time
@@ -12,10 +11,11 @@ import seaborn as sns
 import joblib
 import zipfile
 import io
 # ML imports
 from sklearn.model_selection import train_test_split
-from sklearn.linear_model import LinearRegression
 from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
 from sklearn.preprocessing import StandardScaler, PolynomialFeatures
 from sklearn.decomposition import PCA
@@ -25,6 +25,10 @@ from sklearn.metrics import mean_squared_error, r2_score
 # SHAP
 import shap
 # --- Safe defaults for Streamlit session state ---
 defaults = {
@@ -38,7 +42,6 @@ defaults = {
 for k, v in defaults.items():
     st.session_state.setdefault(k, v)
 if "llm_result" not in st.session_state:
     st.session_state["llm_result"] = None
 if "automl_summary" not in st.session_state:
@@ -51,7 +54,6 @@ if "hf_clicked" not in st.session_state:
 # -------------------------
 # Config & paths
 # -------------------------
 st.set_page_config(page_title="Steel Authority of India Limited (MODEX)", layout="wide")
 plt.style.use("seaborn-v0_8-muted")
 sns.set_palette("muted")
@@ -79,17 +81,13 @@ def log(msg: str):
         f.write(f"[{stamp}] {msg}\n")
     print(msg)
 log("=== Streamlit session started ===")
 if os.path.exists("/data"):
     st.sidebar.success(f" Using persistent storage | Logs directory: {LOG_DIR}")
 else:
     st.sidebar.warning(f" Using ephemeral storage | Logs directory: {LOG_DIR}. Data will be lost on rebuild.")
 # -------------------------
 # Utility: generate advanced dataset if missing
 # -------------------------
@@ -104,13 +102,6 @@ def generate_advanced_flatfile(
     Generates a large synthetic, physics-aligned dataset with many engineered features.
     Allows control of variability per feature (through variance_overrides) or globally
     (via global_variance_multiplier).
-    Args:
-        n_rows: number of samples
-        random_seed: RNG seed
-        max_polynomial_new: limit on number of polynomial expansion features
-        global_variance_multiplier: multiplier applied to all default stddevs
-        variance_overrides: dict mapping feature name or substring → stddev multiplier
     """
     np.random.seed(random_seed)
     os.makedirs(LOG_DIR, exist_ok=True)
@@ -307,37 +298,7 @@ def generate_advanced_flatfile(
         existing = [meta_entry]
     json.dump(existing, open(META_PATH, "w"), indent=2)
     PDF_PATH = None
-    # annotated bibliography
-    # try:
-    #     from fpdf import FPDF
-    #     pdf = FPDF('P','mm','A4')
-    #     pdf.add_page()
-    #     pdf.set_font("Helvetica","B",14)
-    #     pdf.cell(0,8,"Annotated Bibliography - Metallurgical AI (Selected Papers)", ln=True)
-    #     pdf.ln(2)
-    #     pdf.set_font("Helvetica","",10)
-    #     pdf.cell(0,6,"Generated: " + datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC"), ln=True)
-    #     pdf.ln(4)
-    #     bib_items = [
-    #         ("A Survey of Data-Driven Soft Sensing in Ironmaking Systems","Yan et al. (2024)","Review of soft-sensors; supports gas proxies, lags, PCA."),
-    #         ("Optimisation of Oxygen Blowing Process using RL","Ojeda Roldan et al. (2022)","RL for oxygen control; motivates surrogate predicted states & safety indices."),
-    #         ("Analyzing the Energy Efficiency of Electric Arc Furnace","Zhuo et al. (2024)","Energy KPIs (kWh/t) motivate power_density & energy_efficiency features."),
-    #         ("BOF/Endpoint prediction techniques","Springer (2024)","Endpoint prediction; supports temporal lags and cycle encoding."),
-    #         ("Dynamic EAF modeling & slag foaming","MacRosty et al.","Physics priors for slag_foaming_index and refractory health modeling.")
-    #     ]
-    #     for title, auth, note in bib_items:
-    #         pdf.set_font("Helvetica","B",11)
-    #         pdf.multi_cell(0,6, f"{title} — {auth}")
-    #         pdf.set_font("Helvetica","",10)
-    #         pdf.multi_cell(0,5, f"Notes: {note}")
-    #         pdf.ln(2)
-    #     pdf.output(PDF_PATH)
-    # except Exception as e:
-    #     with open(PDF_PATH.replace(".pdf",".txt"), "w") as tf:
-    #         tf.write("Annotated bibliography generated. Install fpdf for PDF output.\n")
     return CSV_PATH, META_PATH, PDF_PATH
 # -------------------------
@@ -359,10 +320,8 @@ def load_data(csv_path=CSV_PATH, meta_path=META_PATH):
     return df_local, pd.DataFrame(meta_local)
 df, meta_df = load_data()
 # -------------------------
-# Sidebar filters & UI (FINAL ROBUST VERSION)
 # -------------------------
 st.sidebar.title("Feature Explorer - Advanced + SHAP")
@@ -370,7 +329,6 @@ def ensure_feature_metadata(df: pd.DataFrame, meta_df: pd.DataFrame) -> pd.DataF
     """Ensure metadata dataframe matches feature count & has required columns."""
     required_cols = ["feature_name", "source_type", "formula", "remarks"]
-    # If metadata missing or too short, rebuild it entirely
     if meta_df is None or len(meta_df) < len(df.columns):
         meta_df = pd.DataFrame({
             "feature_name": df.columns,
@@ -383,14 +341,11 @@ def ensure_feature_metadata(df: pd.DataFrame, meta_df: pd.DataFrame) -> pd.DataF
         })
         st.sidebar.warning("Metadata was summary-only — rebuilt feature-level metadata.")
     else:
-        # Ensure required columns exist
         for col in required_cols:
             if col not in meta_df.columns:
                 meta_df[col] = None
-        # Fill feature_name if blank or NaN
         if meta_df["feature_name"].isna().all():
             meta_df["feature_name"] = df.columns
-        # Clip to same number of features (safety)
         if len(meta_df) > len(df.columns):
             meta_df = meta_df.iloc[: len(df.columns)]
@@ -398,7 +353,6 @@ def ensure_feature_metadata(df: pd.DataFrame, meta_df: pd.DataFrame) -> pd.DataF
 meta_df = ensure_feature_metadata(df, meta_df)
-# Build sidebar safely
 feat_types = sorted(meta_df["source_type"].dropna().unique().tolist())
 selected_types = st.sidebar.multiselect("Feature type", feat_types, default=feat_types)
@@ -409,11 +363,9 @@ else:
 numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
 # -------------------------
-# Features tab (robust)
 # -------------------------
 tabs = st.tabs([
     "Features",
     "Visualization",
@@ -426,7 +378,7 @@ tabs = st.tabs([
     "View Logs"
 ])
 with tabs[0]:
     st.subheader("Feature metadata")
     st.dataframe(
@@ -436,24 +388,18 @@ with tabs[0]:
     )
     st.markdown(f"Total features loaded: **{df.shape[1]}**  |  Rows: **{df.shape[0]}**")
-# ----- Visualize tab
 with tabs[1]:
     st.subheader("Feature Visualization")
     col = st.selectbox("Choose numeric feature", numeric_cols, index=0)
     bins = st.slider("Histogram bins", 10, 200, 50)
-    # --- Improved Histogram with style ---
     fig, ax = plt.subplots(figsize=(8, 4))
     sns.histplot(df[col], bins=bins, kde=True, ax=ax, color="#2C6E91", alpha=0.8)
-    ax.set_title(f"Distribution of {col.replace('_', ' ').title()}", fontsize=12)
-    ax.set_xlabel(col.replace("_", " ").title(), fontsize=10)
-    ax.set_ylabel("Frequency", fontsize=10)
-    sns.despine()
     st.pyplot(fig, clear_figure=True)
     st.write(df[col].describe().to_frame().T)
-    # --- Add PCA scatter visualization ---
     if all(x in df.columns for x in ["pca_1", "pca_2", "operating_mode"]):
         st.markdown("### PCA Feature Space — Colored by Operating Mode")
         fig2, ax2 = plt.subplots(figsize=(6, 5))
@@ -462,14 +408,9 @@ with tabs[1]:
             x="pca_1", y="pca_2", hue="operating_mode",
             palette="tab10", alpha=0.7, s=40, ax=ax2
         )
-        ax2.set_title("Operating Mode Clusters (PCA Projection)", fontsize=12)
-        ax2.set_xlabel("PCA 1")
-        ax2.set_ylabel("PCA 2")
-        ax2.legend(title="Operating Mode", bbox_to_anchor=(1.05, 1), loc="upper left")
-        sns.despine()
         st.pyplot(fig2, clear_figure=True)
 # ----- Correlations tab
 with tabs[2]:
     st.subheader("Correlation explorer")
@@ -478,14 +419,9 @@ with tabs[2]:
     if len(corr_sel) >= 2:
         corr = df[corr_sel].corr()
         fig, ax = plt.subplots(figsize=(10,8))
-        sns.heatmap(
-            corr, cmap="RdBu_r", center=0, annot=True, fmt=".2f",
-            linewidths=0.5, cbar_kws={"shrink": 0.7}, ax=ax
-        )
-        ax.set_title("Feature Correlation Matrix", fontsize=12)
-        sns.despine()
         st.pyplot(fig, clear_figure=True)
     else:
         st.info("Choose at least 2 numeric features to compute correlation.")
@@ -494,13 +430,10 @@ with tabs[3]:
     st.subheader("Summary statistics (numeric features)")
     st.dataframe(df.describe().T.style.format("{:.3f}"), height=500)
-# ----- Ensemble + SHAP tab (Expanded AutoML + Stacking + Multi-Family) -----
 with tabs[4]:
-    st.subheader(" AutoML Ensemble — Expanded Families + Stacking + SHAP")
-    # --- Step 0: High-level Use Case (keeps previous defaults) ---
-    st.markdown("###  Choose Industrial Use Case ")
     use_case = st.selectbox(
         "Select Use Case",
         [
@@ -516,11 +449,10 @@ with tabs[4]:
         index=1
     )
-    # Map use-case -> defaults (same as before)
     use_case_config = {
         "Predictive Maintenance": {"target": "bearing_temp", "model_hint": "RandomForest"},
         "EAF Data Intelligence": {"target": "furnace_temp", "model_hint": "GradientBoosting"},
-        "Casting Quality Optimization": {"target": "surface_temp" if "surface_temp" in numeric_cols else "furnace_temp", "model_hint": "GradientBoosting"},
         "Rolling Mill Energy Optimization": {"target": "energy_efficiency", "model_hint": "ExtraTrees"},
         "Surface Defect Detection (Vision AI)": {"target": "image_entropy_proxy", "model_hint": "GradientBoosting"},
         "Material Composition & Alloy Mix AI": {"target": "chemical_C", "model_hint": "RandomForest"},
@@ -531,81 +463,44 @@ with tabs[4]:
     target = cfg["target"]
     model_hint = cfg["model_hint"]
-    # --- Feature auto-suggestion (keeps your earlier heuristic) ---
     suggested = [c for c in numeric_cols if any(k in c for k in target.split('_'))]
     if len(suggested) < 6:
-        suggested = [c for c in numeric_cols if any(k in c for k in ["temp", "power", "energy", "pressure", "yield"])]
     if len(suggested) < 6:
         suggested = numeric_cols[:50]
     features = st.multiselect("Model input features (auto-suggested)", numeric_cols, default=suggested)
     st.markdown(f"Auto target: `{target}` · Suggested family hint: `{model_hint}`")
-    # --- Data sampling controls ---
     max_rows = min(df.shape[0], 20000)
-    sample_size = st.slider("Sample rows (train speed vs fidelity)", 500, max_rows, min(1500, max_rows), step=100)
     sub_df = df[features + [target]].sample(n=sample_size, random_state=42).reset_index(drop=True)
     X = sub_df[features].fillna(0)
     y = sub_df[target].fillna(0)
-    # --- Ensemble control UI ---
     st.markdown("### Ensemble & AutoML Settings")
-    max_trials = st.slider("Optuna trials per family (total trials grow with families)", 5, 80, 20, step=5)
-    top_k = st.slider("Max base models to keep in final ensemble", 2, 8, 5)
-    allow_advanced = st.checkbox("Include advanced families (XGBoost, LightGBM, CatBoost, TabPFN if installed)", value=True)
-    # --- Conditional imports (graceful fallbacks) ---
-    available_models = ["RandomForest", "ExtraTrees"]  # always available (sklearn)
     optional_families = {}
     if allow_advanced:
         try:
-            import xgboost as xgb
-            optional_families["XGBoost"] = True
-            available_models.append("XGBoost")
-        except Exception:
-            optional_families["XGBoost"] = False
         try:
-            import lightgbm as lgb
-            optional_families["LightGBM"] = True
-            available_models.append("LightGBM")
-        except Exception:
-            optional_families["LightGBM"] = False
-        try:
-            import catboost as cb
-            optional_families["CatBoost"] = True
-            available_models.append("CatBoost")
-        except Exception:
-            optional_families["CatBoost"] = False
-        try:
-            # TabPFN is often packaged differently; attempt import but it's optional
-            import tabpfn
-            optional_families["TabPFN"] = True
-            available_models.append("TabPFN")
-        except Exception:
-            optional_families["TabPFN"] = False
         try:
-            # FT-Transformer optional
-            from pytorch_tabular.models import transformers  # may not be installed
-            optional_families["FTTransformer"] = True
-            available_models.append("FTTransformer")
-        except Exception:
-            optional_families["FTTransformer"] = False
     st.markdown(f"Available model families: {', '.join(available_models)}")
-    # --- Optuna tuning routine per family ---
-    import optuna
-    from sklearn.model_selection import cross_val_score, KFold
-    from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
-    from sklearn.linear_model import Ridge
-    from sklearn.neural_network import MLPRegressor
-    from sklearn.metrics import r2_score, mean_squared_error
     def tune_family(family_name, X_local, y_local, n_trials=20, random_state=42):
-        """Tune one model family using Optuna; returns best (model_obj, cv_score, best_params)."""
         def obj(trial):
-            # sample hyperparams per family
             if family_name == "RandomForest":
                 n_estimators = trial.suggest_int("n_estimators", 100, 800)
                 max_depth = trial.suggest_int("max_depth", 4, 30)
@@ -618,33 +513,21 @@ with tabs[4]:
                 n_estimators = trial.suggest_int("n_estimators", 100, 1000)
                 max_depth = trial.suggest_int("max_depth", 3, 12)
                 lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
-                m = xgb.XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, tree_method="hist", verbosity=0, random_state=random_state, n_jobs=1)
             elif family_name == "LightGBM" and optional_families.get("LightGBM"):
                 n_estimators = trial.suggest_int("n_estimators", 100, 1000)
                 max_depth = trial.suggest_int("max_depth", 3, 16)
                 lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
-                m = lgb.LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, n_jobs=1, random_state=random_state)
             elif family_name == "CatBoost" and optional_families.get("CatBoost"):
                 iterations = trial.suggest_int("iterations", 200, 1000)
                 depth = trial.suggest_int("depth", 4, 10)
                 lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
-                m = cb.CatBoostRegressor(iterations=iterations, depth=depth, learning_rate=lr, verbose=0, random_state=random_state)
-            elif family_name == "MLP":
-                hidden = trial.suggest_int("hidden_layer_sizes", 32, 512, log=True)
-                lr = trial.suggest_float("learning_rate_init", 1e-4, 1e-1, log=True)
-                m = MLPRegressor(hidden_layer_sizes=(hidden,), learning_rate_init=lr, max_iter=500, random_state=random_state)
-            elif family_name == "TabPFN" and optional_families.get("TabPFN"):
-                # TabPFN often works without hyperparams exposure; return a surrogate score using quick fit
-                # We'll call its predict_proba style API if available; as fallback use a mean score to let stacking consider it.
-                # For tuning, just return a placeholder; we'll build model object later.
-                return 0.0
             else:
-                # fallback to a small RandomForest to avoid crashing
-                m = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=random_state, n_jobs=-1)
-            # use negative RMSE if better for our domain? keep R2 for generality
             try:
-                scores = cross_val_score(m, X_local, y_local, scoring="r2", cv=3, n_jobs=1)
                 return float(np.mean(scores))
             except Exception:
                 return -999.0
@@ -652,636 +535,232 @@ with tabs[4]:
         study = optuna.create_study(direction="maximize")
         study.optimize(obj, n_trials=n_trials, show_progress_bar=False)
         best = study.best_trial.params if study.trials else {}
-        # instantiate best model
         try:
             if family_name == "RandomForest":
-                model = RandomForestRegressor(n_estimators=best.get("n_estimators",200), max_depth=best.get("max_depth",8), n_jobs=-1, random_state=42)
             elif family_name == "ExtraTrees":
-                model = ExtraTreesRegressor(n_estimators=best.get("n_estimators",200), max_depth=best.get("max_depth",8), n_jobs=-1, random_state=42)
             elif family_name == "XGBoost" and optional_families.get("XGBoost"):
-                model = xgb.XGBRegressor(n_estimators=best.get("n_estimators",200), max_depth=best.get("max_depth",6), learning_rate=best.get("learning_rate",0.1), tree_method="hist", verbosity=0, random_state=42, n_jobs=1)
             elif family_name == "LightGBM" and optional_families.get("LightGBM"):
-                model = lgb.LGBMRegressor(n_estimators=best.get("n_estimators",200), max_depth=best.get("max_depth",8), learning_rate=best.get("learning_rate",0.1), n_jobs=1, random_state=42)
             elif family_name == "CatBoost" and optional_families.get("CatBoost"):
-                model = cb.CatBoostRegressor(iterations=best.get("iterations",200), depth=best.get("depth",6), learning_rate=best.get("learning_rate",0.1), verbose=0, random_state=42)
-            elif family_name == "MLP":
-                model = MLPRegressor(hidden_layer_sizes=(best.get("hidden_layer_sizes",128),), learning_rate_init=best.get("learning_rate_init",0.001), max_iter=500, random_state=42)
-            elif family_name == "TabPFN" and optional_families.get("TabPFN"):
-                # We'll create a small wrapper for TabPFN later on train time
-                model = "TabPFN_placeholder"
             else:
-                model = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
         except Exception:
-            model = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
-        # compute cross-validated score for the best model
         try:
-            score = float(np.mean(cross_val_score(model, X_local, y_local, scoring="r2", cv=3, n_jobs=1)))
         except Exception:
             score = -999.0
-        return {"model_obj": model, "cv_score": score, "best_params": best, "family": family_name, "study": study}
-    # --- Run tuning across available families (user triggered) ---
-    if "run_automl_clicked" not in st.session_state:
-        st.session_state["run_automl_clicked"] = False
     if st.button("Run expanded AutoML + Stacking"):
         st.session_state["run_automl_clicked"] = True
     if st.session_state["run_automl_clicked"]:
         log("AutoML + Stacking initiated.")
-        with st.spinner("Tuning multiple families (this may take a while depending on choices)..."):
             families_to_try = ["RandomForest", "ExtraTrees", "MLP"]
             if allow_advanced:
                 if optional_families.get("XGBoost"): families_to_try.append("XGBoost")
                 if optional_families.get("LightGBM"): families_to_try.append("LightGBM")
                 if optional_families.get("CatBoost"): families_to_try.append("CatBoost")
-                if optional_families.get("TabPFN"): families_to_try.append("TabPFN")
-                if optional_families.get("FTTransformer"): families_to_try.append("FTTransformer")
             tuned_results = []
             for fam in families_to_try:
                 log(f"Tuning family: {fam}")
                 st.caption(f"Tuning family: {fam}")
-                res = tune_family(fam, X, y, n_trials=max_trials)
-                # res can be dict or single-run result; ensure consistent format
-                if isinstance(res, dict) and "model_obj" in res:
-                    tuned_results.append(res)
-                else:
-                    st.warning(f"Family {fam} returned unexpected tune result: {res}")
-            log("All families tuned successfully.")
-            # build leaderboard DataFrame
             lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"], "params": r["best_params"]} for r in tuned_results])
             lb = lb.sort_values("cv_r2", ascending=False).reset_index(drop=True)
             st.markdown("### Tuning Leaderboard (by CV R²)")
             st.dataframe(lb[["family","cv_r2"]].round(4))
-            # --- Bonus Visualization: Model Performance Summary ---
-            if not lb.empty:
-                st.markdown("#### Model Performance Summary (CV R²)")
-                fig_perf, ax_perf = plt.subplots(figsize=(7, 4))
-                colors = ["#2C6E91" if fam != lb.iloc[0]["family"] else "#C65F00" for fam in lb["family"]]
-                ax_perf.barh(lb["family"], lb["cv_r2"], color=colors, alpha=0.85)
-                ax_perf.set_xlabel("Cross-Validated R² Score", fontsize=10)
-                ax_perf.set_ylabel("Model Family", fontsize=10)
-                ax_perf.set_title("Performance Comparison Across Model Families", fontsize=12)
-                ax_perf.invert_yaxis()
-                for i, v in enumerate(lb["cv_r2"]):
-                    ax_perf.text(v + 0.005, i, f"{v:.3f}", va="center", fontsize=9)
-                sns.despine()
-                st.pyplot(fig_perf, clear_figure=True)
-            # --- Build base-models and collect out-of-fold preds for stacking ---
             st.markdown("### Building base models & out-of-fold predictions for stacking")
-            kf = KFold(n_splits=5, shuffle=True, random_state=42)
-            base_models = []
-            oof_preds = pd.DataFrame(index=X.index)
-            for idx, row in lb.iterrows():
-                fam = row["family"]
-                model_entry = next((r for r in tuned_results if r["family"] == fam), None)
-                if model_entry is None:
-                    continue
-                model_obj = model_entry["model_obj"]
-                # train out-of-fold predictions
-                oof = np.zeros(X.shape[0])
-                for tr_idx, val_idx in kf.split(X):
-                    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
-                    y_tr = y.iloc[tr_idx]
-                    # fit family-specific wrapper (TabPFN/FTTransformer special-case)
-                    if model_obj == "TabPFN_placeholder":
-                        try:
-                            # TabPFN expects specific API; create a simple fallback: use RandomForest to approximate
-                            tmp = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
-                            tmp.fit(X_tr, y_tr)
-                            oof[val_idx] = tmp.predict(X_val)
-                        except Exception:
-                            oof[val_idx] = np.mean(y_tr)
-                    else:
-                        try:
-                            model_obj.fit(X_tr, y_tr)
-                            oof[val_idx] = model_obj.predict(X_val)
-                        except Exception:
-                            # fallback to mean
-                            oof[val_idx] = np.mean(y_tr)
-                oof_preds[f"{fam}_oof"] = oof
-                # finally fit model on full data
-                try:
-                    if model_entry["model_obj"] == "TabPFN_placeholder":
-                        # fallback full-model: RandomForest
-                        fitted = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
-                        fitted.fit(X, y)
-                    else:
-                        model_entry["model_obj"].fit(X, y)
-                        fitted = model_entry["model_obj"]
-                except Exception:
-                    fitted = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
-                    fitted.fit(X, y)
-                base_models.append({"family": fam, "model": fitted, "cv_r2": model_entry["cv_score"]})
-            # --- prune highly correlated OOF preds and keep top_k diverse models ---
-            if oof_preds.shape[1] == 0:
-                st.error("No base models created — aborting stacking.")
-            else:
-                corr_matrix = oof_preds.corr().abs()
-                # compute diversity score = (1 - mean correlation with others)
-                diversity = {col: 1 - corr_matrix[col].drop(col).mean() for col in corr_matrix.columns}
-                summary = []
-                for bm in base_models:
-                    col = f"{bm['family']}_oof"
-                    summary.append({"family": bm["family"], "cv_r2": bm["cv_r2"], "diversity": diversity.get(col, 0.0)})
-                summary_df = pd.DataFrame(summary).sort_values(["cv_r2", "diversity"], ascending=[False, False]).reset_index(drop=True)
-                st.markdown("### Base Model Summary (cv_r2, diversity)")
-                st.dataframe(summary_df.round(4))
-                # select top_k by cv_r2 and diversity combined
-                selected = summary_df.sort_values(["cv_r2","diversity"], ascending=[False, False]).head(top_k)["family"].tolist()
-                st.markdown(f"Selected for stacking (top {top_k}): {selected}")
-                # build stacking training data (OOF preds for selected)
-                selected_cols = [f"{s}_oof" for s in selected]
-                X_stack = oof_preds[selected_cols].fillna(0)
-                meta = Ridge(alpha=1.0)
-                meta.fit(X_stack, y)
-                # --- Robust holdout evaluation & SHAP (safe for deployment) ---
-                # Split for holdout
-                X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
-                # Helper to always produce scalar-safe mean
-                def scalar_mean(arr):
                     try:
-                        return float(np.mean(arr))
                     except Exception:
-                        return float(np.mean(np.ravel(arr)))
-                # Build family → model map
-                base_model_map = {bm["family"]: bm["model"] for bm in base_models}
-                meta_inputs = []
-                missing_families = []
-                n_meta_features_trained = X_stack.shape[1]
-                # Collect predictions from each selected model
-                for fam in selected:
-                    bm = base_model_map.get(fam)
-                    if bm is None:
-                        missing_families.append(fam)
-                        safe_mean = scalar_mean(y_tr)
-                        meta_inputs.append(np.full(len(X_val), safe_mean))
-                        continue
-                    try:
-                        preds = bm.predict(X_val)
-                        preds = np.asarray(preds)
-                        # Collapse multi-output predictions to 1D
-                        if preds.ndim == 2:
-                            preds = preds.mean(axis=1)
-                        preds = preds.reshape(-1)
-                        if preds.shape[0] != len(X_val):
-                            preds = np.full(len(X_val), scalar_mean(y_tr))
-                        meta_inputs.append(preds)
-                    except Exception as e:
-                        safe_mean = scalar_mean(y_tr)
-                        meta_inputs.append(np.full(len(X_val), safe_mean))
-                if missing_families:
-                    st.warning(f"Missing base models: {missing_families}. Using mean predictions.")
-                # Stack meta features
-                if not meta_inputs:
-                    st.error("No meta features to predict — aborting.")
-                    st.stop()
-                X_meta_val = np.column_stack(meta_inputs)
-                n_meta_features_val = X_meta_val.shape[1]
-                # Align meta features between training and validation
-                if n_meta_features_val < n_meta_features_trained:
-                    pad_cols = n_meta_features_trained - n_meta_features_val
-                    safe_mean = scalar_mean(y_tr)
-                    pad = np.tile(np.full((len(X_val), 1), safe_mean), (1, pad_cols))
-                    X_meta_val = np.hstack([X_meta_val, pad])
-                elif n_meta_features_val > n_meta_features_trained:
-                    X_meta_val = X_meta_val[:, :n_meta_features_trained]
-                if X_meta_val.shape[1] != n_meta_features_trained:
-                    st.error(f"Stack alignment failed: {X_meta_val.shape[1]} != {n_meta_features_trained}")
-                    st.stop()
-                # Meta prediction
-                y_meta_pred = meta.predict(pd.DataFrame(X_meta_val, columns=X_stack.columns))
-                # Final evaluation
-                final_r2 = r2_score(y_val, y_meta_pred)
-                final_rmse = float(np.sqrt(mean_squared_error(y_val, y_meta_pred)))
-                st.success("AutoML + Stacking complete — metrics, artifacts, and SHAP ready.")
-                log(f"Completed stacking. Final R2={final_r2:.4f}, RMSE={final_rmse:.4f}")
-                # ===============================
-                # OPERATOR ADVISORY SYSTEM
-                # ===============================
-                st.markdown("---")
-                st.subheader("Operator Advisory System — Real-Time Shift Recommendations")
-                try:
-                    # Use top base model already identified
-                    top_base = next((b for b in base_models if b["family"] == selected[0]), None)
-                    if top_base and hasattr(top_base["model"], "predict"):
-                        sample_X = X_val.sample(min(300, len(X_val)), random_state=42)
-                        model = top_base["model"]
-                        # SHAP direction analysis
-                        expl = shap.TreeExplainer(model)
-                        shap_vals = expl.shap_values(sample_X)
-                        # --- Normalize SHAP output structure (handles list, ndarray, or multi-dim cases) ---
-                        if isinstance(shap_vals, list):  # e.g., for multiclass models
-                            shap_vals = shap_vals[0]
-                        shap_vals = np.array(shap_vals)
-                        # If SHAP output has >2 dims, reduce to (n_samples, n_features)
-                        if shap_vals.ndim > 2:
-                            shap_vals = shap_vals.reshape(shap_vals.shape[0], -1)
-                        # Align SHAP features to DataFrame
-                        if shap_vals.shape[1] != sample_X.shape[1]:
-                            min_feats = min(shap_vals.shape[1], sample_X.shape[1])
-                            shap_vals = shap_vals[:, :min_feats]
-                            sample_X = sample_X.iloc[:, :min_feats]
-                        # Compute robust means
-                        mean_abs = np.abs(shap_vals).mean(axis=0)
-                        mean_sign = np.sign(shap_vals).mean(axis=0)
-                        importance = pd.DataFrame({
-                            "Feature": sample_X.columns,
-                            "Mean |SHAP|": mean_abs,
-                            "Mean SHAP Sign": mean_sign
-                        }).sort_values("Mean |SHAP|", ascending=False)
-                        # Display Top 5 Drivers
-                        st.markdown("### Top 5 Operational Drivers Influencing Target")
-                        st.dataframe(importance.head(5).style.format({"Mean |SHAP|": "{:.3f}", "Mean SHAP Sign": "{:.3f}"}))
-                        # Direction-based recommendations
-                        recommendations = []
-                        for _, row in importance.head(5).iterrows():
-                            f = row["Feature"]
-                            s = row["Mean SHAP Sign"]
-                            if s > 0.05:
-                                recommendations.append(f"Increase `{f}` likely increases `{target}`")
-                            elif s < -0.05:
-                                recommendations.append(f"Decrease `{f}` likely increases `{target}`")
-                            else:
-                                recommendations.append(f" `{f}` is neutral or nonlinear for `{target}`")
-                        st.markdown("###  Suggested Operator Adjustments (Model-Inferred)")
-                        st.write("\n".join(recommendations))
-                        # Delta recommendations vs previous shift
-                        prev_shift = df.tail(200).mean(numeric_only=True)
-                        recommended_shift = prev_shift.copy()
-                        for rec in recommendations:
-                            if "Increase" in rec:
-                                name = rec.split('`')[1]
-                                if name in recommended_shift:
-                                    recommended_shift[name] *= 1.03  # +3%
-                            elif "Decrease" in rec:
-                                name = rec.split('`')[1]
-                                if name in recommended_shift:
-                                    recommended_shift[name] *= 0.97  # -3%
-                        # Delta table
-                        st.markdown("###  Shift Adjustment Summary (vs Previous 200 Samples)")
-                        deltas = pd.DataFrame({
-                            "Current Avg": prev_shift,
-                            "Suggested": recommended_shift,
-                            "Δ (%)": ((recommended_shift - prev_shift) / prev_shift * 100)
-                        }).loc[[r.split('`')[1] for r in recommendations if '`' in r]].round(2)
-                        st.dataframe(deltas.fillna(0).style.format("{:.2f}"))
-                        log("Operator advisory system executed successfully.")
-                        # Optional: LLM-generated human-friendly summary
-                        st.markdown("### Natural Language Operator Note")
-                        try:
-                            import importlib.util
-                            if importlib.util.find_spec("transformers"):
-                                from transformers import pipeline
-                                tiny_llm_path = os.path.join(LOG_DIR, "cached_tiny_llm")
-                                if os.path.exists(os.path.join(tiny_llm_path, "config.json")):
-                                    from transformers import AutoModelForCausalLM, AutoTokenizer
-                                    model = AutoModelForCausalLM.from_pretrained(tiny_llm_path)
-                                    tokenizer = AutoTokenizer.from_pretrained(tiny_llm_path)
-                                    assistant = pipeline("text-generation", model=model, tokenizer=tokenizer)
-                                else:
-                                    assistant = pipeline("text2text-generation", model="google/flan-t5-small")
-                                llm_prompt = f"""
-                                You are a metallurgical process advisor working in a steel manufacturing unit.
-                                Based on these recommendations:
-                                {recommendations}
-                                and these shift averages:
-                                {deltas.to_dict(orient='index')}
-                                Write a concise 3-line message to the operator suggesting what to adjust this shift.
-                                """
-                                resp = assistant(llm_prompt, max_new_tokens=120)[0]["generated_text"]
-                                st.info(resp)
-                                log("Operator LLM advisory note generated successfully.")
-                            else:
-                                st.warning("Transformers not available — install it for text generation.")
-                        except Exception as e:
-                            st.warning(f"LLM advisory generation skipped: {e}")
-                    else:
-                        st.info("No suitable model found for operator advisory system.")
-                except Exception as e:
-                    st.error(f"Operator advisory system failed: {e}")
-                    log(f"Operator advisory error: {e}")
-                c1, c2 = st.columns(2)
-                c1.metric("Stacked Ensemble R² (holdout)", f"{final_r2:.4f}")
-                c2.metric("Stacked Ensemble RMSE (holdout)", f"{final_rmse:.4f}")
-                # Scatter comparison
-                fig, ax = plt.subplots(figsize=(7, 4))
-                ax.scatter(y_val, y_meta_pred, alpha=0.6)
-                ax.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], "r--")
-                ax.set_xlabel("Actual")
-                ax.set_ylabel("Stacked Predicted")
-                st.pyplot(fig)
-                # Save trained stack artifacts
-                joblib.dump(meta, ENSEMBLE_PATH)
-                st.caption(f"Stacked ensemble snapshot updated → {ENSEMBLE_PATH}")
-                log(f"Ensemble model updated for use case: {use_case}")
-                # Explainability
-                st.markdown("### Explainability (approximate)")
-                try:
-                    top_base = next((b for b in base_models if b["family"] == selected[0]), None)
-                    if top_base and hasattr(top_base["model"], "predict"):
-                        sample_X = X_val.sample(min(300, len(X_val)), random_state=42)
-                        if any(k in top_base["family"] for k in ["XGBoost", "LightGBM", "RandomForest", "ExtraTrees", "CatBoost"]):
-                            expl = shap.TreeExplainer(top_base["model"])
-                            shap_vals = expl.shap_values(sample_X)
-                            fig_sh = plt.figure(figsize=(8, 6))
-                            shap.summary_plot(shap_vals, sample_X, show=False)
-                            st.pyplot(fig_sh)
-                        else:
-                            st.info("Top model not tree-based; skipping SHAP summary.")
                     else:
-                        st.info("No suitable base model for SHAP explanation.")
-                except Exception as e:
-                    st.warning(f"SHAP computation skipped: {e}")
-                st.success(" AutoML + Stacking complete — metrics, artifacts, and SHAP ready.")
-                # --- Store AutoML summary for optional LLM advisory ---
-                if "automl_summary" not in st.session_state:
-                    st.session_state["automl_summary"] = {
-                        "leaderboard": lb[["family", "cv_r2"]].round(4).to_dict(orient="records"),
-                        "final_r2": float(final_r2),
-                        "final_rmse": float(final_rmse),
-                        "target": target,
-                        "use_case": use_case
-                    }
                 else:
-                    # Always refresh with latest metrics after each run
-                    st.session_state["automl_summary"].update({
-                        "leaderboard": lb[["family", "cv_r2"]].round(4).to_dict(orient="records"),
-                        "final_r2": float(final_r2),
-                        "final_rmse": float(final_rmse),
-                        "target": target,
-                        "use_case": use_case
-                    })
-                # Persist SHAP-based recommendations for reuse across reruns
-                if "shap_recommendations" not in st.session_state:
-                    st.session_state["shap_recommendations"] = recommendations
-                else:
-                    st.session_state["shap_recommendations"] = recommendations
-                # --- AI Recommendation Assistant ---
-                st.markdown("---")
-                st.subheader("AI Recommendation Assistant ")
-                st.caption("Generates quick local AI suggestions — no file writes required.")
-                # Create or reset button states safely
-                if "hf_clicked" not in st.session_state:
-                    st.session_state["hf_clicked"] = False
-                if "llm_result" not in st.session_state:
-                    st.session_state["llm_result"] = None
-                # --- Buttons ---
-                col1, col2 = st.columns(2)
-                # Click handlers with isolated session flags
-                if col1.button("Get AI Recommendation (via HF API)", key="ai_reco"):
-                    st.session_state["hf_clicked"] = True
-                    st.session_state["hf_ran_once"] = False  # reset internal control
-                if col2.button("Reset Recommendation Output"):
-                    st.session_state["hf_clicked"] = False
-                    st.session_state["llm_result"] = None
-                    st.session_state["hf_ran_once"] = False
-                    st.info("Recommendation output cleared.")
-                # Execute API call only once
-                if st.session_state["hf_clicked"] and not st.session_state.get("hf_ran_once", False):
-                    summary = st.session_state.get("automl_summary", {})
-                    if not summary:
-                        st.warning("Please run AutoML first to generate context.")
-                    else:
-                        try:
-                            import requests, json
-                            st.info("Contacting Hugging Face Inference API (Mixtral-8x7B-Instruct)…")
-                            API_URL = "https://api-inference.huggingface.co/models/mistralai/Mixtral-8x7B-Instruct-v0.1"
-                            headers = {"Authorization": f"Bearer {st.secrets['HF_TOKEN']}"}
-                            prompt = f"""
-                            You are an ML model tuning advisor.
-                            Based on this AutoML summary, suggest 3 concise, actionable steps
-                            to improve model performance if overfitting, underfitting, or data-quality issues are observed.
-                            Use case: {summary.get('use_case')}
-                            Target: {summary.get('target')}
-                            Final R²: {summary.get('final_r2')}
-                            Final RMSE: {summary.get('final_rmse')}
-                            Leaderboard: {summary.get('leaderboard')}
-                            """
-                            payload = {"inputs": prompt, "parameters": {"max_new_tokens": 200, "temperature": 0.7}}
-                            response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
-                            response.raise_for_status()
-                            result = response.json()
-                            if isinstance(result, list) and "generated_text" in result[0]:
-                                text = result[0]["generated_text"]
-                            elif isinstance(result, dict) and "generated_text" in result:
-                                text = result["generated_text"]
-                            else:
-                                text = json.dumps(result, indent=2)
-                            st.session_state["llm_result"] = text.strip()
-                            st.session_state["hf_ran_once"] = True
-                            st.success("✅ AI Recommendation (Mixtral-8x7B-Instruct):")
-                            st.markdown(st.session_state["llm_result"])
-                        except Exception as e:
-                            st.error(f"HF Inference API call failed: {e}")
-                # --- Always display cached result, even on rerun ---
-                if st.session_state["llm_result"]:
-                    st.markdown("### Cached AI Recommendation:")
-                    st.markdown(st.session_state["llm_result"])
-# -----  Target & Business Impact tab
 with tabs[5]:
-    st.subheader("Recommended Target Variables by Use Case")
-    st.markdown("Each use case maps to a practical target variable that drives measurable business impact.")
     target_table = pd.DataFrame([
-        ["Predictive Maintenance (Mills, Motors, Compressors)", "bearing_temp / time_to_failure", "Rises before mechanical failure; early warning", "₹10–30 L per asset/year"],
-        ["Blast Furnace / EAF Data Intelligence", "furnace_temp / tap_temp", "Central control variable, linked to energy and quality", "₹20–60 L/year"],
-        ["Casting Quality Optimization", "defect_probability / solidification_rate", "Determines billet quality; control nozzle & cooling", "₹50 L/year yield gain"],
-        ["Rolling Mill Energy Optimization", "energy_per_ton / exit_temp", "Directly tied to energy efficiency", "₹5–10 L/year per kWh/t"],
-        ["Surface Defect Detection (Vision AI)", "defect_probability", "Quality metric from CNN", "1–2 % yield gain"],
-        ["Material Composition & Alloy Mix AI", "deviation_from_target_grade", "Predict deviation, suggest corrections", "₹20 L/year raw material savings"],
-        ["Inventory & Yield Optimization", "yield_ratio (output/input)", "Linked to WIP and process yield", "₹1 Cr+/year"],
-        ["Refractory & Cooling Loss Prediction", "lining_thickness / heat_loss_rate", "Predict wear for planned maintenance", "₹40 L/year downtime savings"]], columns=["Use Case", "Target Variable", "Why It’s Ideal", "Business Leverage"])
-    st.dataframe(target_table,  width="stretch")
-    st.markdown("---")
-    st.subheader("Business Framing for Clients")
-    st.markdown("These metrics show approximate annual benefits from small process improvements.")
-    business_table = pd.DataFrame([
-        ["Energy consumption", "400 kWh/ton", "₹35–60 L"],
-        ["Electrode wear", "1.8 kg/ton", "₹10 L"],
-        ["Refractory wear", "3 mm/heat", "₹15 L"],
-        ["Oxygen usage", "40 Nm³/ton", "₹20 L"],
-        ["Yield loss", "2 %", "₹50 L – ₹1 Cr"],
-    ], columns=["Metric", "Typical Value (EAF India)", "5 % Improvement → Annual ₹ Value"])
-    st.dataframe(business_table, width="stretch")
-    st.info("These numbers are indicative averages; actual benefits depend on plant capacity and process efficiency.")
-# -----  Bibliography tab
 with tabs[6]:
-    st.subheader("Annotated Bibliography — Justification for Target Variables")
-    st.markdown("""
-These papers justify the chosen target variables (temperature, yield, efficiency, refractory wear)
-in metallurgical AI modeling. Click any title to open the official paper.
-    """)
-    bib_data = [
-        {
-            "title": "A Survey of Data-Driven Soft Sensing in Ironmaking Systems",
-            "authors": "Yan et al. (2024)",
-            "notes": "Soft sensors for furnace and tap temperature; validates `furnace_temp` and `tap_temp` targets.",
-            "url": "https://doi.org/10.1021/acsomega.4c01254"
-        },
-        {
-            "title": "Optimisation of Operator Support Systems through Artificial Intelligence for the Cast Steel Industry",
-            "authors": "Ojeda Roldán et al. (2022)",
-            "notes": "Reinforcement learning for oxygen blowing and endpoint control; supports temperature and carbon targets.",
-            "url": "https://doi.org/10.3390/jmmp6020034"
-        },
-        {
-            "title": "Analyzing the Energy Efficiency of Electric Arc Furnace Steelmaking",
-            "authors": "Zhuo et al. (2024)",
-            "notes": "Links arc power, temperature, and energy KPIs — validates `energy_efficiency` and `power_density`.",
-            "url": "https://doi.org/10.3390/met15010113"
-        },
-        {
-            "title": "Dynamic EAF Modeling and Slag Foaming Index Prediction",
-            "authors": "MacRosty et al.",
-            "notes": "Supports refractory and heat-flux-based wear prediction — validates `lining_thickness` target.",
-            "url": "https://www.sciencedirect.com/science/article/pii/S0921883123004019"
-        },
-        {
-            "title": "Machine Learning for Yield Optimization in Continuous Casting",
-            "authors": "Springer (2023)",
-            "notes": "ML for yield ratio and defect minimization; supports `yield_ratio` target.",
-            "url": "https://link.springer.com/article/10.1007/s40964-023-00592-7"
-        }
     ]
-    bib_df = pd.DataFrame(bib_data)
-    bib_df["Paper Title"] = bib_df.apply(lambda x: f"[{x['title']}]({x['url']})", axis=1)
-    st.markdown("### Annotated Bibliography — Justification for Target Variables")
-    for _, row in bib_df.iterrows():
-        st.markdown(
-            f"**[{row['title']}]({row['url']})**  \n"
-            f"*{row['authors']}*  \n"
-            f" _{row['notes']}_  \n",
-            unsafe_allow_html=True
-        )
-    st.info("Click any paper title above to open it in a new tab.")
-    st.markdown("""
-**Feature ↔ Target Justification**
-- `furnace_temp`, `tap_temp` → Process temperature (Yan 2024, Ojeda 2022)
-- `yield_ratio` → Production yield (Springer 2023)
-- `energy_efficiency`, `power_density` → Energy KPIs (Zhuo 2024)
-- `lining_thickness`, `slag_foaming_index` → Refractory & process health (MacRosty et al.)
-    """)
-    st.info("Click any paper title above to open it in a new tab.")
-    log("Bibliography tab rendered successfully.")
-# -------------------------
-# Footer / Notes
-# -------------------------
-st.markdown("---")
-st.markdown("**Notes:** This dataset is synthetic and for demo/prototyping. Real plant integration requires NDA, data on-boarding, sensor mapping, and plant safety checks before any control actions.")
-# -----  Download tab
-with tabs[-2]:
-    st.subheader(" Download Saved Files (Flat Log Mode)")
-    available_files = [f for f in os.listdir(LOG_DIR) if os.path.isfile(os.path.join(LOG_DIR, f))]
-    if not available_files:
-        st.info("No files found yet — run AutoML once to generate outputs.")
     else:
-        for f in sorted(available_files):
             path = os.path.join(LOG_DIR, f)
-            with open(path, "rb") as fp:
-                st.download_button(
-                    label=f" Download {f}",
-                    data=fp,
-                    file_name=f,
-                    mime="application/octet-stream"
-                )
-# -----  Logs tab
-with tabs[-1]:
-    st.subheader(" Master Log (append-in-place)")
     if os.path.exists(LOG_PATH):
-        with open(LOG_PATH, "r", encoding="utf-8") as f:
-            content = f.read()
-        st.text_area("Master Log Output", content, height=400)
-        st.download_button("Download Log", content, file_name="run_master.log")
     else:
-        st.info("No log file yet — run AutoML once to start logging.")

+# sail_modex_stable.py
 import os
 import json
 import time
 import joblib
 import zipfile
 import io
+import gc
 # ML imports
 from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LinearRegression, Ridge
 from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
 from sklearn.preprocessing import StandardScaler, PolynomialFeatures
 from sklearn.decomposition import PCA
 # SHAP
 import shap
+# Optuna (used later)
+import optuna
+from sklearn.model_selection import cross_val_score, KFold
+from sklearn.neural_network import MLPRegressor
 # --- Safe defaults for Streamlit session state ---
 defaults = {
 for k, v in defaults.items():
     st.session_state.setdefault(k, v)
 if "llm_result" not in st.session_state:
     st.session_state["llm_result"] = None
 if "automl_summary" not in st.session_state:
 # -------------------------
 # Config & paths
 # -------------------------
 st.set_page_config(page_title="Steel Authority of India Limited (MODEX)", layout="wide")
 plt.style.use("seaborn-v0_8-muted")
 sns.set_palette("muted")
         f.write(f"[{stamp}] {msg}\n")
     print(msg)
 log("=== Streamlit session started ===")
 if os.path.exists("/data"):
     st.sidebar.success(f" Using persistent storage | Logs directory: {LOG_DIR}")
 else:
     st.sidebar.warning(f" Using ephemeral storage | Logs directory: {LOG_DIR}. Data will be lost on rebuild.")
 # -------------------------
 # Utility: generate advanced dataset if missing
 # -------------------------
     Generates a large synthetic, physics-aligned dataset with many engineered features.
     Allows control of variability per feature (through variance_overrides) or globally
     (via global_variance_multiplier).
     """
     np.random.seed(random_seed)
     os.makedirs(LOG_DIR, exist_ok=True)
         existing = [meta_entry]
     json.dump(existing, open(META_PATH, "w"), indent=2)
     PDF_PATH = None
     return CSV_PATH, META_PATH, PDF_PATH
 # -------------------------
     return df_local, pd.DataFrame(meta_local)
 df, meta_df = load_data()
 # -------------------------
+# Sidebar filters & UI
 # -------------------------
 st.sidebar.title("Feature Explorer - Advanced + SHAP")
     """Ensure metadata dataframe matches feature count & has required columns."""
     required_cols = ["feature_name", "source_type", "formula", "remarks"]
     if meta_df is None or len(meta_df) < len(df.columns):
         meta_df = pd.DataFrame({
             "feature_name": df.columns,
         })
         st.sidebar.warning("Metadata was summary-only — rebuilt feature-level metadata.")
     else:
         for col in required_cols:
             if col not in meta_df.columns:
                 meta_df[col] = None
         if meta_df["feature_name"].isna().all():
             meta_df["feature_name"] = df.columns
         if len(meta_df) > len(df.columns):
             meta_df = meta_df.iloc[: len(df.columns)]
 meta_df = ensure_feature_metadata(df, meta_df)
 feat_types = sorted(meta_df["source_type"].dropna().unique().tolist())
 selected_types = st.sidebar.multiselect("Feature type", feat_types, default=feat_types)
 numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
 # -------------------------
+# Tabs layout
 # -------------------------
 tabs = st.tabs([
     "Features",
     "Visualization",
     "View Logs"
 ])
+# ----- Feature metadata
 with tabs[0]:
     st.subheader("Feature metadata")
     st.dataframe(
     )
     st.markdown(f"Total features loaded: **{df.shape[1]}**  |  Rows: **{df.shape[0]}**")
+# ----- Visualization tab
 with tabs[1]:
     st.subheader("Feature Visualization")
     col = st.selectbox("Choose numeric feature", numeric_cols, index=0)
     bins = st.slider("Histogram bins", 10, 200, 50)
     fig, ax = plt.subplots(figsize=(8, 4))
     sns.histplot(df[col], bins=bins, kde=True, ax=ax, color="#2C6E91", alpha=0.8)
+    ax.set_title(f"Distribution of {col}", fontsize=12)
     st.pyplot(fig, clear_figure=True)
     st.write(df[col].describe().to_frame().T)
     if all(x in df.columns for x in ["pca_1", "pca_2", "operating_mode"]):
         st.markdown("### PCA Feature Space — Colored by Operating Mode")
         fig2, ax2 = plt.subplots(figsize=(6, 5))
             x="pca_1", y="pca_2", hue="operating_mode",
             palette="tab10", alpha=0.7, s=40, ax=ax2
         )
+        ax2.set_title("Operating Mode Clusters (PCA Projection)")
         st.pyplot(fig2, clear_figure=True)
 # ----- Correlations tab
 with tabs[2]:
     st.subheader("Correlation explorer")
     if len(corr_sel) >= 2:
         corr = df[corr_sel].corr()
         fig, ax = plt.subplots(figsize=(10,8))
+        sns.heatmap(corr, cmap="RdBu_r", center=0, annot=True, fmt=".2f",
+                    linewidths=0.5, cbar_kws={"shrink": 0.7}, ax=ax)
         st.pyplot(fig, clear_figure=True)
     else:
         st.info("Choose at least 2 numeric features to compute correlation.")
     st.subheader("Summary statistics (numeric features)")
     st.dataframe(df.describe().T.style.format("{:.3f}"), height=500)
+# ----- AutoML + SHAP tab (Expanded)
 with tabs[4]:
+    st.subheader("AutoML Ensemble — Expanded Families + Stacking + SHAP")
     use_case = st.selectbox(
         "Select Use Case",
         [
         index=1
     )
     use_case_config = {
         "Predictive Maintenance": {"target": "bearing_temp", "model_hint": "RandomForest"},
         "EAF Data Intelligence": {"target": "furnace_temp", "model_hint": "GradientBoosting"},
+        "Casting Quality Optimization": {"target": "surface_temp", "model_hint": "GradientBoosting"},
         "Rolling Mill Energy Optimization": {"target": "energy_efficiency", "model_hint": "ExtraTrees"},
         "Surface Defect Detection (Vision AI)": {"target": "image_entropy_proxy", "model_hint": "GradientBoosting"},
         "Material Composition & Alloy Mix AI": {"target": "chemical_C", "model_hint": "RandomForest"},
     target = cfg["target"]
     model_hint = cfg["model_hint"]
     suggested = [c for c in numeric_cols if any(k in c for k in target.split('_'))]
     if len(suggested) < 6:
+        suggested = [c for c in numeric_cols if any(k in c for k in ["temp","power","energy","pressure","yield"])]
     if len(suggested) < 6:
         suggested = numeric_cols[:50]
     features = st.multiselect("Model input features (auto-suggested)", numeric_cols, default=suggested)
     st.markdown(f"Auto target: `{target}` · Suggested family hint: `{model_hint}`")
     max_rows = min(df.shape[0], 20000)
+    sample_size = st.slider("Sample rows", 500, max_rows, min(1500, max_rows), step=100)
     sub_df = df[features + [target]].sample(n=sample_size, random_state=42).reset_index(drop=True)
     X = sub_df[features].fillna(0)
     y = sub_df[target].fillna(0)
     st.markdown("### Ensemble & AutoML Settings")
+    max_trials = st.slider("Optuna trials per family", 5, 80, 20, step=5)
+    top_k = st.slider("Max base models in ensemble", 2, 8, 5)
+    allow_advanced = st.checkbox("Include advanced families (XGBoost, LightGBM, CatBoost)", value=True)
+    available_models = ["RandomForest", "ExtraTrees"]
     optional_families = {}
     if allow_advanced:
         try:
+            import xgboost as xgb; optional_families["XGBoost"] = True; available_models.append("XGBoost")
+        except Exception: optional_families["XGBoost"] = False
         try:
+            import lightgbm as lgb; optional_families["LightGBM"] = True; available_models.append("LightGBM")
+        except Exception: optional_families["LightGBM"] = False
         try:
+            import catboost as cb; optional_families["CatBoost"] = True; available_models.append("CatBoost")
+        except Exception: optional_families["CatBoost"] = False
     st.markdown(f"Available model families: {', '.join(available_models)}")
     def tune_family(family_name, X_local, y_local, n_trials=20, random_state=42):
+        """Tune one model family using Optuna."""
         def obj(trial):
             if family_name == "RandomForest":
                 n_estimators = trial.suggest_int("n_estimators", 100, 800)
                 max_depth = trial.suggest_int("max_depth", 4, 30)
                 n_estimators = trial.suggest_int("n_estimators", 100, 1000)
                 max_depth = trial.suggest_int("max_depth", 3, 12)
                 lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
+                m = xgb.XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, tree_method="hist", verbosity=0)
             elif family_name == "LightGBM" and optional_families.get("LightGBM"):
                 n_estimators = trial.suggest_int("n_estimators", 100, 1000)
                 max_depth = trial.suggest_int("max_depth", 3, 16)
                 lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
+                m = lgb.LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, n_jobs=1)
             elif family_name == "CatBoost" and optional_families.get("CatBoost"):
                 iterations = trial.suggest_int("iterations", 200, 1000)
                 depth = trial.suggest_int("depth", 4, 10)
                 lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
+                m = cb.CatBoostRegressor(iterations=iterations, depth=depth, learning_rate=lr, verbose=0)
             else:
+                m = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=random_state)
             try:
+                scores = cross_val_score(m, X_local, y_local, scoring="r2", cv=3)
                 return float(np.mean(scores))
             except Exception:
                 return -999.0
         study = optuna.create_study(direction="maximize")
         study.optimize(obj, n_trials=n_trials, show_progress_bar=False)
         best = study.best_trial.params if study.trials else {}
         try:
             if family_name == "RandomForest":
+                model = RandomForestRegressor(**{**{"random_state":42,"n_jobs":-1}, **best})
             elif family_name == "ExtraTrees":
+                model = ExtraTreesRegressor(**{**{"random_state":42,"n_jobs":-1}, **best})
             elif family_name == "XGBoost" and optional_families.get("XGBoost"):
+                model = xgb.XGBRegressor(**{**{"verbosity":0,"tree_method":"hist"}, **best})
             elif family_name == "LightGBM" and optional_families.get("LightGBM"):
+                model = lgb.LGBMRegressor(**{**{"n_jobs":1}, **best})
             elif family_name == "CatBoost" and optional_families.get("CatBoost"):
+                model = cb.CatBoostRegressor(**{**{"verbose":0}, **best})
             else:
+                model = RandomForestRegressor(random_state=42)
         except Exception:
+            model = RandomForestRegressor(random_state=42)
         try:
+            score = float(np.mean(cross_val_score(model, X_local, y_local, scoring="r2", cv=3)))
         except Exception:
             score = -999.0
+        return {"model_obj": model, "cv_score": score, "best_params": best, "family": family_name}
     if st.button("Run expanded AutoML + Stacking"):
         st.session_state["run_automl_clicked"] = True
     if st.session_state["run_automl_clicked"]:
         log("AutoML + Stacking initiated.")
+        with st.spinner("Tuning multiple families..."):
             families_to_try = ["RandomForest", "ExtraTrees", "MLP"]
             if allow_advanced:
                 if optional_families.get("XGBoost"): families_to_try.append("XGBoost")
                 if optional_families.get("LightGBM"): families_to_try.append("LightGBM")
                 if optional_families.get("CatBoost"): families_to_try.append("CatBoost")
             tuned_results = []
             for fam in families_to_try:
                 log(f"Tuning family: {fam}")
                 st.caption(f"Tuning family: {fam}")
+                tuned_results.append(tune_family(fam, X, y, n_trials=max_trials))
+            # --- Leaderboard
             lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"], "params": r["best_params"]} for r in tuned_results])
             lb = lb.sort_values("cv_r2", ascending=False).reset_index(drop=True)
             st.markdown("### Tuning Leaderboard (by CV R²)")
             st.dataframe(lb[["family","cv_r2"]].round(4))
+            # --- Enhanced Ensemble Stacking ---
+            from sklearn.feature_selection import SelectKBest, f_regression
+            from sklearn.linear_model import LinearRegression
+            from sklearn.model_selection import KFold
             st.markdown("### Building base models & out-of-fold predictions for stacking")
+            scaler = StandardScaler()
+            X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
+            selector = SelectKBest(f_regression, k=min(40, X_scaled.shape[1]))
+            X_sel = selector.fit_transform(X_scaled, y)
+            selected_feature_names = [X.columns[i] for i in selector.get_support(indices=True)]
+            X_sel = pd.DataFrame(X_sel, columns=selected_feature_names)
+            kf = KFold(n_splits=5, shuffle=True, random_state=42)
+            base_models, oof_preds = [], pd.DataFrame(index=X_sel.index)
+            for fam, entry in [(r["family"], r) for r in tuned_results if r.get("model_obj")]:
+                model_obj = entry["model_obj"]
+                oof = np.zeros(X_sel.shape[0])
+                for tr_idx, val_idx in kf.split(X_sel):
+                    X_tr, X_val = X_sel.iloc[tr_idx], X_sel.iloc[val_idx]
+                    y_tr = y.iloc[tr_idx]
                     try:
+                        model_obj.fit(X_tr, y_tr)
+                        preds = model_obj.predict(X_val)
+                        oof[val_idx] = preds
                     except Exception:
+                        oof[val_idx] = np.mean(y_tr)
+                oof_preds[f"{fam}_oof"] = oof
+                model_obj.fit(X_sel, y)
+                base_models.append({"family": fam, "model": model_obj})
+            if oof_preds.empty:
+                st.error("No base models built.")
+                st.stop()
+            corr = oof_preds.corr().abs()
+            div = {c: 1 - corr[c].drop(c).mean() for c in corr.columns}
+            cv_r2_est = {c: r2_score(y, oof_preds[c]) for c in oof_preds.columns}
+            summary_df = pd.DataFrame({
+                "family": [c.replace("_oof","") for c in oof_preds.columns],
+                "cv_r2": [cv_r2_est[c] for c in oof_preds.columns],
+                "diversity": [div[c] for c in oof_preds.columns]
+            }).sort_values(["cv_r2","diversity"], ascending=[False,False])
+            st.dataframe(summary_df.round(4))
+            selected = summary_df.head(top_k)["family"].tolist()
+            st.markdown(f"Selected for stacking (top {top_k}): {selected}")
+            meta = LinearRegression(positive=True)
+            X_stack = oof_preds[[f"{s}_oof" for s in selected]].fillna(0)
+            meta.fit(X_stack, y)
+            X_tr, X_val, y_tr, y_val = train_test_split(X_sel, y, test_size=0.2, random_state=42)
+            meta_inputs = []
+            for fam in selected:
+                mdl = next((b["model"] for b in base_models if b["family"] == fam), None)
+                preds = mdl.predict(X_val) if mdl else np.full(len(X_val), np.mean(y_tr))
+                meta_inputs.append(np.ravel(preds))
+            X_meta_val = pd.DataFrame(np.column_stack(meta_inputs), columns=X_stack.columns)
+            y_meta_pred = meta.predict(X_meta_val)
+            final_r2 = r2_score(y_val, y_meta_pred)
+            final_rmse = np.sqrt(mean_squared_error(y_val, y_meta_pred))
+            st.success(f"Stacked Ensemble — R² = {final_r2:.4f}, RMSE = {final_rmse:.3f}")
+            fig, ax = plt.subplots(figsize=(7,4))
+            ax.scatter(y_val, y_meta_pred, alpha=0.7)
+            ax.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], "r--")
+            st.pyplot(fig, clear_figure=True)
+            st.session_state["automl_summary"] = {
+                "leaderboard": summary_df[["family","cv_r2"]].to_dict(orient="records"),
+                "final_r2": float(final_r2),
+                "final_rmse": float(final_rmse),
+                "target": target,
+                "use_case": use_case
+            }
+            # --- Operator Advisory System + Llama-3-70B-Instruct ---
+            st.markdown("---")
+            st.subheader("Operator Advisory System — Real-Time Shift Recommendations")
+            try:
+                top_base = next((b for b in base_models if b["family"] == selected[0]), None)
+                if top_base and hasattr(top_base["model"], "predict"):
+                    sample_X = X_val.sample(min(300, len(X_val)), random_state=42)
+                    model = top_base["model"]
+                    expl = shap.TreeExplainer(model)
+                    shap_vals = expl.shap_values(sample_X)
+                    if isinstance(shap_vals, list): shap_vals = shap_vals[0]
+                    shap_vals = np.array(shap_vals)
+                    mean_abs = np.abs(shap_vals).mean(axis=0)
+                    mean_sign = np.sign(shap_vals).mean(axis=0)
+                    importance = pd.DataFrame({
+                        "Feature": sample_X.columns,
+                        "Mean |SHAP|": mean_abs,
+                        "Mean SHAP Sign": mean_sign
+                    }).sort_values("Mean |SHAP|", ascending=False)
+                    st.markdown("### Top 5 Operational Drivers")
+                    st.dataframe(importance.head(5))
+                    recommendations = []
+                    for _, row in importance.head(5).iterrows():
+                        f, s = row["Feature"], row["Mean SHAP Sign"]
+                        if s > 0.05: recommendations.append(f"Increase `{f}` likely increases `{target}`")
+                        elif s < -0.05: recommendations.append(f"Decrease `{f}` likely increases `{target}`")
+                        else: recommendations.append(f"`{f}` neutral for `{target}`")
+                    st.markdown("### Suggested Operator Adjustments")
+                    st.write("\n".join(recommendations))
+                    # --- Call HF Llama-3-70B-Instruct API for summary ---
+                    import requests
+                    HF_TOKEN = st.secrets.get("HF_TOKEN", os.getenv("HF_TOKEN"))
+                    if not HF_TOKEN:
+                        st.error("HF_TOKEN not found in secrets or environment.")
                     else:
+                        API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-3-70B-Instruct"
+                        headers = {"Authorization": f"Bearer {HF_TOKEN}"}
+                        prompt = f"""
+                        You are an expert metallurgical process advisor.
+                        Based on these recommendations:
+                        {recommendations}
+                        Target: {target}
+                        Use case: {use_case}
+                        Summarize in three concise, professional lines what the operator should do this shift.
+                        """
+                        payload = {"inputs": prompt, "parameters": {"max_new_tokens": 150, "temperature": 0.6}}
+                        with st.spinner("Generating operator note (Llama-3-70B)…"):
+                            resp = requests.post(API_URL, headers=headers, json=payload, timeout=90)
+                            text = resp.json()[0].get("generated_text","").strip()
+                            st.info(text)
                 else:
+                    st.info("No suitable base model found.")
+            except Exception as e:
+                st.warning(f"Operator advisory skipped: {e}")
+# ----- Business Impact tab
 with tabs[5]:
+    st.subheader("Business Impact Metrics")
     target_table = pd.DataFrame([
+        ["EAF Data Intelligence", "furnace_temp / tap_temp", "Central control variable", "₹20–60 L/year"],
+        ["Casting Optimization", "surface_temp / cooling_water_temp", "Controls billet quality", "₹50 L/year"],
+        ["Rolling Mill", "energy_efficiency", "Energy optimization", "₹5–10 L/year"],
+        ["Refractory Loss Prediction", "lining_thickness / heat_loss_rate", "Wear and downtime", "₹40 L/year"],
+    ], columns=["Use Case","Target Variable","Why It’s Ideal","Business Leverage"])
+    st.dataframe(target_table, width="stretch")
+# ----- Bibliography tab
 with tabs[6]:
+    st.subheader("Annotated Bibliography")
+    refs = [
+        ("A Survey of Data-Driven Soft Sensing in Ironmaking Systems","Yan et al. (2024)","Soft sensors validate `furnace_temp` and `tap_temp`.","https://doi.org/10.1021/acsomega.4c01254"),
+        ("Optimisation of Operator Support Systems","Ojeda Roldán et al. (2022)","Reinforcement learning for endpoint control.","https://doi.org/10.3390/jmmp6020034"),
+        ("Analyzing the Energy Efficiency of Electric Arc Furnace Steelmaking","Zhuo et al. (2024)","Links arc power and energy KPIs.","https://doi.org/10.3390/met15010113"),
+        ("Dynamic EAF Modeling and Slag Foaming Index Prediction","MacRosty et al.","Supports refractory wear modeling.","https://www.sciencedirect.com/science/article/pii/S0921883123004019")
     ]
+    for t,a,n,u in refs:
+        st.markdown(f"**[{t}]({u})** — *{a}*  \n_{n}_")
+# ----- Download tab
+with tabs[7]:
+    st.subheader("Download Saved Files")
+    files = [f for f in os.listdir(LOG_DIR) if os.path.isfile(os.path.join(LOG_DIR, f))]
+    if not files: st.info("No files yet — run AutoML first.")
     else:
+        for f in sorted(files):
             path = os.path.join(LOG_DIR, f)
+            with open(path,"rb") as fp:
+                st.download_button(f"Download {f}", fp, file_name=f)
+# ----- Logs tab
+with tabs[8]:
+    st.subheader("Master Log")
     if os.path.exists(LOG_PATH):
+        txt = open(LOG_PATH).read()
+        st.text_area("Log Output", txt, height=400)
+        st.download_button("Download Log", txt, file_name="run_master.log")
     else:
+        st.info("No logs yet — run AutoML once.")
+st.markdown("---")
+st.markdown("**Note:** Synthetic demo dataset for educational use only. Real deployment requires plant data, NDA, and safety validation.")