Spaces:

singhn9
/

SteelAI_Module2_EAF_Intelligence_Explorer

Sleeping

App Files Files Community

singhn9 commited on Nov 10, 2025

Commit

bfb18ef

verified ·

1 Parent(s): b2f829a

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +20 -88

src/streamlit_app.py CHANGED Viewed

@@ -473,43 +473,31 @@ with tabs[4]:
     features = st.multiselect("Model input features (auto-suggested)", numeric_cols, default=suggested)
     st.markdown(f"Auto target: `{target}` · Suggested family hint: `{model_hint}`")
     # --- Sampling configuration ---
     max_rows = min(df.shape[0], 20000)
     sample_size = st.slider("Sample rows", 500, max_rows, min(1500, max_rows), step=100)
     # ---------- SAFE target & X preparation ----------
-    # Ensure target is a single column name (string). If it's a list, pick the first and warn.
     if isinstance(target, (list, tuple)):
         st.warning(f"Target provided as list/tuple; using first element `{target[0]}` as target.")
         target = target[0]
-    # Select only valid feature columns
-    cols_needed = [c for c in features if c in df.columns]
-    # Match exact name first
-    if isinstance(target, (list, tuple)):
-        st.warning(f"Target provided as list/tuple; using first element `{target[0]}` as target.")
-        target = target[0]
-    # Select only valid feature columns
     cols_needed = [c for c in features if c in df.columns]
-    # --- Force single exact target column ---
     if target in df.columns:
         target_col = target
     else:
-        # Case-insensitive exact match
         matches = [c for c in df.columns if c.lower() == target.lower()]
         if matches:
             target_col = matches[0]
             st.info(f"Auto-corrected to exact match: `{target_col}`")
         else:
-            # Partial substring match (e.g., 'furnace_temp' vs 'furnace_temp_next')
             matches = [c for c in df.columns if target.lower() in c.lower()]
             if len(matches) == 1:
                 target_col = matches[0]
                 st.info(f"Auto-corrected to closest match: `{target_col}`")
             elif len(matches) > 1:
-                # Prefer '_temp', '_ratio', or exact substring equality
                 preferred = [m for m in matches if m.endswith("_temp") or m.endswith("_ratio") or m == target]
                 if preferred:
                     target_col = preferred[0]
@@ -521,7 +509,6 @@ with tabs[4]:
                 st.error(f"Target `{target}` not found in dataframe columns.")
                 st.stop()
-    # --- Build sub_df safely — ensure unique and valid target ---
     valid_features = [c for c in cols_needed if c in df.columns and c != target_col]
     if not valid_features:
         st.error("No valid feature columns remain after cleaning. Check feature selection.")
@@ -530,22 +517,14 @@ with tabs[4]:
     sub_df = df.loc[:, valid_features + [target_col]].copy()
     sub_df = sub_df.sample(n=sample_size, random_state=42).reset_index(drop=True)
-    # --- Construct clean X and y ---
     X = sub_df.drop(columns=[target_col])
     y = pd.Series(np.ravel(sub_df[target_col]), name=target_col)
-    # Drop known leak or identifier columns
     leak_cols = ["furnace_temp_next", "pred_temp_30s", "run_timestamp", "timestamp", "batch_id_numeric", "batch_id"]
     for lc in leak_cols:
         if lc in X.columns:
             X.drop(columns=[lc], inplace=True)
-    # Remove constant or near-constant columns
     nunique = X.nunique(dropna=False)
     const_cols = nunique[nunique <= 1].index.tolist()
     if const_cols:
@@ -555,7 +534,6 @@ with tabs[4]:
         st.error("No valid feature columns remain after cleaning. Check feature selection.")
         st.stop()
     st.markdown("### Ensemble & AutoML Settings")
     max_trials = st.slider("Optuna trials per family", 5, 80, 20, step=5)
     top_k = st.slider("Max base models in ensemble", 2, 8, 5)
@@ -653,20 +631,16 @@ with tabs[4]:
                 st.caption(f"Tuning family: {fam}")
                 result = tune_family(fam, X, y, n_trials=max_trials)
                 model_obj = result.get("model_obj")
-                # Fix: ensure model is safe to access before fitting
                 if hasattr(model_obj, "estimators_"):
-                    delattr(model_obj, "estimators_")  # clear stale ref if any
                 result["model_obj"] = model_obj
                 tuned_results.append(result)
-            # --- Leaderboard
             lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"], "params": r["best_params"]} for r in tuned_results])
             lb = lb.sort_values("cv_r2", ascending=False).reset_index(drop=True)
             st.markdown("### Tuning Leaderboard (by CV R²)")
             st.dataframe(lb[["family","cv_r2"]].round(4))
-            # --- Enhanced Ensemble Stacking ---
             from sklearn.feature_selection import SelectKBest, f_regression
             from sklearn.linear_model import LinearRegression
             from sklearn.model_selection import KFold
@@ -683,26 +657,21 @@ with tabs[4]:
             kf = KFold(n_splits=5, shuffle=True, random_state=42)
             base_models, oof_preds = [], pd.DataFrame(index=X_sel.index)
-            #  Prevent premature __len__() access on unfitted ensemble models
             for r in tuned_results:
                 m = r.get("model_obj")
-                # Avoid implicit truth check that calls __len__
                 if m is not None:
                     try:
-                        # If model defines __len__, override before fit
                         if "__len__" in dir(m) and not hasattr(m, "estimators_"):
                             setattr(m, "__len__", lambda self=m: 0)
                     except Exception:
                         pass
             for fam, entry in [(r["family"], r) for r in tuned_results if r.get("model_obj") is not None]:
                 model_obj = entry["model_obj"]
                 oof = np.zeros(X_sel.shape[0])
                 for tr_idx, val_idx in kf.split(X_sel):
                     X_tr, X_val = X_sel.iloc[tr_idx], X_sel.iloc[val_idx]
-                    y_tr = y[tr_idx] if not hasattr(y, "iloc") else y.iloc[tr_idx]
                     try:
                         model_obj.fit(X_tr, y_tr)
                         preds = model_obj.predict(X_val)
@@ -753,69 +722,44 @@ with tabs[4]:
             ax.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], "r--")
             st.pyplot(fig, clear_figure=True)
-            st.session_state["automl_summary"] = {
-                "leaderboard": summary_df[["family","cv_r2"]].to_dict(orient="records"),
-                "final_r2": float(final_r2),
-                "final_rmse": float(final_rmse),
-                "target": target,
-                "use_case": use_case
-            }
-            # --- Operator Advisory System + Llama-3-70B-Instruct ---
             st.markdown("---")
             st.subheader("Operator Advisory System — Real-Time Shift Recommendations")
             try:
                 top_base = next((b for b in base_models if b["family"] == selected[0]), None)
                 if top_base and hasattr(top_base["model"], "predict"):
-                    # --- Ensure numeric dtypes for SHAP ---
                     sample_X = X_val.sample(min(300, len(X_val)), random_state=42).copy()
                     def _clean_to_float(x):
-                        """Safely convert any numeric-looking string (even '[1.55E3]') to float."""
                         if isinstance(x, (int, float, np.floating)):
                             return float(x)
                         try:
                             x_str = str(x).replace("[", "").replace("]", "").replace(",", "").strip()
-                            # handle common non-numeric tokens
                             if x_str.lower() in ("nan", "none", "", "null", "na", "n/a"):
                                 return 0.0
                             return float(x_str.replace("E", "e"))
                         except Exception:
                             return 0.0
-                    # Apply cleaning to every column
                     for col in sample_X.columns:
                         sample_X[col] = sample_X[col].map(_clean_to_float)
-                    # Verify numeric dtype and replace NaN
                     sample_X = sample_X.apply(pd.to_numeric, errors="coerce").fillna(0)
-                    # Optional diagnostic
-                    non_numeric_cols = [c for c in sample_X.columns if not np.issubdtype(sample_X[c].dtype, np.number)]
-                    if non_numeric_cols:
-                        st.warning(f"Cleaned {len(non_numeric_cols)} potential non-numeric columns: {non_numeric_cols}")
-                    # --- SHAP computation ---
                     model = top_base["model"]
                     expl = shap.TreeExplainer(model)
                     shap_vals = expl.shap_values(sample_X)
-                    if isinstance(shap_vals, list):
-                        shap_vals = shap_vals[0]
                     shap_vals = np.array(shap_vals)
-                    mean_abs = np.abs(shap_vals).mean(axis=0)
-                    mean_sign = np.sign(shap_vals).mean(axis=0)
                     importance = pd.DataFrame({
                         "Feature": sample_X.columns,
-                        "Mean |SHAP|": mean_abs,
-                        "Mean SHAP Sign": mean_sign
                     }).sort_values("Mean |SHAP|", ascending=False)
                     st.markdown("### Top 5 Operational Drivers")
                     st.dataframe(importance.head(5))
                     recommendations = []
                     for _, row in importance.head(5).iterrows():
                         f, s = row["Feature"], row["Mean SHAP Sign"]
@@ -825,22 +769,17 @@ with tabs[4]:
                             recommendations.append(f"Decrease `{f}` likely increases `{target}`")
                         else:
                             recommendations.append(f"`{f}` neutral for `{target}`")
                     st.markdown("### Suggested Operator Adjustments")
                     st.write("\n".join(recommendations))
-                    # --- Call HF Llama-3-70B-Instruct API for summary ---
-                    # --- Call HF Llama-3-70B-Instruct API for summary (robust + debug-safe) ---
                     import requests, json, textwrap
-                    HF_TOKEN = os.getenv("HF_TOKEN")  # Works on Hugging Face Spaces
                     if not HF_TOKEN:
-                        st.error("HF_TOKEN not detected. Check the Secrets tab in your Space settings.")
                     else:
                         API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-3-8B-Instruct"
                         headers = {"Authorization": f"Bearer {HF_TOKEN}"}
                         prompt = textwrap.dedent(f"""
                             You are an expert metallurgical process advisor.
                             Based on these SHAP-derived recommendations:
@@ -849,16 +788,9 @@ with tabs[4]:
                             Use case: {use_case}
                             Summarize in three concise, professional lines what the operator should do this shift.
                         """)
-                        payload = {
-                            "inputs": prompt,
-                            "parameters": {"max_new_tokens": 150, "temperature": 0.6}
-                        }
                         with st.spinner("Generating operator note (Llama-3-8B)…"):
                             resp = requests.post(API_URL, headers=headers, json=payload, timeout=90)
-                            # --- Debug section (safe, no secrets printed) ---
                             try:
                                 data = resp.json()
                                 st.caption("Raw HF response:")
@@ -867,8 +799,7 @@ with tabs[4]:
                                 st.warning(f"HF raw response parse error: {ex}")
                                 st.text(resp.text)
                                 data = None
-                            # --- Extract generated text robustly ---
                             text = ""
                             if isinstance(data, list) and len(data) > 0 and "generated_text" in data[0]:
                                 text = data[0]["generated_text"].strip()
@@ -876,13 +807,14 @@ with tabs[4]:
                                 text = data["generated_text"].strip()
                             elif isinstance(data, str):
                                 text = data.strip()
                             if text:
-                                st.success("✅ Operator Advisory Generated:")
                                 st.info(text)
                             else:
                                 st.warning("Operator advisory skipped: no text returned from model.")
 # ----- Business Impact tab

     features = st.multiselect("Model input features (auto-suggested)", numeric_cols, default=suggested)
     st.markdown(f"Auto target: `{target}` · Suggested family hint: `{model_hint}`")
     # --- Sampling configuration ---
     max_rows = min(df.shape[0], 20000)
     sample_size = st.slider("Sample rows", 500, max_rows, min(1500, max_rows), step=100)
     # ---------- SAFE target & X preparation ----------
     if isinstance(target, (list, tuple)):
         st.warning(f"Target provided as list/tuple; using first element `{target[0]}` as target.")
         target = target[0]
     cols_needed = [c for c in features if c in df.columns]
     if target in df.columns:
         target_col = target
     else:
         matches = [c for c in df.columns if c.lower() == target.lower()]
         if matches:
             target_col = matches[0]
             st.info(f"Auto-corrected to exact match: `{target_col}`")
         else:
             matches = [c for c in df.columns if target.lower() in c.lower()]
             if len(matches) == 1:
                 target_col = matches[0]
                 st.info(f"Auto-corrected to closest match: `{target_col}`")
             elif len(matches) > 1:
                 preferred = [m for m in matches if m.endswith("_temp") or m.endswith("_ratio") or m == target]
                 if preferred:
                     target_col = preferred[0]
                 st.error(f"Target `{target}` not found in dataframe columns.")
                 st.stop()
     valid_features = [c for c in cols_needed if c in df.columns and c != target_col]
     if not valid_features:
         st.error("No valid feature columns remain after cleaning. Check feature selection.")
     sub_df = df.loc[:, valid_features + [target_col]].copy()
     sub_df = sub_df.sample(n=sample_size, random_state=42).reset_index(drop=True)
     X = sub_df.drop(columns=[target_col])
     y = pd.Series(np.ravel(sub_df[target_col]), name=target_col)
     leak_cols = ["furnace_temp_next", "pred_temp_30s", "run_timestamp", "timestamp", "batch_id_numeric", "batch_id"]
     for lc in leak_cols:
         if lc in X.columns:
             X.drop(columns=[lc], inplace=True)
     nunique = X.nunique(dropna=False)
     const_cols = nunique[nunique <= 1].index.tolist()
     if const_cols:
         st.error("No valid feature columns remain after cleaning. Check feature selection.")
         st.stop()
     st.markdown("### Ensemble & AutoML Settings")
     max_trials = st.slider("Optuna trials per family", 5, 80, 20, step=5)
     top_k = st.slider("Max base models in ensemble", 2, 8, 5)
                 st.caption(f"Tuning family: {fam}")
                 result = tune_family(fam, X, y, n_trials=max_trials)
                 model_obj = result.get("model_obj")
                 if hasattr(model_obj, "estimators_"):
+                    delattr(model_obj, "estimators_")
                 result["model_obj"] = model_obj
                 tuned_results.append(result)
             lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"], "params": r["best_params"]} for r in tuned_results])
             lb = lb.sort_values("cv_r2", ascending=False).reset_index(drop=True)
             st.markdown("### Tuning Leaderboard (by CV R²)")
             st.dataframe(lb[["family","cv_r2"]].round(4))
             from sklearn.feature_selection import SelectKBest, f_regression
             from sklearn.linear_model import LinearRegression
             from sklearn.model_selection import KFold
             kf = KFold(n_splits=5, shuffle=True, random_state=42)
             base_models, oof_preds = [], pd.DataFrame(index=X_sel.index)
             for r in tuned_results:
                 m = r.get("model_obj")
                 if m is not None:
                     try:
                         if "__len__" in dir(m) and not hasattr(m, "estimators_"):
                             setattr(m, "__len__", lambda self=m: 0)
                     except Exception:
                         pass
             for fam, entry in [(r["family"], r) for r in tuned_results if r.get("model_obj") is not None]:
                 model_obj = entry["model_obj"]
                 oof = np.zeros(X_sel.shape[0])
                 for tr_idx, val_idx in kf.split(X_sel):
                     X_tr, X_val = X_sel.iloc[tr_idx], X_sel.iloc[val_idx]
+                    y_tr = y.iloc[tr_idx]
                     try:
                         model_obj.fit(X_tr, y_tr)
                         preds = model_obj.predict(X_val)
             ax.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], "r--")
             st.pyplot(fig, clear_figure=True)
+            # --- Operator Advisory ---
             st.markdown("---")
             st.subheader("Operator Advisory System — Real-Time Shift Recommendations")
             try:
                 top_base = next((b for b in base_models if b["family"] == selected[0]), None)
                 if top_base and hasattr(top_base["model"], "predict"):
                     sample_X = X_val.sample(min(300, len(X_val)), random_state=42).copy()
                     def _clean_to_float(x):
                         if isinstance(x, (int, float, np.floating)):
                             return float(x)
                         try:
                             x_str = str(x).replace("[", "").replace("]", "").replace(",", "").strip()
                             if x_str.lower() in ("nan", "none", "", "null", "na", "n/a"):
                                 return 0.0
                             return float(x_str.replace("E", "e"))
                         except Exception:
                             return 0.0
                     for col in sample_X.columns:
                         sample_X[col] = sample_X[col].map(_clean_to_float)
                     sample_X = sample_X.apply(pd.to_numeric, errors="coerce").fillna(0)
                     model = top_base["model"]
                     expl = shap.TreeExplainer(model)
                     shap_vals = expl.shap_values(sample_X)
+                    if isinstance(shap_vals, list): shap_vals = shap_vals[0]
                     shap_vals = np.array(shap_vals)
                     importance = pd.DataFrame({
                         "Feature": sample_X.columns,
+                        "Mean |SHAP|": np.abs(shap_vals).mean(axis=0),
+                        "Mean SHAP Sign": np.sign(shap_vals).mean(axis=0)
                     }).sort_values("Mean |SHAP|", ascending=False)
                     st.markdown("### Top 5 Operational Drivers")
                     st.dataframe(importance.head(5))
                     recommendations = []
                     for _, row in importance.head(5).iterrows():
                         f, s = row["Feature"], row["Mean SHAP Sign"]
                             recommendations.append(f"Decrease `{f}` likely increases `{target}`")
                         else:
                             recommendations.append(f"`{f}` neutral for `{target}`")
                     st.markdown("### Suggested Operator Adjustments")
                     st.write("\n".join(recommendations))
                     import requests, json, textwrap
+                    HF_TOKEN = os.getenv("HF_TOKEN")
                     if not HF_TOKEN:
+                        st.error("HF_TOKEN not detected. Check the Secrets tab.")
                     else:
                         API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-3-8B-Instruct"
                         headers = {"Authorization": f"Bearer {HF_TOKEN}"}
                         prompt = textwrap.dedent(f"""
                             You are an expert metallurgical process advisor.
                             Based on these SHAP-derived recommendations:
                             Use case: {use_case}
                             Summarize in three concise, professional lines what the operator should do this shift.
                         """)
+                        payload = {"inputs": prompt, "parameters": {"max_new_tokens": 150, "temperature": 0.6}}
                         with st.spinner("Generating operator note (Llama-3-8B)…"):
                             resp = requests.post(API_URL, headers=headers, json=payload, timeout=90)
                             try:
                                 data = resp.json()
                                 st.caption("Raw HF response:")
                                 st.warning(f"HF raw response parse error: {ex}")
                                 st.text(resp.text)
                                 data = None
                             text = ""
                             if isinstance(data, list) and len(data) > 0 and "generated_text" in data[0]:
                                 text = data[0]["generated_text"].strip()
                                 text = data["generated_text"].strip()
                             elif isinstance(data, str):
                                 text = data.strip()
                             if text:
+                                st.success(" Operator Advisory Generated:")
                                 st.info(text)
                             else:
                                 st.warning("Operator advisory skipped: no text returned from model.")
+            except Exception as e:
+                st.warning(f"Operator advisory skipped: {e}")
 # ----- Business Impact tab