Spaces:

singhn9
/

SteelAI_Module2_EAF_Intelligence_Explorer

Sleeping

App Files Files Community

singhn9 commited on Nov 10, 2025

Commit

4d9b97c

verified ·

1 Parent(s): 4605aa4

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +52 -90

src/streamlit_app.py CHANGED Viewed

@@ -432,13 +432,14 @@ with tabs[3]:
     st.subheader("Summary statistics (numeric features)")
     st.dataframe(df.describe().T.style.format("{:.3f}"), height=500)
 # ----- AutoML + SHAP tab (Expanded)
 with tabs[4]:
     st.subheader("AutoML Ensemble — Expanded Families + Stacking + SHAP")
-    # --- Universal numeric cleaner (runs once per tab) ---
     def clean_entire_df(df):
-        """Cleans dataframe of any bracketed/scientific string numbers like '[1.551E3]'."""
         df_clean = df.copy()
         for col in df_clean.columns:
             if df_clean[col].dtype == object:
@@ -456,7 +457,7 @@ with tabs[4]:
         return df_clean
     df = clean_entire_df(df)
-    st.caption(" Dataset cleaned globally — all numeric-like values converted safely.")
     # --- Use Case Selection ---
     use_case = st.selectbox(
@@ -541,49 +542,23 @@ with tabs[4]:
     # --- Family tuner ---
     def tune_family(fam, X_local, y_local, n_trials=20):
-        import optuna
-        from sklearn.model_selection import cross_val_score
-        from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
         def obj(trial):
             if fam == "RandomForest":
                 m = RandomForestRegressor(
                     n_estimators=trial.suggest_int("n_estimators", 100, 800),
                     max_depth=trial.suggest_int("max_depth", 4, 30),
-                    random_state=42, n_jobs=-1,
-                )
             elif fam == "ExtraTrees":
                 m = ExtraTreesRegressor(
                     n_estimators=trial.suggest_int("n_estimators", 100, 800),
                     max_depth=trial.suggest_int("max_depth", 4, 30),
-                    random_state=42, n_jobs=-1,
-                )
-            elif fam == "XGBoost" and optional_families.get("XGBoost"):
-                m = xgb.XGBRegressor(
-                    n_estimators=trial.suggest_int("n_estimators", 100, 800),
-                    max_depth=trial.suggest_int("max_depth", 3, 12),
-                    learning_rate=trial.suggest_float("lr", 0.01, 0.3, log=True),
-                    tree_method="hist", verbosity=0
-                )
-            elif fam == "LightGBM" and optional_families.get("LightGBM"):
-                m = lgb.LGBMRegressor(
-                    n_estimators=trial.suggest_int("n_estimators", 100, 800),
-                    max_depth=trial.suggest_int("max_depth", 3, 16),
-                    learning_rate=trial.suggest_float("lr", 0.01, 0.3, log=True)
-                )
-            elif fam == "CatBoost" and optional_families.get("CatBoost"):
-                m = cb.CatBoostRegressor(
-                    iterations=trial.suggest_int("iterations", 200, 800),
-                    depth=trial.suggest_int("depth", 4, 10),
-                    learning_rate=trial.suggest_float("lr", 0.01, 0.3, log=True),
-                    verbose=0
-                )
             else:
                 m = RandomForestRegressor(random_state=42)
             try:
                 return np.mean(cross_val_score(m, X_local, y_local, cv=3, scoring="r2"))
             except Exception:
-                return -999
         study = optuna.create_study(direction="maximize")
         study.optimize(obj, n_trials=n_trials, show_progress_bar=False)
@@ -603,33 +578,34 @@ with tabs[4]:
             for fam in families:
                 tuned_results.append(tune_family(fam, X, y, n_trials=max_trials))
-            lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"]} for r in tuned_results]).sort_values("cv_r2", ascending=False)
             st.dataframe(lb.round(4))
             # --- Stacking ---
             from sklearn.feature_selection import SelectKBest, f_regression
             from sklearn.linear_model import LinearRegression
-            from sklearn.model_selection import KFold, train_test_split
             from sklearn.metrics import r2_score
             scaler = StandardScaler()
             X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
             selector = SelectKBest(f_regression, k=min(40, X_scaled.shape[1]))
-            X_sel = pd.DataFrame(selector.fit_transform(X_scaled, y), columns=[X.columns[i] for i in selector.get_support(indices=True)])
-            # --- Safe stacking ensemble build ---
             kf = KFold(n_splits=5, shuffle=True, random_state=42)
             oof_preds = pd.DataFrame(index=X_sel.index)
             base_models = []
-            # Explicitly filter valid models (no truthiness eval)
-            valid_results = []
-            for r in tuned_results:
-                m = r.get("model_obj", None)
-                if isinstance(m, object) and hasattr(m, "fit") and callable(getattr(m, "fit", None)):
-                    valid_results.append((r["family"], r))
-            # Train each base model safely
             for fam, entry in valid_results:
                 model = entry["model_obj"]
                 preds = np.zeros(X_sel.shape[0])
@@ -645,14 +621,6 @@ with tabs[4]:
                     base_models.append({"family": fam, "model": model})
                 except Exception as e:
                     st.warning(f"⚠️ {fam} full-fit failed: {e}")
-            # Meta model on OOF predictions
-            meta = LinearRegression(positive=True)
-            meta.fit(oof_preds, y)
-            y_pred = meta.predict(oof_preds)
-            final_r2 = r2_score(y, y_pred)
-            st.success(f"Stacked Ensemble R² = {final_r2:.4f}")
             meta = LinearRegression(positive=True)
             meta.fit(oof_preds, y)
@@ -669,8 +637,7 @@ with tabs[4]:
                 sample_X = X_sel.sample(min(300, len(X_sel)), random_state=42)
                 expl = shap.TreeExplainer(top_base)
                 shap_vals = expl.shap_values(sample_X)
-                if isinstance(shap_vals, list):
-                    shap_vals = shap_vals[0]
                 imp = pd.DataFrame({
                     "Feature": sample_X.columns,
                     "Mean |SHAP|": np.abs(shap_vals).mean(axis=0),
@@ -688,71 +655,66 @@ with tabs[4]:
                         recs.append(f"`{r['Feature']}` neutral for `{target}`")
                 st.write("\n".join(recs))
-                # --- Hugging Face advisory ---
-                import requests, json, textwrap
                 HF_TOKEN = os.getenv("HF_TOKEN")
                 if not HF_TOKEN:
                     st.error("HF_TOKEN not detected in environment or secrets.toml.")
                 else:
-                    #  Correct endpoint per Hugging Face Router API
-                    API_URL = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3-8B-Instruct"
                     headers = {
                         "Authorization": f"Bearer {HF_TOKEN}",
                         "Content-Type": "application/json",
                     }
-                    # Prepare prompt
                     prompt = textwrap.dedent(f"""
                         You are an expert metallurgical process advisor.
                         Analyze these SHAP-based operator recommendations and rewrite them
                         as a concise 3-line professional advisory note.
                         Recommendations: {recs}
                         Target variable: {target}
                         Use case: {use_case}
                     """)
-                    #  HF Router supports both "inputs" and OpenAI-style "messages"
                     payload = {
-                        "inputs": prompt,
-                        "parameters": {
-                            "max_new_tokens": 200,
-                            "temperature": 0.5,
-                            "top_p": 0.95,
-                            "return_full_text": False
-                        }
                     }
                     with st.spinner("Generating operator advisory (Llama 3-8B)…"):
                         try:
                             resp = requests.post(API_URL, headers=headers, json=payload, timeout=90)
                             if resp.status_code != 200:
                                 st.warning(f"HF API error {resp.status_code}: {resp.text}")
                             else:
                                 try:
                                     data = resp.json()
-                                    text = ""
-                                    # The router returns list-based structure for text generation
-                                    if isinstance(data, list) and len(data) > 0 and "generated_text" in data[0]:
-                                        text = data[0]["generated_text"].strip()
-                                    elif isinstance(data, dict) and "generated_text" in data:
-                                        text = data["generated_text"].strip()
-                                    if text:
-                                        st.success(" Operator Advisory Generated:")
-                                        st.info(text)
                                     else:
-                                        st.warning(f"Operator advisory skipped: no text returned.\nRaw response:\n{data}")
-                                except json.JSONDecodeError:
-                                    st.warning(f"Operator advisory skipped: invalid JSON.\nRaw response:\n{resp.text}")
                         except Exception as e:
                             st.warning(f"Operator advisory skipped: {e}")
 # ----- Business Impact tab

     st.subheader("Summary statistics (numeric features)")
     st.dataframe(df.describe().T.style.format("{:.3f}"), height=500)
+# ----- AutoML + SHAP tab (Expanded)
 # ----- AutoML + SHAP tab (Expanded)
 with tabs[4]:
     st.subheader("AutoML Ensemble — Expanded Families + Stacking + SHAP")
+    # --- Global numeric cleaner ---
     def clean_entire_df(df):
+        """Cleans dataframe of bracketed/scientific string numbers like '[1.551E3]'."""
         df_clean = df.copy()
         for col in df_clean.columns:
             if df_clean[col].dtype == object:
         return df_clean
     df = clean_entire_df(df)
+    st.caption("✅ Dataset cleaned globally — all numeric-like values converted safely.")
     # --- Use Case Selection ---
     use_case = st.selectbox(
     # --- Family tuner ---
     def tune_family(fam, X_local, y_local, n_trials=20):
         def obj(trial):
             if fam == "RandomForest":
                 m = RandomForestRegressor(
                     n_estimators=trial.suggest_int("n_estimators", 100, 800),
                     max_depth=trial.suggest_int("max_depth", 4, 30),
+                    random_state=42, n_jobs=-1)
             elif fam == "ExtraTrees":
                 m = ExtraTreesRegressor(
                     n_estimators=trial.suggest_int("n_estimators", 100, 800),
                     max_depth=trial.suggest_int("max_depth", 4, 30),
+                    random_state=42, n_jobs=-1)
             else:
                 m = RandomForestRegressor(random_state=42)
             try:
                 return np.mean(cross_val_score(m, X_local, y_local, cv=3, scoring="r2"))
             except Exception:
+                return -999.0
         study = optuna.create_study(direction="maximize")
         study.optimize(obj, n_trials=n_trials, show_progress_bar=False)
             for fam in families:
                 tuned_results.append(tune_family(fam, X, y, n_trials=max_trials))
+            lb = pd.DataFrame(
+                [{"family": r["family"], "cv_r2": r["cv_score"]} for r in tuned_results]
+            ).sort_values("cv_r2", ascending=False)
             st.dataframe(lb.round(4))
             # --- Stacking ---
             from sklearn.feature_selection import SelectKBest, f_regression
             from sklearn.linear_model import LinearRegression
+            from sklearn.model_selection import KFold
             from sklearn.metrics import r2_score
             scaler = StandardScaler()
             X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
             selector = SelectKBest(f_regression, k=min(40, X_scaled.shape[1]))
+            X_sel = pd.DataFrame(
+                selector.fit_transform(X_scaled, y),
+                columns=[X.columns[i] for i in selector.get_support(indices=True)]
+            )
             kf = KFold(n_splits=5, shuffle=True, random_state=42)
             oof_preds = pd.DataFrame(index=X_sel.index)
             base_models = []
+            valid_results = [
+                (r["family"], r) for r in tuned_results
+                if r.get("model_obj") is not None and hasattr(r["model_obj"], "fit")
+            ]
             for fam, entry in valid_results:
                 model = entry["model_obj"]
                 preds = np.zeros(X_sel.shape[0])
                     base_models.append({"family": fam, "model": model})
                 except Exception as e:
                     st.warning(f"⚠️ {fam} full-fit failed: {e}")
             meta = LinearRegression(positive=True)
             meta.fit(oof_preds, y)
                 sample_X = X_sel.sample(min(300, len(X_sel)), random_state=42)
                 expl = shap.TreeExplainer(top_base)
                 shap_vals = expl.shap_values(sample_X)
+                if isinstance(shap_vals, list): shap_vals = shap_vals[0]
                 imp = pd.DataFrame({
                     "Feature": sample_X.columns,
                     "Mean |SHAP|": np.abs(shap_vals).mean(axis=0),
                         recs.append(f"`{r['Feature']}` neutral for `{target}`")
                 st.write("\n".join(recs))
+                # --- Hugging Face Router Chat API (OpenAI-Compatible Format) ---
+                import requests, textwrap
                 HF_TOKEN = os.getenv("HF_TOKEN")
                 if not HF_TOKEN:
                     st.error("HF_TOKEN not detected in environment or secrets.toml.")
                 else:
+                    API_URL = "https://router.huggingface.co/v1/chat/completions"
                     headers = {
                         "Authorization": f"Bearer {HF_TOKEN}",
                         "Content-Type": "application/json",
                     }
                     prompt = textwrap.dedent(f"""
                         You are an expert metallurgical process advisor.
                         Analyze these SHAP-based operator recommendations and rewrite them
                         as a concise 3-line professional advisory note.
                         Recommendations: {recs}
                         Target variable: {target}
                         Use case: {use_case}
                     """)
                     payload = {
+                        "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+                        "messages": [
+                            {"role": "system", "content": "You are a concise metallurgical advisor."},
+                            {"role": "user", "content": prompt}
+                        ],
+                        "temperature": 0.5,
+                        "max_tokens": 200,
+                        "stream": False
                     }
                     with st.spinner("Generating operator advisory (Llama 3-8B)…"):
                         try:
                             resp = requests.post(API_URL, headers=headers, json=payload, timeout=90)
                             if resp.status_code != 200:
                                 st.warning(f"HF API error {resp.status_code}: {resp.text}")
                             else:
                                 try:
                                     data = resp.json()
+                                    msg = (
+                                        data.get("choices", [{}])[0]
+                                        .get("message", {})
+                                        .get("content", "")
+                                        .strip()
+                                    )
+                                    if msg:
+                                        st.success("✅ Operator Advisory Generated:")
+                                        st.info(msg)
                                     else:
+                                        st.warning(f"Operator advisory skipped: empty response.\nRaw: {data}")
+                                except Exception as e:
+                                    st.warning(f"Operator advisory skipped: JSON parse error — {e}")
                         except Exception as e:
                             st.warning(f"Operator advisory skipped: {e}")
+            except Exception as e:
+                st.warning(f"Operator advisory skipped: {e}")
 # ----- Business Impact tab