singhn9 commited on
Commit
a2c0d56
·
verified ·
1 Parent(s): 09c46b2

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +161 -279
src/streamlit_app.py CHANGED
@@ -436,6 +436,29 @@ with tabs[3]:
436
  with tabs[4]:
437
  st.subheader("AutoML Ensemble — Expanded Families + Stacking + SHAP")
438
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439
  use_case = st.selectbox(
440
  "Select Use Case",
441
  [
@@ -446,9 +469,9 @@ with tabs[4]:
446
  "Surface Defect Detection (Vision AI)",
447
  "Material Composition & Alloy Mix AI",
448
  "Inventory & Yield Optimization",
449
- "Refractory & Cooling Loss Prediction"
450
  ],
451
- index=1
452
  )
453
 
454
  use_case_config = {
@@ -461,13 +484,13 @@ with tabs[4]:
461
  "Inventory & Yield Optimization": {"target": "yield_ratio", "model_hint": "GradientBoosting"},
462
  "Refractory & Cooling Loss Prediction": {"target": "lining_thickness", "model_hint": "ExtraTrees"},
463
  }
 
464
  cfg = use_case_config.get(use_case, {"target": numeric_cols[0], "model_hint": "RandomForest"})
465
- target = cfg["target"]
466
- model_hint = cfg["model_hint"]
467
 
468
- suggested = [c for c in numeric_cols if any(k in c for k in target.split('_'))]
469
  if len(suggested) < 6:
470
- suggested = [c for c in numeric_cols if any(k in c for k in ["temp","power","energy","pressure","yield"])]
471
  if len(suggested) < 6:
472
  suggested = numeric_cols[:50]
473
 
@@ -478,62 +501,24 @@ with tabs[4]:
478
  max_rows = min(df.shape[0], 20000)
479
  sample_size = st.slider("Sample rows", 500, max_rows, min(1500, max_rows), step=100)
480
 
481
- # ---------- SAFE target & X preparation ----------
482
- if isinstance(target, (list, tuple)):
483
- st.warning(f"Target provided as list/tuple; using first element `{target[0]}` as target.")
484
- target = target[0]
485
-
486
- cols_needed = [c for c in features if c in df.columns]
487
-
488
- if target in df.columns:
489
- target_col = target
490
- else:
491
- matches = [c for c in df.columns if c.lower() == target.lower()]
492
- if matches:
493
- target_col = matches[0]
494
- st.info(f"Auto-corrected to exact match: `{target_col}`")
495
- else:
496
- matches = [c for c in df.columns if target.lower() in c.lower()]
497
- if len(matches) == 1:
498
- target_col = matches[0]
499
- st.info(f"Auto-corrected to closest match: `{target_col}`")
500
- elif len(matches) > 1:
501
- preferred = [m for m in matches if m.endswith("_temp") or m.endswith("_ratio") or m == target]
502
- if preferred:
503
- target_col = preferred[0]
504
- st.warning(f"Multiple matches found {matches}. Using `{target_col}`.")
505
- else:
506
- target_col = matches[0]
507
- st.warning(f"Multiple matches found {matches}. Using first: `{target_col}`.")
508
- else:
509
- st.error(f"Target `{target}` not found in dataframe columns.")
510
- st.stop()
511
-
512
- valid_features = [c for c in cols_needed if c in df.columns and c != target_col]
513
- if not valid_features:
514
- st.error("No valid feature columns remain after cleaning. Check feature selection.")
515
  st.stop()
516
 
517
- sub_df = df.loc[:, valid_features + [target_col]].copy()
518
- sub_df = sub_df.sample(n=sample_size, random_state=42).reset_index(drop=True)
519
 
520
  X = sub_df.drop(columns=[target_col])
521
  y = pd.Series(np.ravel(sub_df[target_col]), name=target_col)
522
 
 
523
  leak_cols = ["furnace_temp_next", "pred_temp_30s", "run_timestamp", "timestamp", "batch_id_numeric", "batch_id"]
524
- for lc in leak_cols:
525
- if lc in X.columns:
526
- X.drop(columns=[lc], inplace=True)
527
-
528
- nunique = X.nunique(dropna=False)
529
- const_cols = nunique[nunique <= 1].index.tolist()
530
- if const_cols:
531
- X.drop(columns=const_cols, inplace=True)
532
-
533
- if X.shape[1] == 0:
534
- st.error("No valid feature columns remain after cleaning. Check feature selection.")
535
- st.stop()
536
 
 
537
  st.markdown("### Ensemble & AutoML Settings")
538
  max_trials = st.slider("Optuna trials per family", 5, 80, 20, step=5)
539
  top_k = st.slider("Max base models in ensemble", 2, 8, 5)
@@ -552,267 +537,164 @@ with tabs[4]:
552
  import catboost as cb; optional_families["CatBoost"] = True; available_models.append("CatBoost")
553
  except Exception: optional_families["CatBoost"] = False
554
 
555
- st.markdown(f"Available model families: {', '.join(available_models)}")
 
 
 
 
 
 
556
 
557
- def tune_family(family_name, X_local, y_local, n_trials=20, random_state=42):
558
- """Tune one model family using Optuna."""
559
  def obj(trial):
560
- if family_name == "RandomForest":
561
- n_estimators = trial.suggest_int("n_estimators", 100, 800)
562
- max_depth = trial.suggest_int("max_depth", 4, 30)
563
- m = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1, random_state=random_state)
564
- elif family_name == "ExtraTrees":
565
- n_estimators = trial.suggest_int("n_estimators", 100, 800)
566
- max_depth = trial.suggest_int("max_depth", 4, 30)
567
- m = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1, random_state=random_state)
568
- elif family_name == "XGBoost" and optional_families.get("XGBoost"):
569
- n_estimators = trial.suggest_int("n_estimators", 100, 1000)
570
- max_depth = trial.suggest_int("max_depth", 3, 12)
571
- lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
572
- m = xgb.XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, tree_method="hist", verbosity=0)
573
- elif family_name == "LightGBM" and optional_families.get("LightGBM"):
574
- n_estimators = trial.suggest_int("n_estimators", 100, 1000)
575
- max_depth = trial.suggest_int("max_depth", 3, 16)
576
- lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
577
- m = lgb.LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, n_jobs=1)
578
- elif family_name == "CatBoost" and optional_families.get("CatBoost"):
579
- iterations = trial.suggest_int("iterations", 200, 1000)
580
- depth = trial.suggest_int("depth", 4, 10)
581
- lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
582
- m = cb.CatBoostRegressor(iterations=iterations, depth=depth, learning_rate=lr, verbose=0)
 
 
 
 
 
 
 
 
 
583
  else:
584
- m = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=random_state)
585
  try:
586
- scores = cross_val_score(m, X_local, y_local, scoring="r2", cv=3)
587
- return float(np.mean(scores))
588
  except Exception:
589
- return -999.0
590
 
591
  study = optuna.create_study(direction="maximize")
592
  study.optimize(obj, n_trials=n_trials, show_progress_bar=False)
593
- best = study.best_trial.params if study.trials else {}
594
- try:
595
- if family_name == "RandomForest":
596
- model = RandomForestRegressor(**{**{"random_state":42,"n_jobs":-1}, **best})
597
- elif family_name == "ExtraTrees":
598
- model = ExtraTreesRegressor(**{**{"random_state":42,"n_jobs":-1}, **best})
599
- elif family_name == "XGBoost" and optional_families.get("XGBoost"):
600
- model = xgb.XGBRegressor(**{**{"verbosity":0,"tree_method":"hist"}, **best})
601
- elif family_name == "LightGBM" and optional_families.get("LightGBM"):
602
- model = lgb.LGBMRegressor(**{**{"n_jobs":1}, **best})
603
- elif family_name == "CatBoost" and optional_families.get("CatBoost"):
604
- model = cb.CatBoostRegressor(**{**{"verbose":0}, **best})
605
- else:
606
- model = RandomForestRegressor(random_state=42)
607
- except Exception:
608
- model = RandomForestRegressor(random_state=42)
609
 
610
- try:
611
- score = float(np.mean(cross_val_score(model, X_local, y_local, scoring="r2", cv=3)))
612
- except Exception:
613
- score = -999.0
614
- return {"model_obj": model, "cv_score": score, "best_params": best, "family": family_name}
615
-
616
- if st.button("Run expanded AutoML + Stacking"):
617
- st.session_state["run_automl_clicked"] = True
618
-
619
- if st.session_state["run_automl_clicked"]:
620
- log("AutoML + Stacking initiated.")
621
- with st.spinner("Tuning multiple families..."):
622
- families_to_try = ["RandomForest", "ExtraTrees", "MLP"]
623
  if allow_advanced:
624
- if optional_families.get("XGBoost"): families_to_try.append("XGBoost")
625
- if optional_families.get("LightGBM"): families_to_try.append("LightGBM")
626
- if optional_families.get("CatBoost"): families_to_try.append("CatBoost")
627
 
628
- tuned_results = []
629
- for fam in families_to_try:
630
- log(f"Tuning family: {fam}")
631
- st.caption(f"Tuning family: {fam}")
632
- result = tune_family(fam, X, y, n_trials=max_trials)
633
- model_obj = result.get("model_obj")
634
- if hasattr(model_obj, "estimators_"):
635
- delattr(model_obj, "estimators_")
636
- result["model_obj"] = model_obj
637
- tuned_results.append(result)
638
-
639
- lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"], "params": r["best_params"]} for r in tuned_results])
640
- lb = lb.sort_values("cv_r2", ascending=False).reset_index(drop=True)
641
- st.markdown("### Tuning Leaderboard (by CV R²)")
642
- st.dataframe(lb[["family","cv_r2"]].round(4))
643
 
 
644
  from sklearn.feature_selection import SelectKBest, f_regression
645
  from sklearn.linear_model import LinearRegression
646
- from sklearn.model_selection import KFold
647
-
648
- st.markdown("### Building base models & out-of-fold predictions for stacking")
649
 
650
  scaler = StandardScaler()
651
  X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
652
  selector = SelectKBest(f_regression, k=min(40, X_scaled.shape[1]))
653
- X_sel = selector.fit_transform(X_scaled, y)
654
- selected_feature_names = [X.columns[i] for i in selector.get_support(indices=True)]
655
- X_sel = pd.DataFrame(X_sel, columns=selected_feature_names)
656
 
657
  kf = KFold(n_splits=5, shuffle=True, random_state=42)
658
- base_models, oof_preds = [], pd.DataFrame(index=X_sel.index)
659
-
660
- for r in tuned_results:
661
- m = r.get("model_obj")
662
- if m is not None:
663
- try:
664
- if "__len__" in dir(m) and not hasattr(m, "estimators_"):
665
- setattr(m, "__len__", lambda self=m: 0)
666
- except Exception:
667
- pass
668
-
669
- for fam, entry in [(r["family"], r) for r in tuned_results if r.get("model_obj") is not None]:
670
- model_obj = entry["model_obj"]
671
- oof = np.zeros(X_sel.shape[0])
672
- for tr_idx, val_idx in kf.split(X_sel):
673
- X_tr, X_val = X_sel.iloc[tr_idx], X_sel.iloc[val_idx]
674
- y_tr = y.iloc[tr_idx]
675
- try:
676
- model_obj.fit(X_tr, y_tr)
677
- preds = model_obj.predict(X_val)
678
- oof[val_idx] = preds
679
- except Exception:
680
- oof[val_idx] = np.mean(y_tr)
681
- oof_preds[f"{fam}_oof"] = oof
682
- model_obj.fit(X_sel, y)
683
- base_models.append({"family": fam, "model": model_obj})
684
-
685
- if oof_preds.empty:
686
- st.error("No base models built.")
687
- st.stop()
688
-
689
- corr = oof_preds.corr().abs()
690
- div = {c: 1 - corr[c].drop(c).mean() for c in corr.columns}
691
- cv_r2_est = {c: r2_score(y, oof_preds[c]) for c in oof_preds.columns}
692
-
693
- summary_df = pd.DataFrame({
694
- "family": [c.replace("_oof","") for c in oof_preds.columns],
695
- "cv_r2": [cv_r2_est[c] for c in oof_preds.columns],
696
- "diversity": [div[c] for c in oof_preds.columns]
697
- }).sort_values(["cv_r2","diversity"], ascending=[False,False])
698
-
699
- st.dataframe(summary_df.round(4))
700
- selected = summary_df.head(top_k)["family"].tolist()
701
- st.markdown(f"Selected for stacking (top {top_k}): {selected}")
702
 
703
  meta = LinearRegression(positive=True)
704
- X_stack = oof_preds[[f"{s}_oof" for s in selected]].fillna(0)
705
- meta.fit(X_stack, y)
706
-
707
- X_tr, X_val, y_tr, y_val = train_test_split(X_sel, y, test_size=0.2, random_state=42)
708
- meta_inputs = []
709
- for fam in selected:
710
- mdl = next((b["model"] for b in base_models if b["family"] == fam), None)
711
- preds = mdl.predict(X_val) if mdl else np.full(len(X_val), np.mean(y_tr))
712
- meta_inputs.append(np.ravel(preds))
713
- X_meta_val = pd.DataFrame(np.column_stack(meta_inputs), columns=X_stack.columns)
714
- y_meta_pred = meta.predict(X_meta_val)
715
-
716
- final_r2 = r2_score(y_val, y_meta_pred)
717
- final_rmse = np.sqrt(mean_squared_error(y_val, y_meta_pred))
718
- st.success(f"Stacked Ensemble — R² = {final_r2:.4f}, RMSE = {final_rmse:.3f}")
719
-
720
- fig, ax = plt.subplots(figsize=(7,4))
721
- ax.scatter(y_val, y_meta_pred, alpha=0.7)
722
- ax.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], "r--")
723
- st.pyplot(fig, clear_figure=True)
724
 
725
  # --- Operator Advisory ---
726
  st.markdown("---")
727
- st.subheader("Operator Advisory System — Real-Time Shift Recommendations")
728
 
729
  try:
730
- top_base = next((b for b in base_models if b["family"] == selected[0]), None)
731
- if top_base and hasattr(top_base["model"], "predict"):
732
- sample_X = X_val.sample(min(300, len(X_val)), random_state=42).copy()
733
-
734
- def _clean_to_float(x):
735
- if isinstance(x, (int, float, np.floating)):
736
- return float(x)
737
- try:
738
- x_str = str(x).replace("[", "").replace("]", "").replace(",", "").strip()
739
- if x_str.lower() in ("nan", "none", "", "null", "na", "n/a"):
740
- return 0.0
741
- return float(x_str.replace("E", "e"))
742
- except Exception:
743
- return 0.0
744
-
745
- for col in sample_X.columns:
746
- sample_X[col] = sample_X[col].map(_clean_to_float)
747
- sample_X = sample_X.apply(pd.to_numeric, errors="coerce").fillna(0)
748
-
749
- model = top_base["model"]
750
- expl = shap.TreeExplainer(model)
751
- shap_vals = expl.shap_values(sample_X)
752
- if isinstance(shap_vals, list): shap_vals = shap_vals[0]
753
- shap_vals = np.array(shap_vals)
754
- importance = pd.DataFrame({
755
- "Feature": sample_X.columns,
756
- "Mean |SHAP|": np.abs(shap_vals).mean(axis=0),
757
- "Mean SHAP Sign": np.sign(shap_vals).mean(axis=0)
758
- }).sort_values("Mean |SHAP|", ascending=False)
759
-
760
- st.markdown("### Top 5 Operational Drivers")
761
- st.dataframe(importance.head(5))
762
-
763
- recommendations = []
764
- for _, row in importance.head(5).iterrows():
765
- f, s = row["Feature"], row["Mean SHAP Sign"]
766
- if s > 0.05:
767
- recommendations.append(f"Increase `{f}` likely increases `{target}`")
768
- elif s < -0.05:
769
- recommendations.append(f"Decrease `{f}` likely increases `{target}`")
770
- else:
771
- recommendations.append(f"`{f}` neutral for `{target}`")
772
-
773
- st.markdown("### Suggested Operator Adjustments")
774
- st.write("\n".join(recommendations))
775
-
776
- import requests, json, textwrap
777
- HF_TOKEN = os.getenv("HF_TOKEN")
778
- if not HF_TOKEN:
779
- st.error("HF_TOKEN not detected. Check the Secrets tab.")
780
  else:
781
- API_URL = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3-8B-Instruct"
782
- headers = {"Authorization": f"Bearer {HF_TOKEN}"}
783
- prompt = textwrap.dedent(f"""
784
- You are an expert metallurgical process advisor.
785
- Based on these SHAP-derived recommendations:
786
- {recommendations}
787
- Target: {target}
788
- Use case: {use_case}
789
- Summarize in three concise, professional lines what the operator should do this shift.
790
- """)
791
- payload = {"inputs": prompt, "parameters": {"max_new_tokens": 150, "temperature": 0.6}}
792
- with st.spinner("Generating operator note (Llama-3-8B)…"):
793
- resp = requests.post(API_URL, headers=headers, json=payload, timeout=90)
794
- try:
795
- data = resp.json()
796
- st.caption("Raw HF response:")
797
- st.json(data)
798
- except Exception as ex:
799
- st.warning(f"HF raw response parse error: {ex}")
800
- st.text(resp.text)
801
- data = None
802
-
 
803
  text = ""
804
  if isinstance(data, list) and len(data) > 0 and "generated_text" in data[0]:
805
  text = data[0]["generated_text"].strip()
806
  elif isinstance(data, dict) and "generated_text" in data:
807
  text = data["generated_text"].strip()
808
- elif isinstance(data, str):
809
- text = data.strip()
810
-
811
  if text:
812
- st.success(" Operator Advisory Generated:")
813
  st.info(text)
814
  else:
815
- st.warning("Operator advisory skipped: no text returned from model.")
 
 
816
  except Exception as e:
817
  st.warning(f"Operator advisory skipped: {e}")
818
 
 
436
  with tabs[4]:
437
  st.subheader("AutoML Ensemble — Expanded Families + Stacking + SHAP")
438
 
439
+ # --- Universal numeric cleaner (runs once per tab) ---
440
+ def clean_entire_df(df):
441
+ """Cleans dataframe of any bracketed/scientific string numbers like '[1.551E3]'."""
442
+ df_clean = df.copy()
443
+ for col in df_clean.columns:
444
+ if df_clean[col].dtype == object:
445
+ df_clean[col] = (
446
+ df_clean[col]
447
+ .astype(str)
448
+ .str.replace("[", "", regex=False)
449
+ .str.replace("]", "", regex=False)
450
+ .str.replace(",", "", regex=False)
451
+ .str.strip()
452
+ .replace(["nan", "NaN", "None", "null", "N/A", "", " "], np.nan)
453
+ )
454
+ df_clean[col] = pd.to_numeric(df_clean[col], errors="coerce")
455
+ df_clean = df_clean.fillna(0.0).astype(float)
456
+ return df_clean
457
+
458
+ df = clean_entire_df(df)
459
+ st.caption("✅ Dataset cleaned globally — all numeric-like values converted safely.")
460
+
461
+ # --- Use Case Selection ---
462
  use_case = st.selectbox(
463
  "Select Use Case",
464
  [
 
469
  "Surface Defect Detection (Vision AI)",
470
  "Material Composition & Alloy Mix AI",
471
  "Inventory & Yield Optimization",
472
+ "Refractory & Cooling Loss Prediction",
473
  ],
474
+ index=1,
475
  )
476
 
477
  use_case_config = {
 
484
  "Inventory & Yield Optimization": {"target": "yield_ratio", "model_hint": "GradientBoosting"},
485
  "Refractory & Cooling Loss Prediction": {"target": "lining_thickness", "model_hint": "ExtraTrees"},
486
  }
487
+
488
  cfg = use_case_config.get(use_case, {"target": numeric_cols[0], "model_hint": "RandomForest"})
489
+ target, model_hint = cfg["target"], cfg["model_hint"]
 
490
 
491
+ suggested = [c for c in numeric_cols if any(k in c for k in target.split("_"))]
492
  if len(suggested) < 6:
493
+ suggested = [c for c in numeric_cols if any(k in c for k in ["temp", "power", "energy", "pressure", "yield"])]
494
  if len(suggested) < 6:
495
  suggested = numeric_cols[:50]
496
 
 
501
  max_rows = min(df.shape[0], 20000)
502
  sample_size = st.slider("Sample rows", 500, max_rows, min(1500, max_rows), step=100)
503
 
504
+ # --- Prepare data ---
505
+ target_col = target if target in df.columns else next((c for c in df.columns if target.lower() in c.lower()), None)
506
+ if not target_col:
507
+ st.error(f"Target `{target}` not found in dataframe.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
  st.stop()
509
 
510
+ cols_needed = [c for c in features if c in df.columns and c != target_col]
511
+ sub_df = df.loc[:, cols_needed + [target_col]].sample(n=sample_size, random_state=42).reset_index(drop=True)
512
 
513
  X = sub_df.drop(columns=[target_col])
514
  y = pd.Series(np.ravel(sub_df[target_col]), name=target_col)
515
 
516
+ # --- Drop constant or leak columns ---
517
  leak_cols = ["furnace_temp_next", "pred_temp_30s", "run_timestamp", "timestamp", "batch_id_numeric", "batch_id"]
518
+ X = X.drop(columns=[c for c in leak_cols if c in X.columns], errors="ignore")
519
+ X = X.loc[:, X.nunique() > 1]
 
 
 
 
 
 
 
 
 
 
520
 
521
+ # --- AutoML Settings ---
522
  st.markdown("### Ensemble & AutoML Settings")
523
  max_trials = st.slider("Optuna trials per family", 5, 80, 20, step=5)
524
  top_k = st.slider("Max base models in ensemble", 2, 8, 5)
 
537
  import catboost as cb; optional_families["CatBoost"] = True; available_models.append("CatBoost")
538
  except Exception: optional_families["CatBoost"] = False
539
 
540
+ st.markdown(f"Available families: {', '.join(available_models)}")
541
+
542
+ # --- Family tuner ---
543
+ def tune_family(fam, X_local, y_local, n_trials=20):
544
+ import optuna
545
+ from sklearn.model_selection import cross_val_score
546
+ from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
547
 
 
 
548
  def obj(trial):
549
+ if fam == "RandomForest":
550
+ m = RandomForestRegressor(
551
+ n_estimators=trial.suggest_int("n_estimators", 100, 800),
552
+ max_depth=trial.suggest_int("max_depth", 4, 30),
553
+ random_state=42, n_jobs=-1,
554
+ )
555
+ elif fam == "ExtraTrees":
556
+ m = ExtraTreesRegressor(
557
+ n_estimators=trial.suggest_int("n_estimators", 100, 800),
558
+ max_depth=trial.suggest_int("max_depth", 4, 30),
559
+ random_state=42, n_jobs=-1,
560
+ )
561
+ elif fam == "XGBoost" and optional_families.get("XGBoost"):
562
+ m = xgb.XGBRegressor(
563
+ n_estimators=trial.suggest_int("n_estimators", 100, 800),
564
+ max_depth=trial.suggest_int("max_depth", 3, 12),
565
+ learning_rate=trial.suggest_float("lr", 0.01, 0.3, log=True),
566
+ tree_method="hist", verbosity=0
567
+ )
568
+ elif fam == "LightGBM" and optional_families.get("LightGBM"):
569
+ m = lgb.LGBMRegressor(
570
+ n_estimators=trial.suggest_int("n_estimators", 100, 800),
571
+ max_depth=trial.suggest_int("max_depth", 3, 16),
572
+ learning_rate=trial.suggest_float("lr", 0.01, 0.3, log=True)
573
+ )
574
+ elif fam == "CatBoost" and optional_families.get("CatBoost"):
575
+ m = cb.CatBoostRegressor(
576
+ iterations=trial.suggest_int("iterations", 200, 800),
577
+ depth=trial.suggest_int("depth", 4, 10),
578
+ learning_rate=trial.suggest_float("lr", 0.01, 0.3, log=True),
579
+ verbose=0
580
+ )
581
  else:
582
+ m = RandomForestRegressor(random_state=42)
583
  try:
584
+ return np.mean(cross_val_score(m, X_local, y_local, cv=3, scoring="r2"))
 
585
  except Exception:
586
+ return -999
587
 
588
  study = optuna.create_study(direction="maximize")
589
  study.optimize(obj, n_trials=n_trials, show_progress_bar=False)
590
+ params = study.best_trial.params if study.trials else {}
591
+ model = RandomForestRegressor(random_state=42)
592
+ return {"family": fam, "model_obj": model, "best_params": params, "cv_score": study.best_value}
 
 
 
 
 
 
 
 
 
 
 
 
 
593
 
594
+ # --- Run button ---
595
+ if st.button("Run AutoML + SHAP"):
596
+ with st.spinner("Training and stacking..."):
597
+ tuned_results = []
598
+ families = ["RandomForest", "ExtraTrees"]
 
 
 
 
 
 
 
 
599
  if allow_advanced:
600
+ for f in ["XGBoost", "LightGBM", "CatBoost"]:
601
+ if optional_families.get(f): families.append(f)
 
602
 
603
+ for fam in families:
604
+ tuned_results.append(tune_family(fam, X, y, n_trials=max_trials))
605
+
606
+ lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"]} for r in tuned_results]).sort_values("cv_r2", ascending=False)
607
+ st.dataframe(lb.round(4))
 
 
 
 
 
 
 
 
 
 
608
 
609
+ # --- Stacking ---
610
  from sklearn.feature_selection import SelectKBest, f_regression
611
  from sklearn.linear_model import LinearRegression
612
+ from sklearn.model_selection import KFold, train_test_split
613
+ from sklearn.metrics import r2_score
 
614
 
615
  scaler = StandardScaler()
616
  X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
617
  selector = SelectKBest(f_regression, k=min(40, X_scaled.shape[1]))
618
+ X_sel = pd.DataFrame(selector.fit_transform(X_scaled, y), columns=[X.columns[i] for i in selector.get_support(indices=True)])
 
 
619
 
620
  kf = KFold(n_splits=5, shuffle=True, random_state=42)
621
+ oof_preds, base_models = pd.DataFrame(index=X_sel.index), []
622
+ for fam, entry in [(r["family"], r) for r in tuned_results if r.get("model_obj")]:
623
+ model = entry["model_obj"]
624
+ preds = np.zeros(X_sel.shape[0])
625
+ for tr, va in kf.split(X_sel):
626
+ model.fit(X_sel.iloc[tr], y.iloc[tr])
627
+ preds[va] = model.predict(X_sel.iloc[va])
628
+ oof_preds[f"{fam}_oof"] = preds
629
+ model.fit(X_sel, y)
630
+ base_models.append({"family": fam, "model": model})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
631
 
632
  meta = LinearRegression(positive=True)
633
+ meta.fit(oof_preds, y)
634
+ y_pred = meta.predict(oof_preds)
635
+ final_r2 = r2_score(y, y_pred)
636
+ st.success(f"Stacked Ensemble = {final_r2:.4f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
637
 
638
  # --- Operator Advisory ---
639
  st.markdown("---")
640
+ st.subheader("Operator Advisory — Real-Time Recommendations")
641
 
642
  try:
643
+ top_base = base_models[0]["model"]
644
+ sample_X = X_sel.sample(min(300, len(X_sel)), random_state=42)
645
+ expl = shap.TreeExplainer(top_base)
646
+ shap_vals = expl.shap_values(sample_X)
647
+ if isinstance(shap_vals, list):
648
+ shap_vals = shap_vals[0]
649
+ imp = pd.DataFrame({
650
+ "Feature": sample_X.columns,
651
+ "Mean |SHAP|": np.abs(shap_vals).mean(axis=0),
652
+ "Mean SHAP Sign": np.sign(shap_vals).mean(axis=0)
653
+ }).sort_values("Mean |SHAP|", ascending=False)
654
+
655
+ st.dataframe(imp.head(5))
656
+ recs = []
657
+ for _, r in imp.head(5).iterrows():
658
+ if r["Mean SHAP Sign"] > 0.05:
659
+ recs.append(f"Increase `{r['Feature']}` likely increases `{target}`")
660
+ elif r["Mean SHAP Sign"] < -0.05:
661
+ recs.append(f"Decrease `{r['Feature']}` likely increases `{target}`")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
662
  else:
663
+ recs.append(f"`{r['Feature']}` neutral for `{target}`")
664
+ st.write("\n".join(recs))
665
+
666
+ # --- Hugging Face advisory ---
667
+ import requests, json, textwrap
668
+ HF_TOKEN = os.getenv("HF_TOKEN")
669
+ if not HF_TOKEN:
670
+ st.error("HF_TOKEN not detected.")
671
+ else:
672
+ API_URL = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3-8B-Instruct"
673
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
674
+ prompt = textwrap.dedent(f"""
675
+ You are an expert metallurgical advisor.
676
+ Recommendations: {recs}
677
+ Target: {target}
678
+ Use case: {use_case}
679
+ Summarize in three professional lines for the shift operator.
680
+ """)
681
+ payload = {"inputs": prompt, "parameters": {"max_new_tokens": 120, "temperature": 0.6}}
682
+ with st.spinner("Generating advisory (Llama-3-8B)…"):
683
+ resp = requests.post(API_URL, headers=headers, json=payload, timeout=90)
684
+ try:
685
+ data = resp.json()
686
  text = ""
687
  if isinstance(data, list) and len(data) > 0 and "generated_text" in data[0]:
688
  text = data[0]["generated_text"].strip()
689
  elif isinstance(data, dict) and "generated_text" in data:
690
  text = data["generated_text"].strip()
 
 
 
691
  if text:
692
+ st.success(" Operator Advisory Generated:")
693
  st.info(text)
694
  else:
695
+ st.warning("Operator advisory skipped: no text returned.")
696
+ except Exception as e:
697
+ st.warning(f"Operator advisory skipped: {e}")
698
  except Exception as e:
699
  st.warning(f"Operator advisory skipped: {e}")
700