Update src/streamlit_app.py
Browse files- src/streamlit_app.py +161 -279
src/streamlit_app.py
CHANGED
|
@@ -436,6 +436,29 @@ with tabs[3]:
|
|
| 436 |
with tabs[4]:
|
| 437 |
st.subheader("AutoML Ensemble — Expanded Families + Stacking + SHAP")
|
| 438 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
use_case = st.selectbox(
|
| 440 |
"Select Use Case",
|
| 441 |
[
|
|
@@ -446,9 +469,9 @@ with tabs[4]:
|
|
| 446 |
"Surface Defect Detection (Vision AI)",
|
| 447 |
"Material Composition & Alloy Mix AI",
|
| 448 |
"Inventory & Yield Optimization",
|
| 449 |
-
"Refractory & Cooling Loss Prediction"
|
| 450 |
],
|
| 451 |
-
index=1
|
| 452 |
)
|
| 453 |
|
| 454 |
use_case_config = {
|
|
@@ -461,13 +484,13 @@ with tabs[4]:
|
|
| 461 |
"Inventory & Yield Optimization": {"target": "yield_ratio", "model_hint": "GradientBoosting"},
|
| 462 |
"Refractory & Cooling Loss Prediction": {"target": "lining_thickness", "model_hint": "ExtraTrees"},
|
| 463 |
}
|
|
|
|
| 464 |
cfg = use_case_config.get(use_case, {"target": numeric_cols[0], "model_hint": "RandomForest"})
|
| 465 |
-
target = cfg["target"]
|
| 466 |
-
model_hint = cfg["model_hint"]
|
| 467 |
|
| 468 |
-
suggested = [c for c in numeric_cols if any(k in c for k in target.split(
|
| 469 |
if len(suggested) < 6:
|
| 470 |
-
suggested = [c for c in numeric_cols if any(k in c for k in ["temp","power","energy","pressure","yield"])]
|
| 471 |
if len(suggested) < 6:
|
| 472 |
suggested = numeric_cols[:50]
|
| 473 |
|
|
@@ -478,62 +501,24 @@ with tabs[4]:
|
|
| 478 |
max_rows = min(df.shape[0], 20000)
|
| 479 |
sample_size = st.slider("Sample rows", 500, max_rows, min(1500, max_rows), step=100)
|
| 480 |
|
| 481 |
-
#
|
| 482 |
-
if
|
| 483 |
-
|
| 484 |
-
target
|
| 485 |
-
|
| 486 |
-
cols_needed = [c for c in features if c in df.columns]
|
| 487 |
-
|
| 488 |
-
if target in df.columns:
|
| 489 |
-
target_col = target
|
| 490 |
-
else:
|
| 491 |
-
matches = [c for c in df.columns if c.lower() == target.lower()]
|
| 492 |
-
if matches:
|
| 493 |
-
target_col = matches[0]
|
| 494 |
-
st.info(f"Auto-corrected to exact match: `{target_col}`")
|
| 495 |
-
else:
|
| 496 |
-
matches = [c for c in df.columns if target.lower() in c.lower()]
|
| 497 |
-
if len(matches) == 1:
|
| 498 |
-
target_col = matches[0]
|
| 499 |
-
st.info(f"Auto-corrected to closest match: `{target_col}`")
|
| 500 |
-
elif len(matches) > 1:
|
| 501 |
-
preferred = [m for m in matches if m.endswith("_temp") or m.endswith("_ratio") or m == target]
|
| 502 |
-
if preferred:
|
| 503 |
-
target_col = preferred[0]
|
| 504 |
-
st.warning(f"Multiple matches found {matches}. Using `{target_col}`.")
|
| 505 |
-
else:
|
| 506 |
-
target_col = matches[0]
|
| 507 |
-
st.warning(f"Multiple matches found {matches}. Using first: `{target_col}`.")
|
| 508 |
-
else:
|
| 509 |
-
st.error(f"Target `{target}` not found in dataframe columns.")
|
| 510 |
-
st.stop()
|
| 511 |
-
|
| 512 |
-
valid_features = [c for c in cols_needed if c in df.columns and c != target_col]
|
| 513 |
-
if not valid_features:
|
| 514 |
-
st.error("No valid feature columns remain after cleaning. Check feature selection.")
|
| 515 |
st.stop()
|
| 516 |
|
| 517 |
-
|
| 518 |
-
sub_df =
|
| 519 |
|
| 520 |
X = sub_df.drop(columns=[target_col])
|
| 521 |
y = pd.Series(np.ravel(sub_df[target_col]), name=target_col)
|
| 522 |
|
|
|
|
| 523 |
leak_cols = ["furnace_temp_next", "pred_temp_30s", "run_timestamp", "timestamp", "batch_id_numeric", "batch_id"]
|
| 524 |
-
for
|
| 525 |
-
|
| 526 |
-
X.drop(columns=[lc], inplace=True)
|
| 527 |
-
|
| 528 |
-
nunique = X.nunique(dropna=False)
|
| 529 |
-
const_cols = nunique[nunique <= 1].index.tolist()
|
| 530 |
-
if const_cols:
|
| 531 |
-
X.drop(columns=const_cols, inplace=True)
|
| 532 |
-
|
| 533 |
-
if X.shape[1] == 0:
|
| 534 |
-
st.error("No valid feature columns remain after cleaning. Check feature selection.")
|
| 535 |
-
st.stop()
|
| 536 |
|
|
|
|
| 537 |
st.markdown("### Ensemble & AutoML Settings")
|
| 538 |
max_trials = st.slider("Optuna trials per family", 5, 80, 20, step=5)
|
| 539 |
top_k = st.slider("Max base models in ensemble", 2, 8, 5)
|
|
@@ -552,267 +537,164 @@ with tabs[4]:
|
|
| 552 |
import catboost as cb; optional_families["CatBoost"] = True; available_models.append("CatBoost")
|
| 553 |
except Exception: optional_families["CatBoost"] = False
|
| 554 |
|
| 555 |
-
st.markdown(f"Available
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 556 |
|
| 557 |
-
def tune_family(family_name, X_local, y_local, n_trials=20, random_state=42):
|
| 558 |
-
"""Tune one model family using Optuna."""
|
| 559 |
def obj(trial):
|
| 560 |
-
if
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
m = ExtraTreesRegressor(
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 583 |
else:
|
| 584 |
-
m = RandomForestRegressor(
|
| 585 |
try:
|
| 586 |
-
|
| 587 |
-
return float(np.mean(scores))
|
| 588 |
except Exception:
|
| 589 |
-
return -999
|
| 590 |
|
| 591 |
study = optuna.create_study(direction="maximize")
|
| 592 |
study.optimize(obj, n_trials=n_trials, show_progress_bar=False)
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
model = RandomForestRegressor(**{**{"random_state":42,"n_jobs":-1}, **best})
|
| 597 |
-
elif family_name == "ExtraTrees":
|
| 598 |
-
model = ExtraTreesRegressor(**{**{"random_state":42,"n_jobs":-1}, **best})
|
| 599 |
-
elif family_name == "XGBoost" and optional_families.get("XGBoost"):
|
| 600 |
-
model = xgb.XGBRegressor(**{**{"verbosity":0,"tree_method":"hist"}, **best})
|
| 601 |
-
elif family_name == "LightGBM" and optional_families.get("LightGBM"):
|
| 602 |
-
model = lgb.LGBMRegressor(**{**{"n_jobs":1}, **best})
|
| 603 |
-
elif family_name == "CatBoost" and optional_families.get("CatBoost"):
|
| 604 |
-
model = cb.CatBoostRegressor(**{**{"verbose":0}, **best})
|
| 605 |
-
else:
|
| 606 |
-
model = RandomForestRegressor(random_state=42)
|
| 607 |
-
except Exception:
|
| 608 |
-
model = RandomForestRegressor(random_state=42)
|
| 609 |
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
if st.button("Run expanded AutoML + Stacking"):
|
| 617 |
-
st.session_state["run_automl_clicked"] = True
|
| 618 |
-
|
| 619 |
-
if st.session_state["run_automl_clicked"]:
|
| 620 |
-
log("AutoML + Stacking initiated.")
|
| 621 |
-
with st.spinner("Tuning multiple families..."):
|
| 622 |
-
families_to_try = ["RandomForest", "ExtraTrees", "MLP"]
|
| 623 |
if allow_advanced:
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
if optional_families.get("CatBoost"): families_to_try.append("CatBoost")
|
| 627 |
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
model_obj = result.get("model_obj")
|
| 634 |
-
if hasattr(model_obj, "estimators_"):
|
| 635 |
-
delattr(model_obj, "estimators_")
|
| 636 |
-
result["model_obj"] = model_obj
|
| 637 |
-
tuned_results.append(result)
|
| 638 |
-
|
| 639 |
-
lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"], "params": r["best_params"]} for r in tuned_results])
|
| 640 |
-
lb = lb.sort_values("cv_r2", ascending=False).reset_index(drop=True)
|
| 641 |
-
st.markdown("### Tuning Leaderboard (by CV R²)")
|
| 642 |
-
st.dataframe(lb[["family","cv_r2"]].round(4))
|
| 643 |
|
|
|
|
| 644 |
from sklearn.feature_selection import SelectKBest, f_regression
|
| 645 |
from sklearn.linear_model import LinearRegression
|
| 646 |
-
from sklearn.model_selection import KFold
|
| 647 |
-
|
| 648 |
-
st.markdown("### Building base models & out-of-fold predictions for stacking")
|
| 649 |
|
| 650 |
scaler = StandardScaler()
|
| 651 |
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
|
| 652 |
selector = SelectKBest(f_regression, k=min(40, X_scaled.shape[1]))
|
| 653 |
-
X_sel = selector.fit_transform(X_scaled, y)
|
| 654 |
-
selected_feature_names = [X.columns[i] for i in selector.get_support(indices=True)]
|
| 655 |
-
X_sel = pd.DataFrame(X_sel, columns=selected_feature_names)
|
| 656 |
|
| 657 |
kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
for fam, entry in [(r["family"], r) for r in tuned_results if r.get("model_obj") is not None]:
|
| 670 |
-
model_obj = entry["model_obj"]
|
| 671 |
-
oof = np.zeros(X_sel.shape[0])
|
| 672 |
-
for tr_idx, val_idx in kf.split(X_sel):
|
| 673 |
-
X_tr, X_val = X_sel.iloc[tr_idx], X_sel.iloc[val_idx]
|
| 674 |
-
y_tr = y.iloc[tr_idx]
|
| 675 |
-
try:
|
| 676 |
-
model_obj.fit(X_tr, y_tr)
|
| 677 |
-
preds = model_obj.predict(X_val)
|
| 678 |
-
oof[val_idx] = preds
|
| 679 |
-
except Exception:
|
| 680 |
-
oof[val_idx] = np.mean(y_tr)
|
| 681 |
-
oof_preds[f"{fam}_oof"] = oof
|
| 682 |
-
model_obj.fit(X_sel, y)
|
| 683 |
-
base_models.append({"family": fam, "model": model_obj})
|
| 684 |
-
|
| 685 |
-
if oof_preds.empty:
|
| 686 |
-
st.error("No base models built.")
|
| 687 |
-
st.stop()
|
| 688 |
-
|
| 689 |
-
corr = oof_preds.corr().abs()
|
| 690 |
-
div = {c: 1 - corr[c].drop(c).mean() for c in corr.columns}
|
| 691 |
-
cv_r2_est = {c: r2_score(y, oof_preds[c]) for c in oof_preds.columns}
|
| 692 |
-
|
| 693 |
-
summary_df = pd.DataFrame({
|
| 694 |
-
"family": [c.replace("_oof","") for c in oof_preds.columns],
|
| 695 |
-
"cv_r2": [cv_r2_est[c] for c in oof_preds.columns],
|
| 696 |
-
"diversity": [div[c] for c in oof_preds.columns]
|
| 697 |
-
}).sort_values(["cv_r2","diversity"], ascending=[False,False])
|
| 698 |
-
|
| 699 |
-
st.dataframe(summary_df.round(4))
|
| 700 |
-
selected = summary_df.head(top_k)["family"].tolist()
|
| 701 |
-
st.markdown(f"Selected for stacking (top {top_k}): {selected}")
|
| 702 |
|
| 703 |
meta = LinearRegression(positive=True)
|
| 704 |
-
|
| 705 |
-
meta.
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
meta_inputs = []
|
| 709 |
-
for fam in selected:
|
| 710 |
-
mdl = next((b["model"] for b in base_models if b["family"] == fam), None)
|
| 711 |
-
preds = mdl.predict(X_val) if mdl else np.full(len(X_val), np.mean(y_tr))
|
| 712 |
-
meta_inputs.append(np.ravel(preds))
|
| 713 |
-
X_meta_val = pd.DataFrame(np.column_stack(meta_inputs), columns=X_stack.columns)
|
| 714 |
-
y_meta_pred = meta.predict(X_meta_val)
|
| 715 |
-
|
| 716 |
-
final_r2 = r2_score(y_val, y_meta_pred)
|
| 717 |
-
final_rmse = np.sqrt(mean_squared_error(y_val, y_meta_pred))
|
| 718 |
-
st.success(f"Stacked Ensemble — R² = {final_r2:.4f}, RMSE = {final_rmse:.3f}")
|
| 719 |
-
|
| 720 |
-
fig, ax = plt.subplots(figsize=(7,4))
|
| 721 |
-
ax.scatter(y_val, y_meta_pred, alpha=0.7)
|
| 722 |
-
ax.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], "r--")
|
| 723 |
-
st.pyplot(fig, clear_figure=True)
|
| 724 |
|
| 725 |
# --- Operator Advisory ---
|
| 726 |
st.markdown("---")
|
| 727 |
-
st.subheader("Operator Advisory
|
| 728 |
|
| 729 |
try:
|
| 730 |
-
top_base =
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
model = top_base["model"]
|
| 750 |
-
expl = shap.TreeExplainer(model)
|
| 751 |
-
shap_vals = expl.shap_values(sample_X)
|
| 752 |
-
if isinstance(shap_vals, list): shap_vals = shap_vals[0]
|
| 753 |
-
shap_vals = np.array(shap_vals)
|
| 754 |
-
importance = pd.DataFrame({
|
| 755 |
-
"Feature": sample_X.columns,
|
| 756 |
-
"Mean |SHAP|": np.abs(shap_vals).mean(axis=0),
|
| 757 |
-
"Mean SHAP Sign": np.sign(shap_vals).mean(axis=0)
|
| 758 |
-
}).sort_values("Mean |SHAP|", ascending=False)
|
| 759 |
-
|
| 760 |
-
st.markdown("### Top 5 Operational Drivers")
|
| 761 |
-
st.dataframe(importance.head(5))
|
| 762 |
-
|
| 763 |
-
recommendations = []
|
| 764 |
-
for _, row in importance.head(5).iterrows():
|
| 765 |
-
f, s = row["Feature"], row["Mean SHAP Sign"]
|
| 766 |
-
if s > 0.05:
|
| 767 |
-
recommendations.append(f"Increase `{f}` likely increases `{target}`")
|
| 768 |
-
elif s < -0.05:
|
| 769 |
-
recommendations.append(f"Decrease `{f}` likely increases `{target}`")
|
| 770 |
-
else:
|
| 771 |
-
recommendations.append(f"`{f}` neutral for `{target}`")
|
| 772 |
-
|
| 773 |
-
st.markdown("### Suggested Operator Adjustments")
|
| 774 |
-
st.write("\n".join(recommendations))
|
| 775 |
-
|
| 776 |
-
import requests, json, textwrap
|
| 777 |
-
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 778 |
-
if not HF_TOKEN:
|
| 779 |
-
st.error("HF_TOKEN not detected. Check the Secrets tab.")
|
| 780 |
else:
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
|
| 787 |
-
|
| 788 |
-
|
| 789 |
-
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
|
| 794 |
-
|
| 795 |
-
|
| 796 |
-
|
| 797 |
-
|
| 798 |
-
|
| 799 |
-
|
| 800 |
-
|
| 801 |
-
|
| 802 |
-
|
|
|
|
| 803 |
text = ""
|
| 804 |
if isinstance(data, list) and len(data) > 0 and "generated_text" in data[0]:
|
| 805 |
text = data[0]["generated_text"].strip()
|
| 806 |
elif isinstance(data, dict) and "generated_text" in data:
|
| 807 |
text = data["generated_text"].strip()
|
| 808 |
-
elif isinstance(data, str):
|
| 809 |
-
text = data.strip()
|
| 810 |
-
|
| 811 |
if text:
|
| 812 |
-
st.success(" Operator Advisory Generated:")
|
| 813 |
st.info(text)
|
| 814 |
else:
|
| 815 |
-
st.warning("Operator advisory skipped: no text returned
|
|
|
|
|
|
|
| 816 |
except Exception as e:
|
| 817 |
st.warning(f"Operator advisory skipped: {e}")
|
| 818 |
|
|
|
|
| 436 |
with tabs[4]:
|
| 437 |
st.subheader("AutoML Ensemble — Expanded Families + Stacking + SHAP")
|
| 438 |
|
| 439 |
+
# --- Universal numeric cleaner (runs once per tab) ---
|
| 440 |
+
def clean_entire_df(df):
|
| 441 |
+
"""Cleans dataframe of any bracketed/scientific string numbers like '[1.551E3]'."""
|
| 442 |
+
df_clean = df.copy()
|
| 443 |
+
for col in df_clean.columns:
|
| 444 |
+
if df_clean[col].dtype == object:
|
| 445 |
+
df_clean[col] = (
|
| 446 |
+
df_clean[col]
|
| 447 |
+
.astype(str)
|
| 448 |
+
.str.replace("[", "", regex=False)
|
| 449 |
+
.str.replace("]", "", regex=False)
|
| 450 |
+
.str.replace(",", "", regex=False)
|
| 451 |
+
.str.strip()
|
| 452 |
+
.replace(["nan", "NaN", "None", "null", "N/A", "", " "], np.nan)
|
| 453 |
+
)
|
| 454 |
+
df_clean[col] = pd.to_numeric(df_clean[col], errors="coerce")
|
| 455 |
+
df_clean = df_clean.fillna(0.0).astype(float)
|
| 456 |
+
return df_clean
|
| 457 |
+
|
| 458 |
+
df = clean_entire_df(df)
|
| 459 |
+
st.caption("✅ Dataset cleaned globally — all numeric-like values converted safely.")
|
| 460 |
+
|
| 461 |
+
# --- Use Case Selection ---
|
| 462 |
use_case = st.selectbox(
|
| 463 |
"Select Use Case",
|
| 464 |
[
|
|
|
|
| 469 |
"Surface Defect Detection (Vision AI)",
|
| 470 |
"Material Composition & Alloy Mix AI",
|
| 471 |
"Inventory & Yield Optimization",
|
| 472 |
+
"Refractory & Cooling Loss Prediction",
|
| 473 |
],
|
| 474 |
+
index=1,
|
| 475 |
)
|
| 476 |
|
| 477 |
use_case_config = {
|
|
|
|
| 484 |
"Inventory & Yield Optimization": {"target": "yield_ratio", "model_hint": "GradientBoosting"},
|
| 485 |
"Refractory & Cooling Loss Prediction": {"target": "lining_thickness", "model_hint": "ExtraTrees"},
|
| 486 |
}
|
| 487 |
+
|
| 488 |
cfg = use_case_config.get(use_case, {"target": numeric_cols[0], "model_hint": "RandomForest"})
|
| 489 |
+
target, model_hint = cfg["target"], cfg["model_hint"]
|
|
|
|
| 490 |
|
| 491 |
+
suggested = [c for c in numeric_cols if any(k in c for k in target.split("_"))]
|
| 492 |
if len(suggested) < 6:
|
| 493 |
+
suggested = [c for c in numeric_cols if any(k in c for k in ["temp", "power", "energy", "pressure", "yield"])]
|
| 494 |
if len(suggested) < 6:
|
| 495 |
suggested = numeric_cols[:50]
|
| 496 |
|
|
|
|
| 501 |
max_rows = min(df.shape[0], 20000)
|
| 502 |
sample_size = st.slider("Sample rows", 500, max_rows, min(1500, max_rows), step=100)
|
| 503 |
|
| 504 |
+
# --- Prepare data ---
|
| 505 |
+
target_col = target if target in df.columns else next((c for c in df.columns if target.lower() in c.lower()), None)
|
| 506 |
+
if not target_col:
|
| 507 |
+
st.error(f"Target `{target}` not found in dataframe.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 508 |
st.stop()
|
| 509 |
|
| 510 |
+
cols_needed = [c for c in features if c in df.columns and c != target_col]
|
| 511 |
+
sub_df = df.loc[:, cols_needed + [target_col]].sample(n=sample_size, random_state=42).reset_index(drop=True)
|
| 512 |
|
| 513 |
X = sub_df.drop(columns=[target_col])
|
| 514 |
y = pd.Series(np.ravel(sub_df[target_col]), name=target_col)
|
| 515 |
|
| 516 |
+
# --- Drop constant or leak columns ---
|
| 517 |
leak_cols = ["furnace_temp_next", "pred_temp_30s", "run_timestamp", "timestamp", "batch_id_numeric", "batch_id"]
|
| 518 |
+
X = X.drop(columns=[c for c in leak_cols if c in X.columns], errors="ignore")
|
| 519 |
+
X = X.loc[:, X.nunique() > 1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
|
| 521 |
+
# --- AutoML Settings ---
|
| 522 |
st.markdown("### Ensemble & AutoML Settings")
|
| 523 |
max_trials = st.slider("Optuna trials per family", 5, 80, 20, step=5)
|
| 524 |
top_k = st.slider("Max base models in ensemble", 2, 8, 5)
|
|
|
|
| 537 |
import catboost as cb; optional_families["CatBoost"] = True; available_models.append("CatBoost")
|
| 538 |
except Exception: optional_families["CatBoost"] = False
|
| 539 |
|
| 540 |
+
st.markdown(f"Available families: {', '.join(available_models)}")
|
| 541 |
+
|
| 542 |
+
# --- Family tuner ---
|
| 543 |
+
def tune_family(fam, X_local, y_local, n_trials=20):
|
| 544 |
+
import optuna
|
| 545 |
+
from sklearn.model_selection import cross_val_score
|
| 546 |
+
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
|
| 547 |
|
|
|
|
|
|
|
| 548 |
def obj(trial):
|
| 549 |
+
if fam == "RandomForest":
|
| 550 |
+
m = RandomForestRegressor(
|
| 551 |
+
n_estimators=trial.suggest_int("n_estimators", 100, 800),
|
| 552 |
+
max_depth=trial.suggest_int("max_depth", 4, 30),
|
| 553 |
+
random_state=42, n_jobs=-1,
|
| 554 |
+
)
|
| 555 |
+
elif fam == "ExtraTrees":
|
| 556 |
+
m = ExtraTreesRegressor(
|
| 557 |
+
n_estimators=trial.suggest_int("n_estimators", 100, 800),
|
| 558 |
+
max_depth=trial.suggest_int("max_depth", 4, 30),
|
| 559 |
+
random_state=42, n_jobs=-1,
|
| 560 |
+
)
|
| 561 |
+
elif fam == "XGBoost" and optional_families.get("XGBoost"):
|
| 562 |
+
m = xgb.XGBRegressor(
|
| 563 |
+
n_estimators=trial.suggest_int("n_estimators", 100, 800),
|
| 564 |
+
max_depth=trial.suggest_int("max_depth", 3, 12),
|
| 565 |
+
learning_rate=trial.suggest_float("lr", 0.01, 0.3, log=True),
|
| 566 |
+
tree_method="hist", verbosity=0
|
| 567 |
+
)
|
| 568 |
+
elif fam == "LightGBM" and optional_families.get("LightGBM"):
|
| 569 |
+
m = lgb.LGBMRegressor(
|
| 570 |
+
n_estimators=trial.suggest_int("n_estimators", 100, 800),
|
| 571 |
+
max_depth=trial.suggest_int("max_depth", 3, 16),
|
| 572 |
+
learning_rate=trial.suggest_float("lr", 0.01, 0.3, log=True)
|
| 573 |
+
)
|
| 574 |
+
elif fam == "CatBoost" and optional_families.get("CatBoost"):
|
| 575 |
+
m = cb.CatBoostRegressor(
|
| 576 |
+
iterations=trial.suggest_int("iterations", 200, 800),
|
| 577 |
+
depth=trial.suggest_int("depth", 4, 10),
|
| 578 |
+
learning_rate=trial.suggest_float("lr", 0.01, 0.3, log=True),
|
| 579 |
+
verbose=0
|
| 580 |
+
)
|
| 581 |
else:
|
| 582 |
+
m = RandomForestRegressor(random_state=42)
|
| 583 |
try:
|
| 584 |
+
return np.mean(cross_val_score(m, X_local, y_local, cv=3, scoring="r2"))
|
|
|
|
| 585 |
except Exception:
|
| 586 |
+
return -999
|
| 587 |
|
| 588 |
study = optuna.create_study(direction="maximize")
|
| 589 |
study.optimize(obj, n_trials=n_trials, show_progress_bar=False)
|
| 590 |
+
params = study.best_trial.params if study.trials else {}
|
| 591 |
+
model = RandomForestRegressor(random_state=42)
|
| 592 |
+
return {"family": fam, "model_obj": model, "best_params": params, "cv_score": study.best_value}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
|
| 594 |
+
# --- Run button ---
|
| 595 |
+
if st.button("Run AutoML + SHAP"):
|
| 596 |
+
with st.spinner("Training and stacking..."):
|
| 597 |
+
tuned_results = []
|
| 598 |
+
families = ["RandomForest", "ExtraTrees"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 599 |
if allow_advanced:
|
| 600 |
+
for f in ["XGBoost", "LightGBM", "CatBoost"]:
|
| 601 |
+
if optional_families.get(f): families.append(f)
|
|
|
|
| 602 |
|
| 603 |
+
for fam in families:
|
| 604 |
+
tuned_results.append(tune_family(fam, X, y, n_trials=max_trials))
|
| 605 |
+
|
| 606 |
+
lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"]} for r in tuned_results]).sort_values("cv_r2", ascending=False)
|
| 607 |
+
st.dataframe(lb.round(4))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 608 |
|
| 609 |
+
# --- Stacking ---
|
| 610 |
from sklearn.feature_selection import SelectKBest, f_regression
|
| 611 |
from sklearn.linear_model import LinearRegression
|
| 612 |
+
from sklearn.model_selection import KFold, train_test_split
|
| 613 |
+
from sklearn.metrics import r2_score
|
|
|
|
| 614 |
|
| 615 |
scaler = StandardScaler()
|
| 616 |
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
|
| 617 |
selector = SelectKBest(f_regression, k=min(40, X_scaled.shape[1]))
|
| 618 |
+
X_sel = pd.DataFrame(selector.fit_transform(X_scaled, y), columns=[X.columns[i] for i in selector.get_support(indices=True)])
|
|
|
|
|
|
|
| 619 |
|
| 620 |
kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
| 621 |
+
oof_preds, base_models = pd.DataFrame(index=X_sel.index), []
|
| 622 |
+
for fam, entry in [(r["family"], r) for r in tuned_results if r.get("model_obj")]:
|
| 623 |
+
model = entry["model_obj"]
|
| 624 |
+
preds = np.zeros(X_sel.shape[0])
|
| 625 |
+
for tr, va in kf.split(X_sel):
|
| 626 |
+
model.fit(X_sel.iloc[tr], y.iloc[tr])
|
| 627 |
+
preds[va] = model.predict(X_sel.iloc[va])
|
| 628 |
+
oof_preds[f"{fam}_oof"] = preds
|
| 629 |
+
model.fit(X_sel, y)
|
| 630 |
+
base_models.append({"family": fam, "model": model})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 631 |
|
| 632 |
meta = LinearRegression(positive=True)
|
| 633 |
+
meta.fit(oof_preds, y)
|
| 634 |
+
y_pred = meta.predict(oof_preds)
|
| 635 |
+
final_r2 = r2_score(y, y_pred)
|
| 636 |
+
st.success(f"Stacked Ensemble R² = {final_r2:.4f}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 637 |
|
| 638 |
# --- Operator Advisory ---
|
| 639 |
st.markdown("---")
|
| 640 |
+
st.subheader("Operator Advisory — Real-Time Recommendations")
|
| 641 |
|
| 642 |
try:
|
| 643 |
+
top_base = base_models[0]["model"]
|
| 644 |
+
sample_X = X_sel.sample(min(300, len(X_sel)), random_state=42)
|
| 645 |
+
expl = shap.TreeExplainer(top_base)
|
| 646 |
+
shap_vals = expl.shap_values(sample_X)
|
| 647 |
+
if isinstance(shap_vals, list):
|
| 648 |
+
shap_vals = shap_vals[0]
|
| 649 |
+
imp = pd.DataFrame({
|
| 650 |
+
"Feature": sample_X.columns,
|
| 651 |
+
"Mean |SHAP|": np.abs(shap_vals).mean(axis=0),
|
| 652 |
+
"Mean SHAP Sign": np.sign(shap_vals).mean(axis=0)
|
| 653 |
+
}).sort_values("Mean |SHAP|", ascending=False)
|
| 654 |
+
|
| 655 |
+
st.dataframe(imp.head(5))
|
| 656 |
+
recs = []
|
| 657 |
+
for _, r in imp.head(5).iterrows():
|
| 658 |
+
if r["Mean SHAP Sign"] > 0.05:
|
| 659 |
+
recs.append(f"Increase `{r['Feature']}` likely increases `{target}`")
|
| 660 |
+
elif r["Mean SHAP Sign"] < -0.05:
|
| 661 |
+
recs.append(f"Decrease `{r['Feature']}` likely increases `{target}`")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 662 |
else:
|
| 663 |
+
recs.append(f"`{r['Feature']}` neutral for `{target}`")
|
| 664 |
+
st.write("\n".join(recs))
|
| 665 |
+
|
| 666 |
+
# --- Hugging Face advisory ---
|
| 667 |
+
import requests, json, textwrap
|
| 668 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 669 |
+
if not HF_TOKEN:
|
| 670 |
+
st.error("HF_TOKEN not detected.")
|
| 671 |
+
else:
|
| 672 |
+
API_URL = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3-8B-Instruct"
|
| 673 |
+
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
|
| 674 |
+
prompt = textwrap.dedent(f"""
|
| 675 |
+
You are an expert metallurgical advisor.
|
| 676 |
+
Recommendations: {recs}
|
| 677 |
+
Target: {target}
|
| 678 |
+
Use case: {use_case}
|
| 679 |
+
Summarize in three professional lines for the shift operator.
|
| 680 |
+
""")
|
| 681 |
+
payload = {"inputs": prompt, "parameters": {"max_new_tokens": 120, "temperature": 0.6}}
|
| 682 |
+
with st.spinner("Generating advisory (Llama-3-8B)…"):
|
| 683 |
+
resp = requests.post(API_URL, headers=headers, json=payload, timeout=90)
|
| 684 |
+
try:
|
| 685 |
+
data = resp.json()
|
| 686 |
text = ""
|
| 687 |
if isinstance(data, list) and len(data) > 0 and "generated_text" in data[0]:
|
| 688 |
text = data[0]["generated_text"].strip()
|
| 689 |
elif isinstance(data, dict) and "generated_text" in data:
|
| 690 |
text = data["generated_text"].strip()
|
|
|
|
|
|
|
|
|
|
| 691 |
if text:
|
| 692 |
+
st.success("✅ Operator Advisory Generated:")
|
| 693 |
st.info(text)
|
| 694 |
else:
|
| 695 |
+
st.warning("Operator advisory skipped: no text returned.")
|
| 696 |
+
except Exception as e:
|
| 697 |
+
st.warning(f"Operator advisory skipped: {e}")
|
| 698 |
except Exception as e:
|
| 699 |
st.warning(f"Operator advisory skipped: {e}")
|
| 700 |
|