Update src/streamlit_app.py
Browse files- src/streamlit_app.py +216 -737
src/streamlit_app.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
import os
|
| 4 |
import json
|
| 5 |
import time
|
|
@@ -12,10 +11,11 @@ import seaborn as sns
|
|
| 12 |
import joblib
|
| 13 |
import zipfile
|
| 14 |
import io
|
|
|
|
| 15 |
|
| 16 |
# ML imports
|
| 17 |
from sklearn.model_selection import train_test_split
|
| 18 |
-
from sklearn.linear_model import LinearRegression
|
| 19 |
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
|
| 20 |
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
|
| 21 |
from sklearn.decomposition import PCA
|
|
@@ -25,6 +25,10 @@ from sklearn.metrics import mean_squared_error, r2_score
|
|
| 25 |
# SHAP
|
| 26 |
import shap
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
# --- Safe defaults for Streamlit session state ---
|
| 30 |
defaults = {
|
|
@@ -38,7 +42,6 @@ defaults = {
|
|
| 38 |
for k, v in defaults.items():
|
| 39 |
st.session_state.setdefault(k, v)
|
| 40 |
|
| 41 |
-
|
| 42 |
if "llm_result" not in st.session_state:
|
| 43 |
st.session_state["llm_result"] = None
|
| 44 |
if "automl_summary" not in st.session_state:
|
|
@@ -51,7 +54,6 @@ if "hf_clicked" not in st.session_state:
|
|
| 51 |
# -------------------------
|
| 52 |
# Config & paths
|
| 53 |
# -------------------------
|
| 54 |
-
|
| 55 |
st.set_page_config(page_title="Steel Authority of India Limited (MODEX)", layout="wide")
|
| 56 |
plt.style.use("seaborn-v0_8-muted")
|
| 57 |
sns.set_palette("muted")
|
|
@@ -79,17 +81,13 @@ def log(msg: str):
|
|
| 79 |
f.write(f"[{stamp}] {msg}\n")
|
| 80 |
print(msg)
|
| 81 |
|
| 82 |
-
|
| 83 |
log("=== Streamlit session started ===")
|
| 84 |
|
| 85 |
-
|
| 86 |
-
|
| 87 |
if os.path.exists("/data"):
|
| 88 |
st.sidebar.success(f" Using persistent storage | Logs directory: {LOG_DIR}")
|
| 89 |
else:
|
| 90 |
st.sidebar.warning(f" Using ephemeral storage | Logs directory: {LOG_DIR}. Data will be lost on rebuild.")
|
| 91 |
|
| 92 |
-
|
| 93 |
# -------------------------
|
| 94 |
# Utility: generate advanced dataset if missing
|
| 95 |
# -------------------------
|
|
@@ -104,13 +102,6 @@ def generate_advanced_flatfile(
|
|
| 104 |
Generates a large synthetic, physics-aligned dataset with many engineered features.
|
| 105 |
Allows control of variability per feature (through variance_overrides) or globally
|
| 106 |
(via global_variance_multiplier).
|
| 107 |
-
|
| 108 |
-
Args:
|
| 109 |
-
n_rows: number of samples
|
| 110 |
-
random_seed: RNG seed
|
| 111 |
-
max_polynomial_new: limit on number of polynomial expansion features
|
| 112 |
-
global_variance_multiplier: multiplier applied to all default stddevs
|
| 113 |
-
variance_overrides: dict mapping feature name or substring → stddev multiplier
|
| 114 |
"""
|
| 115 |
np.random.seed(random_seed)
|
| 116 |
os.makedirs(LOG_DIR, exist_ok=True)
|
|
@@ -307,37 +298,7 @@ def generate_advanced_flatfile(
|
|
| 307 |
existing = [meta_entry]
|
| 308 |
json.dump(existing, open(META_PATH, "w"), indent=2)
|
| 309 |
|
| 310 |
-
|
| 311 |
PDF_PATH = None
|
| 312 |
-
# annotated bibliography
|
| 313 |
-
# try:
|
| 314 |
-
# from fpdf import FPDF
|
| 315 |
-
# pdf = FPDF('P','mm','A4')
|
| 316 |
-
# pdf.add_page()
|
| 317 |
-
# pdf.set_font("Helvetica","B",14)
|
| 318 |
-
# pdf.cell(0,8,"Annotated Bibliography - Metallurgical AI (Selected Papers)", ln=True)
|
| 319 |
-
# pdf.ln(2)
|
| 320 |
-
# pdf.set_font("Helvetica","",10)
|
| 321 |
-
# pdf.cell(0,6,"Generated: " + datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC"), ln=True)
|
| 322 |
-
# pdf.ln(4)
|
| 323 |
-
# bib_items = [
|
| 324 |
-
# ("A Survey of Data-Driven Soft Sensing in Ironmaking Systems","Yan et al. (2024)","Review of soft-sensors; supports gas proxies, lags, PCA."),
|
| 325 |
-
# ("Optimisation of Oxygen Blowing Process using RL","Ojeda Roldan et al. (2022)","RL for oxygen control; motivates surrogate predicted states & safety indices."),
|
| 326 |
-
# ("Analyzing the Energy Efficiency of Electric Arc Furnace","Zhuo et al. (2024)","Energy KPIs (kWh/t) motivate power_density & energy_efficiency features."),
|
| 327 |
-
# ("BOF/Endpoint prediction techniques","Springer (2024)","Endpoint prediction; supports temporal lags and cycle encoding."),
|
| 328 |
-
# ("Dynamic EAF modeling & slag foaming","MacRosty et al.","Physics priors for slag_foaming_index and refractory health modeling.")
|
| 329 |
-
# ]
|
| 330 |
-
# for title, auth, note in bib_items:
|
| 331 |
-
# pdf.set_font("Helvetica","B",11)
|
| 332 |
-
# pdf.multi_cell(0,6, f"{title} — {auth}")
|
| 333 |
-
# pdf.set_font("Helvetica","",10)
|
| 334 |
-
# pdf.multi_cell(0,5, f"Notes: {note}")
|
| 335 |
-
# pdf.ln(2)
|
| 336 |
-
# pdf.output(PDF_PATH)
|
| 337 |
-
# except Exception as e:
|
| 338 |
-
# with open(PDF_PATH.replace(".pdf",".txt"), "w") as tf:
|
| 339 |
-
# tf.write("Annotated bibliography generated. Install fpdf for PDF output.\n")
|
| 340 |
-
|
| 341 |
return CSV_PATH, META_PATH, PDF_PATH
|
| 342 |
|
| 343 |
# -------------------------
|
|
@@ -359,10 +320,8 @@ def load_data(csv_path=CSV_PATH, meta_path=META_PATH):
|
|
| 359 |
return df_local, pd.DataFrame(meta_local)
|
| 360 |
|
| 361 |
df, meta_df = load_data()
|
| 362 |
-
|
| 363 |
-
|
| 364 |
# -------------------------
|
| 365 |
-
# Sidebar filters & UI
|
| 366 |
# -------------------------
|
| 367 |
st.sidebar.title("Feature Explorer - Advanced + SHAP")
|
| 368 |
|
|
@@ -370,7 +329,6 @@ def ensure_feature_metadata(df: pd.DataFrame, meta_df: pd.DataFrame) -> pd.DataF
|
|
| 370 |
"""Ensure metadata dataframe matches feature count & has required columns."""
|
| 371 |
required_cols = ["feature_name", "source_type", "formula", "remarks"]
|
| 372 |
|
| 373 |
-
# If metadata missing or too short, rebuild it entirely
|
| 374 |
if meta_df is None or len(meta_df) < len(df.columns):
|
| 375 |
meta_df = pd.DataFrame({
|
| 376 |
"feature_name": df.columns,
|
|
@@ -383,14 +341,11 @@ def ensure_feature_metadata(df: pd.DataFrame, meta_df: pd.DataFrame) -> pd.DataF
|
|
| 383 |
})
|
| 384 |
st.sidebar.warning("Metadata was summary-only — rebuilt feature-level metadata.")
|
| 385 |
else:
|
| 386 |
-
# Ensure required columns exist
|
| 387 |
for col in required_cols:
|
| 388 |
if col not in meta_df.columns:
|
| 389 |
meta_df[col] = None
|
| 390 |
-
# Fill feature_name if blank or NaN
|
| 391 |
if meta_df["feature_name"].isna().all():
|
| 392 |
meta_df["feature_name"] = df.columns
|
| 393 |
-
# Clip to same number of features (safety)
|
| 394 |
if len(meta_df) > len(df.columns):
|
| 395 |
meta_df = meta_df.iloc[: len(df.columns)]
|
| 396 |
|
|
@@ -398,7 +353,6 @@ def ensure_feature_metadata(df: pd.DataFrame, meta_df: pd.DataFrame) -> pd.DataF
|
|
| 398 |
|
| 399 |
meta_df = ensure_feature_metadata(df, meta_df)
|
| 400 |
|
| 401 |
-
# Build sidebar safely
|
| 402 |
feat_types = sorted(meta_df["source_type"].dropna().unique().tolist())
|
| 403 |
selected_types = st.sidebar.multiselect("Feature type", feat_types, default=feat_types)
|
| 404 |
|
|
@@ -409,11 +363,9 @@ else:
|
|
| 409 |
|
| 410 |
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 411 |
|
| 412 |
-
|
| 413 |
# -------------------------
|
| 414 |
-
#
|
| 415 |
# -------------------------
|
| 416 |
-
|
| 417 |
tabs = st.tabs([
|
| 418 |
"Features",
|
| 419 |
"Visualization",
|
|
@@ -426,7 +378,7 @@ tabs = st.tabs([
|
|
| 426 |
"View Logs"
|
| 427 |
])
|
| 428 |
|
| 429 |
-
|
| 430 |
with tabs[0]:
|
| 431 |
st.subheader("Feature metadata")
|
| 432 |
st.dataframe(
|
|
@@ -436,24 +388,18 @@ with tabs[0]:
|
|
| 436 |
)
|
| 437 |
st.markdown(f"Total features loaded: **{df.shape[1]}** | Rows: **{df.shape[0]}**")
|
| 438 |
|
| 439 |
-
|
| 440 |
-
# ----- Visualize tab
|
| 441 |
with tabs[1]:
|
| 442 |
st.subheader("Feature Visualization")
|
| 443 |
col = st.selectbox("Choose numeric feature", numeric_cols, index=0)
|
| 444 |
bins = st.slider("Histogram bins", 10, 200, 50)
|
| 445 |
|
| 446 |
-
# --- Improved Histogram with style ---
|
| 447 |
fig, ax = plt.subplots(figsize=(8, 4))
|
| 448 |
sns.histplot(df[col], bins=bins, kde=True, ax=ax, color="#2C6E91", alpha=0.8)
|
| 449 |
-
ax.set_title(f"Distribution of {col
|
| 450 |
-
ax.set_xlabel(col.replace("_", " ").title(), fontsize=10)
|
| 451 |
-
ax.set_ylabel("Frequency", fontsize=10)
|
| 452 |
-
sns.despine()
|
| 453 |
st.pyplot(fig, clear_figure=True)
|
| 454 |
st.write(df[col].describe().to_frame().T)
|
| 455 |
|
| 456 |
-
# --- Add PCA scatter visualization ---
|
| 457 |
if all(x in df.columns for x in ["pca_1", "pca_2", "operating_mode"]):
|
| 458 |
st.markdown("### PCA Feature Space — Colored by Operating Mode")
|
| 459 |
fig2, ax2 = plt.subplots(figsize=(6, 5))
|
|
@@ -462,14 +408,9 @@ with tabs[1]:
|
|
| 462 |
x="pca_1", y="pca_2", hue="operating_mode",
|
| 463 |
palette="tab10", alpha=0.7, s=40, ax=ax2
|
| 464 |
)
|
| 465 |
-
ax2.set_title("Operating Mode Clusters (PCA Projection)"
|
| 466 |
-
ax2.set_xlabel("PCA 1")
|
| 467 |
-
ax2.set_ylabel("PCA 2")
|
| 468 |
-
ax2.legend(title="Operating Mode", bbox_to_anchor=(1.05, 1), loc="upper left")
|
| 469 |
-
sns.despine()
|
| 470 |
st.pyplot(fig2, clear_figure=True)
|
| 471 |
|
| 472 |
-
|
| 473 |
# ----- Correlations tab
|
| 474 |
with tabs[2]:
|
| 475 |
st.subheader("Correlation explorer")
|
|
@@ -478,14 +419,9 @@ with tabs[2]:
|
|
| 478 |
if len(corr_sel) >= 2:
|
| 479 |
corr = df[corr_sel].corr()
|
| 480 |
fig, ax = plt.subplots(figsize=(10,8))
|
| 481 |
-
sns.heatmap(
|
| 482 |
-
|
| 483 |
-
linewidths=0.5, cbar_kws={"shrink": 0.7}, ax=ax
|
| 484 |
-
)
|
| 485 |
-
ax.set_title("Feature Correlation Matrix", fontsize=12)
|
| 486 |
-
sns.despine()
|
| 487 |
st.pyplot(fig, clear_figure=True)
|
| 488 |
-
|
| 489 |
else:
|
| 490 |
st.info("Choose at least 2 numeric features to compute correlation.")
|
| 491 |
|
|
@@ -494,13 +430,10 @@ with tabs[3]:
|
|
| 494 |
st.subheader("Summary statistics (numeric features)")
|
| 495 |
st.dataframe(df.describe().T.style.format("{:.3f}"), height=500)
|
| 496 |
|
| 497 |
-
|
| 498 |
-
# ----- Ensemble + SHAP tab (Expanded AutoML + Stacking + Multi-Family) -----
|
| 499 |
with tabs[4]:
|
| 500 |
-
st.subheader("
|
| 501 |
|
| 502 |
-
# --- Step 0: High-level Use Case (keeps previous defaults) ---
|
| 503 |
-
st.markdown("### Choose Industrial Use Case ")
|
| 504 |
use_case = st.selectbox(
|
| 505 |
"Select Use Case",
|
| 506 |
[
|
|
@@ -516,11 +449,10 @@ with tabs[4]:
|
|
| 516 |
index=1
|
| 517 |
)
|
| 518 |
|
| 519 |
-
# Map use-case -> defaults (same as before)
|
| 520 |
use_case_config = {
|
| 521 |
"Predictive Maintenance": {"target": "bearing_temp", "model_hint": "RandomForest"},
|
| 522 |
"EAF Data Intelligence": {"target": "furnace_temp", "model_hint": "GradientBoosting"},
|
| 523 |
-
"Casting Quality Optimization": {"target": "surface_temp"
|
| 524 |
"Rolling Mill Energy Optimization": {"target": "energy_efficiency", "model_hint": "ExtraTrees"},
|
| 525 |
"Surface Defect Detection (Vision AI)": {"target": "image_entropy_proxy", "model_hint": "GradientBoosting"},
|
| 526 |
"Material Composition & Alloy Mix AI": {"target": "chemical_C", "model_hint": "RandomForest"},
|
|
@@ -531,81 +463,44 @@ with tabs[4]:
|
|
| 531 |
target = cfg["target"]
|
| 532 |
model_hint = cfg["model_hint"]
|
| 533 |
|
| 534 |
-
# --- Feature auto-suggestion (keeps your earlier heuristic) ---
|
| 535 |
suggested = [c for c in numeric_cols if any(k in c for k in target.split('_'))]
|
| 536 |
if len(suggested) < 6:
|
| 537 |
-
suggested = [c for c in numeric_cols if any(k in c for k in ["temp",
|
| 538 |
if len(suggested) < 6:
|
| 539 |
suggested = numeric_cols[:50]
|
| 540 |
|
| 541 |
features = st.multiselect("Model input features (auto-suggested)", numeric_cols, default=suggested)
|
| 542 |
st.markdown(f"Auto target: `{target}` · Suggested family hint: `{model_hint}`")
|
| 543 |
|
| 544 |
-
# --- Data sampling controls ---
|
| 545 |
max_rows = min(df.shape[0], 20000)
|
| 546 |
-
sample_size = st.slider("Sample rows
|
| 547 |
-
|
| 548 |
sub_df = df[features + [target]].sample(n=sample_size, random_state=42).reset_index(drop=True)
|
| 549 |
X = sub_df[features].fillna(0)
|
| 550 |
y = sub_df[target].fillna(0)
|
| 551 |
|
| 552 |
-
# --- Ensemble control UI ---
|
| 553 |
st.markdown("### Ensemble & AutoML Settings")
|
| 554 |
-
max_trials = st.slider("Optuna trials per family
|
| 555 |
-
top_k = st.slider("Max base models
|
| 556 |
-
allow_advanced = st.checkbox("Include advanced families (XGBoost, LightGBM, CatBoost
|
| 557 |
|
| 558 |
-
|
| 559 |
-
available_models = ["RandomForest", "ExtraTrees"] # always available (sklearn)
|
| 560 |
optional_families = {}
|
| 561 |
if allow_advanced:
|
| 562 |
try:
|
| 563 |
-
import xgboost as xgb
|
| 564 |
-
|
| 565 |
-
available_models.append("XGBoost")
|
| 566 |
-
except Exception:
|
| 567 |
-
optional_families["XGBoost"] = False
|
| 568 |
try:
|
| 569 |
-
import lightgbm as lgb
|
| 570 |
-
|
| 571 |
-
available_models.append("LightGBM")
|
| 572 |
-
except Exception:
|
| 573 |
-
optional_families["LightGBM"] = False
|
| 574 |
-
try:
|
| 575 |
-
import catboost as cb
|
| 576 |
-
optional_families["CatBoost"] = True
|
| 577 |
-
available_models.append("CatBoost")
|
| 578 |
-
except Exception:
|
| 579 |
-
optional_families["CatBoost"] = False
|
| 580 |
-
try:
|
| 581 |
-
# TabPFN is often packaged differently; attempt import but it's optional
|
| 582 |
-
import tabpfn
|
| 583 |
-
optional_families["TabPFN"] = True
|
| 584 |
-
available_models.append("TabPFN")
|
| 585 |
-
except Exception:
|
| 586 |
-
optional_families["TabPFN"] = False
|
| 587 |
try:
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
optional_families["FTTransformer"] = True
|
| 591 |
-
available_models.append("FTTransformer")
|
| 592 |
-
except Exception:
|
| 593 |
-
optional_families["FTTransformer"] = False
|
| 594 |
|
| 595 |
st.markdown(f"Available model families: {', '.join(available_models)}")
|
| 596 |
|
| 597 |
-
# --- Optuna tuning routine per family ---
|
| 598 |
-
import optuna
|
| 599 |
-
from sklearn.model_selection import cross_val_score, KFold
|
| 600 |
-
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
|
| 601 |
-
from sklearn.linear_model import Ridge
|
| 602 |
-
from sklearn.neural_network import MLPRegressor
|
| 603 |
-
from sklearn.metrics import r2_score, mean_squared_error
|
| 604 |
-
|
| 605 |
def tune_family(family_name, X_local, y_local, n_trials=20, random_state=42):
|
| 606 |
-
"""Tune one model family using Optuna
|
| 607 |
def obj(trial):
|
| 608 |
-
# sample hyperparams per family
|
| 609 |
if family_name == "RandomForest":
|
| 610 |
n_estimators = trial.suggest_int("n_estimators", 100, 800)
|
| 611 |
max_depth = trial.suggest_int("max_depth", 4, 30)
|
|
@@ -618,33 +513,21 @@ with tabs[4]:
|
|
| 618 |
n_estimators = trial.suggest_int("n_estimators", 100, 1000)
|
| 619 |
max_depth = trial.suggest_int("max_depth", 3, 12)
|
| 620 |
lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
|
| 621 |
-
m = xgb.XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, tree_method="hist", verbosity=0
|
| 622 |
elif family_name == "LightGBM" and optional_families.get("LightGBM"):
|
| 623 |
n_estimators = trial.suggest_int("n_estimators", 100, 1000)
|
| 624 |
max_depth = trial.suggest_int("max_depth", 3, 16)
|
| 625 |
lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
|
| 626 |
-
m = lgb.LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, n_jobs=1
|
| 627 |
elif family_name == "CatBoost" and optional_families.get("CatBoost"):
|
| 628 |
iterations = trial.suggest_int("iterations", 200, 1000)
|
| 629 |
depth = trial.suggest_int("depth", 4, 10)
|
| 630 |
lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
|
| 631 |
-
m = cb.CatBoostRegressor(iterations=iterations, depth=depth, learning_rate=lr, verbose=0
|
| 632 |
-
elif family_name == "MLP":
|
| 633 |
-
hidden = trial.suggest_int("hidden_layer_sizes", 32, 512, log=True)
|
| 634 |
-
lr = trial.suggest_float("learning_rate_init", 1e-4, 1e-1, log=True)
|
| 635 |
-
m = MLPRegressor(hidden_layer_sizes=(hidden,), learning_rate_init=lr, max_iter=500, random_state=random_state)
|
| 636 |
-
elif family_name == "TabPFN" and optional_families.get("TabPFN"):
|
| 637 |
-
# TabPFN often works without hyperparams exposure; return a surrogate score using quick fit
|
| 638 |
-
# We'll call its predict_proba style API if available; as fallback use a mean score to let stacking consider it.
|
| 639 |
-
# For tuning, just return a placeholder; we'll build model object later.
|
| 640 |
-
return 0.0
|
| 641 |
else:
|
| 642 |
-
|
| 643 |
-
m = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=random_state, n_jobs=-1)
|
| 644 |
-
|
| 645 |
-
# use negative RMSE if better for our domain? keep R2 for generality
|
| 646 |
try:
|
| 647 |
-
scores = cross_val_score(m, X_local, y_local, scoring="r2", cv=3
|
| 648 |
return float(np.mean(scores))
|
| 649 |
except Exception:
|
| 650 |
return -999.0
|
|
@@ -652,636 +535,232 @@ with tabs[4]:
|
|
| 652 |
study = optuna.create_study(direction="maximize")
|
| 653 |
study.optimize(obj, n_trials=n_trials, show_progress_bar=False)
|
| 654 |
best = study.best_trial.params if study.trials else {}
|
| 655 |
-
# instantiate best model
|
| 656 |
try:
|
| 657 |
if family_name == "RandomForest":
|
| 658 |
-
model = RandomForestRegressor(
|
| 659 |
elif family_name == "ExtraTrees":
|
| 660 |
-
model = ExtraTreesRegressor(
|
| 661 |
elif family_name == "XGBoost" and optional_families.get("XGBoost"):
|
| 662 |
-
model = xgb.XGBRegressor(
|
| 663 |
elif family_name == "LightGBM" and optional_families.get("LightGBM"):
|
| 664 |
-
model = lgb.LGBMRegressor(
|
| 665 |
elif family_name == "CatBoost" and optional_families.get("CatBoost"):
|
| 666 |
-
model = cb.CatBoostRegressor(
|
| 667 |
-
elif family_name == "MLP":
|
| 668 |
-
model = MLPRegressor(hidden_layer_sizes=(best.get("hidden_layer_sizes",128),), learning_rate_init=best.get("learning_rate_init",0.001), max_iter=500, random_state=42)
|
| 669 |
-
elif family_name == "TabPFN" and optional_families.get("TabPFN"):
|
| 670 |
-
# We'll create a small wrapper for TabPFN later on train time
|
| 671 |
-
model = "TabPFN_placeholder"
|
| 672 |
else:
|
| 673 |
-
model = RandomForestRegressor(
|
| 674 |
except Exception:
|
| 675 |
-
model = RandomForestRegressor(
|
| 676 |
|
| 677 |
-
# compute cross-validated score for the best model
|
| 678 |
try:
|
| 679 |
-
score = float(np.mean(cross_val_score(model, X_local, y_local, scoring="r2", cv=3
|
| 680 |
except Exception:
|
| 681 |
score = -999.0
|
|
|
|
| 682 |
|
| 683 |
-
return {"model_obj": model, "cv_score": score, "best_params": best, "family": family_name, "study": study}
|
| 684 |
-
|
| 685 |
-
# --- Run tuning across available families (user triggered) ---
|
| 686 |
-
if "run_automl_clicked" not in st.session_state:
|
| 687 |
-
st.session_state["run_automl_clicked"] = False
|
| 688 |
-
|
| 689 |
if st.button("Run expanded AutoML + Stacking"):
|
| 690 |
st.session_state["run_automl_clicked"] = True
|
| 691 |
-
|
| 692 |
if st.session_state["run_automl_clicked"]:
|
| 693 |
log("AutoML + Stacking initiated.")
|
| 694 |
-
with st.spinner("Tuning multiple families
|
| 695 |
families_to_try = ["RandomForest", "ExtraTrees", "MLP"]
|
| 696 |
if allow_advanced:
|
| 697 |
if optional_families.get("XGBoost"): families_to_try.append("XGBoost")
|
| 698 |
if optional_families.get("LightGBM"): families_to_try.append("LightGBM")
|
| 699 |
if optional_families.get("CatBoost"): families_to_try.append("CatBoost")
|
| 700 |
-
if optional_families.get("TabPFN"): families_to_try.append("TabPFN")
|
| 701 |
-
if optional_families.get("FTTransformer"): families_to_try.append("FTTransformer")
|
| 702 |
|
| 703 |
tuned_results = []
|
| 704 |
for fam in families_to_try:
|
| 705 |
log(f"Tuning family: {fam}")
|
| 706 |
st.caption(f"Tuning family: {fam}")
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
if isinstance(res, dict) and "model_obj" in res:
|
| 710 |
-
tuned_results.append(res)
|
| 711 |
-
else:
|
| 712 |
-
st.warning(f"Family {fam} returned unexpected tune result: {res}")
|
| 713 |
-
log("All families tuned successfully.")
|
| 714 |
-
|
| 715 |
-
# build leaderboard DataFrame
|
| 716 |
lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"], "params": r["best_params"]} for r in tuned_results])
|
| 717 |
lb = lb.sort_values("cv_r2", ascending=False).reset_index(drop=True)
|
| 718 |
st.markdown("### Tuning Leaderboard (by CV R²)")
|
| 719 |
st.dataframe(lb[["family","cv_r2"]].round(4))
|
| 720 |
-
|
| 721 |
-
|
| 722 |
-
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
ax_perf.set_xlabel("Cross-Validated R² Score", fontsize=10)
|
| 727 |
-
ax_perf.set_ylabel("Model Family", fontsize=10)
|
| 728 |
-
ax_perf.set_title("Performance Comparison Across Model Families", fontsize=12)
|
| 729 |
-
ax_perf.invert_yaxis()
|
| 730 |
-
for i, v in enumerate(lb["cv_r2"]):
|
| 731 |
-
ax_perf.text(v + 0.005, i, f"{v:.3f}", va="center", fontsize=9)
|
| 732 |
-
sns.despine()
|
| 733 |
-
st.pyplot(fig_perf, clear_figure=True)
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
# --- Build base-models and collect out-of-fold preds for stacking ---
|
| 737 |
st.markdown("### Building base models & out-of-fold predictions for stacking")
|
| 738 |
-
kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
| 739 |
-
base_models = []
|
| 740 |
-
oof_preds = pd.DataFrame(index=X.index)
|
| 741 |
-
|
| 742 |
-
for idx, row in lb.iterrows():
|
| 743 |
-
fam = row["family"]
|
| 744 |
-
model_entry = next((r for r in tuned_results if r["family"] == fam), None)
|
| 745 |
-
if model_entry is None:
|
| 746 |
-
continue
|
| 747 |
-
model_obj = model_entry["model_obj"]
|
| 748 |
-
# train out-of-fold predictions
|
| 749 |
-
oof = np.zeros(X.shape[0])
|
| 750 |
-
for tr_idx, val_idx in kf.split(X):
|
| 751 |
-
X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
|
| 752 |
-
y_tr = y.iloc[tr_idx]
|
| 753 |
-
# fit family-specific wrapper (TabPFN/FTTransformer special-case)
|
| 754 |
-
if model_obj == "TabPFN_placeholder":
|
| 755 |
-
try:
|
| 756 |
-
# TabPFN expects specific API; create a simple fallback: use RandomForest to approximate
|
| 757 |
-
tmp = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
|
| 758 |
-
tmp.fit(X_tr, y_tr)
|
| 759 |
-
oof[val_idx] = tmp.predict(X_val)
|
| 760 |
-
except Exception:
|
| 761 |
-
oof[val_idx] = np.mean(y_tr)
|
| 762 |
-
else:
|
| 763 |
-
try:
|
| 764 |
-
model_obj.fit(X_tr, y_tr)
|
| 765 |
-
oof[val_idx] = model_obj.predict(X_val)
|
| 766 |
-
except Exception:
|
| 767 |
-
# fallback to mean
|
| 768 |
-
oof[val_idx] = np.mean(y_tr)
|
| 769 |
-
oof_preds[f"{fam}_oof"] = oof
|
| 770 |
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
else:
|
| 778 |
-
model_entry["model_obj"].fit(X, y)
|
| 779 |
-
fitted = model_entry["model_obj"]
|
| 780 |
-
except Exception:
|
| 781 |
-
fitted = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
|
| 782 |
-
fitted.fit(X, y)
|
| 783 |
|
| 784 |
-
|
|
|
|
| 785 |
|
| 786 |
-
|
| 787 |
-
|
| 788 |
-
|
| 789 |
-
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
diversity = {col: 1 - corr_matrix[col].drop(col).mean() for col in corr_matrix.columns}
|
| 793 |
-
summary = []
|
| 794 |
-
for bm in base_models:
|
| 795 |
-
col = f"{bm['family']}_oof"
|
| 796 |
-
summary.append({"family": bm["family"], "cv_r2": bm["cv_r2"], "diversity": diversity.get(col, 0.0)})
|
| 797 |
-
summary_df = pd.DataFrame(summary).sort_values(["cv_r2", "diversity"], ascending=[False, False]).reset_index(drop=True)
|
| 798 |
-
st.markdown("### Base Model Summary (cv_r2, diversity)")
|
| 799 |
-
st.dataframe(summary_df.round(4))
|
| 800 |
-
|
| 801 |
-
# select top_k by cv_r2 and diversity combined
|
| 802 |
-
selected = summary_df.sort_values(["cv_r2","diversity"], ascending=[False, False]).head(top_k)["family"].tolist()
|
| 803 |
-
st.markdown(f"Selected for stacking (top {top_k}): {selected}")
|
| 804 |
-
|
| 805 |
-
# build stacking training data (OOF preds for selected)
|
| 806 |
-
selected_cols = [f"{s}_oof" for s in selected]
|
| 807 |
-
X_stack = oof_preds[selected_cols].fillna(0)
|
| 808 |
-
meta = Ridge(alpha=1.0)
|
| 809 |
-
meta.fit(X_stack, y)
|
| 810 |
-
|
| 811 |
-
# --- Robust holdout evaluation & SHAP (safe for deployment) ---
|
| 812 |
-
# Split for holdout
|
| 813 |
-
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 814 |
-
|
| 815 |
-
# Helper to always produce scalar-safe mean
|
| 816 |
-
def scalar_mean(arr):
|
| 817 |
try:
|
| 818 |
-
|
|
|
|
|
|
|
| 819 |
except Exception:
|
| 820 |
-
|
| 821 |
-
|
| 822 |
-
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
|
| 826 |
-
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
|
| 847 |
-
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
|
| 856 |
-
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
|
| 863 |
-
|
| 864 |
-
|
| 865 |
-
|
| 866 |
-
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
| 877 |
-
|
| 878 |
-
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
-
|
| 884 |
-
|
| 885 |
-
|
| 886 |
-
|
| 887 |
-
|
| 888 |
-
|
| 889 |
-
|
| 890 |
-
|
| 891 |
-
|
| 892 |
-
|
| 893 |
-
|
| 894 |
-
|
| 895 |
-
|
| 896 |
-
|
| 897 |
-
|
| 898 |
-
|
| 899 |
-
|
| 900 |
-
|
| 901 |
-
|
| 902 |
-
|
| 903 |
-
|
| 904 |
-
|
| 905 |
-
|
| 906 |
-
|
| 907 |
-
|
| 908 |
-
|
| 909 |
-
shap_vals = shap_vals.reshape(shap_vals.shape[0], -1)
|
| 910 |
-
|
| 911 |
-
# Align SHAP features to DataFrame
|
| 912 |
-
if shap_vals.shape[1] != sample_X.shape[1]:
|
| 913 |
-
min_feats = min(shap_vals.shape[1], sample_X.shape[1])
|
| 914 |
-
shap_vals = shap_vals[:, :min_feats]
|
| 915 |
-
sample_X = sample_X.iloc[:, :min_feats]
|
| 916 |
-
|
| 917 |
-
# Compute robust means
|
| 918 |
-
mean_abs = np.abs(shap_vals).mean(axis=0)
|
| 919 |
-
mean_sign = np.sign(shap_vals).mean(axis=0)
|
| 920 |
-
|
| 921 |
-
importance = pd.DataFrame({
|
| 922 |
-
"Feature": sample_X.columns,
|
| 923 |
-
"Mean |SHAP|": mean_abs,
|
| 924 |
-
"Mean SHAP Sign": mean_sign
|
| 925 |
-
}).sort_values("Mean |SHAP|", ascending=False)
|
| 926 |
-
|
| 927 |
-
|
| 928 |
-
# Display Top 5 Drivers
|
| 929 |
-
st.markdown("### Top 5 Operational Drivers Influencing Target")
|
| 930 |
-
st.dataframe(importance.head(5).style.format({"Mean |SHAP|": "{:.3f}", "Mean SHAP Sign": "{:.3f}"}))
|
| 931 |
-
|
| 932 |
-
# Direction-based recommendations
|
| 933 |
-
recommendations = []
|
| 934 |
-
for _, row in importance.head(5).iterrows():
|
| 935 |
-
f = row["Feature"]
|
| 936 |
-
s = row["Mean SHAP Sign"]
|
| 937 |
-
if s > 0.05:
|
| 938 |
-
recommendations.append(f"Increase `{f}` likely increases `{target}`")
|
| 939 |
-
elif s < -0.05:
|
| 940 |
-
recommendations.append(f"Decrease `{f}` likely increases `{target}`")
|
| 941 |
-
else:
|
| 942 |
-
recommendations.append(f" `{f}` is neutral or nonlinear for `{target}`")
|
| 943 |
-
|
| 944 |
-
st.markdown("### Suggested Operator Adjustments (Model-Inferred)")
|
| 945 |
-
st.write("\n".join(recommendations))
|
| 946 |
-
|
| 947 |
-
# Delta recommendations vs previous shift
|
| 948 |
-
prev_shift = df.tail(200).mean(numeric_only=True)
|
| 949 |
-
recommended_shift = prev_shift.copy()
|
| 950 |
-
for rec in recommendations:
|
| 951 |
-
if "Increase" in rec:
|
| 952 |
-
name = rec.split('`')[1]
|
| 953 |
-
if name in recommended_shift:
|
| 954 |
-
recommended_shift[name] *= 1.03 # +3%
|
| 955 |
-
elif "Decrease" in rec:
|
| 956 |
-
name = rec.split('`')[1]
|
| 957 |
-
if name in recommended_shift:
|
| 958 |
-
recommended_shift[name] *= 0.97 # -3%
|
| 959 |
-
|
| 960 |
-
# Delta table
|
| 961 |
-
st.markdown("### Shift Adjustment Summary (vs Previous 200 Samples)")
|
| 962 |
-
deltas = pd.DataFrame({
|
| 963 |
-
"Current Avg": prev_shift,
|
| 964 |
-
"Suggested": recommended_shift,
|
| 965 |
-
"Δ (%)": ((recommended_shift - prev_shift) / prev_shift * 100)
|
| 966 |
-
}).loc[[r.split('`')[1] for r in recommendations if '`' in r]].round(2)
|
| 967 |
-
|
| 968 |
-
st.dataframe(deltas.fillna(0).style.format("{:.2f}"))
|
| 969 |
-
log("Operator advisory system executed successfully.")
|
| 970 |
-
|
| 971 |
-
# Optional: LLM-generated human-friendly summary
|
| 972 |
-
st.markdown("### Natural Language Operator Note")
|
| 973 |
-
try:
|
| 974 |
-
import importlib.util
|
| 975 |
-
if importlib.util.find_spec("transformers"):
|
| 976 |
-
from transformers import pipeline
|
| 977 |
-
tiny_llm_path = os.path.join(LOG_DIR, "cached_tiny_llm")
|
| 978 |
-
if os.path.exists(os.path.join(tiny_llm_path, "config.json")):
|
| 979 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 980 |
-
model = AutoModelForCausalLM.from_pretrained(tiny_llm_path)
|
| 981 |
-
tokenizer = AutoTokenizer.from_pretrained(tiny_llm_path)
|
| 982 |
-
assistant = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
| 983 |
-
else:
|
| 984 |
-
assistant = pipeline("text2text-generation", model="google/flan-t5-small")
|
| 985 |
-
|
| 986 |
-
|
| 987 |
-
|
| 988 |
-
llm_prompt = f"""
|
| 989 |
-
You are a metallurgical process advisor working in a steel manufacturing unit.
|
| 990 |
-
Based on these recommendations:
|
| 991 |
-
{recommendations}
|
| 992 |
-
and these shift averages:
|
| 993 |
-
{deltas.to_dict(orient='index')}
|
| 994 |
-
Write a concise 3-line message to the operator suggesting what to adjust this shift.
|
| 995 |
-
"""
|
| 996 |
-
resp = assistant(llm_prompt, max_new_tokens=120)[0]["generated_text"]
|
| 997 |
-
st.info(resp)
|
| 998 |
-
log("Operator LLM advisory note generated successfully.")
|
| 999 |
-
else:
|
| 1000 |
-
st.warning("Transformers not available — install it for text generation.")
|
| 1001 |
-
except Exception as e:
|
| 1002 |
-
st.warning(f"LLM advisory generation skipped: {e}")
|
| 1003 |
-
|
| 1004 |
-
else:
|
| 1005 |
-
st.info("No suitable model found for operator advisory system.")
|
| 1006 |
-
except Exception as e:
|
| 1007 |
-
st.error(f"Operator advisory system failed: {e}")
|
| 1008 |
-
log(f"Operator advisory error: {e}")
|
| 1009 |
-
|
| 1010 |
-
|
| 1011 |
-
|
| 1012 |
-
c1, c2 = st.columns(2)
|
| 1013 |
-
c1.metric("Stacked Ensemble R² (holdout)", f"{final_r2:.4f}")
|
| 1014 |
-
c2.metric("Stacked Ensemble RMSE (holdout)", f"{final_rmse:.4f}")
|
| 1015 |
-
|
| 1016 |
-
# Scatter comparison
|
| 1017 |
-
fig, ax = plt.subplots(figsize=(7, 4))
|
| 1018 |
-
ax.scatter(y_val, y_meta_pred, alpha=0.6)
|
| 1019 |
-
ax.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], "r--")
|
| 1020 |
-
ax.set_xlabel("Actual")
|
| 1021 |
-
ax.set_ylabel("Stacked Predicted")
|
| 1022 |
-
st.pyplot(fig)
|
| 1023 |
-
|
| 1024 |
-
# Save trained stack artifacts
|
| 1025 |
-
joblib.dump(meta, ENSEMBLE_PATH)
|
| 1026 |
-
st.caption(f"Stacked ensemble snapshot updated → {ENSEMBLE_PATH}")
|
| 1027 |
-
log(f"Ensemble model updated for use case: {use_case}")
|
| 1028 |
-
|
| 1029 |
-
|
| 1030 |
-
# Explainability
|
| 1031 |
-
st.markdown("### Explainability (approximate)")
|
| 1032 |
-
try:
|
| 1033 |
-
top_base = next((b for b in base_models if b["family"] == selected[0]), None)
|
| 1034 |
-
if top_base and hasattr(top_base["model"], "predict"):
|
| 1035 |
-
sample_X = X_val.sample(min(300, len(X_val)), random_state=42)
|
| 1036 |
-
if any(k in top_base["family"] for k in ["XGBoost", "LightGBM", "RandomForest", "ExtraTrees", "CatBoost"]):
|
| 1037 |
-
expl = shap.TreeExplainer(top_base["model"])
|
| 1038 |
-
shap_vals = expl.shap_values(sample_X)
|
| 1039 |
-
fig_sh = plt.figure(figsize=(8, 6))
|
| 1040 |
-
shap.summary_plot(shap_vals, sample_X, show=False)
|
| 1041 |
-
st.pyplot(fig_sh)
|
| 1042 |
-
else:
|
| 1043 |
-
st.info("Top model not tree-based; skipping SHAP summary.")
|
| 1044 |
else:
|
| 1045 |
-
|
| 1046 |
-
|
| 1047 |
-
|
| 1048 |
-
|
| 1049 |
-
|
| 1050 |
-
|
| 1051 |
-
|
| 1052 |
-
|
| 1053 |
-
|
| 1054 |
-
"
|
| 1055 |
-
"
|
| 1056 |
-
"
|
| 1057 |
-
|
| 1058 |
-
|
| 1059 |
-
|
| 1060 |
else:
|
| 1061 |
-
|
| 1062 |
-
|
| 1063 |
-
|
| 1064 |
-
"final_r2": float(final_r2),
|
| 1065 |
-
"final_rmse": float(final_rmse),
|
| 1066 |
-
"target": target,
|
| 1067 |
-
"use_case": use_case
|
| 1068 |
-
})
|
| 1069 |
-
|
| 1070 |
-
# Persist SHAP-based recommendations for reuse across reruns
|
| 1071 |
-
if "shap_recommendations" not in st.session_state:
|
| 1072 |
-
st.session_state["shap_recommendations"] = recommendations
|
| 1073 |
-
else:
|
| 1074 |
-
st.session_state["shap_recommendations"] = recommendations
|
| 1075 |
-
|
| 1076 |
-
# --- AI Recommendation Assistant ---
|
| 1077 |
-
st.markdown("---")
|
| 1078 |
-
st.subheader("AI Recommendation Assistant ")
|
| 1079 |
-
st.caption("Generates quick local AI suggestions — no file writes required.")
|
| 1080 |
-
|
| 1081 |
-
# Create or reset button states safely
|
| 1082 |
-
if "hf_clicked" not in st.session_state:
|
| 1083 |
-
st.session_state["hf_clicked"] = False
|
| 1084 |
-
if "llm_result" not in st.session_state:
|
| 1085 |
-
st.session_state["llm_result"] = None
|
| 1086 |
-
|
| 1087 |
-
# --- Buttons ---
|
| 1088 |
-
col1, col2 = st.columns(2)
|
| 1089 |
-
# Click handlers with isolated session flags
|
| 1090 |
-
if col1.button("Get AI Recommendation (via HF API)", key="ai_reco"):
|
| 1091 |
-
st.session_state["hf_clicked"] = True
|
| 1092 |
-
st.session_state["hf_ran_once"] = False # reset internal control
|
| 1093 |
-
|
| 1094 |
-
if col2.button("Reset Recommendation Output"):
|
| 1095 |
-
st.session_state["hf_clicked"] = False
|
| 1096 |
-
st.session_state["llm_result"] = None
|
| 1097 |
-
st.session_state["hf_ran_once"] = False
|
| 1098 |
-
st.info("Recommendation output cleared.")
|
| 1099 |
-
|
| 1100 |
-
# Execute API call only once
|
| 1101 |
-
if st.session_state["hf_clicked"] and not st.session_state.get("hf_ran_once", False):
|
| 1102 |
-
summary = st.session_state.get("automl_summary", {})
|
| 1103 |
-
if not summary:
|
| 1104 |
-
st.warning("Please run AutoML first to generate context.")
|
| 1105 |
-
else:
|
| 1106 |
-
try:
|
| 1107 |
-
import requests, json
|
| 1108 |
-
st.info("Contacting Hugging Face Inference API (Mixtral-8x7B-Instruct)…")
|
| 1109 |
-
|
| 1110 |
-
API_URL = "https://api-inference.huggingface.co/models/mistralai/Mixtral-8x7B-Instruct-v0.1"
|
| 1111 |
-
headers = {"Authorization": f"Bearer {st.secrets['HF_TOKEN']}"}
|
| 1112 |
-
prompt = f"""
|
| 1113 |
-
You are an ML model tuning advisor.
|
| 1114 |
-
Based on this AutoML summary, suggest 3 concise, actionable steps
|
| 1115 |
-
to improve model performance if overfitting, underfitting, or data-quality issues are observed.
|
| 1116 |
-
|
| 1117 |
-
Use case: {summary.get('use_case')}
|
| 1118 |
-
Target: {summary.get('target')}
|
| 1119 |
-
Final R²: {summary.get('final_r2')}
|
| 1120 |
-
Final RMSE: {summary.get('final_rmse')}
|
| 1121 |
-
Leaderboard: {summary.get('leaderboard')}
|
| 1122 |
-
"""
|
| 1123 |
-
|
| 1124 |
-
payload = {"inputs": prompt, "parameters": {"max_new_tokens": 200, "temperature": 0.7}}
|
| 1125 |
-
response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
|
| 1126 |
-
response.raise_for_status()
|
| 1127 |
-
result = response.json()
|
| 1128 |
-
|
| 1129 |
-
if isinstance(result, list) and "generated_text" in result[0]:
|
| 1130 |
-
text = result[0]["generated_text"]
|
| 1131 |
-
elif isinstance(result, dict) and "generated_text" in result:
|
| 1132 |
-
text = result["generated_text"]
|
| 1133 |
-
else:
|
| 1134 |
-
text = json.dumps(result, indent=2)
|
| 1135 |
-
|
| 1136 |
-
st.session_state["llm_result"] = text.strip()
|
| 1137 |
-
st.session_state["hf_ran_once"] = True
|
| 1138 |
-
st.success("✅ AI Recommendation (Mixtral-8x7B-Instruct):")
|
| 1139 |
-
st.markdown(st.session_state["llm_result"])
|
| 1140 |
-
|
| 1141 |
-
except Exception as e:
|
| 1142 |
-
st.error(f"HF Inference API call failed: {e}")
|
| 1143 |
-
|
| 1144 |
-
|
| 1145 |
-
|
| 1146 |
-
# --- Always display cached result, even on rerun ---
|
| 1147 |
-
if st.session_state["llm_result"]:
|
| 1148 |
-
st.markdown("### Cached AI Recommendation:")
|
| 1149 |
-
st.markdown(st.session_state["llm_result"])
|
| 1150 |
|
| 1151 |
-
|
| 1152 |
-
# ----- Target & Business Impact tab
|
| 1153 |
with tabs[5]:
|
| 1154 |
-
st.subheader("
|
| 1155 |
-
st.markdown("Each use case maps to a practical target variable that drives measurable business impact.")
|
| 1156 |
-
|
| 1157 |
target_table = pd.DataFrame([
|
| 1158 |
-
["
|
| 1159 |
-
["
|
| 1160 |
-
["
|
| 1161 |
-
["
|
| 1162 |
-
|
| 1163 |
-
|
| 1164 |
-
|
| 1165 |
-
|
| 1166 |
-
|
| 1167 |
-
st.dataframe(target_table, width="stretch")
|
| 1168 |
-
|
| 1169 |
-
st.markdown("---")
|
| 1170 |
-
st.subheader("Business Framing for Clients")
|
| 1171 |
-
st.markdown("These metrics show approximate annual benefits from small process improvements.")
|
| 1172 |
-
|
| 1173 |
-
business_table = pd.DataFrame([
|
| 1174 |
-
["Energy consumption", "400 kWh/ton", "₹35–60 L"],
|
| 1175 |
-
["Electrode wear", "1.8 kg/ton", "₹10 L"],
|
| 1176 |
-
["Refractory wear", "3 mm/heat", "₹15 L"],
|
| 1177 |
-
["Oxygen usage", "40 Nm³/ton", "₹20 L"],
|
| 1178 |
-
["Yield loss", "2 %", "₹50 L – ₹1 Cr"],
|
| 1179 |
-
], columns=["Metric", "Typical Value (EAF India)", "5 % Improvement → Annual ₹ Value"])
|
| 1180 |
-
|
| 1181 |
-
st.dataframe(business_table, width="stretch")
|
| 1182 |
-
st.info("These numbers are indicative averages; actual benefits depend on plant capacity and process efficiency.")
|
| 1183 |
-
|
| 1184 |
-
# ----- Bibliography tab
|
| 1185 |
with tabs[6]:
|
| 1186 |
-
st.subheader("Annotated Bibliography
|
| 1187 |
-
|
| 1188 |
-
|
| 1189 |
-
|
| 1190 |
-
|
| 1191 |
-
|
| 1192 |
-
bib_data = [
|
| 1193 |
-
{
|
| 1194 |
-
"title": "A Survey of Data-Driven Soft Sensing in Ironmaking Systems",
|
| 1195 |
-
"authors": "Yan et al. (2024)",
|
| 1196 |
-
"notes": "Soft sensors for furnace and tap temperature; validates `furnace_temp` and `tap_temp` targets.",
|
| 1197 |
-
"url": "https://doi.org/10.1021/acsomega.4c01254"
|
| 1198 |
-
},
|
| 1199 |
-
{
|
| 1200 |
-
"title": "Optimisation of Operator Support Systems through Artificial Intelligence for the Cast Steel Industry",
|
| 1201 |
-
"authors": "Ojeda Roldán et al. (2022)",
|
| 1202 |
-
"notes": "Reinforcement learning for oxygen blowing and endpoint control; supports temperature and carbon targets.",
|
| 1203 |
-
"url": "https://doi.org/10.3390/jmmp6020034"
|
| 1204 |
-
},
|
| 1205 |
-
{
|
| 1206 |
-
"title": "Analyzing the Energy Efficiency of Electric Arc Furnace Steelmaking",
|
| 1207 |
-
"authors": "Zhuo et al. (2024)",
|
| 1208 |
-
"notes": "Links arc power, temperature, and energy KPIs — validates `energy_efficiency` and `power_density`.",
|
| 1209 |
-
"url": "https://doi.org/10.3390/met15010113"
|
| 1210 |
-
},
|
| 1211 |
-
{
|
| 1212 |
-
"title": "Dynamic EAF Modeling and Slag Foaming Index Prediction",
|
| 1213 |
-
"authors": "MacRosty et al.",
|
| 1214 |
-
"notes": "Supports refractory and heat-flux-based wear prediction — validates `lining_thickness` target.",
|
| 1215 |
-
"url": "https://www.sciencedirect.com/science/article/pii/S0921883123004019"
|
| 1216 |
-
},
|
| 1217 |
-
{
|
| 1218 |
-
"title": "Machine Learning for Yield Optimization in Continuous Casting",
|
| 1219 |
-
"authors": "Springer (2023)",
|
| 1220 |
-
"notes": "ML for yield ratio and defect minimization; supports `yield_ratio` target.",
|
| 1221 |
-
"url": "https://link.springer.com/article/10.1007/s40964-023-00592-7"
|
| 1222 |
-
}
|
| 1223 |
]
|
| 1224 |
-
|
| 1225 |
-
|
| 1226 |
-
|
| 1227 |
-
|
| 1228 |
-
|
| 1229 |
-
|
| 1230 |
-
for
|
| 1231 |
-
|
| 1232 |
-
f"**[{row['title']}]({row['url']})** \n"
|
| 1233 |
-
f"*{row['authors']}* \n"
|
| 1234 |
-
f" _{row['notes']}_ \n",
|
| 1235 |
-
unsafe_allow_html=True
|
| 1236 |
-
)
|
| 1237 |
-
st.info("Click any paper title above to open it in a new tab.")
|
| 1238 |
-
|
| 1239 |
-
|
| 1240 |
-
st.markdown("""
|
| 1241 |
-
**Feature ↔ Target Justification**
|
| 1242 |
-
- `furnace_temp`, `tap_temp` → Process temperature (Yan 2024, Ojeda 2022)
|
| 1243 |
-
- `yield_ratio` → Production yield (Springer 2023)
|
| 1244 |
-
- `energy_efficiency`, `power_density` → Energy KPIs (Zhuo 2024)
|
| 1245 |
-
- `lining_thickness`, `slag_foaming_index` → Refractory & process health (MacRosty et al.)
|
| 1246 |
-
""")
|
| 1247 |
-
|
| 1248 |
-
st.info("Click any paper title above to open it in a new tab.")
|
| 1249 |
-
log("Bibliography tab rendered successfully.")
|
| 1250 |
-
|
| 1251 |
-
# -------------------------
|
| 1252 |
-
# Footer / Notes
|
| 1253 |
-
# -------------------------
|
| 1254 |
-
st.markdown("---")
|
| 1255 |
-
st.markdown("**Notes:** This dataset is synthetic and for demo/prototyping. Real plant integration requires NDA, data on-boarding, sensor mapping, and plant safety checks before any control actions.")
|
| 1256 |
-
|
| 1257 |
-
|
| 1258 |
-
# ----- Download tab
|
| 1259 |
-
with tabs[-2]:
|
| 1260 |
-
st.subheader(" Download Saved Files (Flat Log Mode)")
|
| 1261 |
-
|
| 1262 |
-
available_files = [f for f in os.listdir(LOG_DIR) if os.path.isfile(os.path.join(LOG_DIR, f))]
|
| 1263 |
-
if not available_files:
|
| 1264 |
-
st.info("No files found yet — run AutoML once to generate outputs.")
|
| 1265 |
else:
|
| 1266 |
-
for f in sorted(
|
| 1267 |
path = os.path.join(LOG_DIR, f)
|
| 1268 |
-
with open(path,
|
| 1269 |
-
st.download_button(
|
| 1270 |
-
label=f" Download {f}",
|
| 1271 |
-
data=fp,
|
| 1272 |
-
file_name=f,
|
| 1273 |
-
mime="application/octet-stream"
|
| 1274 |
-
)
|
| 1275 |
|
| 1276 |
-
|
| 1277 |
-
|
| 1278 |
-
|
| 1279 |
-
with tabs[-1]:
|
| 1280 |
-
st.subheader(" Master Log (append-in-place)")
|
| 1281 |
if os.path.exists(LOG_PATH):
|
| 1282 |
-
|
| 1283 |
-
|
| 1284 |
-
st.
|
| 1285 |
-
st.download_button("Download Log", content, file_name="run_master.log")
|
| 1286 |
else:
|
| 1287 |
-
st.info("No
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# sail_modex_stable.py
|
|
|
|
| 2 |
import os
|
| 3 |
import json
|
| 4 |
import time
|
|
|
|
| 11 |
import joblib
|
| 12 |
import zipfile
|
| 13 |
import io
|
| 14 |
+
import gc
|
| 15 |
|
| 16 |
# ML imports
|
| 17 |
from sklearn.model_selection import train_test_split
|
| 18 |
+
from sklearn.linear_model import LinearRegression, Ridge
|
| 19 |
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
|
| 20 |
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
|
| 21 |
from sklearn.decomposition import PCA
|
|
|
|
| 25 |
# SHAP
|
| 26 |
import shap
|
| 27 |
|
| 28 |
+
# Optuna (used later)
|
| 29 |
+
import optuna
|
| 30 |
+
from sklearn.model_selection import cross_val_score, KFold
|
| 31 |
+
from sklearn.neural_network import MLPRegressor
|
| 32 |
|
| 33 |
# --- Safe defaults for Streamlit session state ---
|
| 34 |
defaults = {
|
|
|
|
| 42 |
for k, v in defaults.items():
|
| 43 |
st.session_state.setdefault(k, v)
|
| 44 |
|
|
|
|
| 45 |
if "llm_result" not in st.session_state:
|
| 46 |
st.session_state["llm_result"] = None
|
| 47 |
if "automl_summary" not in st.session_state:
|
|
|
|
| 54 |
# -------------------------
|
| 55 |
# Config & paths
|
| 56 |
# -------------------------
|
|
|
|
| 57 |
st.set_page_config(page_title="Steel Authority of India Limited (MODEX)", layout="wide")
|
| 58 |
plt.style.use("seaborn-v0_8-muted")
|
| 59 |
sns.set_palette("muted")
|
|
|
|
| 81 |
f.write(f"[{stamp}] {msg}\n")
|
| 82 |
print(msg)
|
| 83 |
|
|
|
|
| 84 |
log("=== Streamlit session started ===")
|
| 85 |
|
|
|
|
|
|
|
| 86 |
if os.path.exists("/data"):
|
| 87 |
st.sidebar.success(f" Using persistent storage | Logs directory: {LOG_DIR}")
|
| 88 |
else:
|
| 89 |
st.sidebar.warning(f" Using ephemeral storage | Logs directory: {LOG_DIR}. Data will be lost on rebuild.")
|
| 90 |
|
|
|
|
| 91 |
# -------------------------
|
| 92 |
# Utility: generate advanced dataset if missing
|
| 93 |
# -------------------------
|
|
|
|
| 102 |
Generates a large synthetic, physics-aligned dataset with many engineered features.
|
| 103 |
Allows control of variability per feature (through variance_overrides) or globally
|
| 104 |
(via global_variance_multiplier).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
"""
|
| 106 |
np.random.seed(random_seed)
|
| 107 |
os.makedirs(LOG_DIR, exist_ok=True)
|
|
|
|
| 298 |
existing = [meta_entry]
|
| 299 |
json.dump(existing, open(META_PATH, "w"), indent=2)
|
| 300 |
|
|
|
|
| 301 |
PDF_PATH = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
return CSV_PATH, META_PATH, PDF_PATH
|
| 303 |
|
| 304 |
# -------------------------
|
|
|
|
| 320 |
return df_local, pd.DataFrame(meta_local)
|
| 321 |
|
| 322 |
df, meta_df = load_data()
|
|
|
|
|
|
|
| 323 |
# -------------------------
|
| 324 |
+
# Sidebar filters & UI
|
| 325 |
# -------------------------
|
| 326 |
st.sidebar.title("Feature Explorer - Advanced + SHAP")
|
| 327 |
|
|
|
|
| 329 |
"""Ensure metadata dataframe matches feature count & has required columns."""
|
| 330 |
required_cols = ["feature_name", "source_type", "formula", "remarks"]
|
| 331 |
|
|
|
|
| 332 |
if meta_df is None or len(meta_df) < len(df.columns):
|
| 333 |
meta_df = pd.DataFrame({
|
| 334 |
"feature_name": df.columns,
|
|
|
|
| 341 |
})
|
| 342 |
st.sidebar.warning("Metadata was summary-only — rebuilt feature-level metadata.")
|
| 343 |
else:
|
|
|
|
| 344 |
for col in required_cols:
|
| 345 |
if col not in meta_df.columns:
|
| 346 |
meta_df[col] = None
|
|
|
|
| 347 |
if meta_df["feature_name"].isna().all():
|
| 348 |
meta_df["feature_name"] = df.columns
|
|
|
|
| 349 |
if len(meta_df) > len(df.columns):
|
| 350 |
meta_df = meta_df.iloc[: len(df.columns)]
|
| 351 |
|
|
|
|
| 353 |
|
| 354 |
meta_df = ensure_feature_metadata(df, meta_df)
|
| 355 |
|
|
|
|
| 356 |
feat_types = sorted(meta_df["source_type"].dropna().unique().tolist())
|
| 357 |
selected_types = st.sidebar.multiselect("Feature type", feat_types, default=feat_types)
|
| 358 |
|
|
|
|
| 363 |
|
| 364 |
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 365 |
|
|
|
|
| 366 |
# -------------------------
|
| 367 |
+
# Tabs layout
|
| 368 |
# -------------------------
|
|
|
|
| 369 |
tabs = st.tabs([
|
| 370 |
"Features",
|
| 371 |
"Visualization",
|
|
|
|
| 378 |
"View Logs"
|
| 379 |
])
|
| 380 |
|
| 381 |
+
# ----- Feature metadata
|
| 382 |
with tabs[0]:
|
| 383 |
st.subheader("Feature metadata")
|
| 384 |
st.dataframe(
|
|
|
|
| 388 |
)
|
| 389 |
st.markdown(f"Total features loaded: **{df.shape[1]}** | Rows: **{df.shape[0]}**")
|
| 390 |
|
| 391 |
+
# ----- Visualization tab
|
|
|
|
| 392 |
with tabs[1]:
|
| 393 |
st.subheader("Feature Visualization")
|
| 394 |
col = st.selectbox("Choose numeric feature", numeric_cols, index=0)
|
| 395 |
bins = st.slider("Histogram bins", 10, 200, 50)
|
| 396 |
|
|
|
|
| 397 |
fig, ax = plt.subplots(figsize=(8, 4))
|
| 398 |
sns.histplot(df[col], bins=bins, kde=True, ax=ax, color="#2C6E91", alpha=0.8)
|
| 399 |
+
ax.set_title(f"Distribution of {col}", fontsize=12)
|
|
|
|
|
|
|
|
|
|
| 400 |
st.pyplot(fig, clear_figure=True)
|
| 401 |
st.write(df[col].describe().to_frame().T)
|
| 402 |
|
|
|
|
| 403 |
if all(x in df.columns for x in ["pca_1", "pca_2", "operating_mode"]):
|
| 404 |
st.markdown("### PCA Feature Space — Colored by Operating Mode")
|
| 405 |
fig2, ax2 = plt.subplots(figsize=(6, 5))
|
|
|
|
| 408 |
x="pca_1", y="pca_2", hue="operating_mode",
|
| 409 |
palette="tab10", alpha=0.7, s=40, ax=ax2
|
| 410 |
)
|
| 411 |
+
ax2.set_title("Operating Mode Clusters (PCA Projection)")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
st.pyplot(fig2, clear_figure=True)
|
| 413 |
|
|
|
|
| 414 |
# ----- Correlations tab
|
| 415 |
with tabs[2]:
|
| 416 |
st.subheader("Correlation explorer")
|
|
|
|
| 419 |
if len(corr_sel) >= 2:
|
| 420 |
corr = df[corr_sel].corr()
|
| 421 |
fig, ax = plt.subplots(figsize=(10,8))
|
| 422 |
+
sns.heatmap(corr, cmap="RdBu_r", center=0, annot=True, fmt=".2f",
|
| 423 |
+
linewidths=0.5, cbar_kws={"shrink": 0.7}, ax=ax)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 424 |
st.pyplot(fig, clear_figure=True)
|
|
|
|
| 425 |
else:
|
| 426 |
st.info("Choose at least 2 numeric features to compute correlation.")
|
| 427 |
|
|
|
|
| 430 |
st.subheader("Summary statistics (numeric features)")
|
| 431 |
st.dataframe(df.describe().T.style.format("{:.3f}"), height=500)
|
| 432 |
|
| 433 |
+
# ----- AutoML + SHAP tab (Expanded)
|
|
|
|
| 434 |
with tabs[4]:
|
| 435 |
+
st.subheader("AutoML Ensemble — Expanded Families + Stacking + SHAP")
|
| 436 |
|
|
|
|
|
|
|
| 437 |
use_case = st.selectbox(
|
| 438 |
"Select Use Case",
|
| 439 |
[
|
|
|
|
| 449 |
index=1
|
| 450 |
)
|
| 451 |
|
|
|
|
| 452 |
use_case_config = {
|
| 453 |
"Predictive Maintenance": {"target": "bearing_temp", "model_hint": "RandomForest"},
|
| 454 |
"EAF Data Intelligence": {"target": "furnace_temp", "model_hint": "GradientBoosting"},
|
| 455 |
+
"Casting Quality Optimization": {"target": "surface_temp", "model_hint": "GradientBoosting"},
|
| 456 |
"Rolling Mill Energy Optimization": {"target": "energy_efficiency", "model_hint": "ExtraTrees"},
|
| 457 |
"Surface Defect Detection (Vision AI)": {"target": "image_entropy_proxy", "model_hint": "GradientBoosting"},
|
| 458 |
"Material Composition & Alloy Mix AI": {"target": "chemical_C", "model_hint": "RandomForest"},
|
|
|
|
| 463 |
target = cfg["target"]
|
| 464 |
model_hint = cfg["model_hint"]
|
| 465 |
|
|
|
|
| 466 |
suggested = [c for c in numeric_cols if any(k in c for k in target.split('_'))]
|
| 467 |
if len(suggested) < 6:
|
| 468 |
+
suggested = [c for c in numeric_cols if any(k in c for k in ["temp","power","energy","pressure","yield"])]
|
| 469 |
if len(suggested) < 6:
|
| 470 |
suggested = numeric_cols[:50]
|
| 471 |
|
| 472 |
features = st.multiselect("Model input features (auto-suggested)", numeric_cols, default=suggested)
|
| 473 |
st.markdown(f"Auto target: `{target}` · Suggested family hint: `{model_hint}`")
|
| 474 |
|
|
|
|
| 475 |
max_rows = min(df.shape[0], 20000)
|
| 476 |
+
sample_size = st.slider("Sample rows", 500, max_rows, min(1500, max_rows), step=100)
|
|
|
|
| 477 |
sub_df = df[features + [target]].sample(n=sample_size, random_state=42).reset_index(drop=True)
|
| 478 |
X = sub_df[features].fillna(0)
|
| 479 |
y = sub_df[target].fillna(0)
|
| 480 |
|
|
|
|
| 481 |
st.markdown("### Ensemble & AutoML Settings")
|
| 482 |
+
max_trials = st.slider("Optuna trials per family", 5, 80, 20, step=5)
|
| 483 |
+
top_k = st.slider("Max base models in ensemble", 2, 8, 5)
|
| 484 |
+
allow_advanced = st.checkbox("Include advanced families (XGBoost, LightGBM, CatBoost)", value=True)
|
| 485 |
|
| 486 |
+
available_models = ["RandomForest", "ExtraTrees"]
|
|
|
|
| 487 |
optional_families = {}
|
| 488 |
if allow_advanced:
|
| 489 |
try:
|
| 490 |
+
import xgboost as xgb; optional_families["XGBoost"] = True; available_models.append("XGBoost")
|
| 491 |
+
except Exception: optional_families["XGBoost"] = False
|
|
|
|
|
|
|
|
|
|
| 492 |
try:
|
| 493 |
+
import lightgbm as lgb; optional_families["LightGBM"] = True; available_models.append("LightGBM")
|
| 494 |
+
except Exception: optional_families["LightGBM"] = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
try:
|
| 496 |
+
import catboost as cb; optional_families["CatBoost"] = True; available_models.append("CatBoost")
|
| 497 |
+
except Exception: optional_families["CatBoost"] = False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 498 |
|
| 499 |
st.markdown(f"Available model families: {', '.join(available_models)}")
|
| 500 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 501 |
def tune_family(family_name, X_local, y_local, n_trials=20, random_state=42):
|
| 502 |
+
"""Tune one model family using Optuna."""
|
| 503 |
def obj(trial):
|
|
|
|
| 504 |
if family_name == "RandomForest":
|
| 505 |
n_estimators = trial.suggest_int("n_estimators", 100, 800)
|
| 506 |
max_depth = trial.suggest_int("max_depth", 4, 30)
|
|
|
|
| 513 |
n_estimators = trial.suggest_int("n_estimators", 100, 1000)
|
| 514 |
max_depth = trial.suggest_int("max_depth", 3, 12)
|
| 515 |
lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
|
| 516 |
+
m = xgb.XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, tree_method="hist", verbosity=0)
|
| 517 |
elif family_name == "LightGBM" and optional_families.get("LightGBM"):
|
| 518 |
n_estimators = trial.suggest_int("n_estimators", 100, 1000)
|
| 519 |
max_depth = trial.suggest_int("max_depth", 3, 16)
|
| 520 |
lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
|
| 521 |
+
m = lgb.LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, n_jobs=1)
|
| 522 |
elif family_name == "CatBoost" and optional_families.get("CatBoost"):
|
| 523 |
iterations = trial.suggest_int("iterations", 200, 1000)
|
| 524 |
depth = trial.suggest_int("depth", 4, 10)
|
| 525 |
lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
|
| 526 |
+
m = cb.CatBoostRegressor(iterations=iterations, depth=depth, learning_rate=lr, verbose=0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
else:
|
| 528 |
+
m = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=random_state)
|
|
|
|
|
|
|
|
|
|
| 529 |
try:
|
| 530 |
+
scores = cross_val_score(m, X_local, y_local, scoring="r2", cv=3)
|
| 531 |
return float(np.mean(scores))
|
| 532 |
except Exception:
|
| 533 |
return -999.0
|
|
|
|
| 535 |
study = optuna.create_study(direction="maximize")
|
| 536 |
study.optimize(obj, n_trials=n_trials, show_progress_bar=False)
|
| 537 |
best = study.best_trial.params if study.trials else {}
|
|
|
|
| 538 |
try:
|
| 539 |
if family_name == "RandomForest":
|
| 540 |
+
model = RandomForestRegressor(**{**{"random_state":42,"n_jobs":-1}, **best})
|
| 541 |
elif family_name == "ExtraTrees":
|
| 542 |
+
model = ExtraTreesRegressor(**{**{"random_state":42,"n_jobs":-1}, **best})
|
| 543 |
elif family_name == "XGBoost" and optional_families.get("XGBoost"):
|
| 544 |
+
model = xgb.XGBRegressor(**{**{"verbosity":0,"tree_method":"hist"}, **best})
|
| 545 |
elif family_name == "LightGBM" and optional_families.get("LightGBM"):
|
| 546 |
+
model = lgb.LGBMRegressor(**{**{"n_jobs":1}, **best})
|
| 547 |
elif family_name == "CatBoost" and optional_families.get("CatBoost"):
|
| 548 |
+
model = cb.CatBoostRegressor(**{**{"verbose":0}, **best})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 549 |
else:
|
| 550 |
+
model = RandomForestRegressor(random_state=42)
|
| 551 |
except Exception:
|
| 552 |
+
model = RandomForestRegressor(random_state=42)
|
| 553 |
|
|
|
|
| 554 |
try:
|
| 555 |
+
score = float(np.mean(cross_val_score(model, X_local, y_local, scoring="r2", cv=3)))
|
| 556 |
except Exception:
|
| 557 |
score = -999.0
|
| 558 |
+
return {"model_obj": model, "cv_score": score, "best_params": best, "family": family_name}
|
| 559 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 560 |
if st.button("Run expanded AutoML + Stacking"):
|
| 561 |
st.session_state["run_automl_clicked"] = True
|
| 562 |
+
|
| 563 |
if st.session_state["run_automl_clicked"]:
|
| 564 |
log("AutoML + Stacking initiated.")
|
| 565 |
+
with st.spinner("Tuning multiple families..."):
|
| 566 |
families_to_try = ["RandomForest", "ExtraTrees", "MLP"]
|
| 567 |
if allow_advanced:
|
| 568 |
if optional_families.get("XGBoost"): families_to_try.append("XGBoost")
|
| 569 |
if optional_families.get("LightGBM"): families_to_try.append("LightGBM")
|
| 570 |
if optional_families.get("CatBoost"): families_to_try.append("CatBoost")
|
|
|
|
|
|
|
| 571 |
|
| 572 |
tuned_results = []
|
| 573 |
for fam in families_to_try:
|
| 574 |
log(f"Tuning family: {fam}")
|
| 575 |
st.caption(f"Tuning family: {fam}")
|
| 576 |
+
tuned_results.append(tune_family(fam, X, y, n_trials=max_trials))
|
| 577 |
+
# --- Leaderboard
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 578 |
lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"], "params": r["best_params"]} for r in tuned_results])
|
| 579 |
lb = lb.sort_values("cv_r2", ascending=False).reset_index(drop=True)
|
| 580 |
st.markdown("### Tuning Leaderboard (by CV R²)")
|
| 581 |
st.dataframe(lb[["family","cv_r2"]].round(4))
|
| 582 |
+
|
| 583 |
+
# --- Enhanced Ensemble Stacking ---
|
| 584 |
+
from sklearn.feature_selection import SelectKBest, f_regression
|
| 585 |
+
from sklearn.linear_model import LinearRegression
|
| 586 |
+
from sklearn.model_selection import KFold
|
| 587 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 588 |
st.markdown("### Building base models & out-of-fold predictions for stacking")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 589 |
|
| 590 |
+
scaler = StandardScaler()
|
| 591 |
+
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
|
| 592 |
+
selector = SelectKBest(f_regression, k=min(40, X_scaled.shape[1]))
|
| 593 |
+
X_sel = selector.fit_transform(X_scaled, y)
|
| 594 |
+
selected_feature_names = [X.columns[i] for i in selector.get_support(indices=True)]
|
| 595 |
+
X_sel = pd.DataFrame(X_sel, columns=selected_feature_names)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 596 |
|
| 597 |
+
kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
| 598 |
+
base_models, oof_preds = [], pd.DataFrame(index=X_sel.index)
|
| 599 |
|
| 600 |
+
for fam, entry in [(r["family"], r) for r in tuned_results if r.get("model_obj")]:
|
| 601 |
+
model_obj = entry["model_obj"]
|
| 602 |
+
oof = np.zeros(X_sel.shape[0])
|
| 603 |
+
for tr_idx, val_idx in kf.split(X_sel):
|
| 604 |
+
X_tr, X_val = X_sel.iloc[tr_idx], X_sel.iloc[val_idx]
|
| 605 |
+
y_tr = y.iloc[tr_idx]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 606 |
try:
|
| 607 |
+
model_obj.fit(X_tr, y_tr)
|
| 608 |
+
preds = model_obj.predict(X_val)
|
| 609 |
+
oof[val_idx] = preds
|
| 610 |
except Exception:
|
| 611 |
+
oof[val_idx] = np.mean(y_tr)
|
| 612 |
+
oof_preds[f"{fam}_oof"] = oof
|
| 613 |
+
model_obj.fit(X_sel, y)
|
| 614 |
+
base_models.append({"family": fam, "model": model_obj})
|
| 615 |
+
|
| 616 |
+
if oof_preds.empty:
|
| 617 |
+
st.error("No base models built.")
|
| 618 |
+
st.stop()
|
| 619 |
+
|
| 620 |
+
corr = oof_preds.corr().abs()
|
| 621 |
+
div = {c: 1 - corr[c].drop(c).mean() for c in corr.columns}
|
| 622 |
+
cv_r2_est = {c: r2_score(y, oof_preds[c]) for c in oof_preds.columns}
|
| 623 |
+
|
| 624 |
+
summary_df = pd.DataFrame({
|
| 625 |
+
"family": [c.replace("_oof","") for c in oof_preds.columns],
|
| 626 |
+
"cv_r2": [cv_r2_est[c] for c in oof_preds.columns],
|
| 627 |
+
"diversity": [div[c] for c in oof_preds.columns]
|
| 628 |
+
}).sort_values(["cv_r2","diversity"], ascending=[False,False])
|
| 629 |
+
|
| 630 |
+
st.dataframe(summary_df.round(4))
|
| 631 |
+
selected = summary_df.head(top_k)["family"].tolist()
|
| 632 |
+
st.markdown(f"Selected for stacking (top {top_k}): {selected}")
|
| 633 |
+
|
| 634 |
+
meta = LinearRegression(positive=True)
|
| 635 |
+
X_stack = oof_preds[[f"{s}_oof" for s in selected]].fillna(0)
|
| 636 |
+
meta.fit(X_stack, y)
|
| 637 |
+
|
| 638 |
+
X_tr, X_val, y_tr, y_val = train_test_split(X_sel, y, test_size=0.2, random_state=42)
|
| 639 |
+
meta_inputs = []
|
| 640 |
+
for fam in selected:
|
| 641 |
+
mdl = next((b["model"] for b in base_models if b["family"] == fam), None)
|
| 642 |
+
preds = mdl.predict(X_val) if mdl else np.full(len(X_val), np.mean(y_tr))
|
| 643 |
+
meta_inputs.append(np.ravel(preds))
|
| 644 |
+
X_meta_val = pd.DataFrame(np.column_stack(meta_inputs), columns=X_stack.columns)
|
| 645 |
+
y_meta_pred = meta.predict(X_meta_val)
|
| 646 |
+
|
| 647 |
+
final_r2 = r2_score(y_val, y_meta_pred)
|
| 648 |
+
final_rmse = np.sqrt(mean_squared_error(y_val, y_meta_pred))
|
| 649 |
+
st.success(f"Stacked Ensemble — R² = {final_r2:.4f}, RMSE = {final_rmse:.3f}")
|
| 650 |
+
|
| 651 |
+
fig, ax = plt.subplots(figsize=(7,4))
|
| 652 |
+
ax.scatter(y_val, y_meta_pred, alpha=0.7)
|
| 653 |
+
ax.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], "r--")
|
| 654 |
+
st.pyplot(fig, clear_figure=True)
|
| 655 |
+
|
| 656 |
+
st.session_state["automl_summary"] = {
|
| 657 |
+
"leaderboard": summary_df[["family","cv_r2"]].to_dict(orient="records"),
|
| 658 |
+
"final_r2": float(final_r2),
|
| 659 |
+
"final_rmse": float(final_rmse),
|
| 660 |
+
"target": target,
|
| 661 |
+
"use_case": use_case
|
| 662 |
+
}
|
| 663 |
+
|
| 664 |
+
# --- Operator Advisory System + Llama-3-70B-Instruct ---
|
| 665 |
+
st.markdown("---")
|
| 666 |
+
st.subheader("Operator Advisory System — Real-Time Shift Recommendations")
|
| 667 |
+
|
| 668 |
+
try:
|
| 669 |
+
top_base = next((b for b in base_models if b["family"] == selected[0]), None)
|
| 670 |
+
if top_base and hasattr(top_base["model"], "predict"):
|
| 671 |
+
sample_X = X_val.sample(min(300, len(X_val)), random_state=42)
|
| 672 |
+
model = top_base["model"]
|
| 673 |
+
expl = shap.TreeExplainer(model)
|
| 674 |
+
shap_vals = expl.shap_values(sample_X)
|
| 675 |
+
if isinstance(shap_vals, list): shap_vals = shap_vals[0]
|
| 676 |
+
shap_vals = np.array(shap_vals)
|
| 677 |
+
mean_abs = np.abs(shap_vals).mean(axis=0)
|
| 678 |
+
mean_sign = np.sign(shap_vals).mean(axis=0)
|
| 679 |
+
importance = pd.DataFrame({
|
| 680 |
+
"Feature": sample_X.columns,
|
| 681 |
+
"Mean |SHAP|": mean_abs,
|
| 682 |
+
"Mean SHAP Sign": mean_sign
|
| 683 |
+
}).sort_values("Mean |SHAP|", ascending=False)
|
| 684 |
+
st.markdown("### Top 5 Operational Drivers")
|
| 685 |
+
st.dataframe(importance.head(5))
|
| 686 |
+
recommendations = []
|
| 687 |
+
for _, row in importance.head(5).iterrows():
|
| 688 |
+
f, s = row["Feature"], row["Mean SHAP Sign"]
|
| 689 |
+
if s > 0.05: recommendations.append(f"Increase `{f}` likely increases `{target}`")
|
| 690 |
+
elif s < -0.05: recommendations.append(f"Decrease `{f}` likely increases `{target}`")
|
| 691 |
+
else: recommendations.append(f"`{f}` neutral for `{target}`")
|
| 692 |
+
st.markdown("### Suggested Operator Adjustments")
|
| 693 |
+
st.write("\n".join(recommendations))
|
| 694 |
+
|
| 695 |
+
# --- Call HF Llama-3-70B-Instruct API for summary ---
|
| 696 |
+
import requests
|
| 697 |
+
HF_TOKEN = st.secrets.get("HF_TOKEN", os.getenv("HF_TOKEN"))
|
| 698 |
+
if not HF_TOKEN:
|
| 699 |
+
st.error("HF_TOKEN not found in secrets or environment.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 700 |
else:
|
| 701 |
+
API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-3-70B-Instruct"
|
| 702 |
+
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
|
| 703 |
+
prompt = f"""
|
| 704 |
+
You are an expert metallurgical process advisor.
|
| 705 |
+
Based on these recommendations:
|
| 706 |
+
{recommendations}
|
| 707 |
+
Target: {target}
|
| 708 |
+
Use case: {use_case}
|
| 709 |
+
Summarize in three concise, professional lines what the operator should do this shift.
|
| 710 |
+
"""
|
| 711 |
+
payload = {"inputs": prompt, "parameters": {"max_new_tokens": 150, "temperature": 0.6}}
|
| 712 |
+
with st.spinner("Generating operator note (Llama-3-70B)…"):
|
| 713 |
+
resp = requests.post(API_URL, headers=headers, json=payload, timeout=90)
|
| 714 |
+
text = resp.json()[0].get("generated_text","").strip()
|
| 715 |
+
st.info(text)
|
| 716 |
else:
|
| 717 |
+
st.info("No suitable base model found.")
|
| 718 |
+
except Exception as e:
|
| 719 |
+
st.warning(f"Operator advisory skipped: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 720 |
|
| 721 |
+
# ----- Business Impact tab
|
|
|
|
| 722 |
with tabs[5]:
|
| 723 |
+
st.subheader("Business Impact Metrics")
|
|
|
|
|
|
|
| 724 |
target_table = pd.DataFrame([
|
| 725 |
+
["EAF Data Intelligence", "furnace_temp / tap_temp", "Central control variable", "₹20–60 L/year"],
|
| 726 |
+
["Casting Optimization", "surface_temp / cooling_water_temp", "Controls billet quality", "₹50 L/year"],
|
| 727 |
+
["Rolling Mill", "energy_efficiency", "Energy optimization", "₹5–10 L/year"],
|
| 728 |
+
["Refractory Loss Prediction", "lining_thickness / heat_loss_rate", "Wear and downtime", "₹40 L/year"],
|
| 729 |
+
], columns=["Use Case","Target Variable","Why It’s Ideal","Business Leverage"])
|
| 730 |
+
st.dataframe(target_table, width="stretch")
|
| 731 |
+
|
| 732 |
+
# ----- Bibliography tab
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 733 |
with tabs[6]:
|
| 734 |
+
st.subheader("Annotated Bibliography")
|
| 735 |
+
refs = [
|
| 736 |
+
("A Survey of Data-Driven Soft Sensing in Ironmaking Systems","Yan et al. (2024)","Soft sensors validate `furnace_temp` and `tap_temp`.","https://doi.org/10.1021/acsomega.4c01254"),
|
| 737 |
+
("Optimisation of Operator Support Systems","Ojeda Roldán et al. (2022)","Reinforcement learning for endpoint control.","https://doi.org/10.3390/jmmp6020034"),
|
| 738 |
+
("Analyzing the Energy Efficiency of Electric Arc Furnace Steelmaking","Zhuo et al. (2024)","Links arc power and energy KPIs.","https://doi.org/10.3390/met15010113"),
|
| 739 |
+
("Dynamic EAF Modeling and Slag Foaming Index Prediction","MacRosty et al.","Supports refractory wear modeling.","https://www.sciencedirect.com/science/article/pii/S0921883123004019")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 740 |
]
|
| 741 |
+
for t,a,n,u in refs:
|
| 742 |
+
st.markdown(f"**[{t}]({u})** — *{a}* \n_{n}_")
|
| 743 |
+
|
| 744 |
+
# ----- Download tab
|
| 745 |
+
with tabs[7]:
|
| 746 |
+
st.subheader("Download Saved Files")
|
| 747 |
+
files = [f for f in os.listdir(LOG_DIR) if os.path.isfile(os.path.join(LOG_DIR, f))]
|
| 748 |
+
if not files: st.info("No files yet — run AutoML first.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 749 |
else:
|
| 750 |
+
for f in sorted(files):
|
| 751 |
path = os.path.join(LOG_DIR, f)
|
| 752 |
+
with open(path,"rb") as fp:
|
| 753 |
+
st.download_button(f"Download {f}", fp, file_name=f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 754 |
|
| 755 |
+
# ----- Logs tab
|
| 756 |
+
with tabs[8]:
|
| 757 |
+
st.subheader("Master Log")
|
|
|
|
|
|
|
| 758 |
if os.path.exists(LOG_PATH):
|
| 759 |
+
txt = open(LOG_PATH).read()
|
| 760 |
+
st.text_area("Log Output", txt, height=400)
|
| 761 |
+
st.download_button("Download Log", txt, file_name="run_master.log")
|
|
|
|
| 762 |
else:
|
| 763 |
+
st.info("No logs yet — run AutoML once.")
|
| 764 |
+
|
| 765 |
+
st.markdown("---")
|
| 766 |
+
st.markdown("**Note:** Synthetic demo dataset for educational use only. Real deployment requires plant data, NDA, and safety validation.")
|