| |
| import os |
| import json |
| import time |
| from datetime import datetime |
| import numpy as np |
| import pandas as pd |
| import streamlit as st |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| import joblib |
| import zipfile |
| import io |
| import gc |
|
|
| |
| from sklearn.model_selection import train_test_split |
| from sklearn.linear_model import LinearRegression, Ridge |
| from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor |
| from sklearn.preprocessing import StandardScaler, PolynomialFeatures |
| from sklearn.decomposition import PCA |
| from sklearn.cluster import KMeans |
| from sklearn.metrics import mean_squared_error, r2_score |
|
|
| |
| import shap |
|
|
| |
| import optuna |
| from sklearn.model_selection import cross_val_score, KFold |
| from sklearn.neural_network import MLPRegressor |
|
|
| |
| defaults = { |
| "llm_result": None, |
| "automl_summary": {}, |
| "shap_recommendations": [], |
| "hf_clicked": False, |
| "hf_ran_once": False, |
| "run_automl_clicked": False, |
| } |
| for k, v in defaults.items(): |
| st.session_state.setdefault(k, v) |
|
|
| if "llm_result" not in st.session_state: |
| st.session_state["llm_result"] = None |
| if "automl_summary" not in st.session_state: |
| st.session_state["automl_summary"] = {} |
| if "shap_recommendations" not in st.session_state: |
| st.session_state["shap_recommendations"] = [] |
| if "hf_clicked" not in st.session_state: |
| st.session_state["hf_clicked"] = False |
|
|
| |
| |
| |
| st.set_page_config(page_title="Steel Authority of India Limited (MODEX)", layout="wide") |
| plt.style.use("seaborn-v0_8-muted") |
| sns.set_palette("muted") |
| sns.set_style("whitegrid") |
|
|
| LOG_DIR = "./logs" |
| os.makedirs(LOG_DIR, exist_ok=True) |
|
|
| |
| CSV_PATH = os.path.join(LOG_DIR, "flatfile_universe_advanced.csv") |
| META_PATH = os.path.join(LOG_DIR, "feature_metadata_advanced.json") |
| ENSEMBLE_PATH = os.path.join(LOG_DIR, "ensemble_models.joblib") |
| LOG_PATH = os.path.join(LOG_DIR, "run_master.log") |
|
|
| |
| SESSION_STARTED = False |
|
|
| def log(msg: str): |
| global SESSION_STARTED |
| stamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
| with open(LOG_PATH, "a", encoding="utf-8") as f: |
| if not SESSION_STARTED: |
| f.write("\n\n===== New Session Started at {} =====\n".format(stamp)) |
| SESSION_STARTED = True |
| f.write(f"[{stamp}] {msg}\n") |
| print(msg) |
|
|
| log("=== Streamlit session started ===") |
|
|
| if os.path.exists("/data"): |
| st.sidebar.success(f" Using persistent storage | Logs directory: {LOG_DIR}") |
| else: |
| st.sidebar.warning(f" Using ephemeral storage | Logs directory: {LOG_DIR}. Data will be lost on rebuild.") |
|
|
| |
| |
| |
| def generate_advanced_flatfile( |
| n_rows=3000, |
| random_seed=42, |
| max_polynomial_new=60, |
| global_variance_multiplier=1.0, |
| variance_overrides=None, |
| ): |
| """ |
| Generates a large synthetic, physics-aligned dataset with many engineered features. |
| Allows control of variability per feature (through variance_overrides) or globally |
| (via global_variance_multiplier). |
| """ |
| np.random.seed(random_seed) |
| os.makedirs(LOG_DIR, exist_ok=True) |
| if variance_overrides is None: |
| variance_overrides = {} |
|
|
| |
| natural_feats = [ |
| "vibration_x","vibration_y","motor_current","rpm","bearing_temp","ambient_temp","lube_pressure","power_factor", |
| "furnace_temp","tap_temp","slag_temp","offgas_co","offgas_co2","o2_probe_pct","c_feed_rate","arc_power","furnace_pressure","feed_time", |
| "mold_temp","casting_speed","nozzle_pressure","cooling_water_temp","billet_length","chemical_C","chemical_Mn","chemical_Si","chemical_S", |
| "roll_speed","motor_load","coolant_flow","exit_temp","strip_thickness","line_tension","roller_vibration", |
| "lighting_intensity","surface_temp","image_entropy_proxy", |
| "spectro_Fe","spectro_C","spectro_Mn","spectro_Si","time_since_last_sample", |
| "batch_id_numeric","weight_input","weight_output","time_in_queue","conveyor_speed", |
| "shell_temp","lining_thickness","water_flow","cooling_out_temp","heat_flux" |
| ] |
| natural_feats = list(dict.fromkeys(natural_feats)) |
|
|
| |
| def effective_sd(feature_name, base_sd): |
| |
| if feature_name in variance_overrides: |
| return float(variance_overrides[feature_name]) |
| |
| for key, val in variance_overrides.items(): |
| if key in feature_name: |
| return float(val) |
| |
| return float(base_sd) * float(global_variance_multiplier) |
|
|
| |
| def sample_col(name, n): |
| name_l = name.lower() |
| if "furnace_temp" in name_l or name_l.endswith("_temp") or "tap_temp" in name_l: |
| sd = effective_sd("furnace_temp", 50) |
| return np.random.normal(1550, sd, n) |
| if name_l in ("tap_temp","mold_temp","shell_temp","cooling_out_temp","exit_temp"): |
| sd = effective_sd(name_l, 30) |
| return np.random.normal(200 if "mold" not in name_l else 1500, sd, n) |
| if "offgas_co2" in name_l: |
| sd = effective_sd("offgas_co2", 4) |
| return np.abs(np.random.normal(15, sd, n)) |
| if "offgas_co" in name_l: |
| sd = effective_sd("offgas_co", 5) |
| return np.abs(np.random.normal(20, sd, n)) |
| if "o2" in name_l: |
| sd = effective_sd("o2_probe_pct", 1) |
| return np.clip(np.random.normal(5, sd, n), 0.01, 60) |
| if "arc_power" in name_l or "motor_load" in name_l: |
| sd = effective_sd("arc_power", 120) |
| return np.abs(np.random.normal(600, sd, n)) |
| if "rpm" in name_l: |
| sd = effective_sd("rpm", 30) |
| return np.abs(np.random.normal(120, sd, n)) |
| if "vibration" in name_l: |
| sd = effective_sd("vibration", 0.15) |
| return np.abs(np.random.normal(0.4, sd, n)) |
| if "bearing_temp" in name_l: |
| sd = effective_sd("bearing_temp", 5) |
| return np.random.normal(65, sd, n) |
| if "chemical" in name_l or "spectro" in name_l: |
| sd = effective_sd("chemical", 0.15) |
| return np.random.normal(0.7, sd, n) |
| if "weight" in name_l: |
| sd = effective_sd("weight", 100) |
| return np.random.normal(1000, sd, n) |
| if "conveyor_speed" in name_l or "casting_speed" in name_l: |
| sd = effective_sd("casting_speed", 0.6) |
| return np.random.normal(2.5, sd, n) |
| if "power_factor" in name_l: |
| sd = effective_sd("power_factor", 0.03) |
| return np.clip(np.random.normal(0.92, sd, n), 0.6, 1.0) |
| if "image_entropy_proxy" in name_l: |
| sd = effective_sd("image_entropy_proxy", 0.25) |
| return np.abs(np.random.normal(0.5, sd, n)) |
| if "batch_id" in name_l: |
| return np.random.randint(1000,9999,n) |
| if "time_since" in name_l or "time_in_queue" in name_l: |
| sd = effective_sd("time_since", 20) |
| return np.abs(np.random.normal(30, sd, n)) |
| if "heat_flux" in name_l: |
| sd = effective_sd("heat_flux", 300) |
| return np.abs(np.random.normal(1000, sd, n)) |
| return np.random.normal(0, effective_sd(name_l, 1), n) |
|
|
| |
| df = pd.DataFrame({c: sample_col(c, n_rows) for c in natural_feats}) |
|
|
| |
| start = pd.Timestamp("2025-01-01T00:00:00") |
| df["timestamp"] = pd.date_range(start, periods=n_rows, freq="min") |
| df["cycle_minute"] = np.mod(np.arange(n_rows), 80) |
| df["meta_plant_name"] = np.random.choice(["Rourkela","Bhilai","Durgapur","Bokaro","Burnpur","Salem"], n_rows) |
| df["meta_country"] = "India" |
|
|
| |
| df["carbon_proxy"] = df["offgas_co"] / (df["offgas_co2"] + 1.0) |
| df["oxygen_utilization"] = df["offgas_co2"] / (df["offgas_co"] + 1.0) |
| df["power_density"] = df["arc_power"] / (df["weight_input"] + 1.0) |
| df["energy_efficiency"] = df["furnace_temp"] / (df["arc_power"] + 1.0) |
| df["slag_foaming_index"] = (df["slag_temp"] * df["offgas_co"]) / (df["o2_probe_pct"] + 1.0) |
| df["yield_ratio"] = df["weight_output"] / (df["weight_input"] + 1e-9) |
|
|
| |
| rolling_cols = ["arc_power","furnace_temp","offgas_co","offgas_co2","motor_current","vibration_x","weight_input"] |
| for rc in rolling_cols: |
| if rc in df.columns: |
| df[f"{rc}_roll_mean_3"] = df[rc].rolling(3, min_periods=1).mean() |
| df[f"{rc}_roll_std_5"] = df[rc].rolling(5, min_periods=1).std().fillna(0) |
| df[f"{rc}_lag1"] = df[rc].shift(1).bfill() |
| df[f"{rc}_roc_1"] = df[rc].diff().fillna(0) |
|
|
| |
| df["arc_o2_interaction"] = df["arc_power"] * df["o2_probe_pct"] |
| df["carbon_power_ratio"] = df["carbon_proxy"] / (df["arc_power"] + 1e-6) |
| df["temp_power_sqrt"] = df["furnace_temp"] * np.sqrt(np.abs(df["arc_power"]) + 1e-6) |
|
|
| |
| numeric = df.select_dtypes(include=[np.number]).fillna(0) |
| poly_source_cols = numeric.columns[:12].tolist() |
| poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False) |
| poly_mat = poly.fit_transform(numeric[poly_source_cols]) |
| poly_names = poly.get_feature_names_out(poly_source_cols) |
| poly_df = pd.DataFrame(poly_mat, columns=[f"poly__{n}" for n in poly_names], index=df.index) |
| keep_poly = [c for c in poly_df.columns if c.replace("poly__","") not in poly_source_cols] |
| poly_df = poly_df[keep_poly].iloc[:, :max_polynomial_new] if len(keep_poly) > 0 else poly_df.iloc[:, :0] |
| df = pd.concat([df, poly_df], axis=1) |
|
|
| |
| scaler = StandardScaler() |
| scaled = scaler.fit_transform(numeric) |
| pca = PCA(n_components=6, random_state=42) |
| pca_cols = pca.fit_transform(scaled) |
| for i in range(pca_cols.shape[1]): |
| df[f"pca_{i+1}"] = pca_cols[:, i] |
|
|
| |
| kmeans = KMeans(n_clusters=6, random_state=42, n_init=10) |
| df["operating_mode"] = kmeans.fit_predict(scaled) |
|
|
| |
| surrogate_df = df.copy() |
| surrogate_df["furnace_temp_next"] = surrogate_df["furnace_temp"].shift(-1).ffill() |
| features_for_surrogate = [c for c in ["furnace_temp","arc_power","o2_probe_pct","offgas_co","offgas_co2"] if c in df.columns] |
| if len(features_for_surrogate) >= 2: |
| X = surrogate_df[features_for_surrogate].fillna(0) |
| y = surrogate_df["furnace_temp_next"] |
| rf = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1) |
| rf.fit(X, y) |
| df["pred_temp_30s"] = rf.predict(X) |
| else: |
| df["pred_temp_30s"] = df["furnace_temp"] |
|
|
| if all(c in df.columns for c in ["offgas_co","offgas_co2","o2_probe_pct"]): |
| X2 = df[["offgas_co","offgas_co2","o2_probe_pct"]].fillna(0) |
| rf2 = RandomForestRegressor(n_estimators=50, random_state=1, n_jobs=-1) |
| rf2.fit(X2, df["carbon_proxy"]) |
| df["pred_carbon_5min"] = rf2.predict(X2) |
| else: |
| df["pred_carbon_5min"] = df["carbon_proxy"] |
|
|
| |
| df["refractory_limit_flag"] = (df["lining_thickness"] < 140).astype(int) |
| df["max_allowed_power_delta"] = np.clip(df["arc_power"].diff().abs().fillna(0), 0, 2000) |
|
|
| |
| df["ARC_ON"] = ((df["arc_power"] > df["arc_power"].median()) & (df["carbon_proxy"] < 1.0)).astype(int) |
| df["prediction_confidence"] = np.clip(np.random.beta(2,5, n_rows), 0.05, 0.99) |
|
|
| |
| df.replace([np.inf, -np.inf], np.nan, inplace=True) |
| df.bfill(inplace=True) |
| df.fillna(0, inplace=True) |
|
|
| |
| df["run_timestamp"] = datetime.now().strftime("%Y%m%d_%H%M%S") |
| if os.path.exists(CSV_PATH): |
| df.to_csv(CSV_PATH, mode="a", index=False, header=False) |
| else: |
| df.to_csv(CSV_PATH, index=False) |
| |
| |
| meta_entry = { |
| "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), |
| "features": len(df.columns), |
| "rows_added": len(df), |
| "note": "auto-generated block appended" |
| } |
| if os.path.exists(META_PATH): |
| existing = json.load(open(META_PATH)) |
| existing.append(meta_entry) |
| else: |
| existing = [meta_entry] |
| json.dump(existing, open(META_PATH, "w"), indent=2) |
|
|
| PDF_PATH = None |
| return CSV_PATH, META_PATH, PDF_PATH |
|
|
| |
| |
| |
| if not os.path.exists(CSV_PATH) or not os.path.exists(META_PATH): |
| with st.spinner("Generating synthetic features (this may take ~20-60s)..."): |
| CSV_PATH, META_PATH, PDF_PATH = generate_advanced_flatfile(n_rows=3000, random_seed=42, max_polynomial_new=80) |
| st.success(f"Generated dataset and metadata: {CSV_PATH}") |
|
|
| |
| |
| |
| @st.cache_data |
| def load_data(csv_path=CSV_PATH, meta_path=META_PATH): |
| df_local = pd.read_csv(csv_path) |
| with open(meta_path, "r") as f: |
| meta_local = json.load(f) |
| return df_local, pd.DataFrame(meta_local) |
|
|
| df, meta_df = load_data() |
| |
| |
| |
| st.sidebar.title("Feature Explorer - Advanced + SHAP") |
|
|
| def ensure_feature_metadata(df: pd.DataFrame, meta_df: pd.DataFrame) -> pd.DataFrame: |
| """Ensure metadata dataframe matches feature count & has required columns.""" |
| required_cols = ["feature_name", "source_type", "formula", "remarks"] |
|
|
| if meta_df is None or len(meta_df) < len(df.columns): |
| meta_df = pd.DataFrame({ |
| "feature_name": df.columns, |
| "source_type": [ |
| "engineered" if any(x in c for x in ["poly", "pca", "roll", "lag"]) else "measured" |
| for c in df.columns |
| ], |
| "formula": ["" for _ in df.columns], |
| "remarks": ["auto-inferred synthetic feature metadata" for _ in df.columns], |
| }) |
| st.sidebar.warning("Metadata was summary-only — rebuilt feature-level metadata.") |
| else: |
| for col in required_cols: |
| if col not in meta_df.columns: |
| meta_df[col] = None |
| if meta_df["feature_name"].isna().all(): |
| meta_df["feature_name"] = df.columns |
| if len(meta_df) > len(df.columns): |
| meta_df = meta_df.iloc[: len(df.columns)] |
|
|
| return meta_df |
|
|
| meta_df = ensure_feature_metadata(df, meta_df) |
|
|
| feat_types = sorted(meta_df["source_type"].dropna().unique().tolist()) |
| selected_types = st.sidebar.multiselect("Feature type", feat_types, default=feat_types) |
|
|
| if "source_type" not in meta_df.columns or meta_df["source_type"].dropna().empty: |
| filtered_meta = meta_df.copy() |
| else: |
| filtered_meta = meta_df[meta_df["source_type"].isin(selected_types)] |
|
|
| numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() |
|
|
| |
| |
| |
| tabs = st.tabs([ |
| "Features", |
| "Visualization", |
| "Correlations", |
| "Statistics", |
| "AutoML + SHAP", |
| "Business Impact", |
| "Bibliography", |
| "Download Saved Files", |
| "View Logs" |
| ]) |
|
|
| |
| with tabs[0]: |
| st.subheader("Feature metadata") |
| st.dataframe( |
| filtered_meta[["feature_name", "source_type", "formula", "remarks"]] |
| .rename(columns={"feature_name": "Feature"}), |
| height=400 |
| ) |
| st.markdown(f"Total features loaded: **{df.shape[1]}** | Rows: **{df.shape[0]}**") |
|
|
| |
| with tabs[1]: |
| st.subheader("Feature Visualization") |
| col = st.selectbox("Choose numeric feature", numeric_cols, index=0) |
| bins = st.slider("Histogram bins", 10, 200, 50) |
|
|
| fig, ax = plt.subplots(figsize=(8, 4)) |
| sns.histplot(df[col], bins=bins, kde=True, ax=ax, color="#2C6E91", alpha=0.8) |
| ax.set_title(f"Distribution of {col}", fontsize=12) |
| st.pyplot(fig, clear_figure=True) |
| st.write(df[col].describe().to_frame().T) |
|
|
| if all(x in df.columns for x in ["pca_1", "pca_2", "operating_mode"]): |
| st.markdown("### PCA Feature Space — Colored by Operating Mode") |
| fig2, ax2 = plt.subplots(figsize=(6, 5)) |
| sns.scatterplot( |
| data=df.sample(min(1000, len(df)), random_state=42), |
| x="pca_1", y="pca_2", hue="operating_mode", |
| palette="tab10", alpha=0.7, s=40, ax=ax2 |
| ) |
| ax2.set_title("Operating Mode Clusters (PCA Projection)") |
| st.pyplot(fig2, clear_figure=True) |
|
|
| |
| with tabs[2]: |
| st.subheader("Correlation explorer") |
| default_corr = numeric_cols[:20] if len(numeric_cols) >= 20 else numeric_cols |
| corr_sel = st.multiselect("Select features (min 2)", numeric_cols, default=default_corr) |
| if len(corr_sel) >= 2: |
| corr = df[corr_sel].corr() |
| fig, ax = plt.subplots(figsize=(10,8)) |
| sns.heatmap(corr, cmap="RdBu_r", center=0, annot=True, fmt=".2f", |
| linewidths=0.5, cbar_kws={"shrink": 0.7}, ax=ax) |
| st.pyplot(fig, clear_figure=True) |
| else: |
| st.info("Choose at least 2 numeric features to compute correlation.") |
|
|
| |
| with tabs[3]: |
| st.subheader("Summary statistics (numeric features)") |
| st.dataframe(df.describe().T.style.format("{:.3f}"), height=500) |
|
|
| |
| with tabs[4]: |
| st.subheader("AutoML Ensemble — Expanded Families + Stacking + SHAP") |
|
|
| use_case = st.selectbox( |
| "Select Use Case", |
| [ |
| "Predictive Maintenance", |
| "EAF Data Intelligence", |
| "Casting Quality Optimization", |
| "Rolling Mill Energy Optimization", |
| "Surface Defect Detection (Vision AI)", |
| "Material Composition & Alloy Mix AI", |
| "Inventory & Yield Optimization", |
| "Refractory & Cooling Loss Prediction" |
| ], |
| index=1 |
| ) |
|
|
| use_case_config = { |
| "Predictive Maintenance": {"target": "bearing_temp", "model_hint": "RandomForest"}, |
| "EAF Data Intelligence": {"target": "furnace_temp", "model_hint": "GradientBoosting"}, |
| "Casting Quality Optimization": {"target": "surface_temp", "model_hint": "GradientBoosting"}, |
| "Rolling Mill Energy Optimization": {"target": "energy_efficiency", "model_hint": "ExtraTrees"}, |
| "Surface Defect Detection (Vision AI)": {"target": "image_entropy_proxy", "model_hint": "GradientBoosting"}, |
| "Material Composition & Alloy Mix AI": {"target": "chemical_C", "model_hint": "RandomForest"}, |
| "Inventory & Yield Optimization": {"target": "yield_ratio", "model_hint": "GradientBoosting"}, |
| "Refractory & Cooling Loss Prediction": {"target": "lining_thickness", "model_hint": "ExtraTrees"}, |
| } |
| cfg = use_case_config.get(use_case, {"target": numeric_cols[0], "model_hint": "RandomForest"}) |
| target = cfg["target"] |
| model_hint = cfg["model_hint"] |
|
|
| suggested = [c for c in numeric_cols if any(k in c for k in target.split('_'))] |
| if len(suggested) < 6: |
| suggested = [c for c in numeric_cols if any(k in c for k in ["temp","power","energy","pressure","yield"])] |
| if len(suggested) < 6: |
| suggested = numeric_cols[:50] |
|
|
| features = st.multiselect("Model input features (auto-suggested)", numeric_cols, default=suggested) |
| st.markdown(f"Auto target: `{target}` · Suggested family hint: `{model_hint}`") |
| |
| max_rows = min(df.shape[0], 20000) |
| sample_size = st.slider("Sample rows", 500, max_rows, min(1500, max_rows), step=100) |
|
|
| |
| |
| if isinstance(target, (list, tuple)): |
| st.warning(f"Target provided as list/tuple; using first element `{target[0]}` as target.") |
| target = target[0] |
| |
| |
| cols_needed = [c for c in features if c in df.columns] |
| if target not in df.columns: |
| st.error(f"Target `{target}` not found in dataframe columns.") |
| st.stop() |
| |
| |
| sub_df = df[cols_needed + [target]].sample(n=sample_size, random_state=42).reset_index(drop=True) |
| |
| |
| X = sub_df[cols_needed].copy() |
| y = sub_df[[target]].copy() |
| |
| |
| if isinstance(y, pd.DataFrame): |
| if y.shape[1] == 1: |
| y = y.iloc[:, 0] |
| else: |
| st.error(f"Multi-output target detected (shape {y.shape}). Select a single target column.") |
| st.stop() |
| |
| y = pd.Series(np.ravel(y), name=target) |
|
|
| |
| |
| leak_cols = ["furnace_temp_next", "pred_temp_30s", "run_timestamp", "timestamp", "batch_id_numeric", "batch_id"] |
| for lc in leak_cols: |
| if lc in X.columns: |
| X.drop(columns=[lc], inplace=True) |
| |
| |
| nunique = X.nunique(dropna=False) |
| const_cols = nunique[nunique <= 1].index.tolist() |
| if const_cols: |
| X.drop(columns=const_cols, inplace=True) |
| |
| if X.shape[1] == 0: |
| st.error("No valid feature columns remain after cleaning. Check feature selection.") |
| st.stop() |
|
|
|
|
| st.markdown("### Ensemble & AutoML Settings") |
| max_trials = st.slider("Optuna trials per family", 5, 80, 20, step=5) |
| top_k = st.slider("Max base models in ensemble", 2, 8, 5) |
| allow_advanced = st.checkbox("Include advanced families (XGBoost, LightGBM, CatBoost)", value=True) |
|
|
| available_models = ["RandomForest", "ExtraTrees"] |
| optional_families = {} |
| if allow_advanced: |
| try: |
| import xgboost as xgb; optional_families["XGBoost"] = True; available_models.append("XGBoost") |
| except Exception: optional_families["XGBoost"] = False |
| try: |
| import lightgbm as lgb; optional_families["LightGBM"] = True; available_models.append("LightGBM") |
| except Exception: optional_families["LightGBM"] = False |
| try: |
| import catboost as cb; optional_families["CatBoost"] = True; available_models.append("CatBoost") |
| except Exception: optional_families["CatBoost"] = False |
|
|
| st.markdown(f"Available model families: {', '.join(available_models)}") |
|
|
| def tune_family(family_name, X_local, y_local, n_trials=20, random_state=42): |
| """Tune one model family using Optuna.""" |
| def obj(trial): |
| if family_name == "RandomForest": |
| n_estimators = trial.suggest_int("n_estimators", 100, 800) |
| max_depth = trial.suggest_int("max_depth", 4, 30) |
| m = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1, random_state=random_state) |
| elif family_name == "ExtraTrees": |
| n_estimators = trial.suggest_int("n_estimators", 100, 800) |
| max_depth = trial.suggest_int("max_depth", 4, 30) |
| m = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1, random_state=random_state) |
| elif family_name == "XGBoost" and optional_families.get("XGBoost"): |
| n_estimators = trial.suggest_int("n_estimators", 100, 1000) |
| max_depth = trial.suggest_int("max_depth", 3, 12) |
| lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True) |
| m = xgb.XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, tree_method="hist", verbosity=0) |
| elif family_name == "LightGBM" and optional_families.get("LightGBM"): |
| n_estimators = trial.suggest_int("n_estimators", 100, 1000) |
| max_depth = trial.suggest_int("max_depth", 3, 16) |
| lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True) |
| m = lgb.LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, n_jobs=1) |
| elif family_name == "CatBoost" and optional_families.get("CatBoost"): |
| iterations = trial.suggest_int("iterations", 200, 1000) |
| depth = trial.suggest_int("depth", 4, 10) |
| lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True) |
| m = cb.CatBoostRegressor(iterations=iterations, depth=depth, learning_rate=lr, verbose=0) |
| else: |
| m = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=random_state) |
| try: |
| scores = cross_val_score(m, X_local, y_local, scoring="r2", cv=3) |
| return float(np.mean(scores)) |
| except Exception: |
| return -999.0 |
|
|
| study = optuna.create_study(direction="maximize") |
| study.optimize(obj, n_trials=n_trials, show_progress_bar=False) |
| best = study.best_trial.params if study.trials else {} |
| try: |
| if family_name == "RandomForest": |
| model = RandomForestRegressor(**{**{"random_state":42,"n_jobs":-1}, **best}) |
| elif family_name == "ExtraTrees": |
| model = ExtraTreesRegressor(**{**{"random_state":42,"n_jobs":-1}, **best}) |
| elif family_name == "XGBoost" and optional_families.get("XGBoost"): |
| model = xgb.XGBRegressor(**{**{"verbosity":0,"tree_method":"hist"}, **best}) |
| elif family_name == "LightGBM" and optional_families.get("LightGBM"): |
| model = lgb.LGBMRegressor(**{**{"n_jobs":1}, **best}) |
| elif family_name == "CatBoost" and optional_families.get("CatBoost"): |
| model = cb.CatBoostRegressor(**{**{"verbose":0}, **best}) |
| else: |
| model = RandomForestRegressor(random_state=42) |
| except Exception: |
| model = RandomForestRegressor(random_state=42) |
|
|
| try: |
| score = float(np.mean(cross_val_score(model, X_local, y_local, scoring="r2", cv=3))) |
| except Exception: |
| score = -999.0 |
| return {"model_obj": model, "cv_score": score, "best_params": best, "family": family_name} |
|
|
| if st.button("Run expanded AutoML + Stacking"): |
| st.session_state["run_automl_clicked"] = True |
|
|
| if st.session_state["run_automl_clicked"]: |
| log("AutoML + Stacking initiated.") |
| with st.spinner("Tuning multiple families..."): |
| families_to_try = ["RandomForest", "ExtraTrees", "MLP"] |
| if allow_advanced: |
| if optional_families.get("XGBoost"): families_to_try.append("XGBoost") |
| if optional_families.get("LightGBM"): families_to_try.append("LightGBM") |
| if optional_families.get("CatBoost"): families_to_try.append("CatBoost") |
|
|
| tuned_results = [] |
| for fam in families_to_try: |
| log(f"Tuning family: {fam}") |
| st.caption(f"Tuning family: {fam}") |
| tuned_results.append(tune_family(fam, X, y, n_trials=max_trials)) |
| |
| lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"], "params": r["best_params"]} for r in tuned_results]) |
| lb = lb.sort_values("cv_r2", ascending=False).reset_index(drop=True) |
| st.markdown("### Tuning Leaderboard (by CV R²)") |
| st.dataframe(lb[["family","cv_r2"]].round(4)) |
|
|
| |
| from sklearn.feature_selection import SelectKBest, f_regression |
| from sklearn.linear_model import LinearRegression |
| from sklearn.model_selection import KFold |
|
|
| st.markdown("### Building base models & out-of-fold predictions for stacking") |
|
|
| scaler = StandardScaler() |
| X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns) |
| selector = SelectKBest(f_regression, k=min(40, X_scaled.shape[1])) |
| X_sel = selector.fit_transform(X_scaled, y) |
| selected_feature_names = [X.columns[i] for i in selector.get_support(indices=True)] |
| X_sel = pd.DataFrame(X_sel, columns=selected_feature_names) |
|
|
| kf = KFold(n_splits=5, shuffle=True, random_state=42) |
| base_models, oof_preds = [], pd.DataFrame(index=X_sel.index) |
|
|
| for fam, entry in [(r["family"], r) for r in tuned_results if r.get("model_obj")]: |
| model_obj = entry["model_obj"] |
| oof = np.zeros(X_sel.shape[0]) |
| for tr_idx, val_idx in kf.split(X_sel): |
| X_tr, X_val = X_sel.iloc[tr_idx], X_sel.iloc[val_idx] |
| y_tr = y[tr_idx] if not hasattr(y, "iloc") else y.iloc[tr_idx] |
|
|
| try: |
| model_obj.fit(X_tr, y_tr) |
| preds = model_obj.predict(X_val) |
| oof[val_idx] = preds |
| except Exception: |
| oof[val_idx] = np.mean(y_tr) |
| oof_preds[f"{fam}_oof"] = oof |
| model_obj.fit(X_sel, y) |
| base_models.append({"family": fam, "model": model_obj}) |
|
|
| if oof_preds.empty: |
| st.error("No base models built.") |
| st.stop() |
|
|
| corr = oof_preds.corr().abs() |
| div = {c: 1 - corr[c].drop(c).mean() for c in corr.columns} |
| cv_r2_est = {c: r2_score(y, oof_preds[c]) for c in oof_preds.columns} |
|
|
| summary_df = pd.DataFrame({ |
| "family": [c.replace("_oof","") for c in oof_preds.columns], |
| "cv_r2": [cv_r2_est[c] for c in oof_preds.columns], |
| "diversity": [div[c] for c in oof_preds.columns] |
| }).sort_values(["cv_r2","diversity"], ascending=[False,False]) |
|
|
| st.dataframe(summary_df.round(4)) |
| selected = summary_df.head(top_k)["family"].tolist() |
| st.markdown(f"Selected for stacking (top {top_k}): {selected}") |
|
|
| meta = LinearRegression(positive=True) |
| X_stack = oof_preds[[f"{s}_oof" for s in selected]].fillna(0) |
| meta.fit(X_stack, y) |
|
|
| X_tr, X_val, y_tr, y_val = train_test_split(X_sel, y, test_size=0.2, random_state=42) |
| meta_inputs = [] |
| for fam in selected: |
| mdl = next((b["model"] for b in base_models if b["family"] == fam), None) |
| preds = mdl.predict(X_val) if mdl else np.full(len(X_val), np.mean(y_tr)) |
| meta_inputs.append(np.ravel(preds)) |
| X_meta_val = pd.DataFrame(np.column_stack(meta_inputs), columns=X_stack.columns) |
| y_meta_pred = meta.predict(X_meta_val) |
|
|
| final_r2 = r2_score(y_val, y_meta_pred) |
| final_rmse = np.sqrt(mean_squared_error(y_val, y_meta_pred)) |
| st.success(f"Stacked Ensemble — R² = {final_r2:.4f}, RMSE = {final_rmse:.3f}") |
|
|
| fig, ax = plt.subplots(figsize=(7,4)) |
| ax.scatter(y_val, y_meta_pred, alpha=0.7) |
| ax.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], "r--") |
| st.pyplot(fig, clear_figure=True) |
|
|
| st.session_state["automl_summary"] = { |
| "leaderboard": summary_df[["family","cv_r2"]].to_dict(orient="records"), |
| "final_r2": float(final_r2), |
| "final_rmse": float(final_rmse), |
| "target": target, |
| "use_case": use_case |
| } |
|
|
| |
| st.markdown("---") |
| st.subheader("Operator Advisory System — Real-Time Shift Recommendations") |
|
|
| try: |
| top_base = next((b for b in base_models if b["family"] == selected[0]), None) |
| if top_base and hasattr(top_base["model"], "predict"): |
| sample_X = X_val.sample(min(300, len(X_val)), random_state=42) |
| model = top_base["model"] |
| expl = shap.TreeExplainer(model) |
| shap_vals = expl.shap_values(sample_X) |
| if isinstance(shap_vals, list): shap_vals = shap_vals[0] |
| shap_vals = np.array(shap_vals) |
| mean_abs = np.abs(shap_vals).mean(axis=0) |
| mean_sign = np.sign(shap_vals).mean(axis=0) |
| importance = pd.DataFrame({ |
| "Feature": sample_X.columns, |
| "Mean |SHAP|": mean_abs, |
| "Mean SHAP Sign": mean_sign |
| }).sort_values("Mean |SHAP|", ascending=False) |
| st.markdown("### Top 5 Operational Drivers") |
| st.dataframe(importance.head(5)) |
| recommendations = [] |
| for _, row in importance.head(5).iterrows(): |
| f, s = row["Feature"], row["Mean SHAP Sign"] |
| if s > 0.05: recommendations.append(f"Increase `{f}` likely increases `{target}`") |
| elif s < -0.05: recommendations.append(f"Decrease `{f}` likely increases `{target}`") |
| else: recommendations.append(f"`{f}` neutral for `{target}`") |
| st.markdown("### Suggested Operator Adjustments") |
| st.write("\n".join(recommendations)) |
|
|
| |
| import requests |
| HF_TOKEN = st.secrets.get("HF_TOKEN", os.getenv("HF_TOKEN")) |
| if not HF_TOKEN: |
| st.error("HF_TOKEN not found in secrets or environment.") |
| else: |
| API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-3-70B-Instruct" |
| headers = {"Authorization": f"Bearer {HF_TOKEN}"} |
| prompt = f""" |
| You are an expert metallurgical process advisor. |
| Based on these recommendations: |
| {recommendations} |
| Target: {target} |
| Use case: {use_case} |
| Summarize in three concise, professional lines what the operator should do this shift. |
| """ |
| payload = {"inputs": prompt, "parameters": {"max_new_tokens": 150, "temperature": 0.6}} |
| with st.spinner("Generating operator note (Llama-3-70B)…"): |
| resp = requests.post(API_URL, headers=headers, json=payload, timeout=90) |
| text = resp.json()[0].get("generated_text","").strip() |
| st.info(text) |
| else: |
| st.info("No suitable base model found.") |
| except Exception as e: |
| st.warning(f"Operator advisory skipped: {e}") |
|
|
| |
| with tabs[5]: |
| st.subheader("Business Impact Metrics") |
| target_table = pd.DataFrame([ |
| ["EAF Data Intelligence", "furnace_temp / tap_temp", "Central control variable", "₹20–60 L/year"], |
| ["Casting Optimization", "surface_temp / cooling_water_temp", "Controls billet quality", "₹50 L/year"], |
| ["Rolling Mill", "energy_efficiency", "Energy optimization", "₹5–10 L/year"], |
| ["Refractory Loss Prediction", "lining_thickness / heat_loss_rate", "Wear and downtime", "₹40 L/year"], |
| ], columns=["Use Case","Target Variable","Why It’s Ideal","Business Leverage"]) |
| st.dataframe(target_table, width="stretch") |
|
|
| |
| with tabs[6]: |
| st.subheader("Annotated Bibliography") |
| refs = [ |
| ("A Survey of Data-Driven Soft Sensing in Ironmaking Systems","Yan et al. (2024)","Soft sensors validate `furnace_temp` and `tap_temp`.","https://doi.org/10.1021/acsomega.4c01254"), |
| ("Optimisation of Operator Support Systems","Ojeda Roldán et al. (2022)","Reinforcement learning for endpoint control.","https://doi.org/10.3390/jmmp6020034"), |
| ("Analyzing the Energy Efficiency of Electric Arc Furnace Steelmaking","Zhuo et al. (2024)","Links arc power and energy KPIs.","https://doi.org/10.3390/met15010113"), |
| ("Dynamic EAF Modeling and Slag Foaming Index Prediction","MacRosty et al.","Supports refractory wear modeling.","https://www.sciencedirect.com/science/article/pii/S0921883123004019") |
| ] |
| for t,a,n,u in refs: |
| st.markdown(f"**[{t}]({u})** — *{a}* \n_{n}_") |
|
|
| |
| with tabs[7]: |
| st.subheader("Download Saved Files") |
| files = [f for f in os.listdir(LOG_DIR) if os.path.isfile(os.path.join(LOG_DIR, f))] |
| if not files: st.info("No files yet — run AutoML first.") |
| else: |
| for f in sorted(files): |
| path = os.path.join(LOG_DIR, f) |
| with open(path,"rb") as fp: |
| st.download_button(f"Download {f}", fp, file_name=f) |
|
|
| |
| with tabs[8]: |
| st.subheader("Master Log") |
| if os.path.exists(LOG_PATH): |
| txt = open(LOG_PATH).read() |
| st.text_area("Log Output", txt, height=400) |
| st.download_button("Download Log", txt, file_name="run_master.log") |
| else: |
| st.info("No logs yet — run AutoML once.") |
|
|
| st.markdown("---") |
| st.markdown("**Note:** Synthetic demo dataset for educational use only. Real deployment requires plant data, NDA, and safety validation.") |