Spaces:
Runtime error
Runtime error
| import os | |
| import json | |
| import time | |
| from datetime import datetime | |
| import numpy as np | |
| import pandas as pd | |
| import streamlit as st | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import joblib | |
| # ML imports | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.linear_model import LinearRegression | |
| from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor | |
| from sklearn.preprocessing import StandardScaler, PolynomialFeatures | |
| from sklearn.decomposition import PCA | |
| from sklearn.cluster import KMeans | |
| from sklearn.metrics import mean_squared_error, r2_score | |
| # SHAP | |
| import shap | |
| # ------------------------- | |
| # Config & paths | |
| # ------------------------- | |
| st.set_page_config(page_title="AI Feature Universe Explorer — Advanced + SHAP", layout="wide") | |
| DATA_DIR = os.getenv("DATA_DIR", "./data") | |
| os.makedirs(DATA_DIR, exist_ok=True) | |
| CSV_PATH = os.path.join(DATA_DIR, "flatfile_universe_advanced.csv") | |
| META_PATH = os.path.join(DATA_DIR, "feature_metadata_advanced.json") | |
| PDF_PATH = os.path.join(DATA_DIR, "annotated_bibliography.pdf") | |
| ENSEMBLE_ARTIFACT = os.path.join(DATA_DIR, "ensemble_models.joblib") | |
| # ------------------------- | |
| # Utility: generate advanced dataset if missing | |
| # ------------------------- | |
| def generate_advanced_flatfile( | |
| n_rows=3000, | |
| random_seed=42, | |
| max_polynomial_new=60, | |
| global_variance_multiplier=1.0, | |
| variance_overrides=None, | |
| ): | |
| """ | |
| Generates a large synthetic, physics-aligned dataset with many engineered features. | |
| Allows control of variability per feature (through variance_overrides) or globally | |
| (via global_variance_multiplier). | |
| Args: | |
| n_rows: number of samples | |
| random_seed: RNG seed | |
| max_polynomial_new: limit on number of polynomial expansion features | |
| global_variance_multiplier: multiplier applied to all default stddevs | |
| variance_overrides: dict mapping feature name or substring → stddev multiplier | |
| """ | |
| np.random.seed(random_seed) | |
| os.makedirs(DATA_DIR, exist_ok=True) | |
| if variance_overrides is None: | |
| variance_overrides = {} | |
| # --- base natural features across 8 use cases (expanded) | |
| natural_feats = [ | |
| "vibration_x","vibration_y","motor_current","rpm","bearing_temp","ambient_temp","lube_pressure","power_factor", | |
| "furnace_temp","tap_temp","slag_temp","offgas_co","offgas_co2","o2_probe_pct","c_feed_rate","arc_power","furnace_pressure","feed_time", | |
| "mold_temp","casting_speed","nozzle_pressure","cooling_water_temp","billet_length","chemical_C","chemical_Mn","chemical_Si","chemical_S", | |
| "roll_speed","motor_load","coolant_flow","exit_temp","strip_thickness","line_tension","roller_vibration", | |
| "lighting_intensity","surface_temp","image_entropy_proxy", | |
| "spectro_Fe","spectro_C","spectro_Mn","spectro_Si","time_since_last_sample", | |
| "batch_id_numeric","weight_input","weight_output","time_in_queue","conveyor_speed", | |
| "shell_temp","lining_thickness","water_flow","cooling_out_temp","heat_flux" | |
| ] | |
| natural_feats = list(dict.fromkeys(natural_feats)) # dedupe | |
| # helper: compute adjusted stddev | |
| def effective_sd(feature_name, base_sd): | |
| # exact name override | |
| if feature_name in variance_overrides: | |
| return float(variance_overrides[feature_name]) | |
| # substring override | |
| for key, val in variance_overrides.items(): | |
| if key in feature_name: | |
| return float(val) | |
| # fallback: scaled base | |
| return float(base_sd) * float(global_variance_multiplier) | |
| # helper sampling heuristics | |
| def sample_col(name, n): | |
| name_l = name.lower() | |
| if "furnace_temp" in name_l or name_l.endswith("_temp") or "tap_temp" in name_l: | |
| sd = effective_sd("furnace_temp", 50) | |
| return np.random.normal(1550, sd, n) | |
| if name_l in ("tap_temp","mold_temp","shell_temp","cooling_out_temp","exit_temp"): | |
| sd = effective_sd(name_l, 30) | |
| return np.random.normal(200 if "mold" not in name_l else 1500, sd, n) | |
| if "offgas_co2" in name_l: | |
| sd = effective_sd("offgas_co2", 4) | |
| return np.abs(np.random.normal(15, sd, n)) | |
| if "offgas_co" in name_l: | |
| sd = effective_sd("offgas_co", 5) | |
| return np.abs(np.random.normal(20, sd, n)) | |
| if "o2" in name_l: | |
| sd = effective_sd("o2_probe_pct", 1) | |
| return np.clip(np.random.normal(5, sd, n), 0.01, 60) | |
| if "arc_power" in name_l or "motor_load" in name_l: | |
| sd = effective_sd("arc_power", 120) | |
| return np.abs(np.random.normal(600, sd, n)) | |
| if "rpm" in name_l: | |
| sd = effective_sd("rpm", 30) | |
| return np.abs(np.random.normal(120, sd, n)) | |
| if "vibration" in name_l: | |
| sd = effective_sd("vibration", 0.15) | |
| return np.abs(np.random.normal(0.4, sd, n)) | |
| if "bearing_temp" in name_l: | |
| sd = effective_sd("bearing_temp", 5) | |
| return np.random.normal(65, sd, n) | |
| if "chemical" in name_l or "spectro" in name_l: | |
| sd = effective_sd("chemical", 0.15) | |
| return np.random.normal(0.7, sd, n) | |
| if "weight" in name_l: | |
| sd = effective_sd("weight", 100) | |
| return np.random.normal(1000, sd, n) | |
| if "conveyor_speed" in name_l or "casting_speed" in name_l: | |
| sd = effective_sd("casting_speed", 0.6) | |
| return np.random.normal(2.5, sd, n) | |
| if "power_factor" in name_l: | |
| sd = effective_sd("power_factor", 0.03) | |
| return np.clip(np.random.normal(0.92, sd, n), 0.6, 1.0) | |
| if "image_entropy_proxy" in name_l: | |
| sd = effective_sd("image_entropy_proxy", 0.25) | |
| return np.abs(np.random.normal(0.5, sd, n)) | |
| if "batch_id" in name_l: | |
| return np.random.randint(1000,9999,n) | |
| if "time_since" in name_l or "time_in_queue" in name_l: | |
| sd = effective_sd("time_since", 20) | |
| return np.abs(np.random.normal(30, sd, n)) | |
| if "heat_flux" in name_l: | |
| sd = effective_sd("heat_flux", 300) | |
| return np.abs(np.random.normal(1000, sd, n)) | |
| return np.random.normal(0, effective_sd(name_l, 1), n) | |
| # build DataFrame | |
| df = pd.DataFrame({c: sample_col(c, n_rows) for c in natural_feats}) | |
| # timestamps & metadata | |
| start = pd.Timestamp("2025-01-01T00:00:00") | |
| df["timestamp"] = pd.date_range(start, periods=n_rows, freq="T") | |
| df["cycle_minute"] = np.mod(np.arange(n_rows), 80) | |
| df["meta_plant_name"] = np.random.choice(["Rourkela","Jamshedpur","VSP","Bokaro","Kalinganagar","Salem"], n_rows) | |
| df["meta_country"] = "India" | |
| # --- synthetic features: physics informed proxies | |
| df["carbon_proxy"] = df["offgas_co"] / (df["offgas_co2"] + 1.0) | |
| df["oxygen_utilization"] = df["offgas_co2"] / (df["offgas_co"] + 1.0) | |
| df["power_density"] = df["arc_power"] / (df["weight_input"] + 1.0) | |
| df["energy_efficiency"] = df["furnace_temp"] / (df["arc_power"] + 1.0) | |
| df["slag_foaming_index"] = (df["slag_temp"] * df["offgas_co"]) / (df["o2_probe_pct"] + 1.0) | |
| df["yield_ratio"] = df["weight_output"] / (df["weight_input"] + 1e-9) | |
| # rolling stats, lags, rocs for a prioritized set | |
| rolling_cols = ["arc_power","furnace_temp","offgas_co","offgas_co2","motor_current","vibration_x","weight_input"] | |
| for rc in rolling_cols: | |
| if rc in df.columns: | |
| df[f"{rc}_roll_mean_3"] = df[rc].rolling(3, min_periods=1).mean() | |
| df[f"{rc}_roll_std_5"] = df[rc].rolling(5, min_periods=1).std().fillna(0) | |
| df[f"{rc}_lag1"] = df[rc].shift(1).fillna(method="bfill") | |
| df[f"{rc}_roc_1"] = df[rc].diff().fillna(0) | |
| # interaction & polynomial-lite | |
| df["arc_o2_interaction"] = df["arc_power"] * df["o2_probe_pct"] | |
| df["carbon_power_ratio"] = df["carbon_proxy"] / (df["arc_power"] + 1e-6) | |
| df["temp_power_sqrt"] = df["furnace_temp"] * np.sqrt(np.abs(df["arc_power"]) + 1e-6) | |
| # polynomial features limited to first 12 numeric columns | |
| numeric = df.select_dtypes(include=[np.number]).fillna(0) | |
| poly_source_cols = numeric.columns[:12].tolist() | |
| poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False) | |
| poly_mat = poly.fit_transform(numeric[poly_source_cols]) | |
| poly_names = poly.get_feature_names_out(poly_source_cols) | |
| poly_df = pd.DataFrame(poly_mat, columns=[f"poly__{n}" for n in poly_names], index=df.index) | |
| keep_poly = [c for c in poly_df.columns if c.replace("poly__","") not in poly_source_cols] | |
| poly_df = poly_df[keep_poly].iloc[:, :max_polynomial_new] if len(keep_poly) > 0 else poly_df.iloc[:, :0] | |
| df = pd.concat([df, poly_df], axis=1) | |
| # PCA embeddings across numeric sensors | |
| scaler = StandardScaler() | |
| scaled = scaler.fit_transform(numeric) | |
| pca = PCA(n_components=6, random_state=42) | |
| pca_cols = pca.fit_transform(scaled) | |
| for i in range(pca_cols.shape[1]): | |
| df[f"pca_{i+1}"] = pca_cols[:, i] | |
| # KMeans cluster label for operating mode | |
| kmeans = KMeans(n_clusters=6, random_state=42, n_init=10) | |
| df["operating_mode"] = kmeans.fit_predict(scaled) | |
| # surrogate models | |
| surrogate_df = df.copy() | |
| surrogate_df["furnace_temp_next"] = surrogate_df["furnace_temp"].shift(-1).fillna(method="ffill") | |
| features_for_surrogate = [c for c in ["furnace_temp","arc_power","o2_probe_pct","offgas_co","offgas_co2"] if c in df.columns] | |
| if len(features_for_surrogate) >= 2: | |
| X = surrogate_df[features_for_surrogate].fillna(0) | |
| y = surrogate_df["furnace_temp_next"] | |
| rf = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1) | |
| rf.fit(X, y) | |
| df["pred_temp_30s"] = rf.predict(X) | |
| else: | |
| df["pred_temp_30s"] = df["furnace_temp"] | |
| if all(c in df.columns for c in ["offgas_co","offgas_co2","o2_probe_pct"]): | |
| X2 = df[["offgas_co","offgas_co2","o2_probe_pct"]].fillna(0) | |
| rf2 = RandomForestRegressor(n_estimators=50, random_state=1, n_jobs=-1) | |
| rf2.fit(X2, df["carbon_proxy"]) | |
| df["pred_carbon_5min"] = rf2.predict(X2) | |
| else: | |
| df["pred_carbon_5min"] = df["carbon_proxy"] | |
| # safety indices & flags | |
| df["refractory_limit_flag"] = (df["lining_thickness"] < 140).astype(int) | |
| df["max_allowed_power_delta"] = np.clip(df["arc_power"].diff().abs().fillna(0), 0, 2000) | |
| # rule-based target | |
| df["ARC_ON"] = ((df["arc_power"] > df["arc_power"].median()) & (df["carbon_proxy"] < 1.0)).astype(int) | |
| df["prediction_confidence"] = np.clip(np.random.beta(2,5, n_rows), 0.05, 0.99) | |
| # clean NaN and infinite | |
| df.replace([np.inf, -np.inf], np.nan, inplace=True) | |
| df.fillna(method="bfill", inplace=True) | |
| df.fillna(0, inplace=True) | |
| # save CSV & metadata | |
| df.to_csv(CSV_PATH, index=False) | |
| meta = [] | |
| for col in df.columns: | |
| if col in natural_feats: | |
| source = "natural" | |
| elif col.startswith("poly__") or col.startswith("pca_") or col in ["operating_mode"]: | |
| source = "advanced_synthetic" | |
| else: | |
| source = "synthetic" | |
| meta.append({ | |
| "feature_name": col, | |
| "source_type": source, | |
| "linked_use_cases": ["All" if source!="natural" else "Mapped"], | |
| "units": "-", | |
| "formula": "see generator logic", | |
| "remarks": "auto-generated or simulated" | |
| }) | |
| with open(META_PATH, "w") as f: | |
| json.dump(meta, f, indent=2) | |
| # annotated bibliography | |
| try: | |
| from fpdf import FPDF | |
| pdf = FPDF('P','mm','A4') | |
| pdf.add_page() | |
| pdf.set_font("Helvetica","B",14) | |
| pdf.cell(0,8,"Annotated Bibliography - Metallurgical AI (Selected Papers)", ln=True) | |
| pdf.ln(2) | |
| pdf.set_font("Helvetica","",10) | |
| pdf.cell(0,6,"Generated: " + datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC"), ln=True) | |
| pdf.ln(4) | |
| bib_items = [ | |
| ("A Survey of Data-Driven Soft Sensing in Ironmaking Systems","Yan et al. (2024)","Review of soft-sensors; supports gas proxies, lags, PCA."), | |
| ("Optimisation of Oxygen Blowing Process using RL","Ojeda Roldan et al. (2022)","RL for oxygen control; motivates surrogate predicted states & safety indices."), | |
| ("Analyzing the Energy Efficiency of Electric Arc Furnace","Zhuo et al. (2024)","Energy KPIs (kWh/t) motivate power_density & energy_efficiency features."), | |
| ("BOF/Endpoint prediction techniques","Springer (2024)","Endpoint prediction; supports temporal lags and cycle encoding."), | |
| ("Dynamic EAF modeling & slag foaming","MacRosty et al.","Physics priors for slag_foaming_index and refractory health modeling.") | |
| ] | |
| for title, auth, note in bib_items: | |
| pdf.set_font("Helvetica","B",11) | |
| pdf.multi_cell(0,6, f"{title} — {auth}") | |
| pdf.set_font("Helvetica","",10) | |
| pdf.multi_cell(0,5, f"Notes: {note}") | |
| pdf.ln(2) | |
| pdf.output(PDF_PATH) | |
| except Exception as e: | |
| with open(PDF_PATH.replace(".pdf",".txt"), "w") as tf: | |
| tf.write("Annotated bibliography generated. Install fpdf for PDF output.\n") | |
| return CSV_PATH, META_PATH, PDF_PATH | |
| # ------------------------- | |
| # Ensure dataset exists | |
| # ------------------------- | |
| if not os.path.exists(CSV_PATH) or not os.path.exists(META_PATH): | |
| with st.spinner("Generating synthetic features (this may take ~20-60s)..."): | |
| CSV_PATH, META_PATH, PDF_PATH = generate_advanced_flatfile(n_rows=3000, random_seed=42, max_polynomial_new=80) | |
| st.success(f"Generated dataset and metadata: {CSV_PATH}") | |
| # ------------------------- | |
| # Load data & metadata (cached) | |
| # ------------------------- | |
| def load_data(csv_path=CSV_PATH, meta_path=META_PATH): | |
| df_local = pd.read_csv(csv_path) | |
| with open(meta_path, "r") as f: | |
| meta_local = json.load(f) | |
| return df_local, pd.DataFrame(meta_local) | |
| df, meta_df = load_data() | |
| # ------------------------- | |
| # Sidebar filters & UI | |
| # ------------------------- | |
| st.sidebar.title("Feature Explorer - Advanced + SHAP") | |
| feat_types = sorted(meta_df["source_type"].unique().tolist()) | |
| selected_types = st.sidebar.multiselect("Feature type", feat_types, default=feat_types) | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() | |
| # ------------------------- | |
| # Main tabs | |
| # ------------------------- | |
| st.title("Steel Authority of India Limited (SHAP-enabled)") | |
| tabs = st.tabs([ | |
| "Features", | |
| "Visualize", | |
| "Correlations", | |
| "Stats", | |
| "Ensemble + SHAP", | |
| "Target & Business Impact", | |
| "Bibliography" | |
| ]) | |
| # ----- Features tab | |
| with tabs[0]: | |
| st.subheader("Feature metadata") | |
| filtered_meta = meta_df[meta_df["source_type"].isin(selected_types)] | |
| st.dataframe(filtered_meta[["feature_name","source_type","formula","remarks"]].rename(columns={"feature_name":"Feature"}), height=400) | |
| st.markdown(f"Total features loaded: **{df.shape[1]}** | Rows: **{df.shape[0]}**") | |
| # ----- Visualize tab | |
| with tabs[1]: | |
| st.subheader("Feature visualization") | |
| col = st.selectbox("Choose numeric feature", numeric_cols, index=0) | |
| bins = st.slider("Histogram bins", 10, 200, 50) | |
| fig, ax = plt.subplots(figsize=(8,4)) | |
| sns.histplot(df[col], bins=bins, kde=True, ax=ax) | |
| ax.set_title(col) | |
| st.pyplot(fig) | |
| st.write(df[col].describe().to_frame().T) | |
| # ----- Correlations tab | |
| with tabs[2]: | |
| st.subheader("Correlation explorer") | |
| default_corr = numeric_cols[:20] if len(numeric_cols) >= 20 else numeric_cols | |
| corr_sel = st.multiselect("Select features (min 2)", numeric_cols, default=default_corr) | |
| if len(corr_sel) >= 2: | |
| corr = df[corr_sel].corr() | |
| fig, ax = plt.subplots(figsize=(10,8)) | |
| sns.heatmap(corr, cmap="coolwarm", center=0, ax=ax) | |
| st.pyplot(fig) | |
| else: | |
| st.info("Choose at least 2 numeric features to compute correlation.") | |
| # ----- Stats tab | |
| with tabs[3]: | |
| st.subheader("Summary statistics (numeric features)") | |
| st.dataframe(df.describe().T.style.format("{:.3f}"), height=500) | |
| # ----- Ensemble + SHAP tab (Expanded AutoML + Stacking + Multi-Family) ----- | |
| with tabs[4]: | |
| st.subheader(" AutoML Ensemble — Expanded Families + Stacking + SHAP") | |
| # --- Step 0: High-level Use Case (keeps previous defaults) --- | |
| st.markdown("### Choose Industrial Use Case ") | |
| use_case = st.selectbox( | |
| "Select Use Case", | |
| [ | |
| "Predictive Maintenance", | |
| "EAF Data Intelligence", | |
| "Casting Quality Optimization", | |
| "Rolling Mill Energy Optimization", | |
| "Surface Defect Detection (Vision AI)", | |
| "Material Composition & Alloy Mix AI", | |
| "Inventory & Yield Optimization", | |
| "Refractory & Cooling Loss Prediction" | |
| ], | |
| index=1 | |
| ) | |
| # Map use-case -> defaults (same as before) | |
| use_case_config = { | |
| "Predictive Maintenance": {"target": "bearing_temp", "model_hint": "RandomForest"}, | |
| "EAF Data Intelligence": {"target": "furnace_temp", "model_hint": "GradientBoosting"}, | |
| "Casting Quality Optimization": {"target": "surface_temp" if "surface_temp" in numeric_cols else "furnace_temp", "model_hint": "GradientBoosting"}, | |
| "Rolling Mill Energy Optimization": {"target": "energy_efficiency", "model_hint": "ExtraTrees"}, | |
| "Surface Defect Detection (Vision AI)": {"target": "image_entropy_proxy", "model_hint": "GradientBoosting"}, | |
| "Material Composition & Alloy Mix AI": {"target": "chemical_C", "model_hint": "RandomForest"}, | |
| "Inventory & Yield Optimization": {"target": "yield_ratio", "model_hint": "GradientBoosting"}, | |
| "Refractory & Cooling Loss Prediction": {"target": "lining_thickness", "model_hint": "ExtraTrees"}, | |
| } | |
| cfg = use_case_config.get(use_case, {"target": numeric_cols[0], "model_hint": "RandomForest"}) | |
| target = cfg["target"] | |
| model_hint = cfg["model_hint"] | |
| # --- Feature auto-suggestion (keeps your earlier heuristic) --- | |
| suggested = [c for c in numeric_cols if any(k in c for k in target.split('_'))] | |
| if len(suggested) < 6: | |
| suggested = [c for c in numeric_cols if any(k in c for k in ["temp", "power", "energy", "pressure", "yield"])] | |
| if len(suggested) < 6: | |
| suggested = numeric_cols[:50] | |
| features = st.multiselect("Model input features (auto-suggested)", numeric_cols, default=suggested) | |
| st.markdown(f"Auto target: `{target}` · Suggested family hint: `{model_hint}`") | |
| # --- Data sampling controls --- | |
| max_rows = min(df.shape[0], 20000) | |
| sample_size = st.slider("Sample rows (train speed vs fidelity)", 500, max_rows, min(1500, max_rows), step=100) | |
| sub_df = df[features + [target]].sample(n=sample_size, random_state=42).reset_index(drop=True) | |
| X = sub_df[features].fillna(0) | |
| y = sub_df[target].fillna(0) | |
| # --- Ensemble control UI --- | |
| st.markdown("### Ensemble & AutoML Settings") | |
| max_trials = st.slider("Optuna trials per family (total trials grow with families)", 5, 80, 20, step=5) | |
| top_k = st.slider("Max base models to keep in final ensemble", 2, 8, 5) | |
| allow_advanced = st.checkbox("Include advanced families (XGBoost, LightGBM, CatBoost, TabPFN if installed)", value=True) | |
| # --- Conditional imports (graceful fallbacks) --- | |
| available_models = ["RandomForest", "ExtraTrees"] # always available (sklearn) | |
| optional_families = {} | |
| if allow_advanced: | |
| try: | |
| import xgboost as xgb | |
| optional_families["XGBoost"] = True | |
| available_models.append("XGBoost") | |
| except Exception: | |
| optional_families["XGBoost"] = False | |
| try: | |
| import lightgbm as lgb | |
| optional_families["LightGBM"] = True | |
| available_models.append("LightGBM") | |
| except Exception: | |
| optional_families["LightGBM"] = False | |
| try: | |
| import catboost as cb | |
| optional_families["CatBoost"] = True | |
| available_models.append("CatBoost") | |
| except Exception: | |
| optional_families["CatBoost"] = False | |
| try: | |
| # TabPFN is often packaged differently; attempt import but it's optional | |
| import tabpfn | |
| optional_families["TabPFN"] = True | |
| available_models.append("TabPFN") | |
| except Exception: | |
| optional_families["TabPFN"] = False | |
| try: | |
| # FT-Transformer optional | |
| from pytorch_tabular.models import transformers # may not be installed | |
| optional_families["FTTransformer"] = True | |
| available_models.append("FTTransformer") | |
| except Exception: | |
| optional_families["FTTransformer"] = False | |
| st.markdown(f"Available model families: {', '.join(available_models)}") | |
| # --- Optuna tuning routine per family --- | |
| import optuna | |
| from sklearn.model_selection import cross_val_score, KFold | |
| from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor | |
| from sklearn.linear_model import Ridge | |
| from sklearn.neural_network import MLPRegressor | |
| from sklearn.metrics import r2_score, mean_squared_error | |
| def tune_family(family_name, X_local, y_local, n_trials=20, random_state=42): | |
| """Tune one model family using Optuna; returns best (model_obj, cv_score, best_params).""" | |
| def obj(trial): | |
| # sample hyperparams per family | |
| if family_name == "RandomForest": | |
| n_estimators = trial.suggest_int("n_estimators", 100, 800) | |
| max_depth = trial.suggest_int("max_depth", 4, 30) | |
| m = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1, random_state=random_state) | |
| elif family_name == "ExtraTrees": | |
| n_estimators = trial.suggest_int("n_estimators", 100, 800) | |
| max_depth = trial.suggest_int("max_depth", 4, 30) | |
| m = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1, random_state=random_state) | |
| elif family_name == "XGBoost" and optional_families.get("XGBoost"): | |
| n_estimators = trial.suggest_int("n_estimators", 100, 1000) | |
| max_depth = trial.suggest_int("max_depth", 3, 12) | |
| lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True) | |
| m = xgb.XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, tree_method="hist", verbosity=0, random_state=random_state, n_jobs=1) | |
| elif family_name == "LightGBM" and optional_families.get("LightGBM"): | |
| n_estimators = trial.suggest_int("n_estimators", 100, 1000) | |
| max_depth = trial.suggest_int("max_depth", 3, 16) | |
| lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True) | |
| m = lgb.LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, n_jobs=1, random_state=random_state) | |
| elif family_name == "CatBoost" and optional_families.get("CatBoost"): | |
| iterations = trial.suggest_int("iterations", 200, 1000) | |
| depth = trial.suggest_int("depth", 4, 10) | |
| lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True) | |
| m = cb.CatBoostRegressor(iterations=iterations, depth=depth, learning_rate=lr, verbose=0, random_state=random_state) | |
| elif family_name == "MLP": | |
| hidden = trial.suggest_int("hidden_layer_sizes", 32, 512, log=True) | |
| lr = trial.suggest_float("learning_rate_init", 1e-4, 1e-1, log=True) | |
| m = MLPRegressor(hidden_layer_sizes=(hidden,), learning_rate_init=lr, max_iter=500, random_state=random_state) | |
| elif family_name == "TabPFN" and optional_families.get("TabPFN"): | |
| # TabPFN often works without hyperparams exposure; return a surrogate score using quick fit | |
| # We'll call its predict_proba style API if available; as fallback use a mean score to let stacking consider it. | |
| # For tuning, just return a placeholder; we'll build model object later. | |
| return 0.0 | |
| else: | |
| # fallback to a small RandomForest to avoid crashing | |
| m = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=random_state, n_jobs=-1) | |
| # use negative RMSE if better for our domain? keep R2 for generality | |
| try: | |
| scores = cross_val_score(m, X_local, y_local, scoring="r2", cv=3, n_jobs=1) | |
| return float(np.mean(scores)) | |
| except Exception: | |
| return -999.0 | |
| study = optuna.create_study(direction="maximize") | |
| study.optimize(obj, n_trials=n_trials, show_progress_bar=False) | |
| best = study.best_trial.params if study.trials else {} | |
| # instantiate best model | |
| try: | |
| if family_name == "RandomForest": | |
| model = RandomForestRegressor(n_estimators=best.get("n_estimators",200), max_depth=best.get("max_depth",8), n_jobs=-1, random_state=42) | |
| elif family_name == "ExtraTrees": | |
| model = ExtraTreesRegressor(n_estimators=best.get("n_estimators",200), max_depth=best.get("max_depth",8), n_jobs=-1, random_state=42) | |
| elif family_name == "XGBoost" and optional_families.get("XGBoost"): | |
| model = xgb.XGBRegressor(n_estimators=best.get("n_estimators",200), max_depth=best.get("max_depth",6), learning_rate=best.get("learning_rate",0.1), tree_method="hist", verbosity=0, random_state=42, n_jobs=1) | |
| elif family_name == "LightGBM" and optional_families.get("LightGBM"): | |
| model = lgb.LGBMRegressor(n_estimators=best.get("n_estimators",200), max_depth=best.get("max_depth",8), learning_rate=best.get("learning_rate",0.1), n_jobs=1, random_state=42) | |
| elif family_name == "CatBoost" and optional_families.get("CatBoost"): | |
| model = cb.CatBoostRegressor(iterations=best.get("iterations",200), depth=best.get("depth",6), learning_rate=best.get("learning_rate",0.1), verbose=0, random_state=42) | |
| elif family_name == "MLP": | |
| model = MLPRegressor(hidden_layer_sizes=(best.get("hidden_layer_sizes",128),), learning_rate_init=best.get("learning_rate_init",0.001), max_iter=500, random_state=42) | |
| elif family_name == "TabPFN" and optional_families.get("TabPFN"): | |
| # We'll create a small wrapper for TabPFN later on train time | |
| model = "TabPFN_placeholder" | |
| else: | |
| model = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1) | |
| except Exception: | |
| model = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1) | |
| # compute cross-validated score for the best model | |
| try: | |
| score = float(np.mean(cross_val_score(model, X_local, y_local, scoring="r2", cv=3, n_jobs=1))) | |
| except Exception: | |
| score = -999.0 | |
| return {"model_obj": model, "cv_score": score, "best_params": best, "family": family_name, "study": study} | |
| # --- Run tuning across available families (user triggered) --- | |
| run_btn = st.button(" Run expanded AutoML + Stacking") | |
| if run_btn: | |
| with st.spinner("Tuning multiple families (this may take a while depending on choices)..."): | |
| families_to_try = ["RandomForest", "ExtraTrees", "MLP"] | |
| if allow_advanced: | |
| if optional_families.get("XGBoost"): families_to_try.append("XGBoost") | |
| if optional_families.get("LightGBM"): families_to_try.append("LightGBM") | |
| if optional_families.get("CatBoost"): families_to_try.append("CatBoost") | |
| if optional_families.get("TabPFN"): families_to_try.append("TabPFN") | |
| if optional_families.get("FTTransformer"): families_to_try.append("FTTransformer") | |
| tuned_results = [] | |
| for fam in families_to_try: | |
| st.caption(f"Tuning family: {fam}") | |
| res = tune_family(fam, X, y, n_trials=max_trials) | |
| # res can be dict or single-run result; ensure consistent format | |
| if isinstance(res, dict) and "model_obj" in res: | |
| tuned_results.append(res) | |
| else: | |
| st.warning(f"Family {fam} returned unexpected tune result: {res}") | |
| # build leaderboard DataFrame | |
| lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"], "params": r["best_params"]} for r in tuned_results]) | |
| lb = lb.sort_values("cv_r2", ascending=False).reset_index(drop=True) | |
| st.markdown("### Tuning Leaderboard (by CV R²)") | |
| st.dataframe(lb[["family","cv_r2"]].round(4)) | |
| # --- Build base-models and collect out-of-fold preds for stacking --- | |
| st.markdown("### Building base models & out-of-fold predictions for stacking") | |
| kf = KFold(n_splits=5, shuffle=True, random_state=42) | |
| base_models = [] | |
| oof_preds = pd.DataFrame(index=X.index) | |
| for idx, row in lb.iterrows(): | |
| fam = row["family"] | |
| model_entry = next((r for r in tuned_results if r["family"] == fam), None) | |
| if model_entry is None: | |
| continue | |
| model_obj = model_entry["model_obj"] | |
| # train out-of-fold predictions | |
| oof = np.zeros(X.shape[0]) | |
| for tr_idx, val_idx in kf.split(X): | |
| X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx] | |
| y_tr = y.iloc[tr_idx] | |
| # fit family-specific wrapper (TabPFN/FTTransformer special-case) | |
| if model_obj == "TabPFN_placeholder": | |
| try: | |
| # TabPFN expects specific API; create a simple fallback: use RandomForest to approximate | |
| tmp = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1) | |
| tmp.fit(X_tr, y_tr) | |
| oof[val_idx] = tmp.predict(X_val) | |
| except Exception: | |
| oof[val_idx] = np.mean(y_tr) | |
| else: | |
| try: | |
| model_obj.fit(X_tr, y_tr) | |
| oof[val_idx] = model_obj.predict(X_val) | |
| except Exception: | |
| # fallback to mean | |
| oof[val_idx] = np.mean(y_tr) | |
| oof_preds[f"{fam}_oof"] = oof | |
| # finally fit model on full data | |
| try: | |
| if model_entry["model_obj"] == "TabPFN_placeholder": | |
| # fallback full-model: RandomForest | |
| fitted = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1) | |
| fitted.fit(X, y) | |
| else: | |
| model_entry["model_obj"].fit(X, y) | |
| fitted = model_entry["model_obj"] | |
| except Exception: | |
| fitted = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1) | |
| fitted.fit(X, y) | |
| base_models.append({"family": fam, "model": fitted, "cv_r2": model_entry["cv_score"]}) | |
| # --- prune highly correlated OOF preds and keep top_k diverse models --- | |
| if oof_preds.shape[1] == 0: | |
| st.error("No base models created — aborting stacking.") | |
| else: | |
| corr_matrix = oof_preds.corr().abs() | |
| # compute diversity score = (1 - mean correlation with others) | |
| diversity = {col: 1 - corr_matrix[col].drop(col).mean() for col in corr_matrix.columns} | |
| summary = [] | |
| for bm in base_models: | |
| col = f"{bm['family']}_oof" | |
| summary.append({"family": bm["family"], "cv_r2": bm["cv_r2"], "diversity": diversity.get(col, 0.0)}) | |
| summary_df = pd.DataFrame(summary).sort_values(["cv_r2", "diversity"], ascending=[False, False]).reset_index(drop=True) | |
| st.markdown("### Base Model Summary (cv_r2, diversity)") | |
| st.dataframe(summary_df.round(4)) | |
| # select top_k by cv_r2 and diversity combined | |
| selected = summary_df.sort_values(["cv_r2","diversity"], ascending=[False, False]).head(top_k)["family"].tolist() | |
| st.markdown(f"Selected for stacking (top {top_k}): {selected}") | |
| # build stacking training data (OOF preds for selected) | |
| selected_cols = [f"{s}_oof" for s in selected] | |
| X_stack = oof_preds[selected_cols].fillna(0) | |
| meta = Ridge(alpha=1.0) | |
| meta.fit(X_stack, y) | |
| # evaluate stacked ensemble on a holdout split | |
| X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42) | |
| # predict with base models -> create meta inputs | |
| meta_inputs = [] | |
| for fam in selected: | |
| bm = next((b for b in base_models if b["family"] == fam), None) | |
| if bm is not None: | |
| try: | |
| meta_inputs.append(bm["model"].predict(X_val)) | |
| except Exception: | |
| meta_inputs.append(np.full(len(X_val), y_tr.mean())) | |
| else: | |
| meta_inputs.append(np.full(len(X_val), y_tr.mean())) | |
| X_meta_val = np.column_stack(meta_inputs) | |
| y_meta_pred = meta.predict(X_meta_val) | |
| final_r2 = r2_score(y_val, y_meta_pred) | |
| final_rmse = mean_squared_error(y_val, y_meta_pred, squared=False) | |
| c1, c2 = st.columns(2) | |
| c1.metric("Stacked Ensemble R² (holdout)", f"{final_r2:.4f}") | |
| c2.metric("Stacked Ensemble RMSE (holdout)", f"{final_rmse:.4f}") | |
| # scatter plot | |
| fig, ax = plt.subplots(figsize=(7,4)) | |
| ax.scatter(y_val, y_meta_pred, alpha=0.6) | |
| ax.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], "r--") | |
| ax.set_xlabel("Actual"); ax.set_ylabel("Stacked Predicted") | |
| st.pyplot(fig) | |
| # save artifacts: base models list + meta learner | |
| stack_artifact = os.path.join(DATA_DIR, f"stacked_{use_case.replace(' ','_')}.joblib") | |
| to_save = {"base_models": {bm["family"]: bm["model"] for bm in base_models if bm["family"] in selected}, "meta": meta, "features": features, "selected": selected, "target": target} | |
| joblib.dump(to_save, stack_artifact) | |
| st.caption(f"Stacked ensemble saved: {stack_artifact}") | |
| # --- SHAP on final stack: approximate by SHAP of top base model or meta contributions --- | |
| st.markdown("### Explainability (approximate)") | |
| try: | |
| # Prefer SHAP on top base model (tree) for interpretability | |
| top_base = next((b for b in base_models if b["family"] == selected[0]), None) | |
| if top_base is not None and hasattr(top_base["model"], "predict"): | |
| # sample for speed | |
| sample_X = X_val.sample(min(300, len(X_val)), random_state=42) | |
| if hasattr(top_base["model"], "predict") and ("XGBoost" in top_base["family"] or "LightGBM" in top_base["family"] or "RandomForest" in top_base["family"] or "ExtraTrees" in top_base["family"] or "CatBoost" in top_base["family"]): | |
| expl = None | |
| # safe tree explainer creation | |
| try: | |
| expl = shap.TreeExplainer(top_base["model"]) | |
| shap_vals = expl.shap_values(sample_X) | |
| fig_sh = plt.figure(figsize=(8,6)) | |
| shap.summary_plot(shap_vals, sample_X, show=False) | |
| st.pyplot(fig_sh) | |
| except Exception as e: | |
| st.warning(f"SHAP tree explainer unavailable: {e}") | |
| else: | |
| st.info("Top base model not tree-based; SHAP summary skipped. You can inspect per-base feature importances above.") | |
| else: | |
| st.info("No suitable base model for SHAP explanation found.") | |
| except Exception as e: | |
| st.warning(f"SHAP step failed gracefully: {e}") | |
| st.success("AutoML + Stacking complete. Review metrics and saved artifacts.") | |
| # ----- Target & Business Impact tab | |
| with tabs[5]: | |
| st.subheader("Recommended Target Variables by Use Case") | |
| st.markdown("Each use case maps to a practical target variable that drives measurable business impact.") | |
| target_table = pd.DataFrame([ | |
| ["Predictive Maintenance (Mills, Motors, Compressors)", "bearing_temp / time_to_failure", "Rises before mechanical failure; early warning", "₹10–30 L per asset/year"], | |
| ["Blast Furnace / EAF Data Intelligence", "furnace_temp / tap_temp", "Central control variable, linked to energy and quality", "₹20–60 L/year"], | |
| ["Casting Quality Optimization", "defect_probability / solidification_rate", "Determines billet quality; control nozzle & cooling", "₹50 L/year yield gain"], | |
| ["Rolling Mill Energy Optimization", "energy_per_ton / exit_temp", "Directly tied to energy efficiency", "₹5–10 L/year per kWh/t"], | |
| ["Surface Defect Detection (Vision AI)", "defect_probability", "Quality metric from CNN", "1–2 % yield gain"], | |
| ["Material Composition & Alloy Mix AI", "deviation_from_target_grade", "Predict deviation, suggest corrections", "₹20 L/year raw material savings"], | |
| ["Inventory & Yield Optimization", "yield_ratio (output/input)", "Linked to WIP and process yield", "₹1 Cr+/year"], | |
| ["Refractory & Cooling Loss Prediction", "lining_thickness / heat_loss_rate", "Predict wear for planned maintenance", "₹40 L/year downtime savings"]], columns=["Use Case", "Target Variable", "Why It’s Ideal", "Business Leverage"]) | |
| st.dataframe(target_table, use_container_width=True) | |
| st.markdown("---") | |
| st.subheader("Business Framing for Clients") | |
| st.markdown("These metrics show approximate annual benefits from small process improvements.") | |
| business_table = pd.DataFrame([ | |
| ["Energy consumption", "400 kWh/ton", "₹35–60 L"], | |
| ["Electrode wear", "1.8 kg/ton", "₹10 L"], | |
| ["Refractory wear", "3 mm/heat", "₹15 L"], | |
| ["Oxygen usage", "40 Nm³/ton", "₹20 L"], | |
| ["Yield loss", "2 %", "₹50 L – ₹1 Cr"], | |
| ], columns=["Metric", "Typical Value (EAF India)", "5 % Improvement → Annual ₹ Value"]) | |
| st.dataframe(business_table, use_container_width=True) | |
| st.info("These numbers are indicative averages; actual benefits depend on plant capacity and process efficiency.") | |
| # ----- Bibliography tab | |
| with tabs[6]: | |
| st.subheader("Annotated Bibliography & Feature Justification") | |
| st.markdown(""" | |
| This section summarizes published research supporting the feature design and modeling choices. | |
| """) | |
| bib_data = [ | |
| ("A Survey of Data-Driven Soft Sensing in Ironmaking Systems", "Yan et al. (2024)", "Supports gas proxies, lags, PCA for off-gas and temperature correlation."), | |
| ("Optimisation of Oxygen Blowing Process using RL", "Ojeda Roldan et al. (2022)", "Reinforcement learning for oxygen control; motivates surrogate predicted states & safety indices."), | |
| ("Analyzing the Energy Efficiency of Electric Arc Furnace", "Zhuo et al. (2024)", "Energy KPIs (kWh/t) motivate power_density & energy_efficiency features."), | |
| ("BOF/Endpoint Prediction Techniques", "Springer (2024)", "Endpoint prediction; supports temporal lags and cycle encoding."), | |
| ("Dynamic EAF Modeling & Slag Foaming", "MacRosty et al.", "Physics priors for slag_foaming_index and refractory health modeling."), | |
| ] | |
| bib_df = pd.DataFrame(bib_data, columns=["Paper Title", "Authors / Year", "Relevance to Feature Engineering"]) | |
| st.dataframe(bib_df, use_container_width=True) | |
| st.markdown(""" | |
| **Feature-to-Research Mapping Summary:** | |
| - Gas probes & soft-sensing → `carbon_proxy`, `oxygen_utilization` | |
| - Power & energy proxies → `power_density`, `energy_efficiency` | |
| - Temporal features → rolling means, lags, cycle progress indicators | |
| - Surrogate features → `pred_temp_30s`, `pred_carbon_5min` | |
| - PCA / clustering → operating mode compression | |
| """) | |
| # ------------------------- | |
| # Footer / Notes | |
| # ------------------------- | |
| st.markdown("---") | |
| st.markdown("**Notes:** This dataset is synthetic and for demo/prototyping. Real plant integration requires NDA, data on-boarding, sensor mapping, and plant safety checks before any control actions.") | |