singhn9 commited on
Commit
f944dac
·
verified ·
1 Parent(s): a1eb3a8

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +518 -36
src/streamlit_app.py CHANGED
@@ -1,40 +1,522 @@
1
- import altair as alt
 
 
 
 
 
2
  import numpy as np
3
  import pandas as pd
4
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
1
+
2
+
3
+ import os
4
+ import json
5
+ import time
6
+ from datetime import datetime
7
  import numpy as np
8
  import pandas as pd
9
  import streamlit as st
10
+ import matplotlib.pyplot as plt
11
+ import seaborn as sns
12
+ import joblib
13
+
14
+ # ML imports
15
+ from sklearn.model_selection import train_test_split
16
+ from sklearn.linear_model import LinearRegression
17
+ from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
18
+ from sklearn.preprocessing import StandardScaler, PolynomialFeatures
19
+ from sklearn.decomposition import PCA
20
+ from sklearn.cluster import KMeans
21
+ from sklearn.metrics import mean_squared_error, r2_score
22
+
23
+ # SHAP
24
+ import shap
25
+
26
+ # -------------------------
27
+ # Config & paths
28
+ # -------------------------
29
+ st.set_page_config(page_title="AI Feature Universe Explorer — Advanced + SHAP", layout="wide")
30
+ DATA_DIR = "/mnt/data"
31
+ CSV_PATH = os.path.join(DATA_DIR, "flatfile_universe_advanced.csv")
32
+ META_PATH = os.path.join(DATA_DIR, "feature_metadata_advanced.json")
33
+ PDF_PATH = os.path.join(DATA_DIR, "annotated_bibliography.pdf")
34
+ ENSEMBLE_ARTIFACT = os.path.join(DATA_DIR, "ensemble_models.joblib")
35
+
36
+ # -------------------------
37
+ # Utility: generate advanced dataset if missing
38
+ # -------------------------
39
+ def generate_advanced_flatfile(n_rows=3000, random_seed=42, max_polynomial_new=60):
40
+ """
41
+ Generates a large synthetic, physics-aligned dataset with many engineered features.
42
+ Saves CSV and metadata JSON and a short annotated bibliography PDF (text).
43
+ """
44
+ np.random.seed(random_seed)
45
+ os.makedirs(DATA_DIR, exist_ok=True)
46
+ # --- base natural features across 8 use cases (expanded)
47
+ natural_feats = [
48
+ "vibration_x","vibration_y","motor_current","rpm","bearing_temp","ambient_temp","lube_pressure","power_factor",
49
+ "furnace_temp","tap_temp","slag_temp","offgas_co","offgas_co2","o2_probe_pct","c_feed_rate","arc_power","furnace_pressure","feed_time",
50
+ "mold_temp","casting_speed","nozzle_pressure","cooling_water_temp","billet_length","chemical_C","chemical_Mn","chemical_Si","chemical_S",
51
+ "roll_speed","motor_load","coolant_flow","exit_temp","strip_thickness","line_tension","roller_vibration",
52
+ "lighting_intensity","surface_temp","image_entropy_proxy",
53
+ "spectro_Fe","spectro_C","spectro_Mn","spectro_Si","time_since_last_sample",
54
+ "batch_id_numeric","weight_input","weight_output","time_in_queue","conveyor_speed",
55
+ "shell_temp","lining_thickness","water_flow","cooling_out_temp","heat_flux"
56
+ ]
57
+ # dedupe if duplicated names
58
+ natural_feats = list(dict.fromkeys(natural_feats))
59
+
60
+ # helper sampling heuristics
61
+ def sample_col(name, n):
62
+ name_l = name.lower()
63
+ if "furnace_temp" in name_l or name_l.endswith("_temp") or "tap_temp" in name_l:
64
+ return np.random.normal(1550, 50, n)
65
+ if name_l in ("tap_temp","mold_temp","shell_temp","cooling_out_temp","exit_temp"):
66
+ return np.random.normal(200 if "mold" not in name_l else 1500, 30, n)
67
+ if "offgas_co2" in name_l:
68
+ return np.abs(np.random.normal(15,4,n))
69
+ if "offgas_co" in name_l:
70
+ return np.abs(np.random.normal(20,5,n))
71
+ if "o2" in name_l:
72
+ return np.clip(np.random.normal(5,1,n), 0.01, 60)
73
+ if "arc_power" in name_l or "motor_load" in name_l:
74
+ return np.abs(np.random.normal(600,120,n))
75
+ if "rpm" in name_l:
76
+ return np.abs(np.random.normal(120,30,n))
77
+ if "vibration" in name_l:
78
+ return np.abs(np.random.normal(0.4,0.15,n))
79
+ if "bearing_temp" in name_l:
80
+ return np.random.normal(65,5,n)
81
+ if "chemical" in name_l or "spectro" in name_l:
82
+ return np.random.normal(0.7,0.15,n)
83
+ if "weight" in name_l:
84
+ return np.random.normal(1000,100,n)
85
+ if "conveyor_speed" in name_l or "casting_speed" in name_l:
86
+ return np.random.normal(2.5,0.6,n)
87
+ if "power_factor" in name_l:
88
+ return np.clip(np.random.normal(0.92,0.03,n),0.6,1.0)
89
+ if "image_entropy_proxy" in name_l:
90
+ return np.abs(np.random.normal(0.5,0.25,n))
91
+ if "batch_id" in name_l:
92
+ return np.random.randint(1000,9999,n)
93
+ if "time_since" in name_l or "time_in_queue" in name_l:
94
+ return np.abs(np.random.normal(30,20,n))
95
+ if "heat_flux" in name_l:
96
+ return np.abs(np.random.normal(1000,300,n))
97
+ return np.random.normal(0,1,n)
98
+
99
+ # build DF
100
+ df = pd.DataFrame({c: sample_col(c, n_rows) for c in natural_feats})
101
+
102
+ # timestamps & metadata
103
+ start = pd.Timestamp("2025-01-01T00:00:00")
104
+ df["timestamp"] = pd.date_range(start, periods=n_rows, freq="T")
105
+ df["cycle_minute"] = np.mod(np.arange(n_rows), 80)
106
+ df["meta_plant_name"] = np.random.choice(["Rourkela","Jamshedpur","VSP","Bokaro","Kalinganagar","Salem"], n_rows)
107
+ df["meta_country"] = "India"
108
+
109
+ # --- synthetic features: physics informed proxies
110
+ df["carbon_proxy"] = df["offgas_co"] / (df["offgas_co2"] + 1.0)
111
+ df["oxygen_utilization"] = df["offgas_co2"] / (df["offgas_co"] + 1.0)
112
+ df["power_density"] = df["arc_power"] / (df["weight_input"] + 1.0)
113
+ df["energy_efficiency"] = df["furnace_temp"] / (df["arc_power"] + 1.0)
114
+ df["slag_foaming_index"] = (df["slag_temp"] * df["offgas_co"]) / (df["o2_probe_pct"] + 1.0)
115
+ df["yield_ratio"] = df["weight_output"] / (df["weight_input"] + 1e-9)
116
+
117
+ # rolling stats, lags, rocs for a prioritized set
118
+ rolling_cols = ["arc_power","furnace_temp","offgas_co","offgas_co2","motor_current","vibration_x","weight_input"]
119
+ for rc in rolling_cols:
120
+ if rc in df.columns:
121
+ df[f"{rc}_roll_mean_3"] = df[rc].rolling(3, min_periods=1).mean()
122
+ df[f"{rc}_roll_std_5"] = df[rc].rolling(5, min_periods=1).std().fillna(0)
123
+ df[f"{rc}_lag1"] = df[rc].shift(1).fillna(method="bfill")
124
+ df[f"{rc}_roc_1"] = df[rc].diff().fillna(0)
125
+
126
+ # interaction & polynomial-lite
127
+ df["arc_o2_interaction"] = df["arc_power"] * df["o2_probe_pct"]
128
+ df["carbon_power_ratio"] = df["carbon_proxy"] / (df["arc_power"] + 1e-6)
129
+ df["temp_power_sqrt"] = df["furnace_temp"] * np.sqrt(np.abs(df["arc_power"]) + 1e-6)
130
+
131
+ # polynomial features limited to first 12 numeric columns to avoid explosion
132
+ numeric = df.select_dtypes(include=[np.number]).fillna(0)
133
+ poly_source_cols = numeric.columns[:12].tolist()
134
+ poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
135
+ poly_mat = poly.fit_transform(numeric[poly_source_cols])
136
+ poly_names = poly.get_feature_names_out(poly_source_cols)
137
+ poly_df = pd.DataFrame(poly_mat, columns=[f"poly__{n}" for n in poly_names], index=df.index)
138
+ # drop identical originals and limit new cols
139
+ keep_poly = [c for c in poly_df.columns if c.replace("poly__","") not in poly_source_cols]
140
+ if len(keep_poly) > 0:
141
+ poly_df = poly_df[keep_poly].iloc[:, :max_polynomial_new]
142
+ else:
143
+ poly_df = poly_df.iloc[:, :0]
144
+ df = pd.concat([df, poly_df], axis=1)
145
+
146
+ # PCA embeddings across numeric sensors
147
+ scaler = StandardScaler()
148
+ scaled = scaler.fit_transform(numeric)
149
+ pca = PCA(n_components=6, random_state=42)
150
+ pca_cols = pca.fit_transform(scaled)
151
+ for i in range(pca_cols.shape[1]):
152
+ df[f"pca_{i+1}"] = pca_cols[:, i]
153
+
154
+ # KMeans cluster label for operating mode
155
+ kmeans = KMeans(n_clusters=6, random_state=42, n_init=10)
156
+ df["operating_mode"] = kmeans.fit_predict(scaled)
157
+
158
+ # surrogate models to create short-horizon predicted states (fast regressors)
159
+ # furnace_temp_next surrogate
160
+ surrogate_df = df.copy()
161
+ surrogate_df["furnace_temp_next"] = surrogate_df["furnace_temp"].shift(-1).fillna(method="ffill")
162
+ features_for_surrogate = [c for c in ["furnace_temp","arc_power","o2_probe_pct","offgas_co","offgas_co2"] if c in df.columns]
163
+ if len(features_for_surrogate) >= 2:
164
+ X = surrogate_df[features_for_surrogate].fillna(0)
165
+ y = surrogate_df["furnace_temp_next"]
166
+ from sklearn.ensemble import RandomForestRegressor
167
+ rf = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1)
168
+ rf.fit(X, y)
169
+ df["pred_temp_30s"] = rf.predict(X)
170
+ else:
171
+ df["pred_temp_30s"] = df["furnace_temp"]
172
+
173
+ # surrogate for carbon proxy
174
+ if all(c in df.columns for c in ["offgas_co","offgas_co2","o2_probe_pct"]):
175
+ X2 = df[["offgas_co","offgas_co2","o2_probe_pct"]].fillna(0)
176
+ rf2 = RandomForestRegressor(n_estimators=50, random_state=1, n_jobs=-1)
177
+ rf2.fit(X2, df["carbon_proxy"])
178
+ df["pred_carbon_5min"] = rf2.predict(X2)
179
+ else:
180
+ df["pred_carbon_5min"] = df["carbon_proxy"]
181
+
182
+ # safety indices & flags
183
+ df["refractory_limit_flag"] = (df["lining_thickness"] < 140).astype(int)
184
+ df["max_allowed_power_delta"] = np.clip(df["arc_power"].diff().abs().fillna(0), 0, 2000)
185
+
186
+ # simple rule-based target action for demo
187
+ df["ARC_ON"] = ((df["arc_power"] > df["arc_power"].median()) & (df["carbon_proxy"] < 1.0)).astype(int)
188
+ df["prediction_confidence"] = np.clip(np.random.beta(2,5, n_rows), 0.05, 0.99)
189
+
190
+ # clean NaN and infinite
191
+ df.replace([np.inf, -np.inf], np.nan, inplace=True)
192
+ df.fillna(method="bfill", inplace=True)
193
+ df.fillna(0, inplace=True)
194
+
195
+ # save CSV & metadata
196
+ df.to_csv(CSV_PATH, index=False)
197
+
198
+ meta = []
199
+ for col in df.columns:
200
+ if col in natural_feats:
201
+ source = "natural"
202
+ elif col.startswith("poly__") or col.startswith("pca_") or col in ["operating_mode"]:
203
+ source = "advanced_synthetic"
204
+ else:
205
+ source = "synthetic"
206
+ meta.append({
207
+ "feature_name": col,
208
+ "source_type": source,
209
+ "linked_use_cases": ["All" if source!="natural" else "Mapped"],
210
+ "units": "-",
211
+ "formula": "see generator logic",
212
+ "remarks": "auto-generated or simulated"
213
+ })
214
+ with open(META_PATH, "w") as f:
215
+ json.dump(meta, f, indent=2)
216
+
217
+ # annotated bibliography text saved as simple PDF-like text (clients accept PDF)
218
+ try:
219
+ from fpdf import FPDF
220
+ pdf = FPDF('P','mm','A4')
221
+ pdf.add_page()
222
+ pdf.set_font("Helvetica","B",14)
223
+ pdf.cell(0,8,"Annotated Bibliography - Metallurgical AI (Selected Papers)", ln=True)
224
+ pdf.ln(2)
225
+ pdf.set_font("Helvetica","",10)
226
+ pdf.cell(0,6,"Generated: " + datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC"), ln=True)
227
+ pdf.ln(4)
228
+ bib_items = [
229
+ ("A Survey of Data-Driven Soft Sensing in Ironmaking Systems","Yan et al. (2024)","Review of soft-sensors; supports gas proxies, lags, PCA."),
230
+ ("Optimisation of Oxygen Blowing Process using RL","Ojeda Roldan et al. (2022)","RL for oxygen control; motivates surrogate predicted states & safety indices."),
231
+ ("Analyzing the Energy Efficiency of Electric Arc Furnace","Zhuo et al. (2024)","Energy KPIs (kWh/t) motivate power_density & energy_efficiency features."),
232
+ ("BOF/Endpoint prediction techniques","Springer (2024)","Endpoint prediction; supports temporal lags and cycle encoding."),
233
+ ("Dynamic EAF modeling & slag foaming","MacRosty et al.","Physics priors for slag_foaming_index and refractory health modeling.")
234
+ ]
235
+ for title, auth, note in bib_items:
236
+ pdf.set_font("Helvetica","B",11)
237
+ pdf.multi_cell(0,6, f"{title} — {auth}")
238
+ pdf.set_font("Helvetica","",10)
239
+ pdf.multi_cell(0,5, f"Notes: {note}")
240
+ pdf.ln(2)
241
+ pdf.output(PDF_PATH)
242
+ except Exception as e:
243
+ # fallback: simple text file
244
+ with open(PDF_PATH.replace(".pdf",".txt"), "w") as tf:
245
+ tf.write("Annotated bibliography generated. Install fpdf for PDF output.\n")
246
+ return CSV_PATH, META_PATH, PDF_PATH
247
+
248
+ # -------------------------
249
+ # Ensure dataset exists
250
+ # -------------------------
251
+ if not os.path.exists(CSV_PATH) or not os.path.exists(META_PATH):
252
+ with st.spinner("Generating advanced feature universe (this may take ~20-60s)..."):
253
+ CSV_PATH, META_PATH, PDF_PATH = generate_advanced_flatfile(n_rows=3000, random_seed=42, max_polynomial_new=80)
254
+ st.success(f"Generated dataset and metadata: {CSV_PATH}")
255
+
256
+ # -------------------------
257
+ # Load data & metadata (cached)
258
+ # -------------------------
259
+ @st.cache_data
260
+ def load_data(csv_path=CSV_PATH, meta_path=META_PATH):
261
+ df_local = pd.read_csv(csv_path)
262
+ with open(meta_path, "r") as f:
263
+ meta_local = json.load(f)
264
+ return df_local, pd.DataFrame(meta_local)
265
+
266
+ df, meta_df = load_data()
267
+
268
+ # -------------------------
269
+ # Sidebar filters & UI
270
+ # -------------------------
271
+ st.sidebar.title("🔎 Feature Explorer - Advanced + SHAP")
272
+ feat_types = sorted(meta_df["source_type"].unique().tolist())
273
+ selected_types = st.sidebar.multiselect("Feature type", feat_types, default=feat_types)
274
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
275
+
276
+ # -------------------------
277
+ # Main tabs
278
+ # -------------------------
279
+ st.title("Steel Authority of India Limited (SHAP-enabled)")
280
+ tabs = st.tabs([
281
+ "Features",
282
+ "Visualize",
283
+ "Correlations",
284
+ "Stats",
285
+ "Ensemble + SHAP",
286
+ "Target & Business Impact",
287
+ "Bibliography"
288
+ ])
289
+
290
+ # ----- Features tab
291
+ with tabs[0]:
292
+ st.subheader("Feature metadata")
293
+ filtered_meta = meta_df[meta_df["source_type"].isin(selected_types)]
294
+ st.dataframe(filtered_meta[["feature_name","source_type","formula","remarks"]].rename(columns={"feature_name":"Feature"}), height=400)
295
+ st.markdown(f"Total features loaded: **{df.shape[1]}** | Rows: **{df.shape[0]}**")
296
+
297
+ # ----- Visualize tab
298
+ with tabs[1]:
299
+ st.subheader("Feature visualization")
300
+ col = st.selectbox("Choose numeric feature", numeric_cols, index=0)
301
+ bins = st.slider("Histogram bins", 10, 200, 50)
302
+ fig, ax = plt.subplots(figsize=(8,4))
303
+ sns.histplot(df[col], bins=bins, kde=True, ax=ax)
304
+ ax.set_title(col)
305
+ st.pyplot(fig)
306
+ st.write(df[col].describe().to_frame().T)
307
+
308
+ # ----- Correlations tab
309
+ with tabs[2]:
310
+ st.subheader("Correlation explorer")
311
+ default_corr = numeric_cols[:20] if len(numeric_cols) >= 20 else numeric_cols
312
+ corr_sel = st.multiselect("Select features (min 2)", numeric_cols, default=default_corr)
313
+ if len(corr_sel) >= 2:
314
+ corr = df[corr_sel].corr()
315
+ fig, ax = plt.subplots(figsize=(10,8))
316
+ sns.heatmap(corr, cmap="coolwarm", center=0, ax=ax)
317
+ st.pyplot(fig)
318
+ else:
319
+ st.info("Choose at least 2 numeric features to compute correlation.")
320
+
321
+ # ----- Stats tab
322
+ with tabs[3]:
323
+ st.subheader("Summary statistics (numeric features)")
324
+ st.dataframe(df.describe().T.style.format("{:.3f}"), height=500)
325
+
326
+ # ----- Ensemble + SHAP tab
327
+ with tabs[4]:
328
+ st.subheader("Ensemble modeling sandbox (fast) + SHAP explainability")
329
+ # Feature & target selector
330
+ target = st.selectbox("Target variable", numeric_cols, index=numeric_cols.index("furnace_temp") if "furnace_temp" in numeric_cols else 0)
331
+ default_features = [c for c in numeric_cols if c != target][:50] # preselect up to 50 features default
332
+ features = st.multiselect("Model input features (select many; start with defaults)", numeric_cols, default=default_features)
333
+ sample_size = st.slider("Sample rows to use for training (speed vs fidelity)", min_value=200, max_value=min(4000, df.shape[0]), value=1000, step=100)
334
+ train_button = st.button("Train ensemble & compute SHAP (recommended sample only)")
335
+
336
+ if train_button:
337
+ with st.spinner("Preparing data and training ensemble..."):
338
+ sub_df = df[features + [target]].sample(n=sample_size, random_state=42)
339
+ X = sub_df[features].fillna(0)
340
+ y = sub_df[target].fillna(0)
341
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
342
+ # models
343
+ models = {
344
+ "Linear": LinearRegression(),
345
+ "RandomForest": RandomForestRegressor(n_estimators=150, random_state=42, n_jobs=-1),
346
+ "GradientBoosting": GradientBoostingRegressor(n_estimators=150, random_state=42),
347
+ "ExtraTrees": ExtraTreesRegressor(n_estimators=150, random_state=42, n_jobs=-1)
348
+ }
349
+ preds = {}
350
+ results = []
351
+ for name, m in models.items():
352
+ m.fit(X_train, y_train)
353
+ p = m.predict(X_test)
354
+ preds[name] = p
355
+ results.append({"Model": name, "R2": r2_score(y_test, p), "RMSE": float(np.sqrt(mean_squared_error(y_test, p)))})
356
+ # ensemble average
357
+ ensemble_pred = np.column_stack(list(preds.values())).mean(axis=1)
358
+ results.append({"Model": "EnsembleAvg", "R2": r2_score(y_test, ensemble_pred), "RMSE": float(np.sqrt(mean_squared_error(y_test, ensemble_pred)))})
359
+ st.dataframe(pd.DataFrame(results).set_index("Model").round(4))
360
+
361
+ # scatter
362
+ fig, ax = plt.subplots(figsize=(8,4))
363
+ ax.scatter(y_test, ensemble_pred, alpha=0.5)
364
+ ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--")
365
+ ax.set_xlabel("Actual"); ax.set_ylabel("Predicted (Ensemble)")
366
+ st.pyplot(fig)
367
+
368
+ # save the models (lightweight)
369
+ joblib.dump(models, ENSEMBLE_ARTIFACT)
370
+ st.success(f"Saved ensemble models to {ENSEMBLE_ARTIFACT}")
371
+
372
+ # ---------- SHAP explainability ----------
373
+ st.markdown("### SHAP Explainability — pick a model to explain (Tree models recommended)")
374
+ explain_model_name = st.selectbox("Model to explain", list(models.keys()), index= list(models.keys()).index("RandomForest") if "RandomForest" in models else 0)
375
+ explainer_sample = st.slider("Number of rows to use for SHAP explanation (memory heavy)", 50, min(1500, sample_size), value=300, step=50)
376
+
377
+ # Use a Tree explainer if possible; otherwise KernelExplainer (slow)
378
+ model_to_explain = models[explain_model_name]
379
+ X_shap = X_test.copy()
380
+ if explainer_sample < X_shap.shape[0]:
381
+ X_shap_for = X_shap.sample(n=explainer_sample, random_state=42)
382
+ else:
383
+ X_shap_for = X_shap
384
+
385
+ with st.spinner("Computing SHAP values (this may take a while for large SHAP sample)..."):
386
+ try:
387
+ if hasattr(model_to_explain, "predict") and (explain_model_name in ["RandomForest","ExtraTrees","GradientBoosting"]):
388
+ explainer = shap.TreeExplainer(model_to_explain)
389
+ shap_values = explainer.shap_values(X_shap_for)
390
+ # summary plot
391
+ import warnings
392
+ warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")
393
+ fig_shap = plt.figure(figsize=(8,6))
394
+ shap.summary_plot(shap_values, X_shap_for, show=False)
395
+ st.pyplot(fig_shap)
396
+ else:
397
+ # fallback: use KernelExplainer on small sample (very slow)
398
+ explainer = shap.KernelExplainer(model_to_explain.predict, shap.sample(X_train, 100))
399
+ shap_values = explainer.shap_values(X_shap_for, nsamples=100)
400
+ fig_shap = plt.figure(figsize=(8,6))
401
+ shap.summary_plot(shap_values, X_shap_for, show=False)
402
+ st.pyplot(fig_shap)
403
+ st.success("SHAP summary plotted.")
404
+ except Exception as e:
405
+ st.error(f"SHAP failed: {e}")
406
+ # per-instance explanation waterfall
407
+ st.markdown("#### Explain a single prediction (waterfall):")
408
+ idx_choice = st.number_input("Row index (0..n_test-1)", min_value=0, max_value=X_shap.shape[0]-1, value=0)
409
+ try:
410
+ row = X_shap_for.iloc[[idx_choice]]
411
+ if explain_model_name in ["RandomForest","ExtraTrees","GradientBoosting"]:
412
+ expl = shap.TreeExplainer(model_to_explain)
413
+ shap_vals_row = expl.shap_values(row)
414
+ exp_val = expl.expected_value
415
+ shap_vals = shap_vals_row
416
+
417
+ # Handle tree models returning arrays for single target
418
+ if isinstance(exp_val, (list, np.ndarray)) and not np.isscalar(exp_val):
419
+ exp_val = exp_val[0]
420
+ if isinstance(shap_vals, list):
421
+ shap_vals = shap_vals[0]
422
+
423
+ exp_val = expl.expected_value
424
+ shap_vals = shap_vals_row
425
+
426
+ # Handle multi-output case
427
+ if isinstance(exp_val, (list, np.ndarray)) and not np.isscalar(exp_val):
428
+ exp_val = exp_val[0]
429
+ if isinstance(shap_vals, list):
430
+ shap_vals = shap_vals[0]
431
+
432
+ # Plot safely across SHAP versions
433
+ try:
434
+ explanation = shap.Explanation(
435
+ values=shap_vals[0],
436
+ base_values=exp_val,
437
+ data=row.iloc[0],
438
+ feature_names=row.columns.tolist()
439
+ )
440
+ plot_obj = shap.plots.waterfall(explanation, show=False)
441
+
442
+ # If SHAP returns Axes instead of Figure, wrap it
443
+ import matplotlib.pyplot as plt
444
+ if hasattr(plot_obj, "figure"):
445
+ fig2 = plot_obj.figure
446
+ else:
447
+ fig2 = plt.gcf()
448
+
449
+ st.pyplot(fig2)
450
+ except Exception as e:
451
+ st.warning(f"Waterfall plotting failed gracefully: {e}")
452
+
453
+
454
+ else:
455
+ st.info("Per-instance waterfall not available for this model type in fallback.")
456
+ except Exception as e:
457
+ st.warning(f"Could not plot waterfall: {e}")
458
+
459
+
460
+ # ----- 📌 Target & Business Impact tab
461
+ with tabs[5]:
462
+ st.subheader("🎯 Recommended Target Variables by Use Case")
463
+ st.markdown("Each use case maps to a practical target variable that drives measurable business impact.")
464
+
465
+ target_table = pd.DataFrame([
466
+ ["Predictive Maintenance (Mills, Motors, Compressors)", "bearing_temp / time_to_failure", "Rises before mechanical failure; early warning", "₹10–30 L per asset/year"],
467
+ ["Blast Furnace / EAF Data Intelligence", "furnace_temp / tap_temp", "Central control variable, linked to energy and quality", "₹20–60 L/year"],
468
+ ["Casting Quality Optimization", "defect_probability / solidification_rate", "Determines billet quality; control nozzle & cooling", "₹50 L/year yield gain"],
469
+ ["Rolling Mill Energy Optimization", "energy_per_ton / exit_temp", "Directly tied to energy efficiency", "₹5–10 L/year per kWh/t"],
470
+ ["Surface Defect Detection (Vision AI)", "defect_probability", "Quality metric from CNN", "1–2 % yield gain"],
471
+ ["Material Composition & Alloy Mix AI", "deviation_from_target_grade", "Predict deviation, suggest corrections", "₹20 L/year raw material savings"],
472
+ ["Inventory & Yield Optimization", "yield_ratio (output/input)", "Linked to WIP and process yield", "₹1 Cr+/year"],
473
+ ["Refractory & Cooling Loss Prediction", "lining_thickness / heat_loss_rate", "Predict wear for planned maintenance", "₹40 L/year downtime savings"]], columns=["Use Case", "Target Variable", "Why It’s Ideal", "Business Leverage"])
474
+
475
+ st.dataframe(target_table, use_container_width=True)
476
+
477
+ st.markdown("---")
478
+ st.subheader(" Business Framing for Clients")
479
+ st.markdown("These metrics show approximate annual benefits from small process improvements.")
480
+
481
+ business_table = pd.DataFrame([
482
+ ["Energy consumption", "400 kWh/ton", "₹35–60 L"],
483
+ ["Electrode wear", "1.8 kg/ton", "₹10 L"],
484
+ ["Refractory wear", "3 mm/heat", "₹15 L"],
485
+ ["Oxygen usage", "40 Nm³/ton", "₹20 L"],
486
+ ["Yield loss", "2 %", "₹50 L – ₹1 Cr"],
487
+ ], columns=["Metric", "Typical Value (EAF India)", "5 % Improvement → Annual ₹ Value"])
488
+
489
+ st.dataframe(business_table, use_container_width=True)
490
+ st.info("These numbers are indicative averages; actual benefits depend on plant capacity and process efficiency.")
491
+
492
+ # ----- 📚 Bibliography tab
493
+ with tabs[6]:
494
+ st.subheader("📚 Annotated Bibliography & Feature Justification")
495
+ st.markdown("""
496
+ This section summarizes published research supporting the feature design and modeling choices.
497
+ """)
498
+
499
+ bib_data = [
500
+ ("A Survey of Data-Driven Soft Sensing in Ironmaking Systems", "Yan et al. (2024)", "Supports gas proxies, lags, PCA for off-gas and temperature correlation."),
501
+ ("Optimisation of Oxygen Blowing Process using RL", "Ojeda Roldan et al. (2022)", "Reinforcement learning for oxygen control; motivates surrogate predicted states & safety indices."),
502
+ ("Analyzing the Energy Efficiency of Electric Arc Furnace", "Zhuo et al. (2024)", "Energy KPIs (kWh/t) motivate power_density & energy_efficiency features."),
503
+ ("BOF/Endpoint Prediction Techniques", "Springer (2024)", "Endpoint prediction; supports temporal lags and cycle encoding."),
504
+ ("Dynamic EAF Modeling & Slag Foaming", "MacRosty et al.", "Physics priors for slag_foaming_index and refractory health modeling."),
505
+ ]
506
+
507
+ bib_df = pd.DataFrame(bib_data, columns=["Paper Title", "Authors / Year", "Relevance to Feature Engineering"])
508
+ st.dataframe(bib_df, use_container_width=True)
509
 
510
+ st.markdown("""
511
+ **Feature-to-Research Mapping Summary:**
512
+ - Gas probes & soft-sensing → `carbon_proxy`, `oxygen_utilization`
513
+ - Power & energy proxies → `power_density`, `energy_efficiency`
514
+ - Temporal features → rolling means, lags, cycle progress indicators
515
+ - Surrogate features → `pred_temp_30s`, `pred_carbon_5min`
516
+ - PCA / clustering → operating mode compression
517
+ """)
518
+ # -------------------------
519
+ # Footer / Notes
520
+ # -------------------------
521
+ st.markdown("---")
522
+ st.markdown("**Notes:** This dataset is synthetic and for demo/prototyping. Real plant integration requires NDA, data on-boarding, sensor mapping, and plant safety checks before any control actions.")