File size: 26,333 Bytes
f944dac 9a4fc11 f944dac 9a4fc11 f944dac | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 |
import os
import json
import time
from datetime import datetime
import numpy as np
import pandas as pd
import streamlit as st
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
# ML imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, r2_score
# SHAP
import shap
# -------------------------
# Config & paths
# -------------------------
st.set_page_config(page_title="AI Feature Universe Explorer โ Advanced + SHAP", layout="wide")
DATA_DIR = "/mnt/data"
CSV_PATH = os.path.join(DATA_DIR, "flatfile_universe_advanced.csv")
META_PATH = os.path.join(DATA_DIR, "feature_metadata_advanced.json")
PDF_PATH = os.path.join(DATA_DIR, "annotated_bibliography.pdf")
ENSEMBLE_ARTIFACT = os.path.join(DATA_DIR, "ensemble_models.joblib")
# -------------------------
# Utility: generate advanced dataset if missing
# -------------------------
def generate_advanced_flatfile(n_rows=3000, random_seed=42, max_polynomial_new=60):
"""
Generates a large synthetic, physics-aligned dataset with many engineered features.
Saves CSV and metadata JSON and a short annotated bibliography PDF (text).
"""
np.random.seed(random_seed)
os.makedirs(DATA_DIR, exist_ok=True)
# --- base natural features across 8 use cases (expanded)
natural_feats = [
"vibration_x","vibration_y","motor_current","rpm","bearing_temp","ambient_temp","lube_pressure","power_factor",
"furnace_temp","tap_temp","slag_temp","offgas_co","offgas_co2","o2_probe_pct","c_feed_rate","arc_power","furnace_pressure","feed_time",
"mold_temp","casting_speed","nozzle_pressure","cooling_water_temp","billet_length","chemical_C","chemical_Mn","chemical_Si","chemical_S",
"roll_speed","motor_load","coolant_flow","exit_temp","strip_thickness","line_tension","roller_vibration",
"lighting_intensity","surface_temp","image_entropy_proxy",
"spectro_Fe","spectro_C","spectro_Mn","spectro_Si","time_since_last_sample",
"batch_id_numeric","weight_input","weight_output","time_in_queue","conveyor_speed",
"shell_temp","lining_thickness","water_flow","cooling_out_temp","heat_flux"
]
# dedupe if duplicated names
natural_feats = list(dict.fromkeys(natural_feats))
# helper sampling heuristics
def sample_col(name, n):
name_l = name.lower()
if "furnace_temp" in name_l or name_l.endswith("_temp") or "tap_temp" in name_l:
return np.random.normal(1550, 50, n)
if name_l in ("tap_temp","mold_temp","shell_temp","cooling_out_temp","exit_temp"):
return np.random.normal(200 if "mold" not in name_l else 1500, 30, n)
if "offgas_co2" in name_l:
return np.abs(np.random.normal(15,4,n))
if "offgas_co" in name_l:
return np.abs(np.random.normal(20,5,n))
if "o2" in name_l:
return np.clip(np.random.normal(5,1,n), 0.01, 60)
if "arc_power" in name_l or "motor_load" in name_l:
return np.abs(np.random.normal(600,120,n))
if "rpm" in name_l:
return np.abs(np.random.normal(120,30,n))
if "vibration" in name_l:
return np.abs(np.random.normal(0.4,0.15,n))
if "bearing_temp" in name_l:
return np.random.normal(65,5,n)
if "chemical" in name_l or "spectro" in name_l:
return np.random.normal(0.7,0.15,n)
if "weight" in name_l:
return np.random.normal(1000,100,n)
if "conveyor_speed" in name_l or "casting_speed" in name_l:
return np.random.normal(2.5,0.6,n)
if "power_factor" in name_l:
return np.clip(np.random.normal(0.92,0.03,n),0.6,1.0)
if "image_entropy_proxy" in name_l:
return np.abs(np.random.normal(0.5,0.25,n))
if "batch_id" in name_l:
return np.random.randint(1000,9999,n)
if "time_since" in name_l or "time_in_queue" in name_l:
return np.abs(np.random.normal(30,20,n))
if "heat_flux" in name_l:
return np.abs(np.random.normal(1000,300,n))
return np.random.normal(0,1,n)
# build DF
df = pd.DataFrame({c: sample_col(c, n_rows) for c in natural_feats})
# timestamps & metadata
start = pd.Timestamp("2025-01-01T00:00:00")
df["timestamp"] = pd.date_range(start, periods=n_rows, freq="T")
df["cycle_minute"] = np.mod(np.arange(n_rows), 80)
df["meta_plant_name"] = np.random.choice(["Rourkela","Jamshedpur","VSP","Bokaro","Kalinganagar","Salem"], n_rows)
df["meta_country"] = "India"
# --- synthetic features: physics informed proxies
df["carbon_proxy"] = df["offgas_co"] / (df["offgas_co2"] + 1.0)
df["oxygen_utilization"] = df["offgas_co2"] / (df["offgas_co"] + 1.0)
df["power_density"] = df["arc_power"] / (df["weight_input"] + 1.0)
df["energy_efficiency"] = df["furnace_temp"] / (df["arc_power"] + 1.0)
df["slag_foaming_index"] = (df["slag_temp"] * df["offgas_co"]) / (df["o2_probe_pct"] + 1.0)
df["yield_ratio"] = df["weight_output"] / (df["weight_input"] + 1e-9)
# rolling stats, lags, rocs for a prioritized set
rolling_cols = ["arc_power","furnace_temp","offgas_co","offgas_co2","motor_current","vibration_x","weight_input"]
for rc in rolling_cols:
if rc in df.columns:
df[f"{rc}_roll_mean_3"] = df[rc].rolling(3, min_periods=1).mean()
df[f"{rc}_roll_std_5"] = df[rc].rolling(5, min_periods=1).std().fillna(0)
df[f"{rc}_lag1"] = df[rc].shift(1).fillna(method="bfill")
df[f"{rc}_roc_1"] = df[rc].diff().fillna(0)
# interaction & polynomial-lite
df["arc_o2_interaction"] = df["arc_power"] * df["o2_probe_pct"]
df["carbon_power_ratio"] = df["carbon_proxy"] / (df["arc_power"] + 1e-6)
df["temp_power_sqrt"] = df["furnace_temp"] * np.sqrt(np.abs(df["arc_power"]) + 1e-6)
# polynomial features limited to first 12 numeric columns to avoid explosion
numeric = df.select_dtypes(include=[np.number]).fillna(0)
poly_source_cols = numeric.columns[:12].tolist()
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
poly_mat = poly.fit_transform(numeric[poly_source_cols])
poly_names = poly.get_feature_names_out(poly_source_cols)
poly_df = pd.DataFrame(poly_mat, columns=[f"poly__{n}" for n in poly_names], index=df.index)
# drop identical originals and limit new cols
keep_poly = [c for c in poly_df.columns if c.replace("poly__","") not in poly_source_cols]
if len(keep_poly) > 0:
poly_df = poly_df[keep_poly].iloc[:, :max_polynomial_new]
else:
poly_df = poly_df.iloc[:, :0]
df = pd.concat([df, poly_df], axis=1)
# PCA embeddings across numeric sensors
scaler = StandardScaler()
scaled = scaler.fit_transform(numeric)
pca = PCA(n_components=6, random_state=42)
pca_cols = pca.fit_transform(scaled)
for i in range(pca_cols.shape[1]):
df[f"pca_{i+1}"] = pca_cols[:, i]
# KMeans cluster label for operating mode
kmeans = KMeans(n_clusters=6, random_state=42, n_init=10)
df["operating_mode"] = kmeans.fit_predict(scaled)
# surrogate models to create short-horizon predicted states (fast regressors)
# furnace_temp_next surrogate
surrogate_df = df.copy()
surrogate_df["furnace_temp_next"] = surrogate_df["furnace_temp"].shift(-1).fillna(method="ffill")
features_for_surrogate = [c for c in ["furnace_temp","arc_power","o2_probe_pct","offgas_co","offgas_co2"] if c in df.columns]
if len(features_for_surrogate) >= 2:
X = surrogate_df[features_for_surrogate].fillna(0)
y = surrogate_df["furnace_temp_next"]
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1)
rf.fit(X, y)
df["pred_temp_30s"] = rf.predict(X)
else:
df["pred_temp_30s"] = df["furnace_temp"]
# surrogate for carbon proxy
if all(c in df.columns for c in ["offgas_co","offgas_co2","o2_probe_pct"]):
X2 = df[["offgas_co","offgas_co2","o2_probe_pct"]].fillna(0)
rf2 = RandomForestRegressor(n_estimators=50, random_state=1, n_jobs=-1)
rf2.fit(X2, df["carbon_proxy"])
df["pred_carbon_5min"] = rf2.predict(X2)
else:
df["pred_carbon_5min"] = df["carbon_proxy"]
# safety indices & flags
df["refractory_limit_flag"] = (df["lining_thickness"] < 140).astype(int)
df["max_allowed_power_delta"] = np.clip(df["arc_power"].diff().abs().fillna(0), 0, 2000)
# simple rule-based target action for demo
df["ARC_ON"] = ((df["arc_power"] > df["arc_power"].median()) & (df["carbon_proxy"] < 1.0)).astype(int)
df["prediction_confidence"] = np.clip(np.random.beta(2,5, n_rows), 0.05, 0.99)
# clean NaN and infinite
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(method="bfill", inplace=True)
df.fillna(0, inplace=True)
# save CSV & metadata
df.to_csv(CSV_PATH, index=False)
meta = []
for col in df.columns:
if col in natural_feats:
source = "natural"
elif col.startswith("poly__") or col.startswith("pca_") or col in ["operating_mode"]:
source = "advanced_synthetic"
else:
source = "synthetic"
meta.append({
"feature_name": col,
"source_type": source,
"linked_use_cases": ["All" if source!="natural" else "Mapped"],
"units": "-",
"formula": "see generator logic",
"remarks": "auto-generated or simulated"
})
with open(META_PATH, "w") as f:
json.dump(meta, f, indent=2)
# annotated bibliography text saved as simple PDF-like text (clients accept PDF)
try:
from fpdf import FPDF
pdf = FPDF('P','mm','A4')
pdf.add_page()
pdf.set_font("Helvetica","B",14)
pdf.cell(0,8,"Annotated Bibliography - Metallurgical AI (Selected Papers)", ln=True)
pdf.ln(2)
pdf.set_font("Helvetica","",10)
pdf.cell(0,6,"Generated: " + datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC"), ln=True)
pdf.ln(4)
bib_items = [
("A Survey of Data-Driven Soft Sensing in Ironmaking Systems","Yan et al. (2024)","Review of soft-sensors; supports gas proxies, lags, PCA."),
("Optimisation of Oxygen Blowing Process using RL","Ojeda Roldan et al. (2022)","RL for oxygen control; motivates surrogate predicted states & safety indices."),
("Analyzing the Energy Efficiency of Electric Arc Furnace","Zhuo et al. (2024)","Energy KPIs (kWh/t) motivate power_density & energy_efficiency features."),
("BOF/Endpoint prediction techniques","Springer (2024)","Endpoint prediction; supports temporal lags and cycle encoding."),
("Dynamic EAF modeling & slag foaming","MacRosty et al.","Physics priors for slag_foaming_index and refractory health modeling.")
]
for title, auth, note in bib_items:
pdf.set_font("Helvetica","B",11)
pdf.multi_cell(0,6, f"{title} โ {auth}")
pdf.set_font("Helvetica","",10)
pdf.multi_cell(0,5, f"Notes: {note}")
pdf.ln(2)
pdf.output(PDF_PATH)
except Exception as e:
# fallback: simple text file
with open(PDF_PATH.replace(".pdf",".txt"), "w") as tf:
tf.write("Annotated bibliography generated. Install fpdf for PDF output.\n")
return CSV_PATH, META_PATH, PDF_PATH
# -------------------------
# Ensure dataset exists
# -------------------------
if not os.path.exists(CSV_PATH) or not os.path.exists(META_PATH):
with st.spinner("Generating advanced feature universe (this may take ~20-60s)..."):
CSV_PATH, META_PATH, PDF_PATH = generate_advanced_flatfile(n_rows=3000, random_seed=42, max_polynomial_new=80)
st.success(f"Generated dataset and metadata: {CSV_PATH}")
# -------------------------
# Load data & metadata (cached)
# -------------------------
@st.cache_data
def load_data(csv_path=CSV_PATH, meta_path=META_PATH):
df_local = pd.read_csv(csv_path)
with open(meta_path, "r") as f:
meta_local = json.load(f)
return df_local, pd.DataFrame(meta_local)
df, meta_df = load_data()
# -------------------------
# Sidebar filters & UI
# -------------------------
st.sidebar.title("๐ Feature Explorer - Advanced + SHAP")
feat_types = sorted(meta_df["source_type"].unique().tolist())
selected_types = st.sidebar.multiselect("Feature type", feat_types, default=feat_types)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# -------------------------
# Main tabs
# -------------------------
st.title("Steel Authority of India Limited (SHAP-enabled)")
tabs = st.tabs([
"Features",
"Visualize",
"Correlations",
"Stats",
"Ensemble + SHAP",
"Target & Business Impact",
"Bibliography"
])
# ----- Features tab
with tabs[0]:
st.subheader("Feature metadata")
filtered_meta = meta_df[meta_df["source_type"].isin(selected_types)]
st.dataframe(filtered_meta[["feature_name","source_type","formula","remarks"]].rename(columns={"feature_name":"Feature"}), height=400)
st.markdown(f"Total features loaded: **{df.shape[1]}** | Rows: **{df.shape[0]}**")
# ----- Visualize tab
with tabs[1]:
st.subheader("Feature visualization")
col = st.selectbox("Choose numeric feature", numeric_cols, index=0)
bins = st.slider("Histogram bins", 10, 200, 50)
fig, ax = plt.subplots(figsize=(8,4))
sns.histplot(df[col], bins=bins, kde=True, ax=ax)
ax.set_title(col)
st.pyplot(fig)
st.write(df[col].describe().to_frame().T)
# ----- Correlations tab
with tabs[2]:
st.subheader("Correlation explorer")
default_corr = numeric_cols[:20] if len(numeric_cols) >= 20 else numeric_cols
corr_sel = st.multiselect("Select features (min 2)", numeric_cols, default=default_corr)
if len(corr_sel) >= 2:
corr = df[corr_sel].corr()
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(corr, cmap="coolwarm", center=0, ax=ax)
st.pyplot(fig)
else:
st.info("Choose at least 2 numeric features to compute correlation.")
# ----- Stats tab
with tabs[3]:
st.subheader("Summary statistics (numeric features)")
st.dataframe(df.describe().T.style.format("{:.3f}"), height=500)
# ----- Ensemble + SHAP tab
with tabs[4]:
st.subheader("Ensemble modeling sandbox (fast) + SHAP explainability")
# Feature & target selector
target = st.selectbox("Target variable", numeric_cols, index=numeric_cols.index("furnace_temp") if "furnace_temp" in numeric_cols else 0)
default_features = [c for c in numeric_cols if c != target][:50] # preselect up to 50 features default
features = st.multiselect("Model input features (select many; start with defaults)", numeric_cols, default=default_features)
sample_size = st.slider("Sample rows to use for training (speed vs fidelity)", min_value=200, max_value=min(4000, df.shape[0]), value=1000, step=100)
train_button = st.button("Train ensemble & compute SHAP (recommended sample only)")
if train_button:
with st.spinner("Preparing data and training ensemble..."):
sub_df = df[features + [target]].sample(n=sample_size, random_state=42)
X = sub_df[features].fillna(0)
y = sub_df[target].fillna(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# models
models = {
"Linear": LinearRegression(),
"RandomForest": RandomForestRegressor(n_estimators=150, random_state=42, n_jobs=-1),
"GradientBoosting": GradientBoostingRegressor(n_estimators=150, random_state=42),
"ExtraTrees": ExtraTreesRegressor(n_estimators=150, random_state=42, n_jobs=-1)
}
preds = {}
results = []
for name, m in models.items():
m.fit(X_train, y_train)
p = m.predict(X_test)
preds[name] = p
results.append({"Model": name, "R2": r2_score(y_test, p), "RMSE": float(np.sqrt(mean_squared_error(y_test, p)))})
# ensemble average
ensemble_pred = np.column_stack(list(preds.values())).mean(axis=1)
results.append({"Model": "EnsembleAvg", "R2": r2_score(y_test, ensemble_pred), "RMSE": float(np.sqrt(mean_squared_error(y_test, ensemble_pred)))})
st.dataframe(pd.DataFrame(results).set_index("Model").round(4))
# scatter
fig, ax = plt.subplots(figsize=(8,4))
ax.scatter(y_test, ensemble_pred, alpha=0.5)
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--")
ax.set_xlabel("Actual"); ax.set_ylabel("Predicted (Ensemble)")
st.pyplot(fig)
# save the models (lightweight)
joblib.dump(models, ENSEMBLE_ARTIFACT)
st.success(f"Saved ensemble models to {ENSEMBLE_ARTIFACT}")
# ---------- SHAP explainability ----------
st.markdown("### SHAP Explainability โ pick a model to explain (Tree models recommended)")
explain_model_name = st.selectbox("Model to explain", list(models.keys()), index= list(models.keys()).index("RandomForest") if "RandomForest" in models else 0)
explainer_sample = st.slider("Number of rows to use for SHAP explanation (memory heavy)", 50, min(1500, sample_size), value=300, step=50)
# Use a Tree explainer if possible; otherwise KernelExplainer (slow)
model_to_explain = models[explain_model_name]
X_shap = X_test.copy()
if explainer_sample < X_shap.shape[0]:
X_shap_for = X_shap.sample(n=explainer_sample, random_state=42)
else:
X_shap_for = X_shap
with st.spinner("Computing SHAP values (this may take a while for large SHAP sample)..."):
try:
if hasattr(model_to_explain, "predict") and (explain_model_name in ["RandomForest","ExtraTrees","GradientBoosting"]):
explainer = shap.TreeExplainer(model_to_explain)
shap_values = explainer.shap_values(X_shap_for)
# summary plot
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")
fig_shap = plt.figure(figsize=(8,6))
shap.summary_plot(shap_values, X_shap_for, show=False)
st.pyplot(fig_shap)
else:
# fallback: use KernelExplainer on small sample (very slow)
explainer = shap.KernelExplainer(model_to_explain.predict, shap.sample(X_train, 100))
shap_values = explainer.shap_values(X_shap_for, nsamples=100)
fig_shap = plt.figure(figsize=(8,6))
shap.summary_plot(shap_values, X_shap_for, show=False)
st.pyplot(fig_shap)
st.success("SHAP summary plotted.")
except Exception as e:
st.error(f"SHAP failed: {e}")
# per-instance explanation waterfall
st.markdown("#### Explain a single prediction (waterfall):")
idx_choice = st.number_input("Row index (0..n_test-1)", min_value=0, max_value=X_shap.shape[0]-1, value=0)
try:
row = X_shap_for.iloc[[idx_choice]]
if explain_model_name in ["RandomForest","ExtraTrees","GradientBoosting"]:
expl = shap.TreeExplainer(model_to_explain)
shap_vals_row = expl.shap_values(row)
exp_val = expl.expected_value
shap_vals = shap_vals_row
# Handle tree models returning arrays for single target
if isinstance(exp_val, (list, np.ndarray)) and not np.isscalar(exp_val):
exp_val = exp_val[0]
if isinstance(shap_vals, list):
shap_vals = shap_vals[0]
exp_val = expl.expected_value
shap_vals = shap_vals_row
# Handle multi-output case
if isinstance(exp_val, (list, np.ndarray)) and not np.isscalar(exp_val):
exp_val = exp_val[0]
if isinstance(shap_vals, list):
shap_vals = shap_vals[0]
# Plot safely across SHAP versions
try:
explanation = shap.Explanation(
values=shap_vals[0],
base_values=exp_val,
data=row.iloc[0],
feature_names=row.columns.tolist()
)
plot_obj = shap.plots.waterfall(explanation, show=False)
# If SHAP returns Axes instead of Figure, wrap it
import matplotlib.pyplot as plt
if hasattr(plot_obj, "figure"):
fig2 = plot_obj.figure
else:
fig2 = plt.gcf()
st.pyplot(fig2)
except Exception as e:
st.warning(f"Waterfall plotting failed gracefully: {e}")
else:
st.info("Per-instance waterfall not available for this model type in fallback.")
except Exception as e:
st.warning(f"Could not plot waterfall: {e}")
# ----- ๐ Target & Business Impact tab
with tabs[5]:
st.subheader("๐ฏ Recommended Target Variables by Use Case")
st.markdown("Each use case maps to a practical target variable that drives measurable business impact.")
target_table = pd.DataFrame([
["Predictive Maintenance (Mills, Motors, Compressors)", "bearing_temp / time_to_failure", "Rises before mechanical failure; early warning", "โน10โ30 L per asset/year"],
["Blast Furnace / EAF Data Intelligence", "furnace_temp / tap_temp", "Central control variable, linked to energy and quality", "โน20โ60 L/year"],
["Casting Quality Optimization", "defect_probability / solidification_rate", "Determines billet quality; control nozzle & cooling", "โน50 L/year yield gain"],
["Rolling Mill Energy Optimization", "energy_per_ton / exit_temp", "Directly tied to energy efficiency", "โน5โ10 L/year per kWh/t"],
["Surface Defect Detection (Vision AI)", "defect_probability", "Quality metric from CNN", "1โ2 % yield gain"],
["Material Composition & Alloy Mix AI", "deviation_from_target_grade", "Predict deviation, suggest corrections", "โน20 L/year raw material savings"],
["Inventory & Yield Optimization", "yield_ratio (output/input)", "Linked to WIP and process yield", "โน1 Cr+/year"],
["Refractory & Cooling Loss Prediction", "lining_thickness / heat_loss_rate", "Predict wear for planned maintenance", "โน40 L/year downtime savings"]], columns=["Use Case", "Target Variable", "Why Itโs Ideal", "Business Leverage"])
st.dataframe(target_table, use_container_width=True)
st.markdown("---")
st.subheader(" Business Framing for Clients")
st.markdown("These metrics show approximate annual benefits from small process improvements.")
business_table = pd.DataFrame([
["Energy consumption", "400 kWh/ton", "โน35โ60 L"],
["Electrode wear", "1.8 kg/ton", "โน10 L"],
["Refractory wear", "3 mm/heat", "โน15 L"],
["Oxygen usage", "40 Nmยณ/ton", "โน20 L"],
["Yield loss", "2 %", "โน50 L โ โน1 Cr"],
], columns=["Metric", "Typical Value (EAF India)", "5 % Improvement โ Annual โน Value"])
st.dataframe(business_table, use_container_width=True)
st.info("These numbers are indicative averages; actual benefits depend on plant capacity and process efficiency.")
# ----- ๐ Bibliography tab
with tabs[6]:
st.subheader("๐ Annotated Bibliography & Feature Justification")
st.markdown("""
This section summarizes published research supporting the feature design and modeling choices.
""")
bib_data = [
("A Survey of Data-Driven Soft Sensing in Ironmaking Systems", "Yan et al. (2024)", "Supports gas proxies, lags, PCA for off-gas and temperature correlation."),
("Optimisation of Oxygen Blowing Process using RL", "Ojeda Roldan et al. (2022)", "Reinforcement learning for oxygen control; motivates surrogate predicted states & safety indices."),
("Analyzing the Energy Efficiency of Electric Arc Furnace", "Zhuo et al. (2024)", "Energy KPIs (kWh/t) motivate power_density & energy_efficiency features."),
("BOF/Endpoint Prediction Techniques", "Springer (2024)", "Endpoint prediction; supports temporal lags and cycle encoding."),
("Dynamic EAF Modeling & Slag Foaming", "MacRosty et al.", "Physics priors for slag_foaming_index and refractory health modeling."),
]
bib_df = pd.DataFrame(bib_data, columns=["Paper Title", "Authors / Year", "Relevance to Feature Engineering"])
st.dataframe(bib_df, use_container_width=True)
st.markdown("""
**Feature-to-Research Mapping Summary:**
- Gas probes & soft-sensing โ `carbon_proxy`, `oxygen_utilization`
- Power & energy proxies โ `power_density`, `energy_efficiency`
- Temporal features โ rolling means, lags, cycle progress indicators
- Surrogate features โ `pred_temp_30s`, `pred_carbon_5min`
- PCA / clustering โ operating mode compression
""")
# -------------------------
# Footer / Notes
# -------------------------
st.markdown("---")
st.markdown("**Notes:** This dataset is synthetic and for demo/prototyping. Real plant integration requires NDA, data on-boarding, sensor mapping, and plant safety checks before any control actions.")
|