singhn9 commited on
Commit
18057c4
·
verified ·
1 Parent(s): 54e00e0

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +216 -737
src/streamlit_app.py CHANGED
@@ -1,5 +1,4 @@
1
-
2
-
3
  import os
4
  import json
5
  import time
@@ -12,10 +11,11 @@ import seaborn as sns
12
  import joblib
13
  import zipfile
14
  import io
 
15
 
16
  # ML imports
17
  from sklearn.model_selection import train_test_split
18
- from sklearn.linear_model import LinearRegression
19
  from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
20
  from sklearn.preprocessing import StandardScaler, PolynomialFeatures
21
  from sklearn.decomposition import PCA
@@ -25,6 +25,10 @@ from sklearn.metrics import mean_squared_error, r2_score
25
  # SHAP
26
  import shap
27
 
 
 
 
 
28
 
29
  # --- Safe defaults for Streamlit session state ---
30
  defaults = {
@@ -38,7 +42,6 @@ defaults = {
38
  for k, v in defaults.items():
39
  st.session_state.setdefault(k, v)
40
 
41
-
42
  if "llm_result" not in st.session_state:
43
  st.session_state["llm_result"] = None
44
  if "automl_summary" not in st.session_state:
@@ -51,7 +54,6 @@ if "hf_clicked" not in st.session_state:
51
  # -------------------------
52
  # Config & paths
53
  # -------------------------
54
-
55
  st.set_page_config(page_title="Steel Authority of India Limited (MODEX)", layout="wide")
56
  plt.style.use("seaborn-v0_8-muted")
57
  sns.set_palette("muted")
@@ -79,17 +81,13 @@ def log(msg: str):
79
  f.write(f"[{stamp}] {msg}\n")
80
  print(msg)
81
 
82
-
83
  log("=== Streamlit session started ===")
84
 
85
-
86
-
87
  if os.path.exists("/data"):
88
  st.sidebar.success(f" Using persistent storage | Logs directory: {LOG_DIR}")
89
  else:
90
  st.sidebar.warning(f" Using ephemeral storage | Logs directory: {LOG_DIR}. Data will be lost on rebuild.")
91
 
92
-
93
  # -------------------------
94
  # Utility: generate advanced dataset if missing
95
  # -------------------------
@@ -104,13 +102,6 @@ def generate_advanced_flatfile(
104
  Generates a large synthetic, physics-aligned dataset with many engineered features.
105
  Allows control of variability per feature (through variance_overrides) or globally
106
  (via global_variance_multiplier).
107
-
108
- Args:
109
- n_rows: number of samples
110
- random_seed: RNG seed
111
- max_polynomial_new: limit on number of polynomial expansion features
112
- global_variance_multiplier: multiplier applied to all default stddevs
113
- variance_overrides: dict mapping feature name or substring → stddev multiplier
114
  """
115
  np.random.seed(random_seed)
116
  os.makedirs(LOG_DIR, exist_ok=True)
@@ -307,37 +298,7 @@ def generate_advanced_flatfile(
307
  existing = [meta_entry]
308
  json.dump(existing, open(META_PATH, "w"), indent=2)
309
 
310
-
311
  PDF_PATH = None
312
- # annotated bibliography
313
- # try:
314
- # from fpdf import FPDF
315
- # pdf = FPDF('P','mm','A4')
316
- # pdf.add_page()
317
- # pdf.set_font("Helvetica","B",14)
318
- # pdf.cell(0,8,"Annotated Bibliography - Metallurgical AI (Selected Papers)", ln=True)
319
- # pdf.ln(2)
320
- # pdf.set_font("Helvetica","",10)
321
- # pdf.cell(0,6,"Generated: " + datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC"), ln=True)
322
- # pdf.ln(4)
323
- # bib_items = [
324
- # ("A Survey of Data-Driven Soft Sensing in Ironmaking Systems","Yan et al. (2024)","Review of soft-sensors; supports gas proxies, lags, PCA."),
325
- # ("Optimisation of Oxygen Blowing Process using RL","Ojeda Roldan et al. (2022)","RL for oxygen control; motivates surrogate predicted states & safety indices."),
326
- # ("Analyzing the Energy Efficiency of Electric Arc Furnace","Zhuo et al. (2024)","Energy KPIs (kWh/t) motivate power_density & energy_efficiency features."),
327
- # ("BOF/Endpoint prediction techniques","Springer (2024)","Endpoint prediction; supports temporal lags and cycle encoding."),
328
- # ("Dynamic EAF modeling & slag foaming","MacRosty et al.","Physics priors for slag_foaming_index and refractory health modeling.")
329
- # ]
330
- # for title, auth, note in bib_items:
331
- # pdf.set_font("Helvetica","B",11)
332
- # pdf.multi_cell(0,6, f"{title} — {auth}")
333
- # pdf.set_font("Helvetica","",10)
334
- # pdf.multi_cell(0,5, f"Notes: {note}")
335
- # pdf.ln(2)
336
- # pdf.output(PDF_PATH)
337
- # except Exception as e:
338
- # with open(PDF_PATH.replace(".pdf",".txt"), "w") as tf:
339
- # tf.write("Annotated bibliography generated. Install fpdf for PDF output.\n")
340
-
341
  return CSV_PATH, META_PATH, PDF_PATH
342
 
343
  # -------------------------
@@ -359,10 +320,8 @@ def load_data(csv_path=CSV_PATH, meta_path=META_PATH):
359
  return df_local, pd.DataFrame(meta_local)
360
 
361
  df, meta_df = load_data()
362
-
363
-
364
  # -------------------------
365
- # Sidebar filters & UI (FINAL ROBUST VERSION)
366
  # -------------------------
367
  st.sidebar.title("Feature Explorer - Advanced + SHAP")
368
 
@@ -370,7 +329,6 @@ def ensure_feature_metadata(df: pd.DataFrame, meta_df: pd.DataFrame) -> pd.DataF
370
  """Ensure metadata dataframe matches feature count & has required columns."""
371
  required_cols = ["feature_name", "source_type", "formula", "remarks"]
372
 
373
- # If metadata missing or too short, rebuild it entirely
374
  if meta_df is None or len(meta_df) < len(df.columns):
375
  meta_df = pd.DataFrame({
376
  "feature_name": df.columns,
@@ -383,14 +341,11 @@ def ensure_feature_metadata(df: pd.DataFrame, meta_df: pd.DataFrame) -> pd.DataF
383
  })
384
  st.sidebar.warning("Metadata was summary-only — rebuilt feature-level metadata.")
385
  else:
386
- # Ensure required columns exist
387
  for col in required_cols:
388
  if col not in meta_df.columns:
389
  meta_df[col] = None
390
- # Fill feature_name if blank or NaN
391
  if meta_df["feature_name"].isna().all():
392
  meta_df["feature_name"] = df.columns
393
- # Clip to same number of features (safety)
394
  if len(meta_df) > len(df.columns):
395
  meta_df = meta_df.iloc[: len(df.columns)]
396
 
@@ -398,7 +353,6 @@ def ensure_feature_metadata(df: pd.DataFrame, meta_df: pd.DataFrame) -> pd.DataF
398
 
399
  meta_df = ensure_feature_metadata(df, meta_df)
400
 
401
- # Build sidebar safely
402
  feat_types = sorted(meta_df["source_type"].dropna().unique().tolist())
403
  selected_types = st.sidebar.multiselect("Feature type", feat_types, default=feat_types)
404
 
@@ -409,11 +363,9 @@ else:
409
 
410
  numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
411
 
412
-
413
  # -------------------------
414
- # Features tab (robust)
415
  # -------------------------
416
-
417
  tabs = st.tabs([
418
  "Features",
419
  "Visualization",
@@ -426,7 +378,7 @@ tabs = st.tabs([
426
  "View Logs"
427
  ])
428
 
429
-
430
  with tabs[0]:
431
  st.subheader("Feature metadata")
432
  st.dataframe(
@@ -436,24 +388,18 @@ with tabs[0]:
436
  )
437
  st.markdown(f"Total features loaded: **{df.shape[1]}** | Rows: **{df.shape[0]}**")
438
 
439
-
440
- # ----- Visualize tab
441
  with tabs[1]:
442
  st.subheader("Feature Visualization")
443
  col = st.selectbox("Choose numeric feature", numeric_cols, index=0)
444
  bins = st.slider("Histogram bins", 10, 200, 50)
445
 
446
- # --- Improved Histogram with style ---
447
  fig, ax = plt.subplots(figsize=(8, 4))
448
  sns.histplot(df[col], bins=bins, kde=True, ax=ax, color="#2C6E91", alpha=0.8)
449
- ax.set_title(f"Distribution of {col.replace('_', ' ').title()}", fontsize=12)
450
- ax.set_xlabel(col.replace("_", " ").title(), fontsize=10)
451
- ax.set_ylabel("Frequency", fontsize=10)
452
- sns.despine()
453
  st.pyplot(fig, clear_figure=True)
454
  st.write(df[col].describe().to_frame().T)
455
 
456
- # --- Add PCA scatter visualization ---
457
  if all(x in df.columns for x in ["pca_1", "pca_2", "operating_mode"]):
458
  st.markdown("### PCA Feature Space — Colored by Operating Mode")
459
  fig2, ax2 = plt.subplots(figsize=(6, 5))
@@ -462,14 +408,9 @@ with tabs[1]:
462
  x="pca_1", y="pca_2", hue="operating_mode",
463
  palette="tab10", alpha=0.7, s=40, ax=ax2
464
  )
465
- ax2.set_title("Operating Mode Clusters (PCA Projection)", fontsize=12)
466
- ax2.set_xlabel("PCA 1")
467
- ax2.set_ylabel("PCA 2")
468
- ax2.legend(title="Operating Mode", bbox_to_anchor=(1.05, 1), loc="upper left")
469
- sns.despine()
470
  st.pyplot(fig2, clear_figure=True)
471
 
472
-
473
  # ----- Correlations tab
474
  with tabs[2]:
475
  st.subheader("Correlation explorer")
@@ -478,14 +419,9 @@ with tabs[2]:
478
  if len(corr_sel) >= 2:
479
  corr = df[corr_sel].corr()
480
  fig, ax = plt.subplots(figsize=(10,8))
481
- sns.heatmap(
482
- corr, cmap="RdBu_r", center=0, annot=True, fmt=".2f",
483
- linewidths=0.5, cbar_kws={"shrink": 0.7}, ax=ax
484
- )
485
- ax.set_title("Feature Correlation Matrix", fontsize=12)
486
- sns.despine()
487
  st.pyplot(fig, clear_figure=True)
488
-
489
  else:
490
  st.info("Choose at least 2 numeric features to compute correlation.")
491
 
@@ -494,13 +430,10 @@ with tabs[3]:
494
  st.subheader("Summary statistics (numeric features)")
495
  st.dataframe(df.describe().T.style.format("{:.3f}"), height=500)
496
 
497
-
498
- # ----- Ensemble + SHAP tab (Expanded AutoML + Stacking + Multi-Family) -----
499
  with tabs[4]:
500
- st.subheader(" AutoML Ensemble — Expanded Families + Stacking + SHAP")
501
 
502
- # --- Step 0: High-level Use Case (keeps previous defaults) ---
503
- st.markdown("### Choose Industrial Use Case ")
504
  use_case = st.selectbox(
505
  "Select Use Case",
506
  [
@@ -516,11 +449,10 @@ with tabs[4]:
516
  index=1
517
  )
518
 
519
- # Map use-case -> defaults (same as before)
520
  use_case_config = {
521
  "Predictive Maintenance": {"target": "bearing_temp", "model_hint": "RandomForest"},
522
  "EAF Data Intelligence": {"target": "furnace_temp", "model_hint": "GradientBoosting"},
523
- "Casting Quality Optimization": {"target": "surface_temp" if "surface_temp" in numeric_cols else "furnace_temp", "model_hint": "GradientBoosting"},
524
  "Rolling Mill Energy Optimization": {"target": "energy_efficiency", "model_hint": "ExtraTrees"},
525
  "Surface Defect Detection (Vision AI)": {"target": "image_entropy_proxy", "model_hint": "GradientBoosting"},
526
  "Material Composition & Alloy Mix AI": {"target": "chemical_C", "model_hint": "RandomForest"},
@@ -531,81 +463,44 @@ with tabs[4]:
531
  target = cfg["target"]
532
  model_hint = cfg["model_hint"]
533
 
534
- # --- Feature auto-suggestion (keeps your earlier heuristic) ---
535
  suggested = [c for c in numeric_cols if any(k in c for k in target.split('_'))]
536
  if len(suggested) < 6:
537
- suggested = [c for c in numeric_cols if any(k in c for k in ["temp", "power", "energy", "pressure", "yield"])]
538
  if len(suggested) < 6:
539
  suggested = numeric_cols[:50]
540
 
541
  features = st.multiselect("Model input features (auto-suggested)", numeric_cols, default=suggested)
542
  st.markdown(f"Auto target: `{target}` · Suggested family hint: `{model_hint}`")
543
 
544
- # --- Data sampling controls ---
545
  max_rows = min(df.shape[0], 20000)
546
- sample_size = st.slider("Sample rows (train speed vs fidelity)", 500, max_rows, min(1500, max_rows), step=100)
547
-
548
  sub_df = df[features + [target]].sample(n=sample_size, random_state=42).reset_index(drop=True)
549
  X = sub_df[features].fillna(0)
550
  y = sub_df[target].fillna(0)
551
 
552
- # --- Ensemble control UI ---
553
  st.markdown("### Ensemble & AutoML Settings")
554
- max_trials = st.slider("Optuna trials per family (total trials grow with families)", 5, 80, 20, step=5)
555
- top_k = st.slider("Max base models to keep in final ensemble", 2, 8, 5)
556
- allow_advanced = st.checkbox("Include advanced families (XGBoost, LightGBM, CatBoost, TabPFN if installed)", value=True)
557
 
558
- # --- Conditional imports (graceful fallbacks) ---
559
- available_models = ["RandomForest", "ExtraTrees"] # always available (sklearn)
560
  optional_families = {}
561
  if allow_advanced:
562
  try:
563
- import xgboost as xgb
564
- optional_families["XGBoost"] = True
565
- available_models.append("XGBoost")
566
- except Exception:
567
- optional_families["XGBoost"] = False
568
  try:
569
- import lightgbm as lgb
570
- optional_families["LightGBM"] = True
571
- available_models.append("LightGBM")
572
- except Exception:
573
- optional_families["LightGBM"] = False
574
- try:
575
- import catboost as cb
576
- optional_families["CatBoost"] = True
577
- available_models.append("CatBoost")
578
- except Exception:
579
- optional_families["CatBoost"] = False
580
- try:
581
- # TabPFN is often packaged differently; attempt import but it's optional
582
- import tabpfn
583
- optional_families["TabPFN"] = True
584
- available_models.append("TabPFN")
585
- except Exception:
586
- optional_families["TabPFN"] = False
587
  try:
588
- # FT-Transformer optional
589
- from pytorch_tabular.models import transformers # may not be installed
590
- optional_families["FTTransformer"] = True
591
- available_models.append("FTTransformer")
592
- except Exception:
593
- optional_families["FTTransformer"] = False
594
 
595
  st.markdown(f"Available model families: {', '.join(available_models)}")
596
 
597
- # --- Optuna tuning routine per family ---
598
- import optuna
599
- from sklearn.model_selection import cross_val_score, KFold
600
- from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
601
- from sklearn.linear_model import Ridge
602
- from sklearn.neural_network import MLPRegressor
603
- from sklearn.metrics import r2_score, mean_squared_error
604
-
605
  def tune_family(family_name, X_local, y_local, n_trials=20, random_state=42):
606
- """Tune one model family using Optuna; returns best (model_obj, cv_score, best_params)."""
607
  def obj(trial):
608
- # sample hyperparams per family
609
  if family_name == "RandomForest":
610
  n_estimators = trial.suggest_int("n_estimators", 100, 800)
611
  max_depth = trial.suggest_int("max_depth", 4, 30)
@@ -618,33 +513,21 @@ with tabs[4]:
618
  n_estimators = trial.suggest_int("n_estimators", 100, 1000)
619
  max_depth = trial.suggest_int("max_depth", 3, 12)
620
  lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
621
- m = xgb.XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, tree_method="hist", verbosity=0, random_state=random_state, n_jobs=1)
622
  elif family_name == "LightGBM" and optional_families.get("LightGBM"):
623
  n_estimators = trial.suggest_int("n_estimators", 100, 1000)
624
  max_depth = trial.suggest_int("max_depth", 3, 16)
625
  lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
626
- m = lgb.LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, n_jobs=1, random_state=random_state)
627
  elif family_name == "CatBoost" and optional_families.get("CatBoost"):
628
  iterations = trial.suggest_int("iterations", 200, 1000)
629
  depth = trial.suggest_int("depth", 4, 10)
630
  lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
631
- m = cb.CatBoostRegressor(iterations=iterations, depth=depth, learning_rate=lr, verbose=0, random_state=random_state)
632
- elif family_name == "MLP":
633
- hidden = trial.suggest_int("hidden_layer_sizes", 32, 512, log=True)
634
- lr = trial.suggest_float("learning_rate_init", 1e-4, 1e-1, log=True)
635
- m = MLPRegressor(hidden_layer_sizes=(hidden,), learning_rate_init=lr, max_iter=500, random_state=random_state)
636
- elif family_name == "TabPFN" and optional_families.get("TabPFN"):
637
- # TabPFN often works without hyperparams exposure; return a surrogate score using quick fit
638
- # We'll call its predict_proba style API if available; as fallback use a mean score to let stacking consider it.
639
- # For tuning, just return a placeholder; we'll build model object later.
640
- return 0.0
641
  else:
642
- # fallback to a small RandomForest to avoid crashing
643
- m = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=random_state, n_jobs=-1)
644
-
645
- # use negative RMSE if better for our domain? keep R2 for generality
646
  try:
647
- scores = cross_val_score(m, X_local, y_local, scoring="r2", cv=3, n_jobs=1)
648
  return float(np.mean(scores))
649
  except Exception:
650
  return -999.0
@@ -652,636 +535,232 @@ with tabs[4]:
652
  study = optuna.create_study(direction="maximize")
653
  study.optimize(obj, n_trials=n_trials, show_progress_bar=False)
654
  best = study.best_trial.params if study.trials else {}
655
- # instantiate best model
656
  try:
657
  if family_name == "RandomForest":
658
- model = RandomForestRegressor(n_estimators=best.get("n_estimators",200), max_depth=best.get("max_depth",8), n_jobs=-1, random_state=42)
659
  elif family_name == "ExtraTrees":
660
- model = ExtraTreesRegressor(n_estimators=best.get("n_estimators",200), max_depth=best.get("max_depth",8), n_jobs=-1, random_state=42)
661
  elif family_name == "XGBoost" and optional_families.get("XGBoost"):
662
- model = xgb.XGBRegressor(n_estimators=best.get("n_estimators",200), max_depth=best.get("max_depth",6), learning_rate=best.get("learning_rate",0.1), tree_method="hist", verbosity=0, random_state=42, n_jobs=1)
663
  elif family_name == "LightGBM" and optional_families.get("LightGBM"):
664
- model = lgb.LGBMRegressor(n_estimators=best.get("n_estimators",200), max_depth=best.get("max_depth",8), learning_rate=best.get("learning_rate",0.1), n_jobs=1, random_state=42)
665
  elif family_name == "CatBoost" and optional_families.get("CatBoost"):
666
- model = cb.CatBoostRegressor(iterations=best.get("iterations",200), depth=best.get("depth",6), learning_rate=best.get("learning_rate",0.1), verbose=0, random_state=42)
667
- elif family_name == "MLP":
668
- model = MLPRegressor(hidden_layer_sizes=(best.get("hidden_layer_sizes",128),), learning_rate_init=best.get("learning_rate_init",0.001), max_iter=500, random_state=42)
669
- elif family_name == "TabPFN" and optional_families.get("TabPFN"):
670
- # We'll create a small wrapper for TabPFN later on train time
671
- model = "TabPFN_placeholder"
672
  else:
673
- model = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
674
  except Exception:
675
- model = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
676
 
677
- # compute cross-validated score for the best model
678
  try:
679
- score = float(np.mean(cross_val_score(model, X_local, y_local, scoring="r2", cv=3, n_jobs=1)))
680
  except Exception:
681
  score = -999.0
 
682
 
683
- return {"model_obj": model, "cv_score": score, "best_params": best, "family": family_name, "study": study}
684
-
685
- # --- Run tuning across available families (user triggered) ---
686
- if "run_automl_clicked" not in st.session_state:
687
- st.session_state["run_automl_clicked"] = False
688
-
689
  if st.button("Run expanded AutoML + Stacking"):
690
  st.session_state["run_automl_clicked"] = True
691
-
692
  if st.session_state["run_automl_clicked"]:
693
  log("AutoML + Stacking initiated.")
694
- with st.spinner("Tuning multiple families (this may take a while depending on choices)..."):
695
  families_to_try = ["RandomForest", "ExtraTrees", "MLP"]
696
  if allow_advanced:
697
  if optional_families.get("XGBoost"): families_to_try.append("XGBoost")
698
  if optional_families.get("LightGBM"): families_to_try.append("LightGBM")
699
  if optional_families.get("CatBoost"): families_to_try.append("CatBoost")
700
- if optional_families.get("TabPFN"): families_to_try.append("TabPFN")
701
- if optional_families.get("FTTransformer"): families_to_try.append("FTTransformer")
702
 
703
  tuned_results = []
704
  for fam in families_to_try:
705
  log(f"Tuning family: {fam}")
706
  st.caption(f"Tuning family: {fam}")
707
- res = tune_family(fam, X, y, n_trials=max_trials)
708
- # res can be dict or single-run result; ensure consistent format
709
- if isinstance(res, dict) and "model_obj" in res:
710
- tuned_results.append(res)
711
- else:
712
- st.warning(f"Family {fam} returned unexpected tune result: {res}")
713
- log("All families tuned successfully.")
714
-
715
- # build leaderboard DataFrame
716
  lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"], "params": r["best_params"]} for r in tuned_results])
717
  lb = lb.sort_values("cv_r2", ascending=False).reset_index(drop=True)
718
  st.markdown("### Tuning Leaderboard (by CV R²)")
719
  st.dataframe(lb[["family","cv_r2"]].round(4))
720
- # --- Bonus Visualization: Model Performance Summary ---
721
- if not lb.empty:
722
- st.markdown("#### Model Performance Summary (CV R²)")
723
- fig_perf, ax_perf = plt.subplots(figsize=(7, 4))
724
- colors = ["#2C6E91" if fam != lb.iloc[0]["family"] else "#C65F00" for fam in lb["family"]]
725
- ax_perf.barh(lb["family"], lb["cv_r2"], color=colors, alpha=0.85)
726
- ax_perf.set_xlabel("Cross-Validated R² Score", fontsize=10)
727
- ax_perf.set_ylabel("Model Family", fontsize=10)
728
- ax_perf.set_title("Performance Comparison Across Model Families", fontsize=12)
729
- ax_perf.invert_yaxis()
730
- for i, v in enumerate(lb["cv_r2"]):
731
- ax_perf.text(v + 0.005, i, f"{v:.3f}", va="center", fontsize=9)
732
- sns.despine()
733
- st.pyplot(fig_perf, clear_figure=True)
734
-
735
-
736
- # --- Build base-models and collect out-of-fold preds for stacking ---
737
  st.markdown("### Building base models & out-of-fold predictions for stacking")
738
- kf = KFold(n_splits=5, shuffle=True, random_state=42)
739
- base_models = []
740
- oof_preds = pd.DataFrame(index=X.index)
741
-
742
- for idx, row in lb.iterrows():
743
- fam = row["family"]
744
- model_entry = next((r for r in tuned_results if r["family"] == fam), None)
745
- if model_entry is None:
746
- continue
747
- model_obj = model_entry["model_obj"]
748
- # train out-of-fold predictions
749
- oof = np.zeros(X.shape[0])
750
- for tr_idx, val_idx in kf.split(X):
751
- X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
752
- y_tr = y.iloc[tr_idx]
753
- # fit family-specific wrapper (TabPFN/FTTransformer special-case)
754
- if model_obj == "TabPFN_placeholder":
755
- try:
756
- # TabPFN expects specific API; create a simple fallback: use RandomForest to approximate
757
- tmp = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
758
- tmp.fit(X_tr, y_tr)
759
- oof[val_idx] = tmp.predict(X_val)
760
- except Exception:
761
- oof[val_idx] = np.mean(y_tr)
762
- else:
763
- try:
764
- model_obj.fit(X_tr, y_tr)
765
- oof[val_idx] = model_obj.predict(X_val)
766
- except Exception:
767
- # fallback to mean
768
- oof[val_idx] = np.mean(y_tr)
769
- oof_preds[f"{fam}_oof"] = oof
770
 
771
- # finally fit model on full data
772
- try:
773
- if model_entry["model_obj"] == "TabPFN_placeholder":
774
- # fallback full-model: RandomForest
775
- fitted = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
776
- fitted.fit(X, y)
777
- else:
778
- model_entry["model_obj"].fit(X, y)
779
- fitted = model_entry["model_obj"]
780
- except Exception:
781
- fitted = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
782
- fitted.fit(X, y)
783
 
784
- base_models.append({"family": fam, "model": fitted, "cv_r2": model_entry["cv_score"]})
 
785
 
786
- # --- prune highly correlated OOF preds and keep top_k diverse models ---
787
- if oof_preds.shape[1] == 0:
788
- st.error("No base models created — aborting stacking.")
789
- else:
790
- corr_matrix = oof_preds.corr().abs()
791
- # compute diversity score = (1 - mean correlation with others)
792
- diversity = {col: 1 - corr_matrix[col].drop(col).mean() for col in corr_matrix.columns}
793
- summary = []
794
- for bm in base_models:
795
- col = f"{bm['family']}_oof"
796
- summary.append({"family": bm["family"], "cv_r2": bm["cv_r2"], "diversity": diversity.get(col, 0.0)})
797
- summary_df = pd.DataFrame(summary).sort_values(["cv_r2", "diversity"], ascending=[False, False]).reset_index(drop=True)
798
- st.markdown("### Base Model Summary (cv_r2, diversity)")
799
- st.dataframe(summary_df.round(4))
800
-
801
- # select top_k by cv_r2 and diversity combined
802
- selected = summary_df.sort_values(["cv_r2","diversity"], ascending=[False, False]).head(top_k)["family"].tolist()
803
- st.markdown(f"Selected for stacking (top {top_k}): {selected}")
804
-
805
- # build stacking training data (OOF preds for selected)
806
- selected_cols = [f"{s}_oof" for s in selected]
807
- X_stack = oof_preds[selected_cols].fillna(0)
808
- meta = Ridge(alpha=1.0)
809
- meta.fit(X_stack, y)
810
-
811
- # --- Robust holdout evaluation & SHAP (safe for deployment) ---
812
- # Split for holdout
813
- X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
814
-
815
- # Helper to always produce scalar-safe mean
816
- def scalar_mean(arr):
817
  try:
818
- return float(np.mean(arr))
 
 
819
  except Exception:
820
- return float(np.mean(np.ravel(arr)))
821
-
822
- # Build family → model map
823
- base_model_map = {bm["family"]: bm["model"] for bm in base_models}
824
-
825
- meta_inputs = []
826
- missing_families = []
827
- n_meta_features_trained = X_stack.shape[1]
828
-
829
- # Collect predictions from each selected model
830
- for fam in selected:
831
- bm = base_model_map.get(fam)
832
- if bm is None:
833
- missing_families.append(fam)
834
- safe_mean = scalar_mean(y_tr)
835
- meta_inputs.append(np.full(len(X_val), safe_mean))
836
- continue
837
-
838
- try:
839
- preds = bm.predict(X_val)
840
- preds = np.asarray(preds)
841
- # Collapse multi-output predictions to 1D
842
- if preds.ndim == 2:
843
- preds = preds.mean(axis=1)
844
- preds = preds.reshape(-1)
845
- if preds.shape[0] != len(X_val):
846
- preds = np.full(len(X_val), scalar_mean(y_tr))
847
- meta_inputs.append(preds)
848
- except Exception as e:
849
- safe_mean = scalar_mean(y_tr)
850
- meta_inputs.append(np.full(len(X_val), safe_mean))
851
-
852
- if missing_families:
853
- st.warning(f"Missing base models: {missing_families}. Using mean predictions.")
854
-
855
- # Stack meta features
856
- if not meta_inputs:
857
- st.error("No meta features to predict — aborting.")
858
- st.stop()
859
-
860
- X_meta_val = np.column_stack(meta_inputs)
861
- n_meta_features_val = X_meta_val.shape[1]
862
-
863
- # Align meta features between training and validation
864
- if n_meta_features_val < n_meta_features_trained:
865
- pad_cols = n_meta_features_trained - n_meta_features_val
866
- safe_mean = scalar_mean(y_tr)
867
- pad = np.tile(np.full((len(X_val), 1), safe_mean), (1, pad_cols))
868
- X_meta_val = np.hstack([X_meta_val, pad])
869
- elif n_meta_features_val > n_meta_features_trained:
870
- X_meta_val = X_meta_val[:, :n_meta_features_trained]
871
-
872
- if X_meta_val.shape[1] != n_meta_features_trained:
873
- st.error(f"Stack alignment failed: {X_meta_val.shape[1]} != {n_meta_features_trained}")
874
- st.stop()
875
-
876
- # Meta prediction
877
- y_meta_pred = meta.predict(pd.DataFrame(X_meta_val, columns=X_stack.columns))
878
-
879
- # Final evaluation
880
- final_r2 = r2_score(y_val, y_meta_pred)
881
- final_rmse = float(np.sqrt(mean_squared_error(y_val, y_meta_pred)))
882
- st.success("AutoML + Stacking complete — metrics, artifacts, and SHAP ready.")
883
- log(f"Completed stacking. Final R2={final_r2:.4f}, RMSE={final_rmse:.4f}")
884
- # ===============================
885
- # OPERATOR ADVISORY SYSTEM
886
- # ===============================
887
- st.markdown("---")
888
- st.subheader("Operator Advisory System — Real-Time Shift Recommendations")
889
-
890
- try:
891
- # Use top base model already identified
892
- top_base = next((b for b in base_models if b["family"] == selected[0]), None)
893
- if top_base and hasattr(top_base["model"], "predict"):
894
- sample_X = X_val.sample(min(300, len(X_val)), random_state=42)
895
- model = top_base["model"]
896
-
897
- # SHAP direction analysis
898
- expl = shap.TreeExplainer(model)
899
- shap_vals = expl.shap_values(sample_X)
900
-
901
- # --- Normalize SHAP output structure (handles list, ndarray, or multi-dim cases) ---
902
- if isinstance(shap_vals, list): # e.g., for multiclass models
903
- shap_vals = shap_vals[0]
904
-
905
- shap_vals = np.array(shap_vals)
906
-
907
- # If SHAP output has >2 dims, reduce to (n_samples, n_features)
908
- if shap_vals.ndim > 2:
909
- shap_vals = shap_vals.reshape(shap_vals.shape[0], -1)
910
-
911
- # Align SHAP features to DataFrame
912
- if shap_vals.shape[1] != sample_X.shape[1]:
913
- min_feats = min(shap_vals.shape[1], sample_X.shape[1])
914
- shap_vals = shap_vals[:, :min_feats]
915
- sample_X = sample_X.iloc[:, :min_feats]
916
-
917
- # Compute robust means
918
- mean_abs = np.abs(shap_vals).mean(axis=0)
919
- mean_sign = np.sign(shap_vals).mean(axis=0)
920
-
921
- importance = pd.DataFrame({
922
- "Feature": sample_X.columns,
923
- "Mean |SHAP|": mean_abs,
924
- "Mean SHAP Sign": mean_sign
925
- }).sort_values("Mean |SHAP|", ascending=False)
926
-
927
-
928
- # Display Top 5 Drivers
929
- st.markdown("### Top 5 Operational Drivers Influencing Target")
930
- st.dataframe(importance.head(5).style.format({"Mean |SHAP|": "{:.3f}", "Mean SHAP Sign": "{:.3f}"}))
931
-
932
- # Direction-based recommendations
933
- recommendations = []
934
- for _, row in importance.head(5).iterrows():
935
- f = row["Feature"]
936
- s = row["Mean SHAP Sign"]
937
- if s > 0.05:
938
- recommendations.append(f"Increase `{f}` likely increases `{target}`")
939
- elif s < -0.05:
940
- recommendations.append(f"Decrease `{f}` likely increases `{target}`")
941
- else:
942
- recommendations.append(f" `{f}` is neutral or nonlinear for `{target}`")
943
-
944
- st.markdown("### Suggested Operator Adjustments (Model-Inferred)")
945
- st.write("\n".join(recommendations))
946
-
947
- # Delta recommendations vs previous shift
948
- prev_shift = df.tail(200).mean(numeric_only=True)
949
- recommended_shift = prev_shift.copy()
950
- for rec in recommendations:
951
- if "Increase" in rec:
952
- name = rec.split('`')[1]
953
- if name in recommended_shift:
954
- recommended_shift[name] *= 1.03 # +3%
955
- elif "Decrease" in rec:
956
- name = rec.split('`')[1]
957
- if name in recommended_shift:
958
- recommended_shift[name] *= 0.97 # -3%
959
-
960
- # Delta table
961
- st.markdown("### Shift Adjustment Summary (vs Previous 200 Samples)")
962
- deltas = pd.DataFrame({
963
- "Current Avg": prev_shift,
964
- "Suggested": recommended_shift,
965
- "Δ (%)": ((recommended_shift - prev_shift) / prev_shift * 100)
966
- }).loc[[r.split('`')[1] for r in recommendations if '`' in r]].round(2)
967
-
968
- st.dataframe(deltas.fillna(0).style.format("{:.2f}"))
969
- log("Operator advisory system executed successfully.")
970
-
971
- # Optional: LLM-generated human-friendly summary
972
- st.markdown("### Natural Language Operator Note")
973
- try:
974
- import importlib.util
975
- if importlib.util.find_spec("transformers"):
976
- from transformers import pipeline
977
- tiny_llm_path = os.path.join(LOG_DIR, "cached_tiny_llm")
978
- if os.path.exists(os.path.join(tiny_llm_path, "config.json")):
979
- from transformers import AutoModelForCausalLM, AutoTokenizer
980
- model = AutoModelForCausalLM.from_pretrained(tiny_llm_path)
981
- tokenizer = AutoTokenizer.from_pretrained(tiny_llm_path)
982
- assistant = pipeline("text-generation", model=model, tokenizer=tokenizer)
983
- else:
984
- assistant = pipeline("text2text-generation", model="google/flan-t5-small")
985
-
986
-
987
-
988
- llm_prompt = f"""
989
- You are a metallurgical process advisor working in a steel manufacturing unit.
990
- Based on these recommendations:
991
- {recommendations}
992
- and these shift averages:
993
- {deltas.to_dict(orient='index')}
994
- Write a concise 3-line message to the operator suggesting what to adjust this shift.
995
- """
996
- resp = assistant(llm_prompt, max_new_tokens=120)[0]["generated_text"]
997
- st.info(resp)
998
- log("Operator LLM advisory note generated successfully.")
999
- else:
1000
- st.warning("Transformers not available — install it for text generation.")
1001
- except Exception as e:
1002
- st.warning(f"LLM advisory generation skipped: {e}")
1003
-
1004
- else:
1005
- st.info("No suitable model found for operator advisory system.")
1006
- except Exception as e:
1007
- st.error(f"Operator advisory system failed: {e}")
1008
- log(f"Operator advisory error: {e}")
1009
-
1010
-
1011
-
1012
- c1, c2 = st.columns(2)
1013
- c1.metric("Stacked Ensemble R² (holdout)", f"{final_r2:.4f}")
1014
- c2.metric("Stacked Ensemble RMSE (holdout)", f"{final_rmse:.4f}")
1015
-
1016
- # Scatter comparison
1017
- fig, ax = plt.subplots(figsize=(7, 4))
1018
- ax.scatter(y_val, y_meta_pred, alpha=0.6)
1019
- ax.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], "r--")
1020
- ax.set_xlabel("Actual")
1021
- ax.set_ylabel("Stacked Predicted")
1022
- st.pyplot(fig)
1023
-
1024
- # Save trained stack artifacts
1025
- joblib.dump(meta, ENSEMBLE_PATH)
1026
- st.caption(f"Stacked ensemble snapshot updated → {ENSEMBLE_PATH}")
1027
- log(f"Ensemble model updated for use case: {use_case}")
1028
-
1029
-
1030
- # Explainability
1031
- st.markdown("### Explainability (approximate)")
1032
- try:
1033
- top_base = next((b for b in base_models if b["family"] == selected[0]), None)
1034
- if top_base and hasattr(top_base["model"], "predict"):
1035
- sample_X = X_val.sample(min(300, len(X_val)), random_state=42)
1036
- if any(k in top_base["family"] for k in ["XGBoost", "LightGBM", "RandomForest", "ExtraTrees", "CatBoost"]):
1037
- expl = shap.TreeExplainer(top_base["model"])
1038
- shap_vals = expl.shap_values(sample_X)
1039
- fig_sh = plt.figure(figsize=(8, 6))
1040
- shap.summary_plot(shap_vals, sample_X, show=False)
1041
- st.pyplot(fig_sh)
1042
- else:
1043
- st.info("Top model not tree-based; skipping SHAP summary.")
1044
  else:
1045
- st.info("No suitable base model for SHAP explanation.")
1046
- except Exception as e:
1047
- st.warning(f"SHAP computation skipped: {e}")
1048
-
1049
- st.success(" AutoML + Stacking complete — metrics, artifacts, and SHAP ready.")
1050
-
1051
- # --- Store AutoML summary for optional LLM advisory ---
1052
- if "automl_summary" not in st.session_state:
1053
- st.session_state["automl_summary"] = {
1054
- "leaderboard": lb[["family", "cv_r2"]].round(4).to_dict(orient="records"),
1055
- "final_r2": float(final_r2),
1056
- "final_rmse": float(final_rmse),
1057
- "target": target,
1058
- "use_case": use_case
1059
- }
1060
  else:
1061
- # Always refresh with latest metrics after each run
1062
- st.session_state["automl_summary"].update({
1063
- "leaderboard": lb[["family", "cv_r2"]].round(4).to_dict(orient="records"),
1064
- "final_r2": float(final_r2),
1065
- "final_rmse": float(final_rmse),
1066
- "target": target,
1067
- "use_case": use_case
1068
- })
1069
-
1070
- # Persist SHAP-based recommendations for reuse across reruns
1071
- if "shap_recommendations" not in st.session_state:
1072
- st.session_state["shap_recommendations"] = recommendations
1073
- else:
1074
- st.session_state["shap_recommendations"] = recommendations
1075
-
1076
- # --- AI Recommendation Assistant ---
1077
- st.markdown("---")
1078
- st.subheader("AI Recommendation Assistant ")
1079
- st.caption("Generates quick local AI suggestions — no file writes required.")
1080
-
1081
- # Create or reset button states safely
1082
- if "hf_clicked" not in st.session_state:
1083
- st.session_state["hf_clicked"] = False
1084
- if "llm_result" not in st.session_state:
1085
- st.session_state["llm_result"] = None
1086
-
1087
- # --- Buttons ---
1088
- col1, col2 = st.columns(2)
1089
- # Click handlers with isolated session flags
1090
- if col1.button("Get AI Recommendation (via HF API)", key="ai_reco"):
1091
- st.session_state["hf_clicked"] = True
1092
- st.session_state["hf_ran_once"] = False # reset internal control
1093
-
1094
- if col2.button("Reset Recommendation Output"):
1095
- st.session_state["hf_clicked"] = False
1096
- st.session_state["llm_result"] = None
1097
- st.session_state["hf_ran_once"] = False
1098
- st.info("Recommendation output cleared.")
1099
-
1100
- # Execute API call only once
1101
- if st.session_state["hf_clicked"] and not st.session_state.get("hf_ran_once", False):
1102
- summary = st.session_state.get("automl_summary", {})
1103
- if not summary:
1104
- st.warning("Please run AutoML first to generate context.")
1105
- else:
1106
- try:
1107
- import requests, json
1108
- st.info("Contacting Hugging Face Inference API (Mixtral-8x7B-Instruct)…")
1109
-
1110
- API_URL = "https://api-inference.huggingface.co/models/mistralai/Mixtral-8x7B-Instruct-v0.1"
1111
- headers = {"Authorization": f"Bearer {st.secrets['HF_TOKEN']}"}
1112
- prompt = f"""
1113
- You are an ML model tuning advisor.
1114
- Based on this AutoML summary, suggest 3 concise, actionable steps
1115
- to improve model performance if overfitting, underfitting, or data-quality issues are observed.
1116
-
1117
- Use case: {summary.get('use_case')}
1118
- Target: {summary.get('target')}
1119
- Final R²: {summary.get('final_r2')}
1120
- Final RMSE: {summary.get('final_rmse')}
1121
- Leaderboard: {summary.get('leaderboard')}
1122
- """
1123
-
1124
- payload = {"inputs": prompt, "parameters": {"max_new_tokens": 200, "temperature": 0.7}}
1125
- response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
1126
- response.raise_for_status()
1127
- result = response.json()
1128
-
1129
- if isinstance(result, list) and "generated_text" in result[0]:
1130
- text = result[0]["generated_text"]
1131
- elif isinstance(result, dict) and "generated_text" in result:
1132
- text = result["generated_text"]
1133
- else:
1134
- text = json.dumps(result, indent=2)
1135
-
1136
- st.session_state["llm_result"] = text.strip()
1137
- st.session_state["hf_ran_once"] = True
1138
- st.success("✅ AI Recommendation (Mixtral-8x7B-Instruct):")
1139
- st.markdown(st.session_state["llm_result"])
1140
-
1141
- except Exception as e:
1142
- st.error(f"HF Inference API call failed: {e}")
1143
-
1144
-
1145
-
1146
- # --- Always display cached result, even on rerun ---
1147
- if st.session_state["llm_result"]:
1148
- st.markdown("### Cached AI Recommendation:")
1149
- st.markdown(st.session_state["llm_result"])
1150
 
1151
-
1152
- # ----- Target & Business Impact tab
1153
  with tabs[5]:
1154
- st.subheader("Recommended Target Variables by Use Case")
1155
- st.markdown("Each use case maps to a practical target variable that drives measurable business impact.")
1156
-
1157
  target_table = pd.DataFrame([
1158
- ["Predictive Maintenance (Mills, Motors, Compressors)", "bearing_temp / time_to_failure", "Rises before mechanical failure; early warning", "₹1030 L per asset/year"],
1159
- ["Blast Furnace / EAF Data Intelligence", "furnace_temp / tap_temp", "Central control variable, linked to energy and quality", "₹20–60 L/year"],
1160
- ["Casting Quality Optimization", "defect_probability / solidification_rate", "Determines billet quality; control nozzle & cooling", "₹50 L/year yield gain"],
1161
- ["Rolling Mill Energy Optimization", "energy_per_ton / exit_temp", "Directly tied to energy efficiency", "₹5–10 L/year per kWh/t"],
1162
- ["Surface Defect Detection (Vision AI)", "defect_probability", "Quality metric from CNN", "1–2 % yield gain"],
1163
- ["Material Composition & Alloy Mix AI", "deviation_from_target_grade", "Predict deviation, suggest corrections", "₹20 L/year raw material savings"],
1164
- ["Inventory & Yield Optimization", "yield_ratio (output/input)", "Linked to WIP and process yield", "₹1 Cr+/year"],
1165
- ["Refractory & Cooling Loss Prediction", "lining_thickness / heat_loss_rate", "Predict wear for planned maintenance", "₹40 L/year downtime savings"]], columns=["Use Case", "Target Variable", "Why It’s Ideal", "Business Leverage"])
1166
-
1167
- st.dataframe(target_table, width="stretch")
1168
-
1169
- st.markdown("---")
1170
- st.subheader("Business Framing for Clients")
1171
- st.markdown("These metrics show approximate annual benefits from small process improvements.")
1172
-
1173
- business_table = pd.DataFrame([
1174
- ["Energy consumption", "400 kWh/ton", "₹35–60 L"],
1175
- ["Electrode wear", "1.8 kg/ton", "₹10 L"],
1176
- ["Refractory wear", "3 mm/heat", "₹15 L"],
1177
- ["Oxygen usage", "40 Nm³/ton", "₹20 L"],
1178
- ["Yield loss", "2 %", "₹50 L – ₹1 Cr"],
1179
- ], columns=["Metric", "Typical Value (EAF India)", "5 % Improvement → Annual ₹ Value"])
1180
-
1181
- st.dataframe(business_table, width="stretch")
1182
- st.info("These numbers are indicative averages; actual benefits depend on plant capacity and process efficiency.")
1183
-
1184
- # ----- Bibliography tab
1185
  with tabs[6]:
1186
- st.subheader("Annotated Bibliography — Justification for Target Variables")
1187
- st.markdown("""
1188
- These papers justify the chosen target variables (temperature, yield, efficiency, refractory wear)
1189
- in metallurgical AI modeling. Click any title to open the official paper.
1190
- """)
1191
-
1192
- bib_data = [
1193
- {
1194
- "title": "A Survey of Data-Driven Soft Sensing in Ironmaking Systems",
1195
- "authors": "Yan et al. (2024)",
1196
- "notes": "Soft sensors for furnace and tap temperature; validates `furnace_temp` and `tap_temp` targets.",
1197
- "url": "https://doi.org/10.1021/acsomega.4c01254"
1198
- },
1199
- {
1200
- "title": "Optimisation of Operator Support Systems through Artificial Intelligence for the Cast Steel Industry",
1201
- "authors": "Ojeda Roldán et al. (2022)",
1202
- "notes": "Reinforcement learning for oxygen blowing and endpoint control; supports temperature and carbon targets.",
1203
- "url": "https://doi.org/10.3390/jmmp6020034"
1204
- },
1205
- {
1206
- "title": "Analyzing the Energy Efficiency of Electric Arc Furnace Steelmaking",
1207
- "authors": "Zhuo et al. (2024)",
1208
- "notes": "Links arc power, temperature, and energy KPIs — validates `energy_efficiency` and `power_density`.",
1209
- "url": "https://doi.org/10.3390/met15010113"
1210
- },
1211
- {
1212
- "title": "Dynamic EAF Modeling and Slag Foaming Index Prediction",
1213
- "authors": "MacRosty et al.",
1214
- "notes": "Supports refractory and heat-flux-based wear prediction — validates `lining_thickness` target.",
1215
- "url": "https://www.sciencedirect.com/science/article/pii/S0921883123004019"
1216
- },
1217
- {
1218
- "title": "Machine Learning for Yield Optimization in Continuous Casting",
1219
- "authors": "Springer (2023)",
1220
- "notes": "ML for yield ratio and defect minimization; supports `yield_ratio` target.",
1221
- "url": "https://link.springer.com/article/10.1007/s40964-023-00592-7"
1222
- }
1223
  ]
1224
-
1225
- bib_df = pd.DataFrame(bib_data)
1226
- bib_df["Paper Title"] = bib_df.apply(lambda x: f"[{x['title']}]({x['url']})", axis=1)
1227
-
1228
- st.markdown("### Annotated Bibliography — Justification for Target Variables")
1229
-
1230
- for _, row in bib_df.iterrows():
1231
- st.markdown(
1232
- f"**[{row['title']}]({row['url']})** \n"
1233
- f"*{row['authors']}* \n"
1234
- f" _{row['notes']}_ \n",
1235
- unsafe_allow_html=True
1236
- )
1237
- st.info("Click any paper title above to open it in a new tab.")
1238
-
1239
-
1240
- st.markdown("""
1241
- **Feature ↔ Target Justification**
1242
- - `furnace_temp`, `tap_temp` → Process temperature (Yan 2024, Ojeda 2022)
1243
- - `yield_ratio` → Production yield (Springer 2023)
1244
- - `energy_efficiency`, `power_density` → Energy KPIs (Zhuo 2024)
1245
- - `lining_thickness`, `slag_foaming_index` → Refractory & process health (MacRosty et al.)
1246
- """)
1247
-
1248
- st.info("Click any paper title above to open it in a new tab.")
1249
- log("Bibliography tab rendered successfully.")
1250
-
1251
- # -------------------------
1252
- # Footer / Notes
1253
- # -------------------------
1254
- st.markdown("---")
1255
- st.markdown("**Notes:** This dataset is synthetic and for demo/prototyping. Real plant integration requires NDA, data on-boarding, sensor mapping, and plant safety checks before any control actions.")
1256
-
1257
-
1258
- # ----- Download tab
1259
- with tabs[-2]:
1260
- st.subheader(" Download Saved Files (Flat Log Mode)")
1261
-
1262
- available_files = [f for f in os.listdir(LOG_DIR) if os.path.isfile(os.path.join(LOG_DIR, f))]
1263
- if not available_files:
1264
- st.info("No files found yet — run AutoML once to generate outputs.")
1265
  else:
1266
- for f in sorted(available_files):
1267
  path = os.path.join(LOG_DIR, f)
1268
- with open(path, "rb") as fp:
1269
- st.download_button(
1270
- label=f" Download {f}",
1271
- data=fp,
1272
- file_name=f,
1273
- mime="application/octet-stream"
1274
- )
1275
 
1276
-
1277
-
1278
- # ----- Logs tab
1279
- with tabs[-1]:
1280
- st.subheader(" Master Log (append-in-place)")
1281
  if os.path.exists(LOG_PATH):
1282
- with open(LOG_PATH, "r", encoding="utf-8") as f:
1283
- content = f.read()
1284
- st.text_area("Master Log Output", content, height=400)
1285
- st.download_button("Download Log", content, file_name="run_master.log")
1286
  else:
1287
- st.info("No log file yet — run AutoML once to start logging.")
 
 
 
 
1
+ # sail_modex_stable.py
 
2
  import os
3
  import json
4
  import time
 
11
  import joblib
12
  import zipfile
13
  import io
14
+ import gc
15
 
16
  # ML imports
17
  from sklearn.model_selection import train_test_split
18
+ from sklearn.linear_model import LinearRegression, Ridge
19
  from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
20
  from sklearn.preprocessing import StandardScaler, PolynomialFeatures
21
  from sklearn.decomposition import PCA
 
25
  # SHAP
26
  import shap
27
 
28
+ # Optuna (used later)
29
+ import optuna
30
+ from sklearn.model_selection import cross_val_score, KFold
31
+ from sklearn.neural_network import MLPRegressor
32
 
33
  # --- Safe defaults for Streamlit session state ---
34
  defaults = {
 
42
  for k, v in defaults.items():
43
  st.session_state.setdefault(k, v)
44
 
 
45
  if "llm_result" not in st.session_state:
46
  st.session_state["llm_result"] = None
47
  if "automl_summary" not in st.session_state:
 
54
  # -------------------------
55
  # Config & paths
56
  # -------------------------
 
57
  st.set_page_config(page_title="Steel Authority of India Limited (MODEX)", layout="wide")
58
  plt.style.use("seaborn-v0_8-muted")
59
  sns.set_palette("muted")
 
81
  f.write(f"[{stamp}] {msg}\n")
82
  print(msg)
83
 
 
84
  log("=== Streamlit session started ===")
85
 
 
 
86
  if os.path.exists("/data"):
87
  st.sidebar.success(f" Using persistent storage | Logs directory: {LOG_DIR}")
88
  else:
89
  st.sidebar.warning(f" Using ephemeral storage | Logs directory: {LOG_DIR}. Data will be lost on rebuild.")
90
 
 
91
  # -------------------------
92
  # Utility: generate advanced dataset if missing
93
  # -------------------------
 
102
  Generates a large synthetic, physics-aligned dataset with many engineered features.
103
  Allows control of variability per feature (through variance_overrides) or globally
104
  (via global_variance_multiplier).
 
 
 
 
 
 
 
105
  """
106
  np.random.seed(random_seed)
107
  os.makedirs(LOG_DIR, exist_ok=True)
 
298
  existing = [meta_entry]
299
  json.dump(existing, open(META_PATH, "w"), indent=2)
300
 
 
301
  PDF_PATH = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  return CSV_PATH, META_PATH, PDF_PATH
303
 
304
  # -------------------------
 
320
  return df_local, pd.DataFrame(meta_local)
321
 
322
  df, meta_df = load_data()
 
 
323
  # -------------------------
324
+ # Sidebar filters & UI
325
  # -------------------------
326
  st.sidebar.title("Feature Explorer - Advanced + SHAP")
327
 
 
329
  """Ensure metadata dataframe matches feature count & has required columns."""
330
  required_cols = ["feature_name", "source_type", "formula", "remarks"]
331
 
 
332
  if meta_df is None or len(meta_df) < len(df.columns):
333
  meta_df = pd.DataFrame({
334
  "feature_name": df.columns,
 
341
  })
342
  st.sidebar.warning("Metadata was summary-only — rebuilt feature-level metadata.")
343
  else:
 
344
  for col in required_cols:
345
  if col not in meta_df.columns:
346
  meta_df[col] = None
 
347
  if meta_df["feature_name"].isna().all():
348
  meta_df["feature_name"] = df.columns
 
349
  if len(meta_df) > len(df.columns):
350
  meta_df = meta_df.iloc[: len(df.columns)]
351
 
 
353
 
354
  meta_df = ensure_feature_metadata(df, meta_df)
355
 
 
356
  feat_types = sorted(meta_df["source_type"].dropna().unique().tolist())
357
  selected_types = st.sidebar.multiselect("Feature type", feat_types, default=feat_types)
358
 
 
363
 
364
  numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
365
 
 
366
  # -------------------------
367
+ # Tabs layout
368
  # -------------------------
 
369
  tabs = st.tabs([
370
  "Features",
371
  "Visualization",
 
378
  "View Logs"
379
  ])
380
 
381
+ # ----- Feature metadata
382
  with tabs[0]:
383
  st.subheader("Feature metadata")
384
  st.dataframe(
 
388
  )
389
  st.markdown(f"Total features loaded: **{df.shape[1]}** | Rows: **{df.shape[0]}**")
390
 
391
+ # ----- Visualization tab
 
392
  with tabs[1]:
393
  st.subheader("Feature Visualization")
394
  col = st.selectbox("Choose numeric feature", numeric_cols, index=0)
395
  bins = st.slider("Histogram bins", 10, 200, 50)
396
 
 
397
  fig, ax = plt.subplots(figsize=(8, 4))
398
  sns.histplot(df[col], bins=bins, kde=True, ax=ax, color="#2C6E91", alpha=0.8)
399
+ ax.set_title(f"Distribution of {col}", fontsize=12)
 
 
 
400
  st.pyplot(fig, clear_figure=True)
401
  st.write(df[col].describe().to_frame().T)
402
 
 
403
  if all(x in df.columns for x in ["pca_1", "pca_2", "operating_mode"]):
404
  st.markdown("### PCA Feature Space — Colored by Operating Mode")
405
  fig2, ax2 = plt.subplots(figsize=(6, 5))
 
408
  x="pca_1", y="pca_2", hue="operating_mode",
409
  palette="tab10", alpha=0.7, s=40, ax=ax2
410
  )
411
+ ax2.set_title("Operating Mode Clusters (PCA Projection)")
 
 
 
 
412
  st.pyplot(fig2, clear_figure=True)
413
 
 
414
  # ----- Correlations tab
415
  with tabs[2]:
416
  st.subheader("Correlation explorer")
 
419
  if len(corr_sel) >= 2:
420
  corr = df[corr_sel].corr()
421
  fig, ax = plt.subplots(figsize=(10,8))
422
+ sns.heatmap(corr, cmap="RdBu_r", center=0, annot=True, fmt=".2f",
423
+ linewidths=0.5, cbar_kws={"shrink": 0.7}, ax=ax)
 
 
 
 
424
  st.pyplot(fig, clear_figure=True)
 
425
  else:
426
  st.info("Choose at least 2 numeric features to compute correlation.")
427
 
 
430
  st.subheader("Summary statistics (numeric features)")
431
  st.dataframe(df.describe().T.style.format("{:.3f}"), height=500)
432
 
433
+ # ----- AutoML + SHAP tab (Expanded)
 
434
  with tabs[4]:
435
+ st.subheader("AutoML Ensemble — Expanded Families + Stacking + SHAP")
436
 
 
 
437
  use_case = st.selectbox(
438
  "Select Use Case",
439
  [
 
449
  index=1
450
  )
451
 
 
452
  use_case_config = {
453
  "Predictive Maintenance": {"target": "bearing_temp", "model_hint": "RandomForest"},
454
  "EAF Data Intelligence": {"target": "furnace_temp", "model_hint": "GradientBoosting"},
455
+ "Casting Quality Optimization": {"target": "surface_temp", "model_hint": "GradientBoosting"},
456
  "Rolling Mill Energy Optimization": {"target": "energy_efficiency", "model_hint": "ExtraTrees"},
457
  "Surface Defect Detection (Vision AI)": {"target": "image_entropy_proxy", "model_hint": "GradientBoosting"},
458
  "Material Composition & Alloy Mix AI": {"target": "chemical_C", "model_hint": "RandomForest"},
 
463
  target = cfg["target"]
464
  model_hint = cfg["model_hint"]
465
 
 
466
  suggested = [c for c in numeric_cols if any(k in c for k in target.split('_'))]
467
  if len(suggested) < 6:
468
+ suggested = [c for c in numeric_cols if any(k in c for k in ["temp","power","energy","pressure","yield"])]
469
  if len(suggested) < 6:
470
  suggested = numeric_cols[:50]
471
 
472
  features = st.multiselect("Model input features (auto-suggested)", numeric_cols, default=suggested)
473
  st.markdown(f"Auto target: `{target}` · Suggested family hint: `{model_hint}`")
474
 
 
475
  max_rows = min(df.shape[0], 20000)
476
+ sample_size = st.slider("Sample rows", 500, max_rows, min(1500, max_rows), step=100)
 
477
  sub_df = df[features + [target]].sample(n=sample_size, random_state=42).reset_index(drop=True)
478
  X = sub_df[features].fillna(0)
479
  y = sub_df[target].fillna(0)
480
 
 
481
  st.markdown("### Ensemble & AutoML Settings")
482
+ max_trials = st.slider("Optuna trials per family", 5, 80, 20, step=5)
483
+ top_k = st.slider("Max base models in ensemble", 2, 8, 5)
484
+ allow_advanced = st.checkbox("Include advanced families (XGBoost, LightGBM, CatBoost)", value=True)
485
 
486
+ available_models = ["RandomForest", "ExtraTrees"]
 
487
  optional_families = {}
488
  if allow_advanced:
489
  try:
490
+ import xgboost as xgb; optional_families["XGBoost"] = True; available_models.append("XGBoost")
491
+ except Exception: optional_families["XGBoost"] = False
 
 
 
492
  try:
493
+ import lightgbm as lgb; optional_families["LightGBM"] = True; available_models.append("LightGBM")
494
+ except Exception: optional_families["LightGBM"] = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  try:
496
+ import catboost as cb; optional_families["CatBoost"] = True; available_models.append("CatBoost")
497
+ except Exception: optional_families["CatBoost"] = False
 
 
 
 
498
 
499
  st.markdown(f"Available model families: {', '.join(available_models)}")
500
 
 
 
 
 
 
 
 
 
501
  def tune_family(family_name, X_local, y_local, n_trials=20, random_state=42):
502
+ """Tune one model family using Optuna."""
503
  def obj(trial):
 
504
  if family_name == "RandomForest":
505
  n_estimators = trial.suggest_int("n_estimators", 100, 800)
506
  max_depth = trial.suggest_int("max_depth", 4, 30)
 
513
  n_estimators = trial.suggest_int("n_estimators", 100, 1000)
514
  max_depth = trial.suggest_int("max_depth", 3, 12)
515
  lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
516
+ m = xgb.XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, tree_method="hist", verbosity=0)
517
  elif family_name == "LightGBM" and optional_families.get("LightGBM"):
518
  n_estimators = trial.suggest_int("n_estimators", 100, 1000)
519
  max_depth = trial.suggest_int("max_depth", 3, 16)
520
  lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
521
+ m = lgb.LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, n_jobs=1)
522
  elif family_name == "CatBoost" and optional_families.get("CatBoost"):
523
  iterations = trial.suggest_int("iterations", 200, 1000)
524
  depth = trial.suggest_int("depth", 4, 10)
525
  lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
526
+ m = cb.CatBoostRegressor(iterations=iterations, depth=depth, learning_rate=lr, verbose=0)
 
 
 
 
 
 
 
 
 
527
  else:
528
+ m = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=random_state)
 
 
 
529
  try:
530
+ scores = cross_val_score(m, X_local, y_local, scoring="r2", cv=3)
531
  return float(np.mean(scores))
532
  except Exception:
533
  return -999.0
 
535
  study = optuna.create_study(direction="maximize")
536
  study.optimize(obj, n_trials=n_trials, show_progress_bar=False)
537
  best = study.best_trial.params if study.trials else {}
 
538
  try:
539
  if family_name == "RandomForest":
540
+ model = RandomForestRegressor(**{**{"random_state":42,"n_jobs":-1}, **best})
541
  elif family_name == "ExtraTrees":
542
+ model = ExtraTreesRegressor(**{**{"random_state":42,"n_jobs":-1}, **best})
543
  elif family_name == "XGBoost" and optional_families.get("XGBoost"):
544
+ model = xgb.XGBRegressor(**{**{"verbosity":0,"tree_method":"hist"}, **best})
545
  elif family_name == "LightGBM" and optional_families.get("LightGBM"):
546
+ model = lgb.LGBMRegressor(**{**{"n_jobs":1}, **best})
547
  elif family_name == "CatBoost" and optional_families.get("CatBoost"):
548
+ model = cb.CatBoostRegressor(**{**{"verbose":0}, **best})
 
 
 
 
 
549
  else:
550
+ model = RandomForestRegressor(random_state=42)
551
  except Exception:
552
+ model = RandomForestRegressor(random_state=42)
553
 
 
554
  try:
555
+ score = float(np.mean(cross_val_score(model, X_local, y_local, scoring="r2", cv=3)))
556
  except Exception:
557
  score = -999.0
558
+ return {"model_obj": model, "cv_score": score, "best_params": best, "family": family_name}
559
 
 
 
 
 
 
 
560
  if st.button("Run expanded AutoML + Stacking"):
561
  st.session_state["run_automl_clicked"] = True
562
+
563
  if st.session_state["run_automl_clicked"]:
564
  log("AutoML + Stacking initiated.")
565
+ with st.spinner("Tuning multiple families..."):
566
  families_to_try = ["RandomForest", "ExtraTrees", "MLP"]
567
  if allow_advanced:
568
  if optional_families.get("XGBoost"): families_to_try.append("XGBoost")
569
  if optional_families.get("LightGBM"): families_to_try.append("LightGBM")
570
  if optional_families.get("CatBoost"): families_to_try.append("CatBoost")
 
 
571
 
572
  tuned_results = []
573
  for fam in families_to_try:
574
  log(f"Tuning family: {fam}")
575
  st.caption(f"Tuning family: {fam}")
576
+ tuned_results.append(tune_family(fam, X, y, n_trials=max_trials))
577
+ # --- Leaderboard
 
 
 
 
 
 
 
578
  lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"], "params": r["best_params"]} for r in tuned_results])
579
  lb = lb.sort_values("cv_r2", ascending=False).reset_index(drop=True)
580
  st.markdown("### Tuning Leaderboard (by CV R²)")
581
  st.dataframe(lb[["family","cv_r2"]].round(4))
582
+
583
+ # --- Enhanced Ensemble Stacking ---
584
+ from sklearn.feature_selection import SelectKBest, f_regression
585
+ from sklearn.linear_model import LinearRegression
586
+ from sklearn.model_selection import KFold
587
+
 
 
 
 
 
 
 
 
 
 
 
588
  st.markdown("### Building base models & out-of-fold predictions for stacking")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
589
 
590
+ scaler = StandardScaler()
591
+ X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
592
+ selector = SelectKBest(f_regression, k=min(40, X_scaled.shape[1]))
593
+ X_sel = selector.fit_transform(X_scaled, y)
594
+ selected_feature_names = [X.columns[i] for i in selector.get_support(indices=True)]
595
+ X_sel = pd.DataFrame(X_sel, columns=selected_feature_names)
 
 
 
 
 
 
596
 
597
+ kf = KFold(n_splits=5, shuffle=True, random_state=42)
598
+ base_models, oof_preds = [], pd.DataFrame(index=X_sel.index)
599
 
600
+ for fam, entry in [(r["family"], r) for r in tuned_results if r.get("model_obj")]:
601
+ model_obj = entry["model_obj"]
602
+ oof = np.zeros(X_sel.shape[0])
603
+ for tr_idx, val_idx in kf.split(X_sel):
604
+ X_tr, X_val = X_sel.iloc[tr_idx], X_sel.iloc[val_idx]
605
+ y_tr = y.iloc[tr_idx]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
606
  try:
607
+ model_obj.fit(X_tr, y_tr)
608
+ preds = model_obj.predict(X_val)
609
+ oof[val_idx] = preds
610
  except Exception:
611
+ oof[val_idx] = np.mean(y_tr)
612
+ oof_preds[f"{fam}_oof"] = oof
613
+ model_obj.fit(X_sel, y)
614
+ base_models.append({"family": fam, "model": model_obj})
615
+
616
+ if oof_preds.empty:
617
+ st.error("No base models built.")
618
+ st.stop()
619
+
620
+ corr = oof_preds.corr().abs()
621
+ div = {c: 1 - corr[c].drop(c).mean() for c in corr.columns}
622
+ cv_r2_est = {c: r2_score(y, oof_preds[c]) for c in oof_preds.columns}
623
+
624
+ summary_df = pd.DataFrame({
625
+ "family": [c.replace("_oof","") for c in oof_preds.columns],
626
+ "cv_r2": [cv_r2_est[c] for c in oof_preds.columns],
627
+ "diversity": [div[c] for c in oof_preds.columns]
628
+ }).sort_values(["cv_r2","diversity"], ascending=[False,False])
629
+
630
+ st.dataframe(summary_df.round(4))
631
+ selected = summary_df.head(top_k)["family"].tolist()
632
+ st.markdown(f"Selected for stacking (top {top_k}): {selected}")
633
+
634
+ meta = LinearRegression(positive=True)
635
+ X_stack = oof_preds[[f"{s}_oof" for s in selected]].fillna(0)
636
+ meta.fit(X_stack, y)
637
+
638
+ X_tr, X_val, y_tr, y_val = train_test_split(X_sel, y, test_size=0.2, random_state=42)
639
+ meta_inputs = []
640
+ for fam in selected:
641
+ mdl = next((b["model"] for b in base_models if b["family"] == fam), None)
642
+ preds = mdl.predict(X_val) if mdl else np.full(len(X_val), np.mean(y_tr))
643
+ meta_inputs.append(np.ravel(preds))
644
+ X_meta_val = pd.DataFrame(np.column_stack(meta_inputs), columns=X_stack.columns)
645
+ y_meta_pred = meta.predict(X_meta_val)
646
+
647
+ final_r2 = r2_score(y_val, y_meta_pred)
648
+ final_rmse = np.sqrt(mean_squared_error(y_val, y_meta_pred))
649
+ st.success(f"Stacked Ensemble — R² = {final_r2:.4f}, RMSE = {final_rmse:.3f}")
650
+
651
+ fig, ax = plt.subplots(figsize=(7,4))
652
+ ax.scatter(y_val, y_meta_pred, alpha=0.7)
653
+ ax.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], "r--")
654
+ st.pyplot(fig, clear_figure=True)
655
+
656
+ st.session_state["automl_summary"] = {
657
+ "leaderboard": summary_df[["family","cv_r2"]].to_dict(orient="records"),
658
+ "final_r2": float(final_r2),
659
+ "final_rmse": float(final_rmse),
660
+ "target": target,
661
+ "use_case": use_case
662
+ }
663
+
664
+ # --- Operator Advisory System + Llama-3-70B-Instruct ---
665
+ st.markdown("---")
666
+ st.subheader("Operator Advisory System — Real-Time Shift Recommendations")
667
+
668
+ try:
669
+ top_base = next((b for b in base_models if b["family"] == selected[0]), None)
670
+ if top_base and hasattr(top_base["model"], "predict"):
671
+ sample_X = X_val.sample(min(300, len(X_val)), random_state=42)
672
+ model = top_base["model"]
673
+ expl = shap.TreeExplainer(model)
674
+ shap_vals = expl.shap_values(sample_X)
675
+ if isinstance(shap_vals, list): shap_vals = shap_vals[0]
676
+ shap_vals = np.array(shap_vals)
677
+ mean_abs = np.abs(shap_vals).mean(axis=0)
678
+ mean_sign = np.sign(shap_vals).mean(axis=0)
679
+ importance = pd.DataFrame({
680
+ "Feature": sample_X.columns,
681
+ "Mean |SHAP|": mean_abs,
682
+ "Mean SHAP Sign": mean_sign
683
+ }).sort_values("Mean |SHAP|", ascending=False)
684
+ st.markdown("### Top 5 Operational Drivers")
685
+ st.dataframe(importance.head(5))
686
+ recommendations = []
687
+ for _, row in importance.head(5).iterrows():
688
+ f, s = row["Feature"], row["Mean SHAP Sign"]
689
+ if s > 0.05: recommendations.append(f"Increase `{f}` likely increases `{target}`")
690
+ elif s < -0.05: recommendations.append(f"Decrease `{f}` likely increases `{target}`")
691
+ else: recommendations.append(f"`{f}` neutral for `{target}`")
692
+ st.markdown("### Suggested Operator Adjustments")
693
+ st.write("\n".join(recommendations))
694
+
695
+ # --- Call HF Llama-3-70B-Instruct API for summary ---
696
+ import requests
697
+ HF_TOKEN = st.secrets.get("HF_TOKEN", os.getenv("HF_TOKEN"))
698
+ if not HF_TOKEN:
699
+ st.error("HF_TOKEN not found in secrets or environment.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
700
  else:
701
+ API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-3-70B-Instruct"
702
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
703
+ prompt = f"""
704
+ You are an expert metallurgical process advisor.
705
+ Based on these recommendations:
706
+ {recommendations}
707
+ Target: {target}
708
+ Use case: {use_case}
709
+ Summarize in three concise, professional lines what the operator should do this shift.
710
+ """
711
+ payload = {"inputs": prompt, "parameters": {"max_new_tokens": 150, "temperature": 0.6}}
712
+ with st.spinner("Generating operator note (Llama-3-70B)…"):
713
+ resp = requests.post(API_URL, headers=headers, json=payload, timeout=90)
714
+ text = resp.json()[0].get("generated_text","").strip()
715
+ st.info(text)
716
  else:
717
+ st.info("No suitable base model found.")
718
+ except Exception as e:
719
+ st.warning(f"Operator advisory skipped: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
720
 
721
+ # ----- Business Impact tab
 
722
  with tabs[5]:
723
+ st.subheader("Business Impact Metrics")
 
 
724
  target_table = pd.DataFrame([
725
+ ["EAF Data Intelligence", "furnace_temp / tap_temp", "Central control variable", "₹2060 L/year"],
726
+ ["Casting Optimization", "surface_temp / cooling_water_temp", "Controls billet quality", "₹50 L/year"],
727
+ ["Rolling Mill", "energy_efficiency", "Energy optimization", "₹5–10 L/year"],
728
+ ["Refractory Loss Prediction", "lining_thickness / heat_loss_rate", "Wear and downtime", "₹40 L/year"],
729
+ ], columns=["Use Case","Target Variable","Why It’s Ideal","Business Leverage"])
730
+ st.dataframe(target_table, width="stretch")
731
+
732
+ # ----- Bibliography tab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
733
  with tabs[6]:
734
+ st.subheader("Annotated Bibliography")
735
+ refs = [
736
+ ("A Survey of Data-Driven Soft Sensing in Ironmaking Systems","Yan et al. (2024)","Soft sensors validate `furnace_temp` and `tap_temp`.","https://doi.org/10.1021/acsomega.4c01254"),
737
+ ("Optimisation of Operator Support Systems","Ojeda Roldán et al. (2022)","Reinforcement learning for endpoint control.","https://doi.org/10.3390/jmmp6020034"),
738
+ ("Analyzing the Energy Efficiency of Electric Arc Furnace Steelmaking","Zhuo et al. (2024)","Links arc power and energy KPIs.","https://doi.org/10.3390/met15010113"),
739
+ ("Dynamic EAF Modeling and Slag Foaming Index Prediction","MacRosty et al.","Supports refractory wear modeling.","https://www.sciencedirect.com/science/article/pii/S0921883123004019")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
740
  ]
741
+ for t,a,n,u in refs:
742
+ st.markdown(f"**[{t}]({u})** — *{a}* \n_{n}_")
743
+
744
+ # ----- Download tab
745
+ with tabs[7]:
746
+ st.subheader("Download Saved Files")
747
+ files = [f for f in os.listdir(LOG_DIR) if os.path.isfile(os.path.join(LOG_DIR, f))]
748
+ if not files: st.info("No files yet — run AutoML first.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
749
  else:
750
+ for f in sorted(files):
751
  path = os.path.join(LOG_DIR, f)
752
+ with open(path,"rb") as fp:
753
+ st.download_button(f"Download {f}", fp, file_name=f)
 
 
 
 
 
754
 
755
+ # ----- Logs tab
756
+ with tabs[8]:
757
+ st.subheader("Master Log")
 
 
758
  if os.path.exists(LOG_PATH):
759
+ txt = open(LOG_PATH).read()
760
+ st.text_area("Log Output", txt, height=400)
761
+ st.download_button("Download Log", txt, file_name="run_master.log")
 
762
  else:
763
+ st.info("No logs yet — run AutoML once.")
764
+
765
+ st.markdown("---")
766
+ st.markdown("**Note:** Synthetic demo dataset for educational use only. Real deployment requires plant data, NDA, and safety validation.")