singhn9 commited on
Commit
222bddd
·
verified ·
1 Parent(s): 9c9ae9b

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +138 -59
src/streamlit_app.py CHANGED
@@ -23,27 +23,43 @@ from sklearn.metrics import mean_squared_error, r2_score
23
  # SHAP
24
  import shap
25
 
 
26
  # -------------------------
27
  # Config & paths
28
  # -------------------------
29
- st.set_page_config(page_title="AI Feature Universe Explorer Advanced + SHAP", layout="wide")
30
 
31
- # Use Hugging Face persistent path if available
32
- PERSISTENT_DIR = "/data" if os.path.exists("/data") else "./data"
33
- DATA_DIR = os.getenv("DATA_DIR", PERSISTENT_DIR)
34
 
 
 
35
  os.makedirs(DATA_DIR, exist_ok=True)
 
 
 
 
 
36
 
 
 
 
 
 
 
 
 
 
37
  CSV_PATH = os.path.join(DATA_DIR, "flatfile_universe_advanced.csv")
38
  META_PATH = os.path.join(DATA_DIR, "feature_metadata_advanced.json")
39
- PDF_PATH = os.path.join(DATA_DIR, "annotated_bibliography.pdf")
40
  ENSEMBLE_ARTIFACT = os.path.join(DATA_DIR, "ensemble_models.joblib")
41
 
 
42
  # Confirm storage mount
43
  if os.path.exists("/data"):
44
- st.sidebar.success(f" Using persistent storage: {DATA_DIR}")
45
  else:
46
- st.sidebar.warning(f"⚠️ Using ephemeral storage: {DATA_DIR}. Data will be lost on rebuild.")
47
 
48
 
49
  # -------------------------
@@ -158,9 +174,9 @@ def generate_advanced_flatfile(
158
 
159
  # timestamps & metadata
160
  start = pd.Timestamp("2025-01-01T00:00:00")
161
- df["timestamp"] = pd.date_range(start, periods=n_rows, freq="T")
162
  df["cycle_minute"] = np.mod(np.arange(n_rows), 80)
163
- df["meta_plant_name"] = np.random.choice(["Rourkela","Jamshedpur","VSP","Bokaro","Kalinganagar","Salem"], n_rows)
164
  df["meta_country"] = "India"
165
 
166
  # --- synthetic features: physics informed proxies
@@ -177,7 +193,7 @@ def generate_advanced_flatfile(
177
  if rc in df.columns:
178
  df[f"{rc}_roll_mean_3"] = df[rc].rolling(3, min_periods=1).mean()
179
  df[f"{rc}_roll_std_5"] = df[rc].rolling(5, min_periods=1).std().fillna(0)
180
- df[f"{rc}_lag1"] = df[rc].shift(1).fillna(method="bfill")
181
  df[f"{rc}_roc_1"] = df[rc].diff().fillna(0)
182
 
183
  # interaction & polynomial-lite
@@ -210,7 +226,7 @@ def generate_advanced_flatfile(
210
 
211
  # surrogate models
212
  surrogate_df = df.copy()
213
- surrogate_df["furnace_temp_next"] = surrogate_df["furnace_temp"].shift(-1).fillna(method="ffill")
214
  features_for_surrogate = [c for c in ["furnace_temp","arc_power","o2_probe_pct","offgas_co","offgas_co2"] if c in df.columns]
215
  if len(features_for_surrogate) >= 2:
216
  X = surrogate_df[features_for_surrogate].fillna(0)
@@ -263,34 +279,35 @@ def generate_advanced_flatfile(
263
  with open(META_PATH, "w") as f:
264
  json.dump(meta, f, indent=2)
265
 
 
266
  # annotated bibliography
267
- try:
268
- from fpdf import FPDF
269
- pdf = FPDF('P','mm','A4')
270
- pdf.add_page()
271
- pdf.set_font("Helvetica","B",14)
272
- pdf.cell(0,8,"Annotated Bibliography - Metallurgical AI (Selected Papers)", ln=True)
273
- pdf.ln(2)
274
- pdf.set_font("Helvetica","",10)
275
- pdf.cell(0,6,"Generated: " + datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC"), ln=True)
276
- pdf.ln(4)
277
- bib_items = [
278
- ("A Survey of Data-Driven Soft Sensing in Ironmaking Systems","Yan et al. (2024)","Review of soft-sensors; supports gas proxies, lags, PCA."),
279
- ("Optimisation of Oxygen Blowing Process using RL","Ojeda Roldan et al. (2022)","RL for oxygen control; motivates surrogate predicted states & safety indices."),
280
- ("Analyzing the Energy Efficiency of Electric Arc Furnace","Zhuo et al. (2024)","Energy KPIs (kWh/t) motivate power_density & energy_efficiency features."),
281
- ("BOF/Endpoint prediction techniques","Springer (2024)","Endpoint prediction; supports temporal lags and cycle encoding."),
282
- ("Dynamic EAF modeling & slag foaming","MacRosty et al.","Physics priors for slag_foaming_index and refractory health modeling.")
283
- ]
284
- for title, auth, note in bib_items:
285
- pdf.set_font("Helvetica","B",11)
286
- pdf.multi_cell(0,6, f"{title} — {auth}")
287
- pdf.set_font("Helvetica","",10)
288
- pdf.multi_cell(0,5, f"Notes: {note}")
289
- pdf.ln(2)
290
- pdf.output(PDF_PATH)
291
- except Exception as e:
292
- with open(PDF_PATH.replace(".pdf",".txt"), "w") as tf:
293
- tf.write("Annotated bibliography generated. Install fpdf for PDF output.\n")
294
 
295
  return CSV_PATH, META_PATH, PDF_PATH
296
 
@@ -563,6 +580,7 @@ with tabs[4]:
563
  # --- Run tuning across available families (user triggered) ---
564
  run_btn = st.button(" Run expanded AutoML + Stacking")
565
  if run_btn:
 
566
  with st.spinner("Tuning multiple families (this may take a while depending on choices)..."):
567
  families_to_try = ["RandomForest", "ExtraTrees", "MLP"]
568
  if allow_advanced:
@@ -574,6 +592,7 @@ with tabs[4]:
574
 
575
  tuned_results = []
576
  for fam in families_to_try:
 
577
  st.caption(f"Tuning family: {fam}")
578
  res = tune_family(fam, X, y, n_trials=max_trials)
579
  # res can be dict or single-run result; ensure consistent format
@@ -581,7 +600,8 @@ with tabs[4]:
581
  tuned_results.append(res)
582
  else:
583
  st.warning(f"Family {fam} returned unexpected tune result: {res}")
584
-
 
585
  # build leaderboard DataFrame
586
  lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"], "params": r["best_params"]} for r in tuned_results])
587
  lb = lb.sort_values("cv_r2", ascending=False).reset_index(drop=True)
@@ -734,6 +754,9 @@ with tabs[4]:
734
  # Final evaluation
735
  final_r2 = r2_score(y_val, y_meta_pred)
736
  final_rmse = mean_squared_error(y_val, y_meta_pred, squared=False)
 
 
 
737
 
738
  c1, c2 = st.columns(2)
739
  c1.metric("Stacked Ensemble R² (holdout)", f"{final_r2:.4f}")
@@ -757,7 +780,7 @@ with tabs[4]:
757
  "target": target,
758
  }
759
  joblib.dump(to_save, stack_artifact)
760
- st.caption(f" Stacked ensemble saved: {stack_artifact}")
761
 
762
  # Explainability
763
  st.markdown("### Explainability (approximate)")
@@ -778,7 +801,7 @@ with tabs[4]:
778
  except Exception as e:
779
  st.warning(f"SHAP computation skipped: {e}")
780
 
781
- st.success(" AutoML + Stacking complete — metrics, artifacts, and SHAP ready.")
782
 
783
 
784
 
@@ -797,7 +820,7 @@ with tabs[5]:
797
  ["Inventory & Yield Optimization", "yield_ratio (output/input)", "Linked to WIP and process yield", "₹1 Cr+/year"],
798
  ["Refractory & Cooling Loss Prediction", "lining_thickness / heat_loss_rate", "Predict wear for planned maintenance", "₹40 L/year downtime savings"]], columns=["Use Case", "Target Variable", "Why It’s Ideal", "Business Leverage"])
799
 
800
- st.dataframe(target_table, use_container_width=True)
801
 
802
  st.markdown("---")
803
  st.subheader("Business Framing for Clients")
@@ -811,37 +834,93 @@ with tabs[5]:
811
  ["Yield loss", "2 %", "₹50 L – ₹1 Cr"],
812
  ], columns=["Metric", "Typical Value (EAF India)", "5 % Improvement → Annual ₹ Value"])
813
 
814
- st.dataframe(business_table, use_container_width=True)
815
  st.info("These numbers are indicative averages; actual benefits depend on plant capacity and process efficiency.")
816
 
817
  # ----- Bibliography tab
818
  with tabs[6]:
819
- st.subheader("Annotated Bibliography & Feature Justification")
820
  st.markdown("""
821
- This section summarizes published research supporting the feature design and modeling choices.
 
822
  """)
823
 
824
  bib_data = [
825
- ("A Survey of Data-Driven Soft Sensing in Ironmaking Systems", "Yan et al. (2024)", "Supports gas proxies, lags, PCA for off-gas and temperature correlation."),
826
- ("Optimisation of Oxygen Blowing Process using RL", "Ojeda Roldan et al. (2022)", "Reinforcement learning for oxygen control; motivates surrogate predicted states & safety indices."),
827
- ("Analyzing the Energy Efficiency of Electric Arc Furnace", "Zhuo et al. (2024)", "Energy KPIs (kWh/t) motivate power_density & energy_efficiency features."),
828
- ("BOF/Endpoint Prediction Techniques", "Springer (2024)", "Endpoint prediction; supports temporal lags and cycle encoding."),
829
- ("Dynamic EAF Modeling & Slag Foaming", "MacRosty et al.", "Physics priors for slag_foaming_index and refractory health modeling."),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
830
  ]
831
 
832
- bib_df = pd.DataFrame(bib_data, columns=["Paper Title", "Authors / Year", "Relevance to Feature Engineering"])
833
- st.dataframe(bib_df, use_container_width=True)
 
 
 
 
 
 
 
834
 
835
  st.markdown("""
836
- **Feature-to-Research Mapping Summary:**
837
- - Gas probes & soft-sensing `carbon_proxy`, `oxygen_utilization`
838
- - Power & energy proxies `power_density`, `energy_efficiency`
839
- - Temporal featuresrolling means, lags, cycle progress indicators
840
- - Surrogate features → `pred_temp_30s`, `pred_carbon_5min`
841
- - PCA / clustering → operating mode compression
842
- """)
 
 
 
843
  # -------------------------
844
  # Footer / Notes
845
  # -------------------------
846
  st.markdown("---")
847
  st.markdown("**Notes:** This dataset is synthetic and for demo/prototyping. Real plant integration requires NDA, data on-boarding, sensor mapping, and plant safety checks before any control actions.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  # SHAP
24
  import shap
25
 
26
+
27
  # -------------------------
28
  # Config & paths
29
  # -------------------------
30
+ st.set_page_config(page_title="Steel Authority of India Limited (MODEX)", layout="wide")
31
 
32
+ # Base ephemeral paths (no Hugging Face data mount)
33
+ BASE_DIR = "./"
 
34
 
35
+ LOG_DIR = os.path.join(BASE_DIR, "logs")
36
+ DATA_DIR = os.path.join(LOG_DIR, "data_ephemeral")
37
  os.makedirs(DATA_DIR, exist_ok=True)
38
+ os.makedirs(LOG_DIR, exist_ok=True)
39
+
40
+ # Timestamped log file
41
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
42
+ LOG_PATH = os.path.join(LOG_DIR, f"run_{timestamp}.log")
43
 
44
+ def log(msg: str):
45
+ """Log message with timestamp to /logs/ for ephemeral HF runs."""
46
+ with open(LOG_PATH, "a", encoding="utf-8") as f:
47
+ f.write(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {msg}\n")
48
+ print(msg)
49
+
50
+ log(" Streamlit session started.")
51
+ log(f"Python PID={os.getpid()} | Time={datetime.now().isoformat()}")
52
+ log(f"Data Dir = {DATA_DIR} | Log Dir = {LOG_DIR}")
53
  CSV_PATH = os.path.join(DATA_DIR, "flatfile_universe_advanced.csv")
54
  META_PATH = os.path.join(DATA_DIR, "feature_metadata_advanced.json")
 
55
  ENSEMBLE_ARTIFACT = os.path.join(DATA_DIR, "ensemble_models.joblib")
56
 
57
+
58
  # Confirm storage mount
59
  if os.path.exists("/data"):
60
+ st.sidebar.success(f" Using persistent storage: {DATA_DIR}")
61
  else:
62
+ st.sidebar.warning(f" Using ephemeral storage: {DATA_DIR}. Data will be lost on rebuild.")
63
 
64
 
65
  # -------------------------
 
174
 
175
  # timestamps & metadata
176
  start = pd.Timestamp("2025-01-01T00:00:00")
177
+ df["timestamp"] = pd.date_range(start, periods=n_rows, freq="min")
178
  df["cycle_minute"] = np.mod(np.arange(n_rows), 80)
179
+ df["meta_plant_name"] = np.random.choice(["Rourkela","Bhilai","Durgapur","Bokaro","Burnpur","Salem"], n_rows)
180
  df["meta_country"] = "India"
181
 
182
  # --- synthetic features: physics informed proxies
 
193
  if rc in df.columns:
194
  df[f"{rc}_roll_mean_3"] = df[rc].rolling(3, min_periods=1).mean()
195
  df[f"{rc}_roll_std_5"] = df[rc].rolling(5, min_periods=1).std().fillna(0)
196
+ df[f"{rc}_lag1"] = df[rc].shift(1).bfill()
197
  df[f"{rc}_roc_1"] = df[rc].diff().fillna(0)
198
 
199
  # interaction & polynomial-lite
 
226
 
227
  # surrogate models
228
  surrogate_df = df.copy()
229
+ surrogate_df["furnace_temp_next"] = surrogate_df["furnace_temp"].shift(-1).ffill()
230
  features_for_surrogate = [c for c in ["furnace_temp","arc_power","o2_probe_pct","offgas_co","offgas_co2"] if c in df.columns]
231
  if len(features_for_surrogate) >= 2:
232
  X = surrogate_df[features_for_surrogate].fillna(0)
 
279
  with open(META_PATH, "w") as f:
280
  json.dump(meta, f, indent=2)
281
 
282
+ PDF_PATH = None
283
  # annotated bibliography
284
+ # try:
285
+ # from fpdf import FPDF
286
+ # pdf = FPDF('P','mm','A4')
287
+ # pdf.add_page()
288
+ # pdf.set_font("Helvetica","B",14)
289
+ # pdf.cell(0,8,"Annotated Bibliography - Metallurgical AI (Selected Papers)", ln=True)
290
+ # pdf.ln(2)
291
+ # pdf.set_font("Helvetica","",10)
292
+ # pdf.cell(0,6,"Generated: " + datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC"), ln=True)
293
+ # pdf.ln(4)
294
+ # bib_items = [
295
+ # ("A Survey of Data-Driven Soft Sensing in Ironmaking Systems","Yan et al. (2024)","Review of soft-sensors; supports gas proxies, lags, PCA."),
296
+ # ("Optimisation of Oxygen Blowing Process using RL","Ojeda Roldan et al. (2022)","RL for oxygen control; motivates surrogate predicted states & safety indices."),
297
+ # ("Analyzing the Energy Efficiency of Electric Arc Furnace","Zhuo et al. (2024)","Energy KPIs (kWh/t) motivate power_density & energy_efficiency features."),
298
+ # ("BOF/Endpoint prediction techniques","Springer (2024)","Endpoint prediction; supports temporal lags and cycle encoding."),
299
+ # ("Dynamic EAF modeling & slag foaming","MacRosty et al.","Physics priors for slag_foaming_index and refractory health modeling.")
300
+ # ]
301
+ # for title, auth, note in bib_items:
302
+ # pdf.set_font("Helvetica","B",11)
303
+ # pdf.multi_cell(0,6, f"{title} — {auth}")
304
+ # pdf.set_font("Helvetica","",10)
305
+ # pdf.multi_cell(0,5, f"Notes: {note}")
306
+ # pdf.ln(2)
307
+ # pdf.output(PDF_PATH)
308
+ # except Exception as e:
309
+ # with open(PDF_PATH.replace(".pdf",".txt"), "w") as tf:
310
+ # tf.write("Annotated bibliography generated. Install fpdf for PDF output.\n")
311
 
312
  return CSV_PATH, META_PATH, PDF_PATH
313
 
 
580
  # --- Run tuning across available families (user triggered) ---
581
  run_btn = st.button(" Run expanded AutoML + Stacking")
582
  if run_btn:
583
+ log("AutoML + Stacking initiated.")
584
  with st.spinner("Tuning multiple families (this may take a while depending on choices)..."):
585
  families_to_try = ["RandomForest", "ExtraTrees", "MLP"]
586
  if allow_advanced:
 
592
 
593
  tuned_results = []
594
  for fam in families_to_try:
595
+ log(f"Tuning family: {fam}")
596
  st.caption(f"Tuning family: {fam}")
597
  res = tune_family(fam, X, y, n_trials=max_trials)
598
  # res can be dict or single-run result; ensure consistent format
 
600
  tuned_results.append(res)
601
  else:
602
  st.warning(f"Family {fam} returned unexpected tune result: {res}")
603
+ log("All families tuned successfully.")
604
+
605
  # build leaderboard DataFrame
606
  lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"], "params": r["best_params"]} for r in tuned_results])
607
  lb = lb.sort_values("cv_r2", ascending=False).reset_index(drop=True)
 
754
  # Final evaluation
755
  final_r2 = r2_score(y_val, y_meta_pred)
756
  final_rmse = mean_squared_error(y_val, y_meta_pred, squared=False)
757
+ st.success("AutoML + Stacking complete — metrics, artifacts, and SHAP ready.")
758
+ log(f"Completed stacking. Final R2={final_r2:.4f}, RMSE={final_rmse:.4f}")
759
+
760
 
761
  c1, c2 = st.columns(2)
762
  c1.metric("Stacked Ensemble R² (holdout)", f"{final_r2:.4f}")
 
780
  "target": target,
781
  }
782
  joblib.dump(to_save, stack_artifact)
783
+ st.caption(f" Stacked ensemble saved: {stack_artifact}")
784
 
785
  # Explainability
786
  st.markdown("### Explainability (approximate)")
 
801
  except Exception as e:
802
  st.warning(f"SHAP computation skipped: {e}")
803
 
804
+ st.success(" AutoML + Stacking complete — metrics, artifacts, and SHAP ready.")
805
 
806
 
807
 
 
820
  ["Inventory & Yield Optimization", "yield_ratio (output/input)", "Linked to WIP and process yield", "₹1 Cr+/year"],
821
  ["Refractory & Cooling Loss Prediction", "lining_thickness / heat_loss_rate", "Predict wear for planned maintenance", "₹40 L/year downtime savings"]], columns=["Use Case", "Target Variable", "Why It’s Ideal", "Business Leverage"])
822
 
823
+ st.dataframe(target_table, width="stretch")
824
 
825
  st.markdown("---")
826
  st.subheader("Business Framing for Clients")
 
834
  ["Yield loss", "2 %", "₹50 L – ₹1 Cr"],
835
  ], columns=["Metric", "Typical Value (EAF India)", "5 % Improvement → Annual ₹ Value"])
836
 
837
+ st.dataframe(business_table, width="stretch")
838
  st.info("These numbers are indicative averages; actual benefits depend on plant capacity and process efficiency.")
839
 
840
  # ----- Bibliography tab
841
  with tabs[6]:
842
+ st.subheader("Annotated Bibliography Justification for Target Variables")
843
  st.markdown("""
844
+ These papers justify the chosen target variables (temperature, yield, efficiency, refractory wear)
845
+ in metallurgical AI modeling. Click any title to open the official paper.
846
  """)
847
 
848
  bib_data = [
849
+ {
850
+ "title": "A Survey of Data-Driven Soft Sensing in Ironmaking Systems",
851
+ "authors": "Yan et al. (2024)",
852
+ "notes": "Soft sensors for furnace and tap temperature; validates `furnace_temp` and `tap_temp` targets.",
853
+ "url": "https://doi.org/10.1021/acsomega.4c01254"
854
+ },
855
+ {
856
+ "title": "Optimisation of Operator Support Systems through Artificial Intelligence for the Cast Steel Industry",
857
+ "authors": "Ojeda Roldán et al. (2022)",
858
+ "notes": "Reinforcement learning for oxygen blowing and endpoint control; supports temperature and carbon targets.",
859
+ "url": "https://doi.org/10.3390/jmmp6020034"
860
+ },
861
+ {
862
+ "title": "Analyzing the Energy Efficiency of Electric Arc Furnace Steelmaking",
863
+ "authors": "Zhuo et al. (2024)",
864
+ "notes": "Links arc power, temperature, and energy KPIs — validates `energy_efficiency` and `power_density`.",
865
+ "url": "https://doi.org/10.3390/met15010113"
866
+ },
867
+ {
868
+ "title": "Dynamic EAF Modeling and Slag Foaming Index Prediction",
869
+ "authors": "MacRosty et al.",
870
+ "notes": "Supports refractory and heat-flux-based wear prediction — validates `lining_thickness` target.",
871
+ "url": "https://www.sciencedirect.com/science/article/pii/S0921883123004019"
872
+ },
873
+ {
874
+ "title": "Machine Learning for Yield Optimization in Continuous Casting",
875
+ "authors": "Springer (2023)",
876
+ "notes": "ML for yield ratio and defect minimization; supports `yield_ratio` target.",
877
+ "url": "https://link.springer.com/article/10.1007/s40964-023-00592-7"
878
+ }
879
  ]
880
 
881
+ bib_df = pd.DataFrame(bib_data)
882
+ bib_df["Paper Title"] = bib_df.apply(lambda x: f"[{x['title']}]({x['url']})", axis=1)
883
+
884
+ st.dataframe(
885
+ bib_df[["Paper Title", "authors", "notes"]]
886
+ .rename(columns={"authors": "Authors / Year", "notes": "Relevance"}),
887
+ width="stretch",
888
+ hide_index=True
889
+ )
890
 
891
  st.markdown("""
892
+ **Feature Target Justification**
893
+ - `furnace_temp`, `tap_temp` Process temperature (Yan 2024, Ojeda 2022)
894
+ - `yield_ratio` Production yield (Springer 2023)
895
+ - `energy_efficiency`, `power_density`Energy KPIs (Zhuo 2024)
896
+ - `lining_thickness`, `slag_foaming_index` → Refractory & process health (MacRosty et al.)
897
+ """)
898
+
899
+ st.info("Click any paper title above to open it in a new tab.")
900
+ log("Bibliography tab rendered successfully.")
901
+
902
  # -------------------------
903
  # Footer / Notes
904
  # -------------------------
905
  st.markdown("---")
906
  st.markdown("**Notes:** This dataset is synthetic and for demo/prototyping. Real plant integration requires NDA, data on-boarding, sensor mapping, and plant safety checks before any control actions.")
907
+ # ----- Logs tab
908
+ tabs.append("View Logs")
909
+ with tabs[-1]:
910
+ st.subheader("📜 Session & Model Logs")
911
+ st.markdown("Each run creates a timestamped log file in `/logs/` inside this Space. Use this panel to review run progress and debug output.")
912
+
913
+ log_files = sorted(
914
+ [f for f in os.listdir(LOG_DIR) if f.endswith(".log")],
915
+ reverse=True
916
+ )
917
+
918
+ if not log_files:
919
+ st.info("No logs yet. Run an AutoML job first.")
920
+ else:
921
+ latest = st.selectbox("Select log file", log_files, index=0)
922
+ path = os.path.join(LOG_DIR, latest)
923
+ with open(path, "r", encoding="utf-8") as f:
924
+ content = f.read()
925
+ st.text_area("Log Output", content, height=400)
926
+ st.download_button("⬇️ Download Log", content, file_name=latest)