SteelAI

Sleeping

App Files Files Community

singhn9 commited on Nov 8, 2025

Commit

222bddd

verified ·

1 Parent(s): 9c9ae9b

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +138 -59

src/streamlit_app.py CHANGED Viewed

@@ -23,27 +23,43 @@ from sklearn.metrics import mean_squared_error, r2_score
 # SHAP
 import shap
 # -------------------------
 # Config & paths
 # -------------------------
-st.set_page_config(page_title="AI Feature Universe Explorer — Advanced + SHAP", layout="wide")
-# Use Hugging Face persistent path if available
-PERSISTENT_DIR = "/data" if os.path.exists("/data") else "./data"
-DATA_DIR = os.getenv("DATA_DIR", PERSISTENT_DIR)
 os.makedirs(DATA_DIR, exist_ok=True)
 CSV_PATH = os.path.join(DATA_DIR, "flatfile_universe_advanced.csv")
 META_PATH = os.path.join(DATA_DIR, "feature_metadata_advanced.json")
-PDF_PATH = os.path.join(DATA_DIR, "annotated_bibliography.pdf")
 ENSEMBLE_ARTIFACT = os.path.join(DATA_DIR, "ensemble_models.joblib")
 # Confirm storage mount
 if os.path.exists("/data"):
-    st.sidebar.success(f"✅ Using persistent storage: {DATA_DIR}")
 else:
-    st.sidebar.warning(f"⚠️ Using ephemeral storage: {DATA_DIR}. Data will be lost on rebuild.")
 # -------------------------
@@ -158,9 +174,9 @@ def generate_advanced_flatfile(
     # timestamps & metadata
     start = pd.Timestamp("2025-01-01T00:00:00")
-    df["timestamp"] = pd.date_range(start, periods=n_rows, freq="T")
     df["cycle_minute"] = np.mod(np.arange(n_rows), 80)
-    df["meta_plant_name"] = np.random.choice(["Rourkela","Jamshedpur","VSP","Bokaro","Kalinganagar","Salem"], n_rows)
     df["meta_country"] = "India"
     # --- synthetic features: physics informed proxies
@@ -177,7 +193,7 @@ def generate_advanced_flatfile(
         if rc in df.columns:
             df[f"{rc}_roll_mean_3"] = df[rc].rolling(3, min_periods=1).mean()
             df[f"{rc}_roll_std_5"] = df[rc].rolling(5, min_periods=1).std().fillna(0)
-            df[f"{rc}_lag1"] = df[rc].shift(1).fillna(method="bfill")
             df[f"{rc}_roc_1"] = df[rc].diff().fillna(0)
     # interaction & polynomial-lite
@@ -210,7 +226,7 @@ def generate_advanced_flatfile(
     # surrogate models
     surrogate_df = df.copy()
-    surrogate_df["furnace_temp_next"] = surrogate_df["furnace_temp"].shift(-1).fillna(method="ffill")
     features_for_surrogate = [c for c in ["furnace_temp","arc_power","o2_probe_pct","offgas_co","offgas_co2"] if c in df.columns]
     if len(features_for_surrogate) >= 2:
         X = surrogate_df[features_for_surrogate].fillna(0)
@@ -263,34 +279,35 @@ def generate_advanced_flatfile(
     with open(META_PATH, "w") as f:
         json.dump(meta, f, indent=2)
     # annotated bibliography
-    try:
-        from fpdf import FPDF
-        pdf = FPDF('P','mm','A4')
-        pdf.add_page()
-        pdf.set_font("Helvetica","B",14)
-        pdf.cell(0,8,"Annotated Bibliography - Metallurgical AI (Selected Papers)", ln=True)
-        pdf.ln(2)
-        pdf.set_font("Helvetica","",10)
-        pdf.cell(0,6,"Generated: " + datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC"), ln=True)
-        pdf.ln(4)
-        bib_items = [
-            ("A Survey of Data-Driven Soft Sensing in Ironmaking Systems","Yan et al. (2024)","Review of soft-sensors; supports gas proxies, lags, PCA."),
-            ("Optimisation of Oxygen Blowing Process using RL","Ojeda Roldan et al. (2022)","RL for oxygen control; motivates surrogate predicted states & safety indices."),
-            ("Analyzing the Energy Efficiency of Electric Arc Furnace","Zhuo et al. (2024)","Energy KPIs (kWh/t) motivate power_density & energy_efficiency features."),
-            ("BOF/Endpoint prediction techniques","Springer (2024)","Endpoint prediction; supports temporal lags and cycle encoding."),
-            ("Dynamic EAF modeling & slag foaming","MacRosty et al.","Physics priors for slag_foaming_index and refractory health modeling.")
-        ]
-        for title, auth, note in bib_items:
-            pdf.set_font("Helvetica","B",11)
-            pdf.multi_cell(0,6, f"{title} — {auth}")
-            pdf.set_font("Helvetica","",10)
-            pdf.multi_cell(0,5, f"Notes: {note}")
-            pdf.ln(2)
-        pdf.output(PDF_PATH)
-    except Exception as e:
-        with open(PDF_PATH.replace(".pdf",".txt"), "w") as tf:
-            tf.write("Annotated bibliography generated. Install fpdf for PDF output.\n")
     return CSV_PATH, META_PATH, PDF_PATH
@@ -563,6 +580,7 @@ with tabs[4]:
     # --- Run tuning across available families (user triggered) ---
     run_btn = st.button(" Run expanded AutoML + Stacking")
     if run_btn:
         with st.spinner("Tuning multiple families (this may take a while depending on choices)..."):
             families_to_try = ["RandomForest", "ExtraTrees", "MLP"]
             if allow_advanced:
@@ -574,6 +592,7 @@ with tabs[4]:
             tuned_results = []
             for fam in families_to_try:
                 st.caption(f"Tuning family: {fam}")
                 res = tune_family(fam, X, y, n_trials=max_trials)
                 # res can be dict or single-run result; ensure consistent format
@@ -581,7 +600,8 @@ with tabs[4]:
                     tuned_results.append(res)
                 else:
                     st.warning(f"Family {fam} returned unexpected tune result: {res}")
             # build leaderboard DataFrame
             lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"], "params": r["best_params"]} for r in tuned_results])
             lb = lb.sort_values("cv_r2", ascending=False).reset_index(drop=True)
@@ -734,6 +754,9 @@ with tabs[4]:
                 # Final evaluation
                 final_r2 = r2_score(y_val, y_meta_pred)
                 final_rmse = mean_squared_error(y_val, y_meta_pred, squared=False)
                 c1, c2 = st.columns(2)
                 c1.metric("Stacked Ensemble R² (holdout)", f"{final_r2:.4f}")
@@ -757,7 +780,7 @@ with tabs[4]:
                     "target": target,
                 }
                 joblib.dump(to_save, stack_artifact)
-                st.caption(f"✅ Stacked ensemble saved: {stack_artifact}")
                 # Explainability
                 st.markdown("### Explainability (approximate)")
@@ -778,7 +801,7 @@ with tabs[4]:
                 except Exception as e:
                     st.warning(f"SHAP computation skipped: {e}")
-                st.success("✅ AutoML + Stacking complete — metrics, artifacts, and SHAP ready.")
@@ -797,7 +820,7 @@ with tabs[5]:
         ["Inventory & Yield Optimization", "yield_ratio (output/input)", "Linked to WIP and process yield", "₹1 Cr+/year"],
         ["Refractory & Cooling Loss Prediction", "lining_thickness / heat_loss_rate", "Predict wear for planned maintenance", "₹40 L/year downtime savings"]], columns=["Use Case", "Target Variable", "Why It’s Ideal", "Business Leverage"])
-    st.dataframe(target_table,  use_container_width=True)
     st.markdown("---")
     st.subheader("Business Framing for Clients")
@@ -811,37 +834,93 @@ with tabs[5]:
         ["Yield loss", "2 %", "₹50 L – ₹1 Cr"],
     ], columns=["Metric", "Typical Value (EAF India)", "5 % Improvement → Annual ₹ Value"])
-    st.dataframe(business_table, use_container_width=True)
     st.info("These numbers are indicative averages; actual benefits depend on plant capacity and process efficiency.")
 # -----  Bibliography tab
 with tabs[6]:
-    st.subheader("Annotated Bibliography & Feature Justification")
     st.markdown("""
-This section summarizes published research supporting the feature design and modeling choices.
     """)
     bib_data = [
-        ("A Survey of Data-Driven Soft Sensing in Ironmaking Systems", "Yan et al. (2024)", "Supports gas proxies, lags, PCA for off-gas and temperature correlation."),
-        ("Optimisation of Oxygen Blowing Process using RL", "Ojeda Roldan et al. (2022)", "Reinforcement learning for oxygen control; motivates surrogate predicted states & safety indices."),
-        ("Analyzing the Energy Efficiency of Electric Arc Furnace", "Zhuo et al. (2024)", "Energy KPIs (kWh/t) motivate power_density & energy_efficiency features."),
-        ("BOF/Endpoint Prediction Techniques", "Springer (2024)", "Endpoint prediction; supports temporal lags and cycle encoding."),
-        ("Dynamic EAF Modeling & Slag Foaming", "MacRosty et al.", "Physics priors for slag_foaming_index and refractory health modeling."),
     ]
-    bib_df = pd.DataFrame(bib_data, columns=["Paper Title", "Authors / Year", "Relevance to Feature Engineering"])
-    st.dataframe(bib_df, use_container_width=True)
     st.markdown("""
-**Feature-to-Research Mapping Summary:**
-- Gas probes & soft-sensing → `carbon_proxy`, `oxygen_utilization`
-- Power & energy proxies → `power_density`, `energy_efficiency`
-- Temporal features → rolling means, lags, cycle progress indicators
-- Surrogate features → `pred_temp_30s`, `pred_carbon_5min`
-- PCA / clustering → operating mode compression
-""")
 # -------------------------
 # Footer / Notes
 # -------------------------
 st.markdown("---")
 st.markdown("**Notes:** This dataset is synthetic and for demo/prototyping. Real plant integration requires NDA, data on-boarding, sensor mapping, and plant safety checks before any control actions.")

 # SHAP
 import shap
 # -------------------------
 # Config & paths
 # -------------------------
+st.set_page_config(page_title="Steel Authority of India Limited (MODEX)", layout="wide")
+# Base ephemeral paths (no Hugging Face data mount)
+BASE_DIR = "./"
+LOG_DIR = os.path.join(BASE_DIR, "logs")
+DATA_DIR = os.path.join(LOG_DIR, "data_ephemeral")
 os.makedirs(DATA_DIR, exist_ok=True)
+os.makedirs(LOG_DIR, exist_ok=True)
+# Timestamped log file
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+LOG_PATH = os.path.join(LOG_DIR, f"run_{timestamp}.log")
+def log(msg: str):
+    """Log message with timestamp to /logs/ for ephemeral HF runs."""
+    with open(LOG_PATH, "a", encoding="utf-8") as f:
+        f.write(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {msg}\n")
+    print(msg)
+log(" Streamlit session started.")
+log(f"Python PID={os.getpid()} | Time={datetime.now().isoformat()}")
+log(f"Data Dir = {DATA_DIR} | Log Dir = {LOG_DIR}")
 CSV_PATH = os.path.join(DATA_DIR, "flatfile_universe_advanced.csv")
 META_PATH = os.path.join(DATA_DIR, "feature_metadata_advanced.json")
 ENSEMBLE_ARTIFACT = os.path.join(DATA_DIR, "ensemble_models.joblib")
 # Confirm storage mount
 if os.path.exists("/data"):
+    st.sidebar.success(f" Using persistent storage: {DATA_DIR}")
 else:
+    st.sidebar.warning(f" Using ephemeral storage: {DATA_DIR}. Data will be lost on rebuild.")
 # -------------------------
     # timestamps & metadata
     start = pd.Timestamp("2025-01-01T00:00:00")
+    df["timestamp"] = pd.date_range(start, periods=n_rows, freq="min")
     df["cycle_minute"] = np.mod(np.arange(n_rows), 80)
+    df["meta_plant_name"] = np.random.choice(["Rourkela","Bhilai","Durgapur","Bokaro","Burnpur","Salem"], n_rows)
     df["meta_country"] = "India"
     # --- synthetic features: physics informed proxies
         if rc in df.columns:
             df[f"{rc}_roll_mean_3"] = df[rc].rolling(3, min_periods=1).mean()
             df[f"{rc}_roll_std_5"] = df[rc].rolling(5, min_periods=1).std().fillna(0)
+            df[f"{rc}_lag1"] = df[rc].shift(1).bfill()
             df[f"{rc}_roc_1"] = df[rc].diff().fillna(0)
     # interaction & polynomial-lite
     # surrogate models
     surrogate_df = df.copy()
+    surrogate_df["furnace_temp_next"] = surrogate_df["furnace_temp"].shift(-1).ffill()
     features_for_surrogate = [c for c in ["furnace_temp","arc_power","o2_probe_pct","offgas_co","offgas_co2"] if c in df.columns]
     if len(features_for_surrogate) >= 2:
         X = surrogate_df[features_for_surrogate].fillna(0)
     with open(META_PATH, "w") as f:
         json.dump(meta, f, indent=2)
+    PDF_PATH = None
     # annotated bibliography
+    # try:
+    #     from fpdf import FPDF
+    #     pdf = FPDF('P','mm','A4')
+    #     pdf.add_page()
+    #     pdf.set_font("Helvetica","B",14)
+    #     pdf.cell(0,8,"Annotated Bibliography - Metallurgical AI (Selected Papers)", ln=True)
+    #     pdf.ln(2)
+    #     pdf.set_font("Helvetica","",10)
+    #     pdf.cell(0,6,"Generated: " + datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC"), ln=True)
+    #     pdf.ln(4)
+    #     bib_items = [
+    #         ("A Survey of Data-Driven Soft Sensing in Ironmaking Systems","Yan et al. (2024)","Review of soft-sensors; supports gas proxies, lags, PCA."),
+    #         ("Optimisation of Oxygen Blowing Process using RL","Ojeda Roldan et al. (2022)","RL for oxygen control; motivates surrogate predicted states & safety indices."),
+    #         ("Analyzing the Energy Efficiency of Electric Arc Furnace","Zhuo et al. (2024)","Energy KPIs (kWh/t) motivate power_density & energy_efficiency features."),
+    #         ("BOF/Endpoint prediction techniques","Springer (2024)","Endpoint prediction; supports temporal lags and cycle encoding."),
+    #         ("Dynamic EAF modeling & slag foaming","MacRosty et al.","Physics priors for slag_foaming_index and refractory health modeling.")
+    #     ]
+    #     for title, auth, note in bib_items:
+    #         pdf.set_font("Helvetica","B",11)
+    #         pdf.multi_cell(0,6, f"{title} — {auth}")
+    #         pdf.set_font("Helvetica","",10)
+    #         pdf.multi_cell(0,5, f"Notes: {note}")
+    #         pdf.ln(2)
+    #     pdf.output(PDF_PATH)
+    # except Exception as e:
+    #     with open(PDF_PATH.replace(".pdf",".txt"), "w") as tf:
+    #         tf.write("Annotated bibliography generated. Install fpdf for PDF output.\n")
     return CSV_PATH, META_PATH, PDF_PATH
     # --- Run tuning across available families (user triggered) ---
     run_btn = st.button(" Run expanded AutoML + Stacking")
     if run_btn:
+        log("AutoML + Stacking initiated.")
         with st.spinner("Tuning multiple families (this may take a while depending on choices)..."):
             families_to_try = ["RandomForest", "ExtraTrees", "MLP"]
             if allow_advanced:
             tuned_results = []
             for fam in families_to_try:
+                log(f"Tuning family: {fam}")
                 st.caption(f"Tuning family: {fam}")
                 res = tune_family(fam, X, y, n_trials=max_trials)
                 # res can be dict or single-run result; ensure consistent format
                     tuned_results.append(res)
                 else:
                     st.warning(f"Family {fam} returned unexpected tune result: {res}")
+            log("All families tuned successfully.")
             # build leaderboard DataFrame
             lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"], "params": r["best_params"]} for r in tuned_results])
             lb = lb.sort_values("cv_r2", ascending=False).reset_index(drop=True)
                 # Final evaluation
                 final_r2 = r2_score(y_val, y_meta_pred)
                 final_rmse = mean_squared_error(y_val, y_meta_pred, squared=False)
+                st.success("AutoML + Stacking complete — metrics, artifacts, and SHAP ready.")
+                log(f"Completed stacking. Final R2={final_r2:.4f}, RMSE={final_rmse:.4f}")
                 c1, c2 = st.columns(2)
                 c1.metric("Stacked Ensemble R² (holdout)", f"{final_r2:.4f}")
                     "target": target,
                 }
                 joblib.dump(to_save, stack_artifact)
+                st.caption(f" Stacked ensemble saved: {stack_artifact}")
                 # Explainability
                 st.markdown("### Explainability (approximate)")
                 except Exception as e:
                     st.warning(f"SHAP computation skipped: {e}")
+                st.success(" AutoML + Stacking complete — metrics, artifacts, and SHAP ready.")
         ["Inventory & Yield Optimization", "yield_ratio (output/input)", "Linked to WIP and process yield", "₹1 Cr+/year"],
         ["Refractory & Cooling Loss Prediction", "lining_thickness / heat_loss_rate", "Predict wear for planned maintenance", "₹40 L/year downtime savings"]], columns=["Use Case", "Target Variable", "Why It’s Ideal", "Business Leverage"])
+    st.dataframe(target_table,  width="stretch")
     st.markdown("---")
     st.subheader("Business Framing for Clients")
         ["Yield loss", "2 %", "₹50 L – ₹1 Cr"],
     ], columns=["Metric", "Typical Value (EAF India)", "5 % Improvement → Annual ₹ Value"])
+    st.dataframe(business_table, width="stretch")
     st.info("These numbers are indicative averages; actual benefits depend on plant capacity and process efficiency.")
 # -----  Bibliography tab
 with tabs[6]:
+    st.subheader("Annotated Bibliography — Justification for Target Variables")
     st.markdown("""
+These papers justify the chosen target variables (temperature, yield, efficiency, refractory wear)
+in metallurgical AI modeling. Click any title to open the official paper.
     """)
     bib_data = [
+        {
+            "title": "A Survey of Data-Driven Soft Sensing in Ironmaking Systems",
+            "authors": "Yan et al. (2024)",
+            "notes": "Soft sensors for furnace and tap temperature; validates `furnace_temp` and `tap_temp` targets.",
+            "url": "https://doi.org/10.1021/acsomega.4c01254"
+        },
+        {
+            "title": "Optimisation of Operator Support Systems through Artificial Intelligence for the Cast Steel Industry",
+            "authors": "Ojeda Roldán et al. (2022)",
+            "notes": "Reinforcement learning for oxygen blowing and endpoint control; supports temperature and carbon targets.",
+            "url": "https://doi.org/10.3390/jmmp6020034"
+        },
+        {
+            "title": "Analyzing the Energy Efficiency of Electric Arc Furnace Steelmaking",
+            "authors": "Zhuo et al. (2024)",
+            "notes": "Links arc power, temperature, and energy KPIs — validates `energy_efficiency` and `power_density`.",
+            "url": "https://doi.org/10.3390/met15010113"
+        },
+        {
+            "title": "Dynamic EAF Modeling and Slag Foaming Index Prediction",
+            "authors": "MacRosty et al.",
+            "notes": "Supports refractory and heat-flux-based wear prediction — validates `lining_thickness` target.",
+            "url": "https://www.sciencedirect.com/science/article/pii/S0921883123004019"
+        },
+        {
+            "title": "Machine Learning for Yield Optimization in Continuous Casting",
+            "authors": "Springer (2023)",
+            "notes": "ML for yield ratio and defect minimization; supports `yield_ratio` target.",
+            "url": "https://link.springer.com/article/10.1007/s40964-023-00592-7"
+        }
     ]
+    bib_df = pd.DataFrame(bib_data)
+    bib_df["Paper Title"] = bib_df.apply(lambda x: f"[{x['title']}]({x['url']})", axis=1)
+    st.dataframe(
+        bib_df[["Paper Title", "authors", "notes"]]
+        .rename(columns={"authors": "Authors / Year", "notes": "Relevance"}),
+        width="stretch",
+        hide_index=True
+    )
     st.markdown("""
+**Feature ↔ Target Justification**
+- `furnace_temp`, `tap_temp` → Process temperature (Yan 2024, Ojeda 2022)
+- `yield_ratio` → Production yield (Springer 2023)
+- `energy_efficiency`, `power_density` → Energy KPIs (Zhuo 2024)
+- `lining_thickness`, `slag_foaming_index` → Refractory & process health (MacRosty et al.)
+    """)
+    st.info("Click any paper title above to open it in a new tab.")
+    log("Bibliography tab rendered successfully.")
 # -------------------------
 # Footer / Notes
 # -------------------------
 st.markdown("---")
 st.markdown("**Notes:** This dataset is synthetic and for demo/prototyping. Real plant integration requires NDA, data on-boarding, sensor mapping, and plant safety checks before any control actions.")
+# -----  Logs tab
+tabs.append("View Logs")
+with tabs[-1]:
+    st.subheader("📜 Session & Model Logs")
+    st.markdown("Each run creates a timestamped log file in `/logs/` inside this Space. Use this panel to review run progress and debug output.")
+    log_files = sorted(
+        [f for f in os.listdir(LOG_DIR) if f.endswith(".log")],
+        reverse=True
+    )
+    if not log_files:
+        st.info("No logs yet. Run an AutoML job first.")
+    else:
+        latest = st.selectbox("Select log file", log_files, index=0)
+        path = os.path.join(LOG_DIR, latest)
+        with open(path, "r", encoding="utf-8") as f:
+            content = f.read()
+        st.text_area("Log Output", content, height=400)
+        st.download_button("⬇️ Download Log", content, file_name=latest)