Multi_X_Y / app.py
sagkho's picture
Upload 11 files
8bb1280 verified
import streamlit as st
from streamlit_option_menu import option_menu
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import plotly.express as px
import io
import os
import sqlite3
import datetime
# ---- SQLite Database Helper Functions ----
DB_PATH = "dashboard.db"
def init_db():
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS datasets (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT UNIQUE,
upload_time TEXT,
num_rows INTEGER,
num_cols INTEGER,
data BLOB
)''')
conn.commit()
conn.close()
def save_dataset_to_db(name, df):
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
blob = df.to_parquet()
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
c.execute('''INSERT OR REPLACE INTO datasets (name, upload_time, num_rows, num_cols, data)
VALUES (?, ?, ?, ?, ?)''', (name, now, len(df), len(df.columns), blob))
conn.commit()
conn.close()
def list_datasets_from_db():
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('SELECT name, upload_time, num_rows, num_cols FROM datasets ORDER BY upload_time DESC')
rows = c.fetchall()
conn.close()
return rows
def load_dataset_from_db(name):
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('SELECT data FROM datasets WHERE name = ?', (name,))
row = c.fetchone()
conn.close()
if row:
return pd.read_parquet(io.BytesIO(row[0]))
return None
def delete_dataset_from_db(name):
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('DELETE FROM datasets WHERE name = ?', (name,))
conn.commit()
conn.close()
init_db()
# ---- Model Persistence Helpers ----
import pickle
MODEL_DIR = "saved_models"
os.makedirs(MODEL_DIR, exist_ok=True)
def save_model_to_disk(model, scaler_x, scaler_y, x_cols, y_cols, model_name):
"""Save model weights, scalers, and column config to disk."""
save_path = os.path.join(MODEL_DIR, model_name)
os.makedirs(save_path, exist_ok=True)
# Save model state dict
torch.save({
'state_dict': model.state_dict(),
'input_dim': len(x_cols),
'latent_dim': model.encoder[-1].out_features,
'output_dim': model.predictor[-1].out_features,
}, os.path.join(save_path, 'model.pth'))
# Save scalers
with open(os.path.join(save_path, 'scaler_x.pkl'), 'wb') as f:
pickle.dump(scaler_x, f)
with open(os.path.join(save_path, 'scaler_y.pkl'), 'wb') as f:
pickle.dump(scaler_y, f)
# Save column config
with open(os.path.join(save_path, 'columns.pkl'), 'wb') as f:
pickle.dump({'x_cols': x_cols, 'y_cols': y_cols}, f)
# Save metadata
meta = {
'name': model_name,
'saved_at': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
'input_dim': len(x_cols),
'output_dim': len(y_cols),
'x_cols': x_cols,
'y_cols': y_cols,
}
with open(os.path.join(save_path, 'metadata.pkl'), 'wb') as f:
pickle.dump(meta, f)
def list_saved_models():
"""List all saved model directories with their metadata."""
models = []
if not os.path.exists(MODEL_DIR):
return models
for name in os.listdir(MODEL_DIR):
meta_path = os.path.join(MODEL_DIR, name, 'metadata.pkl')
if os.path.exists(meta_path):
with open(meta_path, 'rb') as f:
meta = pickle.load(f)
models.append(meta)
return models
def load_model_from_disk(model_name):
"""Load model, scalers, and column config from disk."""
load_path = os.path.join(MODEL_DIR, model_name)
checkpoint = torch.load(os.path.join(load_path, 'model.pth'), weights_only=False)
model = IndustrialDAE(
input_dim=checkpoint['input_dim'],
latent_dim=checkpoint['latent_dim'],
output_dim=checkpoint['output_dim']
)
model.load_state_dict(checkpoint['state_dict'])
model.eval()
with open(os.path.join(load_path, 'scaler_x.pkl'), 'rb') as f:
scaler_x = pickle.load(f)
with open(os.path.join(load_path, 'scaler_y.pkl'), 'rb') as f:
scaler_y = pickle.load(f)
with open(os.path.join(load_path, 'columns.pkl'), 'rb') as f:
cols = pickle.load(f)
return model, scaler_x, scaler_y, cols['x_cols'], cols['y_cols']
st.set_page_config(page_title="Multi X-Y | Industrial DAE", layout="wide", initial_sidebar_state="expanded")
# ---- Premium CSS Styling ----
st.markdown("""
<style>
/* Modern Industrial Theme */
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&family=Outfit:wght@400;600;800&display=swap');
:root {
--primary: #4da6ff;
--secondary: #2b6cb0;
--bg-dark: #0f172a;
--card-bg: rgba(30, 41, 59, 0.7);
--accent: #10b981;
}
.main {
background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
color: #f8fafc;
font-family: 'Inter', sans-serif;
}
h1, h2, h3, h4 {
font-family: 'Outfit', sans-serif !important;
font-weight: 800 !important;
letter-spacing: -0.02em;
}
.stButton>button {
background: linear-gradient(90deg, #3b82f6 0%, #2563eb 100%) !important;
color: white !important;
border: none !important;
padding: 0.6rem 1.5rem !important;
border-radius: 12px !important;
font-weight: 600 !important;
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06) !important;
width: 100% !important;
}
.stButton>button:hover {
transform: translateY(-2px) !important;
box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05) !important;
background: linear-gradient(90deg, #2563eb 0%, #1d4ed8 100%) !important;
}
.stDataFrame, .stTable {
background-color: var(--card-bg) !important;
border-radius: 15px !important;
border: 1px solid rgba(255, 255, 255, 0.1) !important;
padding: 10px !important;
}
[data-testid="stMetricValue"] {
font-size: 1.8rem !important;
font-weight: 700 !important;
color: var(--primary) !important;
}
/* Sidebar Styling */
[data-testid="stSidebar"] {
background-color: #0f172a !important;
border-right: 1px solid rgba(255, 255, 255, 0.05) !important;
}
.st-emotion-cache-16idsys p {
color: #94a3b8 !important;
}
/* Custom Cards */
.status-card {
background: var(--card-bg);
padding: 1.5rem;
border-radius: 16px;
border: 1px solid rgba(255, 255, 255, 0.05);
margin-bottom: 1rem;
}
</style>
""", unsafe_allow_html=True)
# Session State Initialization
if 'df' not in st.session_state: st.session_state.df = None
if 'data_history' not in st.session_state: st.session_state.data_history = {}
if 'x_cols' not in st.session_state: st.session_state.x_cols = []
if 'y_cols' not in st.session_state: st.session_state.y_cols = []
if 'X_train' not in st.session_state: st.session_state.X_train = None
if 'X_test' not in st.session_state: st.session_state.X_test = None
if 'y_train' not in st.session_state: st.session_state.y_train = None
if 'y_test' not in st.session_state: st.session_state.y_test = None
if 'scaler_x' not in st.session_state: st.session_state.scaler_x = None
if 'scaler_y' not in st.session_state: st.session_state.scaler_y = None
if 'model_trained' not in st.session_state: st.session_state.model_trained = False
if 'history' not in st.session_state: st.session_state.history = []
if 'sim_history' not in st.session_state: st.session_state.sim_history = []
if 'loaded_sim' not in st.session_state: st.session_state.loaded_sim = None
# Helper Classes for PyTorch
class IndustrialDAE(nn.Module):
def __init__(self, input_dim=41, latent_dim=15, output_dim=5, dropout_rate=0.2):
super(IndustrialDAE, self).__init__()
# --- ENCODER: Learns the "Hidden Physics" ---
self.encoder = nn.Sequential(
nn.Linear(input_dim, 128),
nn.BatchNorm1d(128),
nn.ReLU(),
nn.Dropout(dropout_rate),
nn.Linear(128, 64),
nn.ReLU(),
nn.Linear(64, latent_dim) # Compressed State
)
# --- DECODER: Reconstructs/Heals all features ---
self.decoder = nn.Sequential(
nn.Linear(latent_dim, 64),
nn.ReLU(),
nn.Linear(64, 128),
nn.ReLU(),
nn.Linear(128, input_dim)
)
# --- PREDICTOR: Specifically targets the KPIs ---
self.predictor = nn.Sequential(
nn.Linear(latent_dim, 32),
nn.ReLU(),
nn.Linear(32, 16),
nn.ReLU(),
nn.Linear(16, output_dim)
)
def forward(self, x):
z = self.encoder(x)
reconstructed_x = self.decoder(z)
predicted_y = self.predictor(z)
return reconstructed_x, predicted_y
# Sidebar Navigation
with st.sidebar:
st.markdown("<h2 style='text-align: left; margin-bottom: 0px;'>Multi X-Y</h2>", unsafe_allow_html=True)
st.markdown("<h4 style='text-align: left; color: #4da6ff; margin-top: 0px;'>ML Dashboard</h4>", unsafe_allow_html=True)
st.markdown("---")
selected = option_menu(
menu_title=None,
options=["Overview", "Upload Data", "Preprocess", "Train Model", "Predict", "What-If", "History", "Comparison"],
icons=["graph-up", "upload", "gear", "diagram-3", "graph-up-arrow", "magic", "clock-history", "bar-chart"],
menu_icon="cast",
default_index=0,
styles={
"container": {"padding": "0!important", "background-color": "transparent"},
"icon": {"color": "white", "font-size": "18px"},
"nav-link": {"font-size": "16px", "text-align": "left", "margin":"0px", "--hover-color": "#2d3748"},
"nav-link-selected": {"background-color": "#2b6cb0"},
}
)
if selected == "Overview":
st.title("🏭 Industrial DAE β€” Multi X-Y Dashboard")
st.caption("End-to-end Denoising Autoencoder for Sensor Reconstruction & KPI Prediction")
# --- System Status Cards ---
st.markdown("### πŸ”„ System Status")
s1, s2, s3, s4 = st.columns(4)
with s1:
if st.session_state.df is not None:
st.success(f"βœ… Data Loaded\n\n**{st.session_state.df.shape[0]}** rows Γ— **{st.session_state.df.shape[1]}** cols")
else:
st.warning("⚠️ No Data\n\nUpload data to begin")
with s2:
if len(st.session_state.x_cols) > 0:
st.success(f"βœ… Preprocessed\n\n**{len(st.session_state.x_cols)}** X | **{len(st.session_state.y_cols)}** Y")
else:
st.warning("⚠️ Not Preprocessed")
with s3:
if st.session_state.model_trained:
st.success("βœ… Model Trained\n\nReady for Prediction")
else:
st.warning("⚠️ No Model\n\nTrain a model first")
with s4:
saved_models_list = list_saved_models()
st.info(f"πŸ’Ύ Saved Models\n\n**{len(saved_models_list)}** model(s) on disk")
st.markdown("---")
# --- Current Model Performance ---
if st.session_state.model_trained and st.session_state.X_test is not None:
st.markdown("### πŸ“Š Current Model Performance")
model = st.session_state.model
scaler_y = st.session_state.scaler_y
X_test_t = torch.tensor(st.session_state.X_test, dtype=torch.float32)
y_test = st.session_state.y_test_raw
model.eval()
with torch.no_grad():
_, preds_test_scaled = model(X_test_t)
preds_test = scaler_y.inverse_transform(preds_test_scaled.numpy())
# Per-feature KPI cards
kpi_cols = st.columns(len(st.session_state.y_cols))
r2_vals = []
for i, col in enumerate(st.session_state.y_cols):
r2_val = r2_score(y_test[col], preds_test[:, i])
mae_val = mean_absolute_error(y_test[col], preds_test[:, i])
r2_vals.append(r2_val)
with kpi_cols[i]:
if r2_val >= 0.90: emoji = "🟒"
elif r2_val >= 0.75: emoji = "🟑"
else: emoji = "πŸ”΄"
st.metric(label=f"{emoji} {col}", value=f"RΒ² = {r2_val:.4f}", delta=f"MAE = {mae_val:.4f}")
avg_r2 = np.mean(r2_vals)
if avg_r2 >= 0.90: grade = "Excellent 🟒"
elif avg_r2 >= 0.75: grade = "Good 🟑"
else: grade = "Needs Improvement πŸ”΄"
st.markdown(f"**Overall Average RΒ²:** `{avg_r2:.4f}` β€” **{grade}**")
st.markdown("---")
# Feature lists
col_info1, col_info2 = st.columns(2)
with col_info1:
st.markdown("**Input Features (X)**")
for c in st.session_state.x_cols:
st.markdown(f"- `{c}`")
with col_info2:
st.markdown("**Target Features (Y)**")
for c in st.session_state.y_cols:
st.markdown(f"- `{c}`")
else:
st.info("Upload data, preprocess, and train a model to see performance metrics here.")
st.markdown("---")
# --- Database & Saved Models Summary ---
db_col1, db_col2 = st.columns(2)
with db_col1:
st.markdown("### πŸ“¦ Datasets in Database")
db_ds = list_datasets_from_db()
if len(db_ds) > 0:
inv_df = pd.DataFrame(db_ds, columns=['Name', 'Uploaded', 'Rows', 'Cols'])
st.dataframe(inv_df, width='stretch')
else:
st.caption("No datasets stored yet.")
with db_col2:
st.markdown("### πŸ’Ύ Saved Models")
if len(saved_models_list) > 0:
model_df = pd.DataFrame(saved_models_list)[['name', 'saved_at', 'input_dim', 'output_dim']]
model_df.columns = ['Name', 'Saved At', 'X Features', 'Y Targets']
st.dataframe(model_df, width='stretch')
else:
st.caption("No models saved yet.")
st.markdown("---")
# --- Workflow Guide ---
st.markdown("### πŸ—ΊοΈ Workflow Guide")
st.markdown("""
| Step | Tab | Action |
|------|-----|--------|
| 1 | **Upload Data** | Upload Excel dataset or load from database |
| 2 | **Preprocess** | Select X/Y features, impute missing data, handle outliers |
| 3 | **Train Model** | Configure hyperparameters, train DAE, or load a saved model |
| 4 | **Predict** | Evaluate on test data β€” metrics, scatter plots, residual analysis |
| 5 | **What-If** | Sensitivity analysis with step changes & trend detection |
| 6 | **History** | Review all training runs |
| 7 | **Comparison** | Compare metrics across different model runs |
""")
elif selected == "Upload Data":
st.title("Upload Data")
col1, col2 = st.columns(2)
with col1:
st.subheader("Upload New File")
uploaded_file = st.file_uploader("Upload Excel file", type=["xlsx", "xls"])
if uploaded_file is not None:
@st.cache_data
def load_data_from_bytes(file_bytes):
return pd.read_excel(file_bytes)
df = load_data_from_bytes(uploaded_file)
save_dataset_to_db(uploaded_file.name, df)
st.session_state.df = df
st.session_state.data_history[uploaded_file.name] = df
st.success(f"βœ… Data saved to database as **{uploaded_file.name}**!")
with col2:
st.subheader("Load from Database")
db_datasets = list_datasets_from_db()
if len(db_datasets) > 0:
dataset_names = [r[0] for r in db_datasets]
history_file = st.selectbox("Select previously uploaded data", dataset_names)
if st.button("Load Selected Data"):
loaded_df = load_dataset_from_db(history_file)
if loaded_df is not None:
st.session_state.df = loaded_df
st.session_state.data_history[history_file] = loaded_df
st.success(f"Data switched to **{history_file}** successfully!")
else:
st.error("Failed to load dataset from database.")
else:
st.info("No datasets in database yet. Upload a file to get started.")
st.markdown("---")
# Show database inventory
db_datasets = list_datasets_from_db()
if len(db_datasets) > 0:
st.subheader("πŸ“¦ Database Inventory")
inv_df = pd.DataFrame(db_datasets, columns=['Dataset Name', 'Uploaded On', 'Rows', 'Columns'])
st.dataframe(inv_df, width='stretch')
del_name = st.selectbox("Select dataset to delete", [r[0] for r in db_datasets], key="del_ds")
if st.button("πŸ—‘οΈ Delete Selected Dataset"):
delete_dataset_from_db(del_name)
if del_name in st.session_state.data_history:
del st.session_state.data_history[del_name]
st.success(f"Deleted **{del_name}** from database.")
st.rerun()
st.markdown("---")
if st.session_state.df is not None:
st.subheader("Current Data Overview")
st.dataframe(st.session_state.df.head())
st.write(f"**Shape:** {st.session_state.df.shape}")
elif selected == "Preprocess":
st.title("Preprocess Data")
db_datasets = list_datasets_from_db()
if len(db_datasets) > 0:
col1, col2 = st.columns([3, 1])
with col1:
history_file_prep = st.selectbox("Select Active Dataset", [r[0] for r in db_datasets], key="prep_dataset")
with col2:
st.write("")
st.write("")
if st.button("Load Dataset", key="load_prep"):
loaded_df = load_dataset_from_db(history_file_prep)
if loaded_df is not None:
st.session_state.df = loaded_df
st.session_state.data_history[history_file_prep] = loaded_df
st.success(f"Dataset switched to {history_file_prep}")
st.rerun()
st.markdown("---")
if st.session_state.df is None:
st.warning("Please upload data first in the 'Upload Data' tab.")
else:
df = st.session_state.df
# Force conversion of object columns to numeric (coercing messy strings to NaN)
# This fixes issues where sensor data is accidentally parsed as text
for col in df.columns:
if df[col].dtype == 'object':
df[col] = pd.to_numeric(df[col], errors='coerce')
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
st.subheader("Variable Selection")
col_x, col_y = st.columns(2)
with col_x:
st.markdown("**Select Input Features (X)**")
# Select All / Deselect All for X
select_all_x = st.checkbox("Select All X", value=len(st.session_state.x_cols) == len(numeric_cols), key="sel_all_x")
x_cols = []
for col in numeric_cols:
default_checked = (col in st.session_state.x_cols) if not select_all_x else True
if st.checkbox(col, value=default_checked, key=f"x_{col}"):
x_cols.append(col)
with col_y:
st.markdown("**Select Target Variables (Y)**")
y_options = [c for c in numeric_cols if c not in x_cols]
select_all_y = st.checkbox("Select All Y", value=len(st.session_state.y_cols) == len(y_options) and len(y_options) > 0, key="sel_all_y")
y_cols = []
for col in y_options:
default_checked = (col in st.session_state.y_cols) if not select_all_y else True
if st.checkbox(col, value=default_checked, key=f"y_{col}"):
y_cols.append(col)
st.subheader("Missing Data Imputation & Outliers")
col_f1, col_f2 = st.columns(2)
with col_f1:
imputation_method = st.selectbox("Missing Value Imputation Method", ["Mean", "Median", "Zero"])
with col_f2:
outlier_method = st.radio("Select Outlier Treatment Method", ["None", "IQR Capping", "Min-Max Percentile Capping (1% - 99%)"])
# --- Custom Min-Max Filter ---
st.subheader("πŸ”§ Custom Min-Max Filter (Per Tag)")
st.caption("Select specific features and set custom min/max bounds. Data outside these limits will be clipped.")
all_selected = x_cols + y_cols
custom_filter_tags = st.multiselect("Select Tags to Apply Custom Min-Max Filter", all_selected, default=[], key="custom_filter_tags")
custom_filters = {}
if len(custom_filter_tags) > 0:
filter_cols = st.columns(3)
for idx, tag in enumerate(custom_filter_tags):
tag_min = float(df[tag].min())
tag_max = float(df[tag].max())
with filter_cols[idx % 3]:
st.markdown(f"**{tag}**")
st.caption(f"Data Range: {tag_min:.4f} β€” {tag_max:.4f}")
c1, c2 = st.columns(2)
with c1:
user_min = st.number_input(f"Min", value=tag_min, format="%.4f", key=f"fmin_{tag}")
with c2:
user_max = st.number_input(f"Max", value=tag_max, format="%.4f", key=f"fmax_{tag}")
custom_filters[tag] = {"min": user_min, "max": user_max}
if st.button("Apply Preprocessing"):
if len(x_cols) == 0 or len(y_cols) == 0:
st.error("Please select at least one X and one Y variable.")
else:
st.session_state.x_cols = x_cols
st.session_state.y_cols = y_cols
data_x = df[x_cols].copy()
data_y = df[y_cols].copy()
# Show Feature Statistics
st.markdown("### Feature-wise Statistics (Before Imputation)")
stats_df = pd.DataFrame({
'Missing Count': data_x.isnull().sum(),
'Missing %': (data_x.isnull().sum() / len(data_x) * 100).round(2),
'Min': data_x.min(),
'Mean': data_x.mean(),
'Max': data_x.max()
})
st.dataframe(stats_df)
# Impute NaNs
if imputation_method == "Mean":
data_x = data_x.fillna(data_x.mean())
data_y = data_y.fillna(data_y.mean())
elif imputation_method == "Median":
data_x = data_x.fillna(data_x.median())
data_y = data_y.fillna(data_y.median())
elif imputation_method == "Zero":
data_x = data_x.fillna(0)
data_y = data_y.fillna(0)
# 4. Outlier Handling
if outlier_method == "IQR Capping":
for col in data_x.columns:
Q1 = data_x[col].quantile(0.25)
Q3 = data_x[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data_x[col] = np.clip(data_x[col], lower_bound, upper_bound)
for col in data_y.columns:
Q1 = data_y[col].quantile(0.25)
Q3 = data_y[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data_y[col] = np.clip(data_y[col], lower_bound, upper_bound)
elif outlier_method == "Min-Max Percentile Capping (1% - 99%)":
for col in data_x.columns:
lower_bound = data_x[col].quantile(0.01)
upper_bound = data_x[col].quantile(0.99)
data_x[col] = np.clip(data_x[col], lower_bound, upper_bound)
for col in data_y.columns:
lower_bound = data_y[col].quantile(0.01)
upper_bound = data_y[col].quantile(0.99)
data_y[col] = np.clip(data_y[col], lower_bound, upper_bound)
# 5. Apply Custom Min-Max Filters
for tag, bounds in custom_filters.items():
if tag in data_x.columns:
data_x[tag] = np.clip(data_x[tag], bounds['min'], bounds['max'])
if tag in data_y.columns:
data_y[tag] = np.clip(data_y[tag], bounds['min'], bounds['max'])
st.markdown("### Feature-wise Statistics (After Preprocessing)")
stats_after_df = pd.DataFrame({
'Missing Count': data_x.isnull().sum(),
'Missing %': (data_x.isnull().sum() / len(data_x) * 100).round(2),
'Min': data_x.min(),
'Mean': data_x.mean(),
'Max': data_x.max()
})
st.dataframe(stats_after_df)
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=42)
scaler_x = StandardScaler()
scaler_y = StandardScaler()
st.session_state.X_train = scaler_x.fit_transform(X_train)
st.session_state.X_test = scaler_x.transform(X_test)
st.session_state.y_train = scaler_y.fit_transform(y_train)
st.session_state.y_test = scaler_y.transform(y_test)
st.session_state.scaler_x = scaler_x
st.session_state.scaler_y = scaler_y
st.session_state.y_test_raw = y_test
st.success(f"Preprocessing complete! Applied {outlier_method}. Train/Test split created and features scaled.")
elif selected == "Train Model":
st.title("Train Model (Industrial DAE)")
# --- Load a previously saved model ---
saved_models = list_saved_models()
if len(saved_models) > 0:
with st.expander("πŸ“‚ Load a Previously Saved Model", expanded=False):
model_meta_df = pd.DataFrame(saved_models)[['name', 'saved_at', 'input_dim', 'output_dim']]
model_meta_df.columns = ['Model Name', 'Saved At', 'Input Features', 'Output Targets']
st.dataframe(model_meta_df, width='stretch')
sel_model_name = st.selectbox("Select Model to Load", [m['name'] for m in saved_models])
if st.button("Load Selected Model"):
loaded_model, loaded_sx, loaded_sy, loaded_x, loaded_y = load_model_from_disk(sel_model_name)
st.session_state.model = loaded_model
st.session_state.scaler_x = loaded_sx
st.session_state.scaler_y = loaded_sy
st.session_state.x_cols = loaded_x
st.session_state.y_cols = loaded_y
st.session_state.model_trained = True
st.success(f"βœ… Model **{sel_model_name}** loaded! You can now use Predict and What-If tabs.")
st.rerun()
st.markdown("---")
if st.session_state.X_train is None:
st.warning("Please preprocess data first in the 'Preprocess' tab.")
else:
st.subheader("Hyperparameters")
col1, col2 = st.columns(2)
with col1:
masking_ratio = st.slider("Masking Ratio (Corruption)", 0.0, 0.5, 0.10)
epochs = st.number_input("Epochs", 10, 1000, 150)
lr = st.number_input("Learning Rate", 0.0001, 0.1, 0.001, format="%.4f")
auto_train = st.checkbox("Auto-Train (Until R2 > 0.85 & MAE lower)", value=False)
with col2:
latent_dim = st.slider("Latent Dimension", 2, max(2, len(st.session_state.x_cols)), 15)
dropout_rate = st.slider("Dropout Rate", 0.0, 0.5, 0.2)
weight_to_pred = st.number_input("Weight to Predictor Loss", 0.1, 10.0, 5.0)
batch_size = st.selectbox("Batch Size", [16, 32, 64, 128, 256], index=3)
if st.button("Train"):
X_train_t = torch.tensor(st.session_state.X_train, dtype=torch.float32)
y_train_t = torch.tensor(st.session_state.y_train, dtype=torch.float32)
X_test_t = torch.tensor(st.session_state.X_test, dtype=torch.float32)
y_test_t = torch.tensor(st.session_state.y_test, dtype=torch.float32)
train_dataset = TensorDataset(X_train_t, y_train_t)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
input_dim = X_train_t.shape[1]
output_dim = y_train_t.shape[1]
model = IndustrialDAE(input_dim=input_dim, latent_dim=latent_dim, output_dim=output_dim, dropout_rate=dropout_rate)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion_recon = nn.MSELoss()
criterion_pred = nn.HuberLoss()
progress_bar = st.progress(0)
status_text = st.empty()
epoch_recon_losses = []
epoch_pred_losses = []
val_recon_losses = []
val_pred_losses = []
y_test_raw = st.session_state.y_test_raw
scaler_y = st.session_state.scaler_y
max_train_epochs = 2000 if auto_train else epochs
best_r2 = -float('inf')
best_mae = float('inf')
for epoch in range(max_train_epochs):
model.train()
batch_recon_loss = 0
batch_pred_loss = 0
for batch_x, batch_y in train_loader:
clean_x = batch_x
# Apply Masking
random_probabilities = torch.rand(clean_x.shape)
mask = random_probabilities < masking_ratio
noised_x = clean_x.clone()
noised_x[mask] = 0.0
recon_x, pred_y = model(noised_x)
loss_recon = criterion_recon(recon_x, clean_x)
loss_pred = criterion_pred(pred_y, batch_y)
total_loss = loss_recon + (weight_to_pred * loss_pred)
optimizer.zero_grad()
total_loss.backward()
optimizer.step()
batch_recon_loss += loss_recon.item()
batch_pred_loss += loss_pred.item()
epoch_recon_losses.append(batch_recon_loss / len(train_loader))
epoch_pred_losses.append(batch_pred_loss / len(train_loader))
# Validation Pass
model.eval()
with torch.no_grad():
# Evaluate on clean test set without masking (standard practice for evaluation)
val_recon, val_pred = model(X_test_t)
v_loss_recon = criterion_recon(val_recon, X_test_t)
v_loss_pred = criterion_pred(val_pred, y_test_t)
val_recon_losses.append(v_loss_recon.item())
val_pred_losses.append(v_loss_pred.item())
if auto_train:
if (epoch + 1) % 10 == 0:
with torch.no_grad():
preds_test_scaled = val_pred
preds_test = scaler_y.inverse_transform(preds_test_scaled.numpy())
r2_vals = [r2_score(y_test_raw[col], preds_test[:, i]) for i, col in enumerate(st.session_state.y_cols)]
mae_vals = [mean_absolute_error(y_test_raw[col], preds_test[:, i]) for i, col in enumerate(st.session_state.y_cols)]
avg_r2 = np.mean(r2_vals)
avg_mae = np.mean(mae_vals)
status_text.text(f"Auto-Training... Epoch {epoch+1} | Avg R2: {avg_r2:.4f} | Avg MAE: {avg_mae:.4f}")
if avg_r2 > 0.85 and avg_mae <= best_mae:
status_text.text(f"Reached Target! Stopped at Epoch {epoch+1} with Avg R2 = {avg_r2:.4f}, Avg MAE = {avg_mae:.4f}")
break
if avg_r2 > best_r2: best_r2 = avg_r2
if avg_mae < best_mae: best_mae = avg_mae
else:
progress_bar.progress((epoch + 1) / epochs)
if not auto_train:
status_text.text("Training Complete!")
st.session_state.model = model
st.session_state.model_trained = True
# Final Evaluation
model.eval()
with torch.no_grad():
_, val_pred = model(X_test_t)
preds_test = scaler_y.inverse_transform(val_pred.numpy())
metrics_df = pd.DataFrame(index=st.session_state.y_cols, columns=['RMSE', 'MAE', 'R2 Score'])
for i, col in enumerate(st.session_state.y_cols):
mse = mean_squared_error(y_test_raw[col], preds_test[:, i])
metrics_df.loc[col, 'RMSE'] = np.sqrt(mse)
metrics_df.loc[col, 'MAE'] = mean_absolute_error(y_test_raw[col], preds_test[:, i])
metrics_df.loc[col, 'R2 Score'] = r2_score(y_test_raw[col], preds_test[:, i])
avg_rmse = metrics_df['RMSE'].mean()
run_id = len(st.session_state.history) + 1
st.session_state.history.append({
"Run ID": run_id,
"Masking": masking_ratio,
"Latent Dim": latent_dim,
"Epochs": len(epoch_pred_losses),
"Avg Test RMSE": avg_rmse,
"Model": model
})
# Auto-save model to disk
model_name = f"DAE_Run{run_id}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
save_model_to_disk(model, st.session_state.scaler_x, st.session_state.scaler_y,
st.session_state.x_cols, st.session_state.y_cols, model_name)
st.success(f"βœ… Model trained, saved as **{model_name}**, and added to History! (Epochs: {len(epoch_pred_losses)})")
st.subheader("Training Post-Evaluation Metrics")
st.dataframe(metrics_df)
col1, col2 = st.columns(2)
with col1:
st.subheader("DAE Reconstruction Loss (MSE)")
fig, ax = plt.subplots()
ax.plot(epoch_recon_losses, color='blue', label='Train Loss')
ax.plot(val_recon_losses, color='cyan', label='Validation Loss')
ax.legend()
st.pyplot(fig)
with col2:
st.subheader("Predictor Loss (Huber)")
fig, ax = plt.subplots()
ax.plot(epoch_pred_losses, color='orange', label='Train Loss')
ax.plot(val_pred_losses, color='red', label='Validation Loss')
ax.legend()
st.pyplot(fig)
elif selected == "Predict":
st.title("Predict & Evaluate")
if not st.session_state.model_trained:
st.warning("Please train the model first in the 'Train Model' tab.")
else:
model = st.session_state.model
scaler_y = st.session_state.scaler_y
X_test_t = torch.tensor(st.session_state.X_test, dtype=torch.float32)
y_test = st.session_state.y_test_raw
model.eval()
with torch.no_grad():
_, val_pred = model(X_test_t)
preds_test = scaler_y.inverse_transform(val_pred.numpy())
st.subheader("Test Set Metrics")
metrics_df = pd.DataFrame(index=st.session_state.y_cols, columns=['RMSE', 'MAE', 'R2 Score', 'MAPE (%)'])
for i, col in enumerate(st.session_state.y_cols):
actual = y_test[col].values
predicted = preds_test[:, i]
mse = mean_squared_error(actual, predicted)
metrics_df.loc[col, 'RMSE'] = np.sqrt(mse)
metrics_df.loc[col, 'MAE'] = mean_absolute_error(actual, predicted)
metrics_df.loc[col, 'R2 Score'] = r2_score(actual, predicted)
# MAPE - handle zeros
nonzero_mask = actual != 0
if nonzero_mask.sum() > 0:
metrics_df.loc[col, 'MAPE (%)'] = np.mean(np.abs((actual[nonzero_mask] - predicted[nonzero_mask]) / actual[nonzero_mask])) * 100
else:
metrics_df.loc[col, 'MAPE (%)'] = 0.0
st.dataframe(metrics_df, width='stretch')
# --- KPI Summary Cards ---
st.subheader("πŸ“Š Model Performance Summary")
kpi_cols = st.columns(len(st.session_state.y_cols))
for i, col in enumerate(st.session_state.y_cols):
r2_val = float(metrics_df.loc[col, 'R2 Score'])
mae_val = float(metrics_df.loc[col, 'MAE'])
with kpi_cols[i]:
if r2_val >= 0.90:
emoji = "🟒"
elif r2_val >= 0.75:
emoji = "🟑"
else:
emoji = "πŸ”΄"
st.metric(label=f"{emoji} {col}", value=f"RΒ² = {r2_val:.4f}", delta=f"MAE = {mae_val:.4f}")
# --- Actual vs Predicted Line Charts ---
st.subheader("πŸ“ˆ Actual vs Predicted (All Y Features)")
pts = min(100, len(y_test))
for i, col in enumerate(st.session_state.y_cols):
r2_val = float(metrics_df.loc[col, 'R2 Score'])
chart_df = pd.DataFrame({
'Sample Index': range(pts),
'Actual': y_test[col].values[:pts],
'Predicted': preds_test[:pts, i]
})
chart_df_melted = chart_df.melt(id_vars=['Sample Index'], value_vars=['Actual', 'Predicted'], var_name='Type', value_name='Value')
fig = px.line(chart_df_melted, x='Sample Index', y='Value', color='Type',
title=f"{col} | RΒ² = {r2_val:.4f}")
fig.update_layout(yaxis=dict(autorange=True))
st.plotly_chart(fig, width='stretch')
# --- Scatter Plot: Actual vs Predicted with 45Β° line ---
st.subheader("🎯 Scatter Plot: Actual vs Predicted")
scatter_cols = st.columns(min(len(st.session_state.y_cols), 3))
for i, col in enumerate(st.session_state.y_cols):
actual = y_test[col].values
predicted = preds_test[:, i]
r2_val = float(metrics_df.loc[col, 'R2 Score'])
with scatter_cols[i % 3]:
fig = px.scatter(x=actual, y=predicted, labels={'x': 'Actual', 'y': 'Predicted'},
title=f"{col} | RΒ² = {r2_val:.4f}", opacity=0.5)
# Add 45-degree ideal line
min_val = min(actual.min(), predicted.min())
max_val = max(actual.max(), predicted.max())
fig.add_shape(type="line", x0=min_val, y0=min_val, x1=max_val, y1=max_val,
line=dict(color="red", dash="dash", width=2))
fig.update_layout(yaxis=dict(autorange=True), height=400)
st.plotly_chart(fig, width='stretch')
# --- Residual Analysis ---
st.subheader("πŸ“‰ Residual Analysis (Error Distribution)")
residual_cols = st.columns(min(len(st.session_state.y_cols), 3))
for i, col in enumerate(st.session_state.y_cols):
actual = y_test[col].values
predicted = preds_test[:, i]
residuals = actual - predicted
with residual_cols[i % 3]:
fig = px.histogram(residuals, nbins=30, title=f"Residuals: {col}",
labels={'value': 'Error (Actual - Predicted)', 'count': 'Frequency'})
fig.update_layout(showlegend=False, height=350)
st.plotly_chart(fig, width='stretch')
# --- Download Predictions ---
st.subheader("πŸ“₯ Export Predictions")
export_df = y_test.copy().reset_index(drop=True)
for i, col in enumerate(st.session_state.y_cols):
export_df[f"Predicted_{col}"] = preds_test[:, i]
export_df[f"Error_{col}"] = y_test[col].values - preds_test[:, i]
csv_pred = export_df.to_csv(index=False).encode('utf-8')
st.download_button(
label="πŸ“₯ Download Full Predictions with Errors (CSV)",
data=csv_pred,
file_name="Predictions_with_Errors.csv",
mime="text/csv",
)
elif selected == "What-If":
st.title("What-If Simulator & Sensitivity Analysis")
if not st.session_state.model_trained:
st.warning("Please train the model first in the 'Train Model' tab.")
else:
df = st.session_state.df
model = st.session_state.model
scaler_x = st.session_state.scaler_x
scaler_y = st.session_state.scaler_y
# --- STEP 1: Select Y targets to observe ---
st.markdown("### 1. Select Y Targets to Observe")
target_y_cols = st.multiselect("Select one or more Y features to see impact on",
st.session_state.y_cols, default=st.session_state.y_cols[:1])
if len(target_y_cols) == 0:
st.warning("Please select at least one Y target.")
else:
# --- STEP 2: Configure each X feature as Constant or Vary ---
st.markdown("### 2. Configure X Features (Constant / Vary)")
st.caption("For each X feature, choose whether to keep it constant at a fixed value or vary it with a step change.")
feature_config = {}
num_cols_per_row = 2
x_cols_list = st.session_state.x_cols
for row_start in range(0, len(x_cols_list), num_cols_per_row):
row_cols = st.columns(num_cols_per_row)
for j in range(num_cols_per_row):
idx = row_start + j
if idx >= len(x_cols_list):
break
feat = x_cols_list[idx]
with row_cols[j]:
with st.expander(f"**{feat}**", expanded=False):
mode = st.radio(f"Mode for {feat}", ["Constant", "Vary"],
key=f"mode_{feat}", horizontal=True)
if mode == "Constant":
if st.session_state.loaded_sim is not None and feat in st.session_state.loaded_sim.get('constants', {}):
def_val = float(st.session_state.loaded_sim['constants'][feat])
else:
def_val = float(df[feat].mean())
val = st.number_input(f"Value for {feat}", value=def_val,
format="%.4f", key=f"const_{feat}")
feature_config[feat] = {"mode": "Constant", "value": val}
else:
feat_min = float(df[feat].min())
feat_max = float(df[feat].max())
default_ss = float((feat_max - feat_min) / 20.0)
if default_ss == 0: default_ss = 1.0
ss = st.number_input(f"Step Size for {feat}", value=default_ss,
min_value=0.000001, format="%.6f", key=f"step_{feat}")
feature_config[feat] = {"mode": "Vary", "step_size": ss,
"min": feat_min, "max": feat_max}
# --- STEP 3: Run Simulation ---
if st.button("πŸš€ Run What-If Simulation"):
varying_features = {k: v for k, v in feature_config.items() if v["mode"] == "Vary"}
constant_features = {k: v for k, v in feature_config.items() if v["mode"] == "Constant"}
if len(varying_features) == 0:
st.error("Please set at least one X feature to 'Vary' mode.")
else:
# Build sweep arrays for each varying feature
sweep_arrays = {}
for feat, cfg in varying_features.items():
mn, mx, ss = cfg["min"], cfg["max"], cfg["step_size"]
if mn == mx:
mn -= 1.0
mx += 1.0
arr = np.arange(mn, mx + ss, ss)
if len(arr) > 500:
arr = arr[:500]
sweep_arrays[feat] = arr
# If single varying feature: simple 1D sweep
if len(varying_features) == 1:
vary_feat = list(varying_features.keys())[0]
sweep_vals = sweep_arrays[vary_feat]
sim_df = pd.DataFrame()
sim_df[vary_feat] = sweep_vals
for col in st.session_state.x_cols:
if col != vary_feat:
sim_df[col] = constant_features[col]["value"]
sim_df = sim_df[st.session_state.x_cols]
input_scaled = scaler_x.transform(sim_df)
input_t = torch.tensor(input_scaled, dtype=torch.float32)
model.eval()
with torch.no_grad():
_, pred_sim_scaled = model(input_t)
pred_sim = scaler_y.inverse_transform(pred_sim_scaled.numpy())
# Build results
results_df = pd.DataFrame({vary_feat: sweep_vals})
for ty in target_y_cols:
y_idx = st.session_state.y_cols.index(ty)
preds = pred_sim[:, y_idx]
results_df[f"Predicted {ty}"] = preds
# Trend
trends = ["-"]
for i in range(1, len(preds)):
diff = preds[i] - preds[i-1]
if diff > 1e-5: trends.append("Increasing πŸ“ˆ")
elif diff < -1e-5: trends.append("Decreasing πŸ“‰")
else: trends.append("Constant βž–")
results_df[f"Trend {ty}"] = trends
st.markdown(f"### Simulation Results")
# Plotly chart for each Y
for ty in target_y_cols:
fig = px.line(results_df, x=vary_feat, y=f"Predicted {ty}",
title=f"{vary_feat} β†’ {ty}")
fig.update_layout(yaxis=dict(autorange=True))
st.plotly_chart(fig, width='stretch')
st.dataframe(results_df, width='stretch')
else:
# Multiple varying features: sweep each one independently while others stay at constant/mean
st.markdown("### Simulation Results (Per-Feature Sweep)")
all_results = []
for vary_feat, arr in sweep_arrays.items():
sim_df = pd.DataFrame()
sim_df[vary_feat] = arr
for col in st.session_state.x_cols:
if col != vary_feat:
if col in constant_features:
sim_df[col] = constant_features[col]["value"]
elif col in varying_features:
# Other varying features held at their mean during this sweep
sim_df[col] = float(df[col].mean())
sim_df = sim_df[st.session_state.x_cols]
input_scaled = scaler_x.transform(sim_df)
input_t = torch.tensor(input_scaled, dtype=torch.float32)
model.eval()
with torch.no_grad():
_, pred_sim_scaled = model(input_t)
pred_sim = scaler_y.inverse_transform(pred_sim_scaled.numpy())
for ty in target_y_cols:
y_idx = st.session_state.y_cols.index(ty)
preds = pred_sim[:, y_idx]
trends = ["-"]
for i in range(1, len(preds)):
diff = preds[i] - preds[i-1]
if diff > 1e-5: trends.append("Increasing πŸ“ˆ")
elif diff < -1e-5: trends.append("Decreasing πŸ“‰")
else: trends.append("Constant βž–")
res_df = pd.DataFrame({
vary_feat: arr,
f"Predicted {ty}": preds,
"Trend": trends
})
all_results.append({"x": vary_feat, "y": ty, "df": res_df})
fig = px.line(res_df, x=vary_feat, y=f"Predicted {ty}",
title=f"{vary_feat} β†’ {ty}")
fig.update_layout(yaxis=dict(autorange=True))
st.plotly_chart(fig, width='stretch')
# Combined download
combined = pd.DataFrame()
for r in all_results:
temp = r["df"].copy()
temp["Varied X"] = r["x"]
temp["Target Y"] = r["y"]
combined = pd.concat([combined, temp], ignore_index=True)
st.dataframe(combined, use_container_width=True)
# Download button
if len(varying_features) == 1:
csv_data = results_df.to_csv(index=False).encode('utf-8')
else:
csv_data = combined.to_csv(index=False).encode('utf-8')
st.download_button(
label="πŸ“₯ Download Simulation Results (CSV)",
data=csv_data,
file_name="WhatIf_Simulation_Results.csv",
mime="text/csv",
)
# Save to sim history
const_dict = {k: v["value"] for k, v in constant_features.items()}
vary_dict = {k: v["step_size"] for k, v in varying_features.items()}
st.session_state.sim_history.append({
"Timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"Varying Features": ", ".join(varying_features.keys()),
"Target KPIs": ", ".join(target_y_cols),
"Step Sizes": str(vary_dict),
"constants": const_dict
})
st.success("βœ… Simulation Run Saved to Action History!")
st.markdown("---")
st.markdown("### πŸ•’ Simulation Action History")
if len(st.session_state.sim_history) == 0:
st.info("No actions performed yet. Run a simulation to save it to history.")
else:
history_df = pd.DataFrame(st.session_state.sim_history).drop(columns=['constants'])
st.dataframe(history_df, use_container_width=True)
st.markdown("**Load a Past Action Scenario:**")
selected_timestamp = st.selectbox("Select Action by Timestamp", [h['Timestamp'] for h in reversed(st.session_state.sim_history)])
if st.button("Load Selected Scenario"):
scenario = next(h for h in st.session_state.sim_history if h['Timestamp'] == selected_timestamp)
st.session_state.loaded_sim = scenario
st.success(f"Scenario from {selected_timestamp} loaded! The constant feature inputs have been updated.")
st.rerun()
elif selected == "History":
st.title("Training History")
if len(st.session_state.history) == 0:
st.info("No training history available. Train a model first.")
else:
history_df = pd.DataFrame(st.session_state.history).drop(columns=['Model'])
st.dataframe(history_df)
load_run = st.selectbox("Select a Run ID to load as active model", history_df['Run ID'].tolist())
if st.button("Load Model"):
run_data = next(item for item in st.session_state.history if item["Run ID"] == load_run)
st.session_state.model = run_data["Model"]
st.session_state.model_trained = True
st.success(f"Model from Run {load_run} loaded successfully!")
elif selected == "Comparison":
st.title("Model Comparison")
if len(st.session_state.history) < 2:
st.info("Need at least 2 training runs to compare. Go to 'Train Model' and try different hyperparameters.")
else:
history_df = pd.DataFrame(st.session_state.history)
st.subheader("Average Test Metric Comparison")
fig, ax = plt.subplots(figsize=(8, 4))
metric_col = 'Avg Test RMSE' if 'Avg Test RMSE' in history_df.columns else 'Avg Test MSE'
ax.bar(history_df['Run ID'].astype(str), history_df[metric_col], color='skyblue')
ax.set_xlabel('Run ID')
ax.set_ylabel(metric_col)
st.pyplot(fig)