import streamlit as st from streamlit_option_menu import option_menu import pandas as pd import numpy as np import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import DataLoader, TensorDataset from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score import matplotlib.pyplot as plt import plotly.express as px import io import os import sqlite3 import datetime # ---- SQLite Database Helper Functions ---- DB_PATH = "dashboard.db" def init_db(): conn = sqlite3.connect(DB_PATH) c = conn.cursor() c.execute('''CREATE TABLE IF NOT EXISTS datasets ( id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT UNIQUE, upload_time TEXT, num_rows INTEGER, num_cols INTEGER, data BLOB )''') conn.commit() conn.close() def save_dataset_to_db(name, df): conn = sqlite3.connect(DB_PATH) c = conn.cursor() blob = df.to_parquet() now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") c.execute('''INSERT OR REPLACE INTO datasets (name, upload_time, num_rows, num_cols, data) VALUES (?, ?, ?, ?, ?)''', (name, now, len(df), len(df.columns), blob)) conn.commit() conn.close() def list_datasets_from_db(): conn = sqlite3.connect(DB_PATH) c = conn.cursor() c.execute('SELECT name, upload_time, num_rows, num_cols FROM datasets ORDER BY upload_time DESC') rows = c.fetchall() conn.close() return rows def load_dataset_from_db(name): conn = sqlite3.connect(DB_PATH) c = conn.cursor() c.execute('SELECT data FROM datasets WHERE name = ?', (name,)) row = c.fetchone() conn.close() if row: return pd.read_parquet(io.BytesIO(row[0])) return None def delete_dataset_from_db(name): conn = sqlite3.connect(DB_PATH) c = conn.cursor() c.execute('DELETE FROM datasets WHERE name = ?', (name,)) conn.commit() conn.close() init_db() # ---- Model Persistence Helpers ---- import pickle MODEL_DIR = "saved_models" os.makedirs(MODEL_DIR, exist_ok=True) def save_model_to_disk(model, scaler_x, scaler_y, x_cols, y_cols, model_name): """Save model weights, scalers, and column config to disk.""" save_path = os.path.join(MODEL_DIR, model_name) os.makedirs(save_path, exist_ok=True) # Save model state dict torch.save({ 'state_dict': model.state_dict(), 'input_dim': len(x_cols), 'latent_dim': model.encoder[-1].out_features, 'output_dim': model.predictor[-1].out_features, }, os.path.join(save_path, 'model.pth')) # Save scalers with open(os.path.join(save_path, 'scaler_x.pkl'), 'wb') as f: pickle.dump(scaler_x, f) with open(os.path.join(save_path, 'scaler_y.pkl'), 'wb') as f: pickle.dump(scaler_y, f) # Save column config with open(os.path.join(save_path, 'columns.pkl'), 'wb') as f: pickle.dump({'x_cols': x_cols, 'y_cols': y_cols}, f) # Save metadata meta = { 'name': model_name, 'saved_at': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'input_dim': len(x_cols), 'output_dim': len(y_cols), 'x_cols': x_cols, 'y_cols': y_cols, } with open(os.path.join(save_path, 'metadata.pkl'), 'wb') as f: pickle.dump(meta, f) def list_saved_models(): """List all saved model directories with their metadata.""" models = [] if not os.path.exists(MODEL_DIR): return models for name in os.listdir(MODEL_DIR): meta_path = os.path.join(MODEL_DIR, name, 'metadata.pkl') if os.path.exists(meta_path): with open(meta_path, 'rb') as f: meta = pickle.load(f) models.append(meta) return models def load_model_from_disk(model_name): """Load model, scalers, and column config from disk.""" load_path = os.path.join(MODEL_DIR, model_name) checkpoint = torch.load(os.path.join(load_path, 'model.pth'), weights_only=False) model = IndustrialDAE( input_dim=checkpoint['input_dim'], latent_dim=checkpoint['latent_dim'], output_dim=checkpoint['output_dim'] ) model.load_state_dict(checkpoint['state_dict']) model.eval() with open(os.path.join(load_path, 'scaler_x.pkl'), 'rb') as f: scaler_x = pickle.load(f) with open(os.path.join(load_path, 'scaler_y.pkl'), 'rb') as f: scaler_y = pickle.load(f) with open(os.path.join(load_path, 'columns.pkl'), 'rb') as f: cols = pickle.load(f) return model, scaler_x, scaler_y, cols['x_cols'], cols['y_cols'] st.set_page_config(page_title="Multi X-Y | Industrial DAE", layout="wide", initial_sidebar_state="expanded") # ---- Premium CSS Styling ---- st.markdown(""" """, unsafe_allow_html=True) # Session State Initialization if 'df' not in st.session_state: st.session_state.df = None if 'data_history' not in st.session_state: st.session_state.data_history = {} if 'x_cols' not in st.session_state: st.session_state.x_cols = [] if 'y_cols' not in st.session_state: st.session_state.y_cols = [] if 'X_train' not in st.session_state: st.session_state.X_train = None if 'X_test' not in st.session_state: st.session_state.X_test = None if 'y_train' not in st.session_state: st.session_state.y_train = None if 'y_test' not in st.session_state: st.session_state.y_test = None if 'scaler_x' not in st.session_state: st.session_state.scaler_x = None if 'scaler_y' not in st.session_state: st.session_state.scaler_y = None if 'model_trained' not in st.session_state: st.session_state.model_trained = False if 'history' not in st.session_state: st.session_state.history = [] if 'sim_history' not in st.session_state: st.session_state.sim_history = [] if 'loaded_sim' not in st.session_state: st.session_state.loaded_sim = None # Helper Classes for PyTorch class IndustrialDAE(nn.Module): def __init__(self, input_dim=41, latent_dim=15, output_dim=5, dropout_rate=0.2): super(IndustrialDAE, self).__init__() # --- ENCODER: Learns the "Hidden Physics" --- self.encoder = nn.Sequential( nn.Linear(input_dim, 128), nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(dropout_rate), nn.Linear(128, 64), nn.ReLU(), nn.Linear(64, latent_dim) # Compressed State ) # --- DECODER: Reconstructs/Heals all features --- self.decoder = nn.Sequential( nn.Linear(latent_dim, 64), nn.ReLU(), nn.Linear(64, 128), nn.ReLU(), nn.Linear(128, input_dim) ) # --- PREDICTOR: Specifically targets the KPIs --- self.predictor = nn.Sequential( nn.Linear(latent_dim, 32), nn.ReLU(), nn.Linear(32, 16), nn.ReLU(), nn.Linear(16, output_dim) ) def forward(self, x): z = self.encoder(x) reconstructed_x = self.decoder(z) predicted_y = self.predictor(z) return reconstructed_x, predicted_y # Sidebar Navigation with st.sidebar: st.markdown("

Multi X-Y

", unsafe_allow_html=True) st.markdown("

ML Dashboard

", unsafe_allow_html=True) st.markdown("---") selected = option_menu( menu_title=None, options=["Overview", "Upload Data", "Preprocess", "Train Model", "Predict", "What-If", "History", "Comparison"], icons=["graph-up", "upload", "gear", "diagram-3", "graph-up-arrow", "magic", "clock-history", "bar-chart"], menu_icon="cast", default_index=0, styles={ "container": {"padding": "0!important", "background-color": "transparent"}, "icon": {"color": "white", "font-size": "18px"}, "nav-link": {"font-size": "16px", "text-align": "left", "margin":"0px", "--hover-color": "#2d3748"}, "nav-link-selected": {"background-color": "#2b6cb0"}, } ) if selected == "Overview": st.title("🏭 Industrial DAE β€” Multi X-Y Dashboard") st.caption("End-to-end Denoising Autoencoder for Sensor Reconstruction & KPI Prediction") # --- System Status Cards --- st.markdown("### πŸ”„ System Status") s1, s2, s3, s4 = st.columns(4) with s1: if st.session_state.df is not None: st.success(f"βœ… Data Loaded\n\n**{st.session_state.df.shape[0]}** rows Γ— **{st.session_state.df.shape[1]}** cols") else: st.warning("⚠️ No Data\n\nUpload data to begin") with s2: if len(st.session_state.x_cols) > 0: st.success(f"βœ… Preprocessed\n\n**{len(st.session_state.x_cols)}** X | **{len(st.session_state.y_cols)}** Y") else: st.warning("⚠️ Not Preprocessed") with s3: if st.session_state.model_trained: st.success("βœ… Model Trained\n\nReady for Prediction") else: st.warning("⚠️ No Model\n\nTrain a model first") with s4: saved_models_list = list_saved_models() st.info(f"πŸ’Ύ Saved Models\n\n**{len(saved_models_list)}** model(s) on disk") st.markdown("---") # --- Current Model Performance --- if st.session_state.model_trained and st.session_state.X_test is not None: st.markdown("### πŸ“Š Current Model Performance") model = st.session_state.model scaler_y = st.session_state.scaler_y X_test_t = torch.tensor(st.session_state.X_test, dtype=torch.float32) y_test = st.session_state.y_test_raw model.eval() with torch.no_grad(): _, preds_test_scaled = model(X_test_t) preds_test = scaler_y.inverse_transform(preds_test_scaled.numpy()) # Per-feature KPI cards kpi_cols = st.columns(len(st.session_state.y_cols)) r2_vals = [] for i, col in enumerate(st.session_state.y_cols): r2_val = r2_score(y_test[col], preds_test[:, i]) mae_val = mean_absolute_error(y_test[col], preds_test[:, i]) r2_vals.append(r2_val) with kpi_cols[i]: if r2_val >= 0.90: emoji = "🟒" elif r2_val >= 0.75: emoji = "🟑" else: emoji = "πŸ”΄" st.metric(label=f"{emoji} {col}", value=f"RΒ² = {r2_val:.4f}", delta=f"MAE = {mae_val:.4f}") avg_r2 = np.mean(r2_vals) if avg_r2 >= 0.90: grade = "Excellent 🟒" elif avg_r2 >= 0.75: grade = "Good 🟑" else: grade = "Needs Improvement πŸ”΄" st.markdown(f"**Overall Average RΒ²:** `{avg_r2:.4f}` β€” **{grade}**") st.markdown("---") # Feature lists col_info1, col_info2 = st.columns(2) with col_info1: st.markdown("**Input Features (X)**") for c in st.session_state.x_cols: st.markdown(f"- `{c}`") with col_info2: st.markdown("**Target Features (Y)**") for c in st.session_state.y_cols: st.markdown(f"- `{c}`") else: st.info("Upload data, preprocess, and train a model to see performance metrics here.") st.markdown("---") # --- Database & Saved Models Summary --- db_col1, db_col2 = st.columns(2) with db_col1: st.markdown("### πŸ“¦ Datasets in Database") db_ds = list_datasets_from_db() if len(db_ds) > 0: inv_df = pd.DataFrame(db_ds, columns=['Name', 'Uploaded', 'Rows', 'Cols']) st.dataframe(inv_df, width='stretch') else: st.caption("No datasets stored yet.") with db_col2: st.markdown("### πŸ’Ύ Saved Models") if len(saved_models_list) > 0: model_df = pd.DataFrame(saved_models_list)[['name', 'saved_at', 'input_dim', 'output_dim']] model_df.columns = ['Name', 'Saved At', 'X Features', 'Y Targets'] st.dataframe(model_df, width='stretch') else: st.caption("No models saved yet.") st.markdown("---") # --- Workflow Guide --- st.markdown("### πŸ—ΊοΈ Workflow Guide") st.markdown(""" | Step | Tab | Action | |------|-----|--------| | 1 | **Upload Data** | Upload Excel dataset or load from database | | 2 | **Preprocess** | Select X/Y features, impute missing data, handle outliers | | 3 | **Train Model** | Configure hyperparameters, train DAE, or load a saved model | | 4 | **Predict** | Evaluate on test data β€” metrics, scatter plots, residual analysis | | 5 | **What-If** | Sensitivity analysis with step changes & trend detection | | 6 | **History** | Review all training runs | | 7 | **Comparison** | Compare metrics across different model runs | """) elif selected == "Upload Data": st.title("Upload Data") col1, col2 = st.columns(2) with col1: st.subheader("Upload New File") uploaded_file = st.file_uploader("Upload Excel file", type=["xlsx", "xls"]) if uploaded_file is not None: @st.cache_data def load_data_from_bytes(file_bytes): return pd.read_excel(file_bytes) df = load_data_from_bytes(uploaded_file) save_dataset_to_db(uploaded_file.name, df) st.session_state.df = df st.session_state.data_history[uploaded_file.name] = df st.success(f"βœ… Data saved to database as **{uploaded_file.name}**!") with col2: st.subheader("Load from Database") db_datasets = list_datasets_from_db() if len(db_datasets) > 0: dataset_names = [r[0] for r in db_datasets] history_file = st.selectbox("Select previously uploaded data", dataset_names) if st.button("Load Selected Data"): loaded_df = load_dataset_from_db(history_file) if loaded_df is not None: st.session_state.df = loaded_df st.session_state.data_history[history_file] = loaded_df st.success(f"Data switched to **{history_file}** successfully!") else: st.error("Failed to load dataset from database.") else: st.info("No datasets in database yet. Upload a file to get started.") st.markdown("---") # Show database inventory db_datasets = list_datasets_from_db() if len(db_datasets) > 0: st.subheader("πŸ“¦ Database Inventory") inv_df = pd.DataFrame(db_datasets, columns=['Dataset Name', 'Uploaded On', 'Rows', 'Columns']) st.dataframe(inv_df, width='stretch') del_name = st.selectbox("Select dataset to delete", [r[0] for r in db_datasets], key="del_ds") if st.button("πŸ—‘οΈ Delete Selected Dataset"): delete_dataset_from_db(del_name) if del_name in st.session_state.data_history: del st.session_state.data_history[del_name] st.success(f"Deleted **{del_name}** from database.") st.rerun() st.markdown("---") if st.session_state.df is not None: st.subheader("Current Data Overview") st.dataframe(st.session_state.df.head()) st.write(f"**Shape:** {st.session_state.df.shape}") elif selected == "Preprocess": st.title("Preprocess Data") db_datasets = list_datasets_from_db() if len(db_datasets) > 0: col1, col2 = st.columns([3, 1]) with col1: history_file_prep = st.selectbox("Select Active Dataset", [r[0] for r in db_datasets], key="prep_dataset") with col2: st.write("") st.write("") if st.button("Load Dataset", key="load_prep"): loaded_df = load_dataset_from_db(history_file_prep) if loaded_df is not None: st.session_state.df = loaded_df st.session_state.data_history[history_file_prep] = loaded_df st.success(f"Dataset switched to {history_file_prep}") st.rerun() st.markdown("---") if st.session_state.df is None: st.warning("Please upload data first in the 'Upload Data' tab.") else: df = st.session_state.df # Force conversion of object columns to numeric (coercing messy strings to NaN) # This fixes issues where sensor data is accidentally parsed as text for col in df.columns: if df[col].dtype == 'object': df[col] = pd.to_numeric(df[col], errors='coerce') numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() st.subheader("Variable Selection") col_x, col_y = st.columns(2) with col_x: st.markdown("**Select Input Features (X)**") # Select All / Deselect All for X select_all_x = st.checkbox("Select All X", value=len(st.session_state.x_cols) == len(numeric_cols), key="sel_all_x") x_cols = [] for col in numeric_cols: default_checked = (col in st.session_state.x_cols) if not select_all_x else True if st.checkbox(col, value=default_checked, key=f"x_{col}"): x_cols.append(col) with col_y: st.markdown("**Select Target Variables (Y)**") y_options = [c for c in numeric_cols if c not in x_cols] select_all_y = st.checkbox("Select All Y", value=len(st.session_state.y_cols) == len(y_options) and len(y_options) > 0, key="sel_all_y") y_cols = [] for col in y_options: default_checked = (col in st.session_state.y_cols) if not select_all_y else True if st.checkbox(col, value=default_checked, key=f"y_{col}"): y_cols.append(col) st.subheader("Missing Data Imputation & Outliers") col_f1, col_f2 = st.columns(2) with col_f1: imputation_method = st.selectbox("Missing Value Imputation Method", ["Mean", "Median", "Zero"]) with col_f2: outlier_method = st.radio("Select Outlier Treatment Method", ["None", "IQR Capping", "Min-Max Percentile Capping (1% - 99%)"]) # --- Custom Min-Max Filter --- st.subheader("πŸ”§ Custom Min-Max Filter (Per Tag)") st.caption("Select specific features and set custom min/max bounds. Data outside these limits will be clipped.") all_selected = x_cols + y_cols custom_filter_tags = st.multiselect("Select Tags to Apply Custom Min-Max Filter", all_selected, default=[], key="custom_filter_tags") custom_filters = {} if len(custom_filter_tags) > 0: filter_cols = st.columns(3) for idx, tag in enumerate(custom_filter_tags): tag_min = float(df[tag].min()) tag_max = float(df[tag].max()) with filter_cols[idx % 3]: st.markdown(f"**{tag}**") st.caption(f"Data Range: {tag_min:.4f} β€” {tag_max:.4f}") c1, c2 = st.columns(2) with c1: user_min = st.number_input(f"Min", value=tag_min, format="%.4f", key=f"fmin_{tag}") with c2: user_max = st.number_input(f"Max", value=tag_max, format="%.4f", key=f"fmax_{tag}") custom_filters[tag] = {"min": user_min, "max": user_max} if st.button("Apply Preprocessing"): if len(x_cols) == 0 or len(y_cols) == 0: st.error("Please select at least one X and one Y variable.") else: st.session_state.x_cols = x_cols st.session_state.y_cols = y_cols data_x = df[x_cols].copy() data_y = df[y_cols].copy() # Show Feature Statistics st.markdown("### Feature-wise Statistics (Before Imputation)") stats_df = pd.DataFrame({ 'Missing Count': data_x.isnull().sum(), 'Missing %': (data_x.isnull().sum() / len(data_x) * 100).round(2), 'Min': data_x.min(), 'Mean': data_x.mean(), 'Max': data_x.max() }) st.dataframe(stats_df) # Impute NaNs if imputation_method == "Mean": data_x = data_x.fillna(data_x.mean()) data_y = data_y.fillna(data_y.mean()) elif imputation_method == "Median": data_x = data_x.fillna(data_x.median()) data_y = data_y.fillna(data_y.median()) elif imputation_method == "Zero": data_x = data_x.fillna(0) data_y = data_y.fillna(0) # 4. Outlier Handling if outlier_method == "IQR Capping": for col in data_x.columns: Q1 = data_x[col].quantile(0.25) Q3 = data_x[col].quantile(0.75) IQR = Q3 - Q1 lower_bound = Q1 - 1.5 * IQR upper_bound = Q3 + 1.5 * IQR data_x[col] = np.clip(data_x[col], lower_bound, upper_bound) for col in data_y.columns: Q1 = data_y[col].quantile(0.25) Q3 = data_y[col].quantile(0.75) IQR = Q3 - Q1 lower_bound = Q1 - 1.5 * IQR upper_bound = Q3 + 1.5 * IQR data_y[col] = np.clip(data_y[col], lower_bound, upper_bound) elif outlier_method == "Min-Max Percentile Capping (1% - 99%)": for col in data_x.columns: lower_bound = data_x[col].quantile(0.01) upper_bound = data_x[col].quantile(0.99) data_x[col] = np.clip(data_x[col], lower_bound, upper_bound) for col in data_y.columns: lower_bound = data_y[col].quantile(0.01) upper_bound = data_y[col].quantile(0.99) data_y[col] = np.clip(data_y[col], lower_bound, upper_bound) # 5. Apply Custom Min-Max Filters for tag, bounds in custom_filters.items(): if tag in data_x.columns: data_x[tag] = np.clip(data_x[tag], bounds['min'], bounds['max']) if tag in data_y.columns: data_y[tag] = np.clip(data_y[tag], bounds['min'], bounds['max']) st.markdown("### Feature-wise Statistics (After Preprocessing)") stats_after_df = pd.DataFrame({ 'Missing Count': data_x.isnull().sum(), 'Missing %': (data_x.isnull().sum() / len(data_x) * 100).round(2), 'Min': data_x.min(), 'Mean': data_x.mean(), 'Max': data_x.max() }) st.dataframe(stats_after_df) X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=42) scaler_x = StandardScaler() scaler_y = StandardScaler() st.session_state.X_train = scaler_x.fit_transform(X_train) st.session_state.X_test = scaler_x.transform(X_test) st.session_state.y_train = scaler_y.fit_transform(y_train) st.session_state.y_test = scaler_y.transform(y_test) st.session_state.scaler_x = scaler_x st.session_state.scaler_y = scaler_y st.session_state.y_test_raw = y_test st.success(f"Preprocessing complete! Applied {outlier_method}. Train/Test split created and features scaled.") elif selected == "Train Model": st.title("Train Model (Industrial DAE)") # --- Load a previously saved model --- saved_models = list_saved_models() if len(saved_models) > 0: with st.expander("πŸ“‚ Load a Previously Saved Model", expanded=False): model_meta_df = pd.DataFrame(saved_models)[['name', 'saved_at', 'input_dim', 'output_dim']] model_meta_df.columns = ['Model Name', 'Saved At', 'Input Features', 'Output Targets'] st.dataframe(model_meta_df, width='stretch') sel_model_name = st.selectbox("Select Model to Load", [m['name'] for m in saved_models]) if st.button("Load Selected Model"): loaded_model, loaded_sx, loaded_sy, loaded_x, loaded_y = load_model_from_disk(sel_model_name) st.session_state.model = loaded_model st.session_state.scaler_x = loaded_sx st.session_state.scaler_y = loaded_sy st.session_state.x_cols = loaded_x st.session_state.y_cols = loaded_y st.session_state.model_trained = True st.success(f"βœ… Model **{sel_model_name}** loaded! You can now use Predict and What-If tabs.") st.rerun() st.markdown("---") if st.session_state.X_train is None: st.warning("Please preprocess data first in the 'Preprocess' tab.") else: st.subheader("Hyperparameters") col1, col2 = st.columns(2) with col1: masking_ratio = st.slider("Masking Ratio (Corruption)", 0.0, 0.5, 0.10) epochs = st.number_input("Epochs", 10, 1000, 150) lr = st.number_input("Learning Rate", 0.0001, 0.1, 0.001, format="%.4f") auto_train = st.checkbox("Auto-Train (Until R2 > 0.85 & MAE lower)", value=False) with col2: latent_dim = st.slider("Latent Dimension", 2, max(2, len(st.session_state.x_cols)), 15) dropout_rate = st.slider("Dropout Rate", 0.0, 0.5, 0.2) weight_to_pred = st.number_input("Weight to Predictor Loss", 0.1, 10.0, 5.0) batch_size = st.selectbox("Batch Size", [16, 32, 64, 128, 256], index=3) if st.button("Train"): X_train_t = torch.tensor(st.session_state.X_train, dtype=torch.float32) y_train_t = torch.tensor(st.session_state.y_train, dtype=torch.float32) X_test_t = torch.tensor(st.session_state.X_test, dtype=torch.float32) y_test_t = torch.tensor(st.session_state.y_test, dtype=torch.float32) train_dataset = TensorDataset(X_train_t, y_train_t) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) input_dim = X_train_t.shape[1] output_dim = y_train_t.shape[1] model = IndustrialDAE(input_dim=input_dim, latent_dim=latent_dim, output_dim=output_dim, dropout_rate=dropout_rate) optimizer = optim.Adam(model.parameters(), lr=lr) criterion_recon = nn.MSELoss() criterion_pred = nn.HuberLoss() progress_bar = st.progress(0) status_text = st.empty() epoch_recon_losses = [] epoch_pred_losses = [] val_recon_losses = [] val_pred_losses = [] y_test_raw = st.session_state.y_test_raw scaler_y = st.session_state.scaler_y max_train_epochs = 2000 if auto_train else epochs best_r2 = -float('inf') best_mae = float('inf') for epoch in range(max_train_epochs): model.train() batch_recon_loss = 0 batch_pred_loss = 0 for batch_x, batch_y in train_loader: clean_x = batch_x # Apply Masking random_probabilities = torch.rand(clean_x.shape) mask = random_probabilities < masking_ratio noised_x = clean_x.clone() noised_x[mask] = 0.0 recon_x, pred_y = model(noised_x) loss_recon = criterion_recon(recon_x, clean_x) loss_pred = criterion_pred(pred_y, batch_y) total_loss = loss_recon + (weight_to_pred * loss_pred) optimizer.zero_grad() total_loss.backward() optimizer.step() batch_recon_loss += loss_recon.item() batch_pred_loss += loss_pred.item() epoch_recon_losses.append(batch_recon_loss / len(train_loader)) epoch_pred_losses.append(batch_pred_loss / len(train_loader)) # Validation Pass model.eval() with torch.no_grad(): # Evaluate on clean test set without masking (standard practice for evaluation) val_recon, val_pred = model(X_test_t) v_loss_recon = criterion_recon(val_recon, X_test_t) v_loss_pred = criterion_pred(val_pred, y_test_t) val_recon_losses.append(v_loss_recon.item()) val_pred_losses.append(v_loss_pred.item()) if auto_train: if (epoch + 1) % 10 == 0: with torch.no_grad(): preds_test_scaled = val_pred preds_test = scaler_y.inverse_transform(preds_test_scaled.numpy()) r2_vals = [r2_score(y_test_raw[col], preds_test[:, i]) for i, col in enumerate(st.session_state.y_cols)] mae_vals = [mean_absolute_error(y_test_raw[col], preds_test[:, i]) for i, col in enumerate(st.session_state.y_cols)] avg_r2 = np.mean(r2_vals) avg_mae = np.mean(mae_vals) status_text.text(f"Auto-Training... Epoch {epoch+1} | Avg R2: {avg_r2:.4f} | Avg MAE: {avg_mae:.4f}") if avg_r2 > 0.85 and avg_mae <= best_mae: status_text.text(f"Reached Target! Stopped at Epoch {epoch+1} with Avg R2 = {avg_r2:.4f}, Avg MAE = {avg_mae:.4f}") break if avg_r2 > best_r2: best_r2 = avg_r2 if avg_mae < best_mae: best_mae = avg_mae else: progress_bar.progress((epoch + 1) / epochs) if not auto_train: status_text.text("Training Complete!") st.session_state.model = model st.session_state.model_trained = True # Final Evaluation model.eval() with torch.no_grad(): _, val_pred = model(X_test_t) preds_test = scaler_y.inverse_transform(val_pred.numpy()) metrics_df = pd.DataFrame(index=st.session_state.y_cols, columns=['RMSE', 'MAE', 'R2 Score']) for i, col in enumerate(st.session_state.y_cols): mse = mean_squared_error(y_test_raw[col], preds_test[:, i]) metrics_df.loc[col, 'RMSE'] = np.sqrt(mse) metrics_df.loc[col, 'MAE'] = mean_absolute_error(y_test_raw[col], preds_test[:, i]) metrics_df.loc[col, 'R2 Score'] = r2_score(y_test_raw[col], preds_test[:, i]) avg_rmse = metrics_df['RMSE'].mean() run_id = len(st.session_state.history) + 1 st.session_state.history.append({ "Run ID": run_id, "Masking": masking_ratio, "Latent Dim": latent_dim, "Epochs": len(epoch_pred_losses), "Avg Test RMSE": avg_rmse, "Model": model }) # Auto-save model to disk model_name = f"DAE_Run{run_id}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}" save_model_to_disk(model, st.session_state.scaler_x, st.session_state.scaler_y, st.session_state.x_cols, st.session_state.y_cols, model_name) st.success(f"βœ… Model trained, saved as **{model_name}**, and added to History! (Epochs: {len(epoch_pred_losses)})") st.subheader("Training Post-Evaluation Metrics") st.dataframe(metrics_df) col1, col2 = st.columns(2) with col1: st.subheader("DAE Reconstruction Loss (MSE)") fig, ax = plt.subplots() ax.plot(epoch_recon_losses, color='blue', label='Train Loss') ax.plot(val_recon_losses, color='cyan', label='Validation Loss') ax.legend() st.pyplot(fig) with col2: st.subheader("Predictor Loss (Huber)") fig, ax = plt.subplots() ax.plot(epoch_pred_losses, color='orange', label='Train Loss') ax.plot(val_pred_losses, color='red', label='Validation Loss') ax.legend() st.pyplot(fig) elif selected == "Predict": st.title("Predict & Evaluate") if not st.session_state.model_trained: st.warning("Please train the model first in the 'Train Model' tab.") else: model = st.session_state.model scaler_y = st.session_state.scaler_y X_test_t = torch.tensor(st.session_state.X_test, dtype=torch.float32) y_test = st.session_state.y_test_raw model.eval() with torch.no_grad(): _, val_pred = model(X_test_t) preds_test = scaler_y.inverse_transform(val_pred.numpy()) st.subheader("Test Set Metrics") metrics_df = pd.DataFrame(index=st.session_state.y_cols, columns=['RMSE', 'MAE', 'R2 Score', 'MAPE (%)']) for i, col in enumerate(st.session_state.y_cols): actual = y_test[col].values predicted = preds_test[:, i] mse = mean_squared_error(actual, predicted) metrics_df.loc[col, 'RMSE'] = np.sqrt(mse) metrics_df.loc[col, 'MAE'] = mean_absolute_error(actual, predicted) metrics_df.loc[col, 'R2 Score'] = r2_score(actual, predicted) # MAPE - handle zeros nonzero_mask = actual != 0 if nonzero_mask.sum() > 0: metrics_df.loc[col, 'MAPE (%)'] = np.mean(np.abs((actual[nonzero_mask] - predicted[nonzero_mask]) / actual[nonzero_mask])) * 100 else: metrics_df.loc[col, 'MAPE (%)'] = 0.0 st.dataframe(metrics_df, width='stretch') # --- KPI Summary Cards --- st.subheader("πŸ“Š Model Performance Summary") kpi_cols = st.columns(len(st.session_state.y_cols)) for i, col in enumerate(st.session_state.y_cols): r2_val = float(metrics_df.loc[col, 'R2 Score']) mae_val = float(metrics_df.loc[col, 'MAE']) with kpi_cols[i]: if r2_val >= 0.90: emoji = "🟒" elif r2_val >= 0.75: emoji = "🟑" else: emoji = "πŸ”΄" st.metric(label=f"{emoji} {col}", value=f"RΒ² = {r2_val:.4f}", delta=f"MAE = {mae_val:.4f}") # --- Actual vs Predicted Line Charts --- st.subheader("πŸ“ˆ Actual vs Predicted (All Y Features)") pts = min(100, len(y_test)) for i, col in enumerate(st.session_state.y_cols): r2_val = float(metrics_df.loc[col, 'R2 Score']) chart_df = pd.DataFrame({ 'Sample Index': range(pts), 'Actual': y_test[col].values[:pts], 'Predicted': preds_test[:pts, i] }) chart_df_melted = chart_df.melt(id_vars=['Sample Index'], value_vars=['Actual', 'Predicted'], var_name='Type', value_name='Value') fig = px.line(chart_df_melted, x='Sample Index', y='Value', color='Type', title=f"{col} | RΒ² = {r2_val:.4f}") fig.update_layout(yaxis=dict(autorange=True)) st.plotly_chart(fig, width='stretch') # --- Scatter Plot: Actual vs Predicted with 45Β° line --- st.subheader("🎯 Scatter Plot: Actual vs Predicted") scatter_cols = st.columns(min(len(st.session_state.y_cols), 3)) for i, col in enumerate(st.session_state.y_cols): actual = y_test[col].values predicted = preds_test[:, i] r2_val = float(metrics_df.loc[col, 'R2 Score']) with scatter_cols[i % 3]: fig = px.scatter(x=actual, y=predicted, labels={'x': 'Actual', 'y': 'Predicted'}, title=f"{col} | RΒ² = {r2_val:.4f}", opacity=0.5) # Add 45-degree ideal line min_val = min(actual.min(), predicted.min()) max_val = max(actual.max(), predicted.max()) fig.add_shape(type="line", x0=min_val, y0=min_val, x1=max_val, y1=max_val, line=dict(color="red", dash="dash", width=2)) fig.update_layout(yaxis=dict(autorange=True), height=400) st.plotly_chart(fig, width='stretch') # --- Residual Analysis --- st.subheader("πŸ“‰ Residual Analysis (Error Distribution)") residual_cols = st.columns(min(len(st.session_state.y_cols), 3)) for i, col in enumerate(st.session_state.y_cols): actual = y_test[col].values predicted = preds_test[:, i] residuals = actual - predicted with residual_cols[i % 3]: fig = px.histogram(residuals, nbins=30, title=f"Residuals: {col}", labels={'value': 'Error (Actual - Predicted)', 'count': 'Frequency'}) fig.update_layout(showlegend=False, height=350) st.plotly_chart(fig, width='stretch') # --- Download Predictions --- st.subheader("πŸ“₯ Export Predictions") export_df = y_test.copy().reset_index(drop=True) for i, col in enumerate(st.session_state.y_cols): export_df[f"Predicted_{col}"] = preds_test[:, i] export_df[f"Error_{col}"] = y_test[col].values - preds_test[:, i] csv_pred = export_df.to_csv(index=False).encode('utf-8') st.download_button( label="πŸ“₯ Download Full Predictions with Errors (CSV)", data=csv_pred, file_name="Predictions_with_Errors.csv", mime="text/csv", ) elif selected == "What-If": st.title("What-If Simulator & Sensitivity Analysis") if not st.session_state.model_trained: st.warning("Please train the model first in the 'Train Model' tab.") else: df = st.session_state.df model = st.session_state.model scaler_x = st.session_state.scaler_x scaler_y = st.session_state.scaler_y # --- STEP 1: Select Y targets to observe --- st.markdown("### 1. Select Y Targets to Observe") target_y_cols = st.multiselect("Select one or more Y features to see impact on", st.session_state.y_cols, default=st.session_state.y_cols[:1]) if len(target_y_cols) == 0: st.warning("Please select at least one Y target.") else: # --- STEP 2: Configure each X feature as Constant or Vary --- st.markdown("### 2. Configure X Features (Constant / Vary)") st.caption("For each X feature, choose whether to keep it constant at a fixed value or vary it with a step change.") feature_config = {} num_cols_per_row = 2 x_cols_list = st.session_state.x_cols for row_start in range(0, len(x_cols_list), num_cols_per_row): row_cols = st.columns(num_cols_per_row) for j in range(num_cols_per_row): idx = row_start + j if idx >= len(x_cols_list): break feat = x_cols_list[idx] with row_cols[j]: with st.expander(f"**{feat}**", expanded=False): mode = st.radio(f"Mode for {feat}", ["Constant", "Vary"], key=f"mode_{feat}", horizontal=True) if mode == "Constant": if st.session_state.loaded_sim is not None and feat in st.session_state.loaded_sim.get('constants', {}): def_val = float(st.session_state.loaded_sim['constants'][feat]) else: def_val = float(df[feat].mean()) val = st.number_input(f"Value for {feat}", value=def_val, format="%.4f", key=f"const_{feat}") feature_config[feat] = {"mode": "Constant", "value": val} else: feat_min = float(df[feat].min()) feat_max = float(df[feat].max()) default_ss = float((feat_max - feat_min) / 20.0) if default_ss == 0: default_ss = 1.0 ss = st.number_input(f"Step Size for {feat}", value=default_ss, min_value=0.000001, format="%.6f", key=f"step_{feat}") feature_config[feat] = {"mode": "Vary", "step_size": ss, "min": feat_min, "max": feat_max} # --- STEP 3: Run Simulation --- if st.button("πŸš€ Run What-If Simulation"): varying_features = {k: v for k, v in feature_config.items() if v["mode"] == "Vary"} constant_features = {k: v for k, v in feature_config.items() if v["mode"] == "Constant"} if len(varying_features) == 0: st.error("Please set at least one X feature to 'Vary' mode.") else: # Build sweep arrays for each varying feature sweep_arrays = {} for feat, cfg in varying_features.items(): mn, mx, ss = cfg["min"], cfg["max"], cfg["step_size"] if mn == mx: mn -= 1.0 mx += 1.0 arr = np.arange(mn, mx + ss, ss) if len(arr) > 500: arr = arr[:500] sweep_arrays[feat] = arr # If single varying feature: simple 1D sweep if len(varying_features) == 1: vary_feat = list(varying_features.keys())[0] sweep_vals = sweep_arrays[vary_feat] sim_df = pd.DataFrame() sim_df[vary_feat] = sweep_vals for col in st.session_state.x_cols: if col != vary_feat: sim_df[col] = constant_features[col]["value"] sim_df = sim_df[st.session_state.x_cols] input_scaled = scaler_x.transform(sim_df) input_t = torch.tensor(input_scaled, dtype=torch.float32) model.eval() with torch.no_grad(): _, pred_sim_scaled = model(input_t) pred_sim = scaler_y.inverse_transform(pred_sim_scaled.numpy()) # Build results results_df = pd.DataFrame({vary_feat: sweep_vals}) for ty in target_y_cols: y_idx = st.session_state.y_cols.index(ty) preds = pred_sim[:, y_idx] results_df[f"Predicted {ty}"] = preds # Trend trends = ["-"] for i in range(1, len(preds)): diff = preds[i] - preds[i-1] if diff > 1e-5: trends.append("Increasing πŸ“ˆ") elif diff < -1e-5: trends.append("Decreasing πŸ“‰") else: trends.append("Constant βž–") results_df[f"Trend {ty}"] = trends st.markdown(f"### Simulation Results") # Plotly chart for each Y for ty in target_y_cols: fig = px.line(results_df, x=vary_feat, y=f"Predicted {ty}", title=f"{vary_feat} β†’ {ty}") fig.update_layout(yaxis=dict(autorange=True)) st.plotly_chart(fig, width='stretch') st.dataframe(results_df, width='stretch') else: # Multiple varying features: sweep each one independently while others stay at constant/mean st.markdown("### Simulation Results (Per-Feature Sweep)") all_results = [] for vary_feat, arr in sweep_arrays.items(): sim_df = pd.DataFrame() sim_df[vary_feat] = arr for col in st.session_state.x_cols: if col != vary_feat: if col in constant_features: sim_df[col] = constant_features[col]["value"] elif col in varying_features: # Other varying features held at their mean during this sweep sim_df[col] = float(df[col].mean()) sim_df = sim_df[st.session_state.x_cols] input_scaled = scaler_x.transform(sim_df) input_t = torch.tensor(input_scaled, dtype=torch.float32) model.eval() with torch.no_grad(): _, pred_sim_scaled = model(input_t) pred_sim = scaler_y.inverse_transform(pred_sim_scaled.numpy()) for ty in target_y_cols: y_idx = st.session_state.y_cols.index(ty) preds = pred_sim[:, y_idx] trends = ["-"] for i in range(1, len(preds)): diff = preds[i] - preds[i-1] if diff > 1e-5: trends.append("Increasing πŸ“ˆ") elif diff < -1e-5: trends.append("Decreasing πŸ“‰") else: trends.append("Constant βž–") res_df = pd.DataFrame({ vary_feat: arr, f"Predicted {ty}": preds, "Trend": trends }) all_results.append({"x": vary_feat, "y": ty, "df": res_df}) fig = px.line(res_df, x=vary_feat, y=f"Predicted {ty}", title=f"{vary_feat} β†’ {ty}") fig.update_layout(yaxis=dict(autorange=True)) st.plotly_chart(fig, width='stretch') # Combined download combined = pd.DataFrame() for r in all_results: temp = r["df"].copy() temp["Varied X"] = r["x"] temp["Target Y"] = r["y"] combined = pd.concat([combined, temp], ignore_index=True) st.dataframe(combined, use_container_width=True) # Download button if len(varying_features) == 1: csv_data = results_df.to_csv(index=False).encode('utf-8') else: csv_data = combined.to_csv(index=False).encode('utf-8') st.download_button( label="πŸ“₯ Download Simulation Results (CSV)", data=csv_data, file_name="WhatIf_Simulation_Results.csv", mime="text/csv", ) # Save to sim history const_dict = {k: v["value"] for k, v in constant_features.items()} vary_dict = {k: v["step_size"] for k, v in varying_features.items()} st.session_state.sim_history.append({ "Timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "Varying Features": ", ".join(varying_features.keys()), "Target KPIs": ", ".join(target_y_cols), "Step Sizes": str(vary_dict), "constants": const_dict }) st.success("βœ… Simulation Run Saved to Action History!") st.markdown("---") st.markdown("### πŸ•’ Simulation Action History") if len(st.session_state.sim_history) == 0: st.info("No actions performed yet. Run a simulation to save it to history.") else: history_df = pd.DataFrame(st.session_state.sim_history).drop(columns=['constants']) st.dataframe(history_df, use_container_width=True) st.markdown("**Load a Past Action Scenario:**") selected_timestamp = st.selectbox("Select Action by Timestamp", [h['Timestamp'] for h in reversed(st.session_state.sim_history)]) if st.button("Load Selected Scenario"): scenario = next(h for h in st.session_state.sim_history if h['Timestamp'] == selected_timestamp) st.session_state.loaded_sim = scenario st.success(f"Scenario from {selected_timestamp} loaded! The constant feature inputs have been updated.") st.rerun() elif selected == "History": st.title("Training History") if len(st.session_state.history) == 0: st.info("No training history available. Train a model first.") else: history_df = pd.DataFrame(st.session_state.history).drop(columns=['Model']) st.dataframe(history_df) load_run = st.selectbox("Select a Run ID to load as active model", history_df['Run ID'].tolist()) if st.button("Load Model"): run_data = next(item for item in st.session_state.history if item["Run ID"] == load_run) st.session_state.model = run_data["Model"] st.session_state.model_trained = True st.success(f"Model from Run {load_run} loaded successfully!") elif selected == "Comparison": st.title("Model Comparison") if len(st.session_state.history) < 2: st.info("Need at least 2 training runs to compare. Go to 'Train Model' and try different hyperparameters.") else: history_df = pd.DataFrame(st.session_state.history) st.subheader("Average Test Metric Comparison") fig, ax = plt.subplots(figsize=(8, 4)) metric_col = 'Avg Test RMSE' if 'Avg Test RMSE' in history_df.columns else 'Avg Test MSE' ax.bar(history_df['Run ID'].astype(str), history_df[metric_col], color='skyblue') ax.set_xlabel('Run ID') ax.set_ylabel(metric_col) st.pyplot(fig)