test9998 / src /streamlit_app.py
gaetanbrison's picture
Update src/streamlit_app.py
e768e32 verified
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import warnings
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
mean_squared_error, mean_absolute_error, r2_score,
accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, classification_report, roc_auc_score
)
warnings.filterwarnings('ignore')
# MLflow and experiment tracking
try:
import mlflow
import mlflow.sklearn
MLFLOW_AVAILABLE = True
except ImportError:
MLFLOW_AVAILABLE = False
st.warning("MLflow not installed. Some features may be limited.")
# PyCaret imports
try:
from pycaret.classification import setup as cls_setup, compare_models as cls_compare, create_model as cls_create
from pycaret.classification import tune_model as cls_tune, finalize_model as cls_finalize, predict_model as cls_predict
from pycaret.classification import pull as cls_pull, plot_model as cls_plot, evaluate_model as cls_evaluate
from pycaret.regression import setup as reg_setup, compare_models as reg_compare, create_model as reg_create
from pycaret.regression import tune_model as reg_tune, finalize_model as reg_finalize, predict_model as reg_predict
from pycaret.regression import pull as reg_pull, plot_model as reg_plot, evaluate_model as reg_evaluate
PYCARET_AVAILABLE = True
except ImportError:
PYCARET_AVAILABLE = False
st.warning("PyCaret not installed. AutoML features will be limited.")
# Data profiling
#try:
# from ydata_profiling import ProfileReport
# from streamlit_pandas_profiling import st_profile_report
# PROFILING_AVAILABLE = True
#except ImportError:
# PROFILING_AVAILABLE = False
# PyTorch for deep learning
try:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
TORCH_AVAILABLE = True
except ImportError:
TORCH_AVAILABLE = False
# SHAP for explainability
try:
import shap
SHAP_AVAILABLE = True
except ImportError:
SHAP_AVAILABLE = False
# Scikit-learn imports
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# ================== CUSTOM CSS & STYLING ==================
st.set_page_config(
page_title="🚀 Super Data Science App",
layout="wide",
initial_sidebar_state="expanded",
page_icon="🚀"
)
st.markdown("""
<style>
/* Main styling */
.main {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
font-family: 'Arial', sans-serif;
}
/* Sidebar styling */
.sidebar .sidebar-content {
background: linear-gradient(180deg, #2C3E50, #3498DB);
color: white;
}
/* Button styling */
.stButton > button {
background: linear-gradient(45deg, #FF6B6B, #4ECDC4);
color: white;
border: none;
border-radius: 25px;
padding: 0.6rem 1.5rem;
font-weight: bold;
transition: all 0.3s ease;
box-shadow: 0 4px 15px 0 rgba(31, 38, 135, 0.37);
}
.stButton > button:hover {
transform: translateY(-2px);
box-shadow: 0 8px 25px 0 rgba(31, 38, 135, 0.37);
}
/* Metric styling */
.metric-container {
background: rgba(255, 255, 255, 0.1);
backdrop-filter: blur(10px);
border-radius: 15px;
padding: 1rem;
margin: 0.5rem 0;
border: 1px solid rgba(255, 255, 255, 0.2);
}
/* Header styling */
.main-header {
text-align: center;
padding: 2rem 0;
background: rgba(255, 255, 255, 0.1);
backdrop-filter: blur(10px);
border-radius: 20px;
margin-bottom: 2rem;
border: 1px solid rgba(255, 255, 255, 0.2);
}
/* Success/Error messages */
.stSuccess, .stError, .stWarning {
border-radius: 10px;
border: none;
}
</style>
""", unsafe_allow_html=True)
# ================== HEADER ==================
st.markdown("""
<div class="main-header">
<h1 style="color: white; font-size: 3rem; margin-bottom: 0;">🚀 Super Data Science App</h1>
<p style="color: rgba(255,255,255,0.8); font-size: 1.2rem;">
Complete ML Pipeline: EDA → Modeling → AutoML → Explainability → Deployment
</p>
</div>
""", unsafe_allow_html=True)
# ================== AUTHENTICATION ==================
def check_authentication():
if 'authenticated' not in st.session_state:
st.session_state.authenticated = False
if not st.session_state.authenticated:
with st.sidebar:
st.header("🔒 Authentication")
password = st.text_input("Enter Password", type="password", key="auth_password")
col1, col2 = st.columns(2)
with col1:
if st.button("🔑 Login", key="login_btn"):
if password == "ds4everyone":
st.session_state.authenticated = True
st.success("✅ Access Granted!")
st.rerun()
else:
st.error("❌ Incorrect Password")
with col2:
if st.button("👤 Demo Mode", key="demo_btn"):
st.session_state.authenticated = True
st.session_state.demo_mode = True
st.info("📊 Demo Mode Activated")
st.rerun()
st.info("🔐 Please authenticate to access the application")
st.stop()
check_authentication()
# ================== SESSION STATE INITIALIZATION ==================
if 'df' not in st.session_state:
st.session_state.df = None
if 'trained_models' not in st.session_state:
st.session_state.trained_models = {}
if 'pycaret_setup_done' not in st.session_state:
st.session_state.pycaret_setup_done = False
if 'best_model' not in st.session_state:
st.session_state.best_model = None
if 'dl_models' not in st.session_state:
st.session_state.dl_models = {}
if 'training_history' not in st.session_state:
st.session_state.training_history = {}
# ================== SIDEBAR NAVIGATION ==================
st.sidebar.title("🧭 Navigation")
pages = [
"🏠 Home",
"📊 Data Loading",
"🔍 EDA & Profiling",
"📈 Visualization",
"🤖 Classical ML",
"⚡ PyCaret AutoML",
"🧠 Deep Learning",
"🎯 Model Evaluation",
"🔬 Explainability",
"📋 MLflow Tracking",
"🚀 Model Deployment"
]
selected_page = st.sidebar.selectbox("Select Page", pages, key="page_selector")
# ================== UTILITY FUNCTIONS ==================
def load_sample_data(dataset_name):
"""Load sample datasets"""
if dataset_name == "California Housing":
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing(as_frame=True)
df = pd.concat([data.data, data.target.rename('MedHouseVal')], axis=1)
return df.sample(n=min(2000, len(df))) # Limit for performance
elif dataset_name == "Iris":
from sklearn.datasets import load_iris
data = load_iris(as_frame=True)
df = pd.concat([data.data, data.target.rename('species')], axis=1)
return df
elif dataset_name == "Wine Quality":
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
try:
df = pd.read_csv(url, sep=';')
return df.sample(n=min(1000, len(df)))
except:
st.error("Could not load Wine Quality dataset")
return None
elif dataset_name == "Titanic":
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
try:
df = pd.read_csv(url)
return df
except:
st.error("Could not load Titanic dataset")
return None
def get_dataset_info(df):
"""Get comprehensive dataset information"""
info = {
'shape': df.shape,
'columns': df.columns.tolist(),
'dtypes': df.dtypes.to_dict(),
'missing_values': df.isnull().sum().to_dict(),
'memory_usage': f"{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB",
'numeric_columns': df.select_dtypes(include=[np.number]).columns.tolist(),
'categorical_columns': df.select_dtypes(exclude=[np.number]).columns.tolist()
}
return info
# ================== PAGE CONTENT ==================
if selected_page == "🏠 Home":
col1, col2, col3 = st.columns([1, 2, 1])
with col2:
st.markdown("""
## Welcome to the Super Data Science App! 🎉
This comprehensive application provides a complete machine learning pipeline:
""")
features = [
"📊 **Data Loading**: Upload CSV or use sample datasets",
"🔍 **EDA & Profiling**: Automated data profiling and exploration",
"📈 **Visualization**: Interactive charts with Plotly and Seaborn",
"🤖 **Classical ML**: Scikit-learn models with hyperparameter tuning",
"⚡ **PyCaret AutoML**: Automated machine learning with model comparison",
"🎯 **Model Evaluation**: Comprehensive model performance analysis",
"🔬 **Explainability**: SHAP values and feature importance",
"📋 **MLflow Tracking**: Experiment tracking and model versioning",
"🚀 **Model Deployment**: Model export and deployment preparation"
]
for feature in features:
st.markdown(feature)
st.markdown("---")
# Quick stats
if st.session_state.df is not None:
col_a, col_b, col_c, col_d = st.columns(4)
with col_a:
st.metric("📊 Rows", f"{st.session_state.df.shape[0]:,}")
with col_b:
st.metric("📋 Columns", f"{st.session_state.df.shape[1]:,}")
with col_c:
st.metric("🤖 Models Trained", len(st.session_state.trained_models))
with col_d:
st.metric("✅ Setup Complete", "Ready" if st.session_state.pycaret_setup_done else "Pending")
elif selected_page == "📊 Data Loading":
st.header("📊 Data Loading & Management")
col1, col2 = st.columns([1, 2])
with col1:
st.subheader("Data Source")
data_source = st.radio(
"Choose data source:",
["📁 Upload CSV", "🎲 Sample Datasets", "📋 Current Data Info"]
)
with col2:
if data_source == "📁 Upload CSV":
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
if uploaded_file is not None:
try:
df = pd.read_csv(uploaded_file)
st.session_state.df = df
st.success(f"✅ Successfully loaded {df.shape[0]} rows and {df.shape[1]} columns")
except Exception as e:
st.error(f"❌ Error loading file: {str(e)}")
if 'model_comparison' not in st.session_state:
st.session_state.model_comparison = None
#st.error(f"❌ Error loading file: {str(e)}")
elif data_source == "🎲 Sample Datasets":
sample_options = ["California Housing", "Iris", "Wine Quality", "Titanic"]
selected_sample = st.selectbox("Choose sample dataset:", sample_options)
if st.button(f"🔄 Load {selected_sample} Dataset"):
with st.spinner(f"Loading {selected_sample}..."):
df = load_sample_data(selected_sample)
if df is not None:
st.session_state.df = df
st.success(f"✅ Loaded {selected_sample} dataset!")
elif data_source == "📋 Current Data Info":
if st.session_state.df is not None:
info = get_dataset_info(st.session_state.df)
col_a, col_b = st.columns(2)
with col_a:
st.metric("📊 Rows", f"{info['shape'][0]:,}")
st.metric("📋 Columns", f"{info['shape'][1]:,}")
st.metric("💾 Memory Usage", info['memory_usage'])
with col_b:
st.metric("🔢 Numeric Columns", len(info['numeric_columns']))
st.metric("📝 Categorical Columns", len(info['categorical_columns']))
st.metric("❌ Missing Values", sum(info['missing_values'].values()))
else:
st.info("🔍 No data loaded yet")
# Data Preview
if st.session_state.df is not None:
st.subheader("📋 Data Preview")
col1, col2, col3 = st.columns(3)
with col1:
show_rows = st.slider("Rows to display", 5, 50, 10)
with col2:
show_info = st.checkbox("Show column info", value=True)
with col3:
if st.button("💾 Download Current Data"):
csv = st.session_state.df.to_csv(index=False)
st.download_button(
label="📥 Download CSV",
data=csv,
file_name=f"processed_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
mime='text/csv'
)
# Display data
st.dataframe(st.session_state.df.head(show_rows), use_container_width=True)
if show_info:
st.subheader("📊 Column Information")
info_df = pd.DataFrame({
'Column': st.session_state.df.columns,
'Data Type': st.session_state.df.dtypes,
'Non-Null Count': st.session_state.df.count(),
'Missing Values': st.session_state.df.isnull().sum(),
'Missing %': (st.session_state.df.isnull().sum() / len(st.session_state.df) * 100).round(2)
})
st.dataframe(info_df, use_container_width=True)
elif selected_page == "🔍 EDA & Profiling":
st.header("🔍 Exploratory Data Analysis & Profiling")
if st.session_state.df is None:
st.warning("⚠️ Please load data first from the Data Loading page")
st.stop()
df = st.session_state.df
# Quick EDA
st.subheader("📊 Quick Statistics")
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("📏 Dataset Shape", f"{df.shape[0]} × {df.shape[1]}")
with col2:
st.metric("🔢 Numeric Columns", len(df.select_dtypes(include=[np.number]).columns))
with col3:
st.metric("📝 Text Columns", len(df.select_dtypes(exclude=[np.number]).columns))
with col4:
st.metric("❌ Missing Values", df.isnull().sum().sum())
# Missing Values Analysis
st.subheader("❌ Missing Values Analysis")
missing_df = pd.DataFrame({
'Column': df.columns,
'Missing Count': df.isnull().sum(),
'Missing Percentage': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing Count', ascending=False)
missing_df = missing_df[missing_df['Missing Count'] > 0]
if len(missing_df) > 0:
st.dataframe(missing_df, use_container_width=True)
# Missing values heatmap
fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(df.isnull(), yticklabels=False, cbar=True, cmap='viridis')
plt.title('Missing Values Heatmap')
st.pyplot(fig)
else:
st.success("✅ No missing values found in the dataset!")
# Statistical Summary
st.subheader("📈 Statistical Summary")
numeric_cols = df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
st.dataframe(df[numeric_cols].describe(), use_container_width=True)
# Distribution plots
st.subheader("📊 Distribution Analysis")
selected_cols = st.multiselect("Select columns for distribution analysis:", numeric_cols, default=numeric_cols[:3])
if selected_cols:
cols_per_row = 2
n_rows = (len(selected_cols) + cols_per_row - 1) // cols_per_row
fig, axes = plt.subplots(n_rows, cols_per_row, figsize=(15, 5*n_rows))
if n_rows == 1:
axes = [axes] if cols_per_row == 1 else axes
else:
axes = axes.flatten()
for i, col in enumerate(selected_cols):
sns.histplot(data=df, x=col, kde=True, ax=axes[i])
axes[i].set_title(f'Distribution of {col}')
# Hide empty subplots
for i in range(len(selected_cols), len(axes)):
axes[i].set_visible(False)
plt.tight_layout()
st.pyplot(fig)
# Correlation Analysis
if len(numeric_cols) > 1:
st.subheader("🔗 Correlation Analysis")
corr_matrix = df[numeric_cols].corr()
fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0,
square=True, fmt='.2f', ax=ax)
plt.title('Correlation Matrix')
st.pyplot(fig)
# Automated Profiling Report
# if PROFILING_AVAILABLE:
# st.subheader("📋 Automated Profiling Report")
# if st.button("🔄 Generate Comprehensive Profile Report"):
# with st.spinner("Generating detailed profiling report..."):
# profile = ProfileReport(df, title="Dataset Profiling Report", explorative=True)
# st_profile_report(profile)
elif selected_page == "📈 Visualization":
st.header("📈 Interactive Data Visualization")
if st.session_state.df is None:
st.warning("⚠️ Please load data first from the Data Loading page")
st.stop()
df = st.session_state.df
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
# Visualization controls
st.subheader("🎛️ Visualization Controls")
col1, col2, col3 = st.columns(3)
with col1:
viz_type = st.selectbox("Select visualization type:", [
"📊 Histogram", "📈 Scatter Plot", "📦 Box Plot",
"🔥 Heatmap", "📉 Line Plot", "🎯 Pair Plot"
])
with col2:
if viz_type in ["📈 Scatter Plot", "📉 Line Plot"]:
x_col = st.selectbox("X-axis:", numeric_cols + categorical_cols)
y_col = st.selectbox("Y-axis:", numeric_cols)
else:
selected_col = st.selectbox("Select column:", numeric_cols if viz_type != "📦 Box Plot" else df.columns)
with col3:
if categorical_cols and viz_type in ["📊 Histogram", "📈 Scatter Plot", "📦 Box Plot"]:
color_col = st.selectbox("Color by (optional):", ["None"] + categorical_cols)
color_col = None if color_col == "None" else color_col
else:
color_col = None
# Generate visualizations
st.subheader("📊 Visualization Output")
try:
if viz_type == "📊 Histogram":
fig = px.histogram(df, x=selected_col, color=color_col,
title=f'Distribution of {selected_col}',
marginal="box")
st.plotly_chart(fig, use_container_width=True)
elif viz_type == "📈 Scatter Plot":
fig = px.scatter(df, x=x_col, y=y_col, color=color_col,
title=f'{y_col} vs {x_col}',
trendline="ols" if color_col is None else None)
st.plotly_chart(fig, use_container_width=True)
elif viz_type == "📦 Box Plot":
if color_col:
fig = px.box(df, y=selected_col, x=color_col,
title=f'Box Plot of {selected_col} by {color_col}')
else:
fig = px.box(df, y=selected_col,
title=f'Box Plot of {selected_col}')
st.plotly_chart(fig, use_container_width=True)
elif viz_type == "🔥 Heatmap":
if len(numeric_cols) > 1:
corr_matrix = df[numeric_cols].corr()
fig = px.imshow(corr_matrix, text_auto=True, aspect="auto",
title="Correlation Heatmap")
st.plotly_chart(fig, use_container_width=True)
else:
st.warning("Need at least 2 numeric columns for correlation heatmap")
elif viz_type == "📉 Line Plot":
fig = px.line(df.sort_values(x_col), x=x_col, y=y_col,
title=f'{y_col} vs {x_col} (Line Plot)')
st.plotly_chart(fig, use_container_width=True)
elif viz_type == "🎯 Pair Plot":
if len(numeric_cols) >= 2:
selected_numeric = st.multiselect("Select numeric columns for pair plot:",
numeric_cols, default=numeric_cols[:4])
if len(selected_numeric) >= 2:
fig = px.scatter_matrix(df, dimensions=selected_numeric, color=color_col,
title="Pair Plot Matrix")
st.plotly_chart(fig, use_container_width=True)
else:
st.warning("Please select at least 2 numeric columns")
else:
st.warning("Need at least 2 numeric columns for pair plot")
except Exception as e:
st.error(f"Error generating visualization: {str(e)}")
# Additional visualizations
st.subheader("📊 Additional Insights")
# Value counts for categorical columns
if categorical_cols:
st.write("**Categorical Column Distributions:**")
for col in categorical_cols[:3]: # Limit to first 3
if df[col].nunique() <= 20: # Only show if not too many categories
fig = px.bar(df[col].value_counts().head(10),
title=f'Top 10 values in {col}')
st.plotly_chart(fig, use_container_width=True)
elif selected_page == "🤖 Classical ML":
st.header("🤖 Classical Machine Learning")
if st.session_state.df is None:
st.warning("⚠️ Please load data first from the Data Loading page")
st.stop()
df = st.session_state.df
# Model configuration
st.subheader("⚙️ Model Configuration")
col1, col2 = st.columns(2)
with col1:
# Target selection
target_col = st.selectbox("🎯 Select target variable:", df.columns)
# Feature selection
available_features = [col for col in df.columns if col != target_col]
selected_features = st.multiselect("📊 Select features:", available_features,
default=available_features[:5])
with col2:
# Problem type detection
if df[target_col].dtype in ['object', 'bool'] or df[target_col].nunique() < 10:
problem_type = "Classification"
st.info("🎯 Detected: Classification Problem")
model_options = ["Logistic Regression", "Decision Tree", "Random Forest"]
else:
problem_type = "Regression"
st.info("📈 Detected: Regression Problem")
model_options = ["Linear Regression", "Decision Tree", "Random Forest"]
selected_model = st.selectbox("🤖 Select model:", model_options)
test_size = st.slider("🔄 Test set size:", 0.1, 0.5, 0.2, 0.05)
if not selected_features:
st.warning("⚠️ Please select at least one feature")
st.stop()
# Data preprocessing
if st.button("🚀 Train Model"):
with st.spinner("Training model..."):
try:
# Prepare data
X = df[selected_features].copy()
y = df[target_col].copy()
# Handle missing values
X = X.fillna(X.mean() if X.select_dtypes(include=[np.number]).shape[1] > 0 else X.mode().iloc[0])
# Encode categorical variables
le_dict = {}
for col in X.select_dtypes(include=['object']).columns:
le = LabelEncoder()
X[col] = le.fit_transform(X[col].astype(str))
le_dict[col] = le
# Encode target if classification
if problem_type == "Classification" and y.dtype == 'object':
target_le = LabelEncoder()
y = target_le.fit_transform(y)
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=42
)
# Train model
if selected_model == "Linear Regression":
model = LinearRegression()
elif selected_model == "Logistic Regression":
model = LogisticRegression(random_state=42, max_iter=1000)
elif selected_model == "Decision Tree":
if problem_type == "Classification":
model = DecisionTreeClassifier(random_state=42)
else:
model = DecisionTreeRegressor(random_state=42)
elif selected_model == "Random Forest":
if problem_type == "Classification":
model = RandomForestClassifier(random_state=42, n_estimators=100)
else:
model = RandomForestRegressor(random_state=42, n_estimators=100)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
# Store model
st.session_state.trained_models[selected_model] = {
'model': model,
'X_test': X_test,
'y_test': y_test,
'predictions': predictions,
'features': selected_features,
'target': target_col,
'problem_type': problem_type
}
st.success("✅ Model trained successfully!")
# Display results
st.subheader("📊 Model Performance")
if problem_type == "Regression":
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
col1, col2, col3 = st.columns(3)
with col1:
st.metric("MSE", f"{mse:.4f}")
with col2:
st.metric("MAE", f"{mae:.4f}")
with col3:
st.metric("R² Score", f"{r2:.4f}")
# Actual vs Predicted plot
fig = px.scatter(x=y_test, y=predictions,
labels={'x': 'Actual', 'y': 'Predicted'},
title='Actual vs Predicted Values')
fig.add_shape(type="line", x0=y_test.min(), y0=y_test.min(),
x1=y_test.max(), y1=y_test.max(),
line=dict(color="red", dash="dash"))
st.plotly_chart(fig, use_container_width=True)
else: # Classification
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Accuracy", f"{accuracy:.4f}")
with col2:
st.metric("Precision", f"{precision:.4f}")
with col3:
st.metric("Recall", f"{recall:.4f}")
with col4:
st.metric("F1-Score", f"{f1:.4f}")
# Confusion Matrix
cm = confusion_matrix(y_test, predictions)
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_title('Confusion Matrix')
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
st.pyplot(fig)
# Feature importance (for tree-based models)
if hasattr(model, 'feature_importances_'):
st.subheader("📊 Feature Importance")
importance_df = pd.DataFrame({
'Feature': selected_features,
'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
fig = px.bar(importance_df, x='Importance', y='Feature',
orientation='h', title='Feature Importance')
st.plotly_chart(fig, use_container_width=True)
except Exception as e:
st.error(f"❌ Error training model: {str(e)}")
elif selected_page == "⚡ PyCaret AutoML":
st.header("⚡ PyCaret AutoML")
if not PYCARET_AVAILABLE:
st.error("❌ PyCaret is not installed. Please install it to use AutoML features.")
st.stop()
if st.session_state.df is None:
st.warning("⚠️ Please load data first from the Data Loading page")
st.stop()
df = st.session_state.df
# AutoML Configuration
st.subheader("⚙️ AutoML Configuration")
col1, col2 = st.columns(2)
with col1:
target_col = st.selectbox("🎯 Select target variable:", df.columns, key="pycaret_target")
# Auto-detect problem type
if df[target_col].dtype in ['object', 'bool'] or df[target_col].nunique() < 10:
problem_type = "classification"
st.info("🎯 Detected: Classification Problem")
else:
problem_type = "regression"
st.info("📈 Detected: Regression Problem")
with col2:
train_size = st.slider("🔄 Training set size:", 0.5, 0.9, 0.8, 0.05)
sample_size = st.slider("📊 Sample size (for performance):", 500, min(5000, len(df)), min(2000, len(df)))
if len(df) > sample_size:
df_sample = df.sample(n=sample_size, random_state=42)
st.info(f"📊 Using {sample_size} samples for faster processing")
else:
df_sample = df.copy()
# Advanced settings
with st.expander("🔧 Advanced Settings"):
col1, col2 = st.columns(2)
with col1:
cross_validation = st.checkbox("🔄 Cross Validation", value=True)
normalize = st.checkbox("📏 Normalize Features", value=True)
with col2:
remove_outliers = st.checkbox("🚫 Remove Outliers", value=False)
feature_selection = st.checkbox("🎯 Feature Selection", value=False)
# Setup PyCaret Environment
if st.button("🚀 Setup PyCaret Environment"):
with st.spinner("Setting up PyCaret environment..."):
try:
if problem_type == "classification":
st.session_state.pycaret_exp = cls_setup(
data=df_sample,
target=target_col,
train_size=train_size,
session_id=42,
normalize=normalize,
remove_outliers=remove_outliers,
feature_selection=feature_selection,
silent=True
)
else:
st.session_state.pycaret_exp = reg_setup(
data=df_sample,
target=target_col,
train_size=train_size,
session_id=42,
normalize=normalize,
remove_outliers=remove_outliers,
feature_selection=feature_selection,
silent=True
)
st.session_state.pycaret_setup_done = True
st.session_state.pycaret_problem_type = problem_type
st.success("✅ PyCaret environment setup complete!")
except Exception as e:
st.error(f"❌ Error setting up PyCaret: {str(e)}")
# Model Comparison
if st.session_state.pycaret_setup_done:
st.subheader("📊 Model Comparison")
if st.button("🔄 Compare Models"):
with st.spinner("Comparing multiple models..."):
try:
if st.session_state.pycaret_problem_type == "classification":
comparison_df = cls_compare(
include=['lr', 'rf', 'et', 'nb', 'dt', 'svm'],
sort='Accuracy',
n_select=5
)
st.session_state.model_comparison = cls_pull()
else:
comparison_df = reg_compare(
include=['lr', 'rf', 'et', 'dt', 'huber'],
sort='R2',
n_select=5
)
st.session_state.model_comparison = reg_pull()
st.success("✅ Model comparison complete!")
except Exception as e:
st.error(f"❌ Error comparing models: {str(e)}")
# Display comparison results
if st.session_state.model_comparison is not None:
st.subheader("📈 Model Comparison Results")
st.dataframe(st.session_state.model_comparison, use_container_width=True)
# Select best model
best_model_name = st.selectbox(
"🏆 Select model for tuning:",
['lr', 'rf', 'et', 'dt', 'nb', 'svm'] if st.session_state.pycaret_problem_type == "classification"
else ['lr', 'rf', 'et', 'dt', 'huber']
)
# Create and tune model
col1, col2 = st.columns(2)
with col1:
if st.button("🎯 Create Model"):
with st.spinner("Creating model..."):
try:
if st.session_state.pycaret_problem_type == "classification":
model = cls_create(best_model_name)
else:
model = reg_create(best_model_name)
st.session_state.pycaret_model = model
st.success("✅ Model created successfully!")
except Exception as e:
st.error(f"❌ Error creating model: {str(e)}")
with col2:
if st.button("⚡ Tune Hyperparameters"):
if 'pycaret_model' in st.session_state:
with st.spinner("Tuning hyperparameters..."):
try:
if st.session_state.pycaret_problem_type == "classification":
tuned_model = cls_tune(st.session_state.pycaret_model,
optimize='Accuracy', n_iter=10)
else:
tuned_model = reg_tune(st.session_state.pycaret_model,
optimize='R2', n_iter=10)
st.session_state.tuned_model = tuned_model
st.success("✅ Hyperparameter tuning complete!")
except Exception as e:
st.error(f"❌ Error tuning model: {str(e)}")
else:
st.warning("⚠️ Please create a model first")
# Finalize model
if st.button("🏁 Finalize Best Model"):
if 'tuned_model' in st.session_state:
model_to_finalize = st.session_state.tuned_model
elif 'pycaret_model' in st.session_state:
model_to_finalize = st.session_state.pycaret_model
else:
st.warning("⚠️ Please create a model first")
model_to_finalize = None
if model_to_finalize is not None:
with st.spinner("Finalizing model..."):
try:
if st.session_state.pycaret_problem_type == "classification":
final_model = cls_finalize(model_to_finalize)
else:
final_model = reg_finalize(model_to_finalize)
st.session_state.best_model = final_model
st.success("✅ Model finalized successfully!")
except Exception as e:
st.error(f"❌ Error finalizing model: {str(e)}")
elif selected_page == "🎯 Model Evaluation":
st.header("🎯 Advanced Model Evaluation")
if st.session_state.df is None:
st.warning("⚠️ Please load data first")
st.stop()
# Check for available models
available_models = []
if st.session_state.trained_models:
available_models.extend(list(st.session_state.trained_models.keys()))
if 'best_model' in st.session_state and st.session_state.best_model is not None:
available_models.append("PyCaret Best Model")
if not available_models:
st.warning("⚠️ No trained models available. Please train a model first.")
st.stop()
selected_model_name = st.selectbox("📊 Select model to evaluate:", available_models)
if selected_model_name == "PyCaret Best Model":
if 'best_model' not in st.session_state:
st.error("❌ PyCaret model not available")
st.stop()
model_info = st.session_state.best_model
problem_type = st.session_state.get('pycaret_problem_type', 'regression')
st.subheader("📈 PyCaret Model Evaluation")
# PyCaret built-in plots
if PYCARET_AVAILABLE:
col1, col2 = st.columns(2)
with col1:
plot_types_cls = ['auc', 'confusion_matrix', 'class_report', 'pr', 'feature']
plot_types_reg = ['residuals', 'feature', 'rfe', 'learning', 'vc']
plot_types = plot_types_cls if problem_type == "classification" else plot_types_reg
selected_plot = st.selectbox("📊 Select evaluation plot:", plot_types)
with col2:
if st.button("📊 Generate Plot"):
try:
with st.spinner("Generating plot..."):
if problem_type == "classification":
cls_plot(model_info, plot=selected_plot, display_format='streamlit')
else:
reg_plot(model_info, plot=selected_plot, display_format='streamlit')
except Exception as e:
st.error(f"❌ Error generating plot: {str(e)}")
# Model predictions
if st.button("🔮 Generate Predictions"):
try:
with st.spinner("Generating predictions..."):
if problem_type == "classification":
predictions_df = cls_predict(model_info)
else:
predictions_df = reg_predict(model_info)
st.subheader("🔮 Model Predictions")
st.dataframe(predictions_df.head(20), use_container_width=True)
# Download predictions
csv = predictions_df.to_csv(index=False)
st.download_button(
label="📥 Download Predictions",
data=csv,
file_name=f"predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
mime='text/csv'
)
except Exception as e:
st.error(f"❌ Error generating predictions: {str(e)}")
else:
# Classical ML model evaluation
model_data = st.session_state.trained_models[selected_model_name]
model = model_data['model']
X_test = model_data['X_test']
y_test = model_data['y_test']
predictions = model_data['predictions']
problem_type = model_data['problem_type']
st.subheader(f"📊 {selected_model_name} Evaluation")
if problem_type == "Regression":
# Regression metrics
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
rmse = np.sqrt(mse)
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("RMSE", f"{rmse:.4f}")
with col2:
st.metric("MAE", f"{mae:.4f}")
with col3:
st.metric("R² Score", f"{r2:.4f}")
with col4:
st.metric("MSE", f"{mse:.4f}")
# Residual analysis
residuals = y_test - predictions
col1, col2 = st.columns(2)
with col1:
# Residual plot
fig = px.scatter(x=predictions, y=residuals,
labels={'x': 'Predicted', 'y': 'Residuals'},
title='Residual Plot')
fig.add_hline(y=0, line_dash="dash", line_color="red")
st.plotly_chart(fig, use_container_width=True)
with col2:
# Residual distribution
fig = px.histogram(residuals, title='Residual Distribution',
labels={'value': 'Residuals', 'count': 'Frequency'})
st.plotly_chart(fig, use_container_width=True)
else:
# Classification metrics
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Accuracy", f"{accuracy:.4f}")
with col2:
st.metric("Precision", f"{precision:.4f}")
with col3:
st.metric("Recall", f"{recall:.4f}")
with col4:
st.metric("F1-Score", f"{f1:.4f}")
elif selected_page == "🔬 Explainability":
st.header("🔬 Model Explainability with SHAP")
if not SHAP_AVAILABLE:
st.warning("⚠️ SHAP is not installed. Explainability features are limited.")
st.stop()
if st.session_state.df is None:
st.warning("⚠️ Please load data first")
st.stop()
# Check for available models
if not st.session_state.trained_models and 'best_model' not in st.session_state:
st.warning("⚠️ No trained models available. Please train a model first.")
st.stop()
# Select model for explanation
available_models = list(st.session_state.trained_models.keys())
if 'best_model' in st.session_state:
available_models.append("PyCaret Best Model")
selected_model = st.selectbox("🤖 Select model to explain:", available_models)
if selected_model != "PyCaret Best Model":
model_data = st.session_state.trained_models[selected_model]
model = model_data['model']
features = model_data['features']
X_test = model_data['X_test']
# SHAP Explanation
st.subheader("🔬 SHAP Analysis")
try:
# Create SHAP explainer
with st.spinner("Creating SHAP explainer..."):
explainer = shap.Explainer(model, X_test.iloc[:100]) # Use subset for performance
shap_values = explainer(X_test.iloc[:100])
# Global feature importance
st.subheader("🌍 Global Feature Importance")
fig, ax = plt.subplots()
shap.plots.bar(shap_values, ax=ax, show=False)
st.pyplot(fig)
# Summary plot
st.subheader("📊 Feature Impact Summary")
fig, ax = plt.subplots()
shap.plots.beeswarm(shap_values, ax=ax, show=False)
st.pyplot(fig)
# Individual prediction explanation
st.subheader("🔍 Individual Prediction Explanation")
instance_idx = st.slider("Select instance:", 0, len(X_test)-1, 0)
fig, ax = plt.subplots()
shap.plots.waterfall(shap_values[instance_idx], ax=ax, show=False)
st.pyplot(fig)
# Feature dependence
if len(features) > 1:
st.subheader("📈 Feature Dependence")
feature_for_dependence = st.selectbox("Select feature:", features)
if feature_for_dependence in X_test.columns:
fig, ax = plt.subplots()
shap.plots.scatter(shap_values[:, feature_for_dependence], ax=ax, show=False)
st.pyplot(fig)
except Exception as e:
st.error(f"❌ Error generating SHAP explanations: {str(e)}")
st.info("💡 SHAP works best with tree-based models (Random Forest, XGBoost, etc.)")
elif selected_page == "📋 MLflow Tracking":
st.header("📋 MLflow Experiment Tracking")
if not MLFLOW_AVAILABLE:
st.warning("⚠️ MLflow is not installed. Install it to use experiment tracking.")
st.stop()
# MLflow Configuration
st.subheader("⚙️ MLflow Configuration")
col1, col2 = st.columns(2)
with col1:
tracking_uri = st.text_input("🔗 Tracking URI:", "http://localhost:5000")
experiment_name = st.text_input("🧪 Experiment Name:", "super_app_experiments")
with col2:
if st.button("🔧 Set MLflow Configuration"):
try:
mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment(experiment_name)
st.success("✅ MLflow configuration set!")
except Exception as e:
st.error(f"❌ Error setting MLflow: {str(e)}")
# Log current models
st.subheader("📊 Log Models to MLflow")
if st.session_state.trained_models:
model_to_log = st.selectbox("Select model to log:", list(st.session_state.trained_models.keys()))
if st.button("📤 Log Model"):
try:
with mlflow.start_run(run_name=f"{model_to_log}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
model_data = st.session_state.trained_models[model_to_log]
model = model_data['model']
# Log model
mlflow.sklearn.log_model(model, "model")
# Log parameters
mlflow.log_param("model_type", model_to_log)
mlflow.log_param("features", model_data['features'])
mlflow.log_param("target", model_data['target'])
# Log metrics (if available)
if 'predictions' in model_data:
y_test = model_data['y_test']
predictions = model_data['predictions']
if model_data['problem_type'] == "Regression":
mlflow.log_metric("mse", mean_squared_error(y_test, predictions))
mlflow.log_metric("mae", mean_absolute_error(y_test, predictions))
mlflow.log_metric("r2", r2_score(y_test, predictions))
else:
mlflow.log_metric("accuracy", accuracy_score(y_test, predictions))
st.success("✅ Model logged to MLflow!")
except Exception as e:
st.error(f"❌ Error logging model: {str(e)}")
# Display recent runs
st.subheader("📈 Recent Experiment Runs")
if st.button("🔄 Refresh Runs"):
try:
runs = mlflow.search_runs(order_by=["start_time desc"])
if not runs.empty:
st.dataframe(runs[['run_id', 'status', 'start_time', 'params.model_type',
'metrics.mse', 'metrics.r2', 'metrics.accuracy']],
use_container_width=True)
else:
st.info("📊 No runs found. Start logging some models!")
except Exception as e:
st.error(f"❌ Error fetching runs: {str(e)}")
elif selected_page == "🚀 Model Deployment":
st.header("🚀 Model Deployment & Export")
if not st.session_state.trained_models and 'best_model' not in st.session_state:
st.warning("⚠️ No trained models available for deployment.")
st.stop()
# Model selection for deployment
available_models = list(st.session_state.trained_models.keys())
if 'best_model' in st.session_state:
available_models.append("PyCaret Best Model")
selected_model = st.selectbox("🤖 Select model for deployment:", available_models)
# Model export options
st.subheader("💾 Export Options")
col1, col2, col3 = st.columns(3)
with col1:
if st.button("📦 Export Model (Pickle)"):
try:
import pickle
if selected_model == "PyCaret Best Model":
model_to_export = st.session_state.best_model
else:
model_to_export = st.session_state.trained_models[selected_model]['model']
# Serialize model
model_bytes = pickle.dumps(model_to_export)
st.download_button(
label="📥 Download Model",
data=model_bytes,
file_name=f"{selected_model.replace(' ', '_')}_model.pkl",
mime="application/octet-stream"
)
st.success("✅ Model ready for download!")
except Exception as e:
st.error(f"❌ Error exporting model: {str(e)}")
with col2:
if st.button("📄 Generate Prediction Script"):
# Generate Python script for predictions
script_content = f'''
import pandas as pd
import pickle
import numpy as np
# Load the trained model
def load_model(model_path):
with open(model_path, 'rb') as f:
model = pickle.load(f)
return model
# Make predictions
def predict(model, input_data):
"""
Make predictions using the trained model
Parameters:
model: Trained model object
input_data: pandas DataFrame with features
Returns:
predictions: numpy array of predictions
"""
predictions = model.predict(input_data)
return predictions
# Example usage
if __name__ == "__main__":
# Load your model
model = load_model("path_to_your_model.pkl")
# Create sample input data (replace with your actual data)
sample_data = pd.DataFrame({{
# Add your feature columns here
# 'feature1': [value1],
# 'feature2': [value2],
}})
# Make predictions
predictions = predict(model, sample_data)
print("Predictions:", predictions)
'''
st.download_button(
label="📥 Download Script",
data=script_content,
file_name=f"{selected_model.replace(' ', '_')}_prediction_script.py",
mime="text/plain"
)
st.success("✅ Prediction script ready!")
with col3:
if st.button("🐳 Generate Dockerfile"):
dockerfile_content = '''
FROM python:3.9-slim
WORKDIR /app
# Copy requirements
COPY requirements.txt .
RUN pip install -r requirements.txt
# Copy model and script
COPY model.pkl .
COPY app.py .
# Expose port
EXPOSE 8000
# Run the application
CMD ["python", "app.py"]
'''
requirements_content = '''
pandas==1.5.3
scikit-learn==1.3.0
numpy==1.24.3
flask==2.3.2
'''
col_a, col_b = st.columns(2)
with col_a:
st.download_button(
label="📥 Download Dockerfile",
data=dockerfile_content,
file_name="Dockerfile",
mime="text/plain"
)
with col_b:
st.download_button(
label="📥 Download Requirements",
data=requirements_content,
file_name="requirements.txt",
mime="text/plain"
)
st.success("✅ Docker files ready!")
# Model API endpoint generator
st.subheader("🌐 API Endpoint Generator")
if st.button("🔧 Generate Flask API"):
api_code = f'''
from flask import Flask, request, jsonify
import pandas as pd
import pickle
import numpy as np
app = Flask(__name__)
# Load model at startup
model = None
def load_model():
global model
with open('model.pkl', 'rb') as f:
model = pickle.load(f)
@app.route('/predict', methods=['POST'])
def predict():
try:
# Get data from request
data = request.get_json()
# Convert to DataFrame
df = pd.DataFrame([data])
# Make prediction
prediction = model.predict(df)
# Return result
return jsonify({{
'prediction': prediction.tolist(),
'status': 'success'
}})
except Exception as e:
return jsonify({{
'error': str(e),
'status': 'error'
}}), 400
@app.route('/health', methods=['GET'])
def health():
return jsonify({{'status': 'healthy'}})
if __name__ == '__main__':
load_model()
app.run(host='0.0.0.0', port=8000, debug=False)
'''
st.download_button(
label="📥 Download Flask API",
data=api_code,
file_name="app.py",
mime="text/plain"
)
st.success("✅ Flask API code ready!")
# Deployment instructions
st.subheader("📋 Deployment Instructions")
st.markdown("""
### 🚀 Deployment Steps:
1. **Local Deployment:**
- Download the model pickle file
- Download the prediction script or Flask API
- Install required dependencies: `pip install -r requirements.txt`
- Run the application: `python app.py`
2. **Docker Deployment:**
- Download all generated files (Dockerfile, requirements.txt, app.py, model.pkl)
- Build image: `docker build -t my-ml-app .`
- Run container: `docker run -p 8000:8000 my-ml-app`
3. **Cloud Deployment:**
- **AWS**: Upload to EC2 or use ECS with the Docker image
- **GCP**: Deploy to Google Cloud Run or App Engine
- **Azure**: Use Azure Container Instances or App Service
- **Heroku**: Push Docker image to Heroku Container Registry
4. **API Usage Example:**
```bash
curl -X POST http://localhost:8000/predict \
-H "Content-Type: application/json" \
-d '{"feature1": 1.0, "feature2": 2.0}'
```
""")
# Model performance summary
if selected_model != "PyCaret Best Model" and selected_model in st.session_state.trained_models:
st.subheader("📊 Model Summary for Deployment")
model_data = st.session_state.trained_models[selected_model]
col1, col2 = st.columns(2)
with col1:
st.write("**Model Details:**")
st.write(f"- **Type:** {selected_model}")
st.write(f"- **Problem Type:** {model_data['problem_type']}")
st.write(f"- **Features:** {len(model_data['features'])}")
st.write(f"- **Target:** {model_data['target']}")
with col2:
if 'predictions' in model_data:
y_test = model_data['y_test']
predictions = model_data['predictions']
st.write("**Performance Metrics:**")
if model_data['problem_type'] == "Regression":
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
st.write(f"- **R² Score:** {r2:.4f}")
st.write(f"- **MAE:** {mae:.4f}")
else:
accuracy = accuracy_score(y_test, predictions)
st.write(f"- **Accuracy:** {accuracy:.4f}")
# ================== FOOTER ==================
st.markdown("---")
col1, col2, col3 = st.columns(3)
with col1:
st.markdown("### 📊 Quick Stats")
if st.session_state.df is not None:
st.write(f"Dataset: {st.session_state.df.shape[0]} rows × {st.session_state.df.shape[1]} cols")
st.write(f"Models Trained: {len(st.session_state.trained_models)}")
with col2:
st.markdown("### 🔗 Quick Actions")
if st.button("🔄 Reset All Data", key="footer_reset"):
for key in list(st.session_state.keys()):
if key not in ['authenticated', 'demo_mode']:
del st.session_state[key]
st.success("✅ All data reset!")
st.rerun()
with col3:
st.markdown("### ℹ️ App Info")
st.write("Super Data Science App v2.0")
st.write(f"Session: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
# ================== SIDEBAR STATUS ==================
st.sidebar.markdown("---")
st.sidebar.subheader("📊 Current Status")
# Data status
if st.session_state.df is not None:
st.sidebar.success(f"✅ Data Loaded ({st.session_state.df.shape[0]} rows)")
else:
st.sidebar.warning("⚠️ No Data Loaded")
# Models status
if st.session_state.trained_models:
st.sidebar.success(f"✅ {len(st.session_state.trained_models)} Classical Models")
else:
st.sidebar.info("ℹ️ No Classical Models")
if st.session_state.pycaret_setup_done:
st.sidebar.success("✅ PyCaret Setup Complete")
else:
st.sidebar.info("ℹ️ PyCaret Not Setup")
if st.session_state.dl_models:
st.sidebar.success(f"✅ {len(st.session_state.dl_models)} Deep Learning Models")
else:
st.sidebar.info("ℹ️ No Deep Learning Models")
# Available libraries status
st.sidebar.markdown("---")
st.sidebar.subheader("📚 Libraries Status")
st.sidebar.write(f"PyCaret: {'✅' if PYCARET_AVAILABLE else '❌'}")
st.sidebar.write(f"PyTorch: {'✅' if TORCH_AVAILABLE else '❌'}")
st.sidebar.write(f"MLflow: {'✅' if MLFLOW_AVAILABLE else '❌'}")
st.sidebar.write(f"SHAP: {'✅' if SHAP_AVAILABLE else '❌'}")
#st.sidebar.write(f"Profiling: {'✅' if PROFILING_AVAILABLE else '❌'}")
# Help section
st.sidebar.markdown("---")
st.sidebar.subheader("❓ Need Help?")
st.sidebar.markdown("""
**Quick Start:**
1. 📊 Load data (sample or upload)
2. 🔍 Explore with EDA
3. 🤖 Train models (Classical or AutoML)
4. 🎯 Evaluate performance
5. 🚀 Deploy your model
**Tips:**
- Use sample data for quick testing
- PyCaret AutoML for best results
- Export models for production use
""")
# Advanced features hint
if st.sidebar.button("🎯 Show Advanced Tips"):
st.sidebar.info("""
**Advanced Features:**
- Feature engineering in EDA
- Hyperparameter tuning in Classical ML
- Cross-validation in PyCaret
- SHAP explanations for interpretability
- MLflow for experiment tracking
- Docker deployment ready
""")
# Debug mode for development
if st.sidebar.checkbox("🐛 Debug Mode", key="debug_mode"):
st.sidebar.subheader("🔧 Debug Info")
st.sidebar.write("Session State Keys:")
for key in st.session_state.keys():
if not key.startswith('_'):
st.sidebar.write(f"- {key}")
# Performance optimization note
st.sidebar.markdown("---")
st.sidebar.caption("💡 For large datasets, consider using data sampling for faster processing")
st.sidebar.caption(f"⏰ Last updated: {datetime.now().strftime('%H:%M:%S')}")
# Auto-refresh data (for development)
if st.sidebar.button("🔄 Auto Refresh", key="auto_refresh"):
st.rerun()
# Export session state
if st.sidebar.button("💾 Export Session", key="export_session"):
session_data = {
'trained_models_count': len(st.session_state.trained_models),
'data_loaded': st.session_state.df is not None,
'pycaret_setup': st.session_state.pycaret_setup_done,
'timestamp': datetime.now().isoformat()
}
st.sidebar.download_button(
label="📥 Download Session Info",
data=str(session_data),
file_name=f"session_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
mime="text/plain"
)
# Success message for completion
if (st.session_state.df is not None and
st.session_state.trained_models and
st.session_state.pycaret_setup_done):
st.sidebar.success("🎉 Full Pipeline Complete!")
st.sidebar.balloons()
# Warning for missing dependencies
missing_deps = []
if not PYCARET_AVAILABLE:
missing_deps.append("pycaret")
if not MLFLOW_AVAILABLE:
missing_deps.append("mlflow")
if not SHAP_AVAILABLE:
missing_deps.append("shap")
#if not PROFILING_AVAILABLE:
# missing_deps.append("ydata-profiling")
if missing_deps:
st.sidebar.warning(f"⚠️ Missing: {', '.join(missing_deps)}")
st.sidebar.code(f"pip install {' '.join(missing_deps)}")
# Fun facts
fun_facts = [
"🧠 Machine Learning can predict with 95%+ accuracy in many domains",
"🚀 AutoML can save 80% of model development time",
"📊 Feature engineering often provides the biggest performance boost",
"🔬 Model explainability is crucial for production deployment",
"⚡ Ensemble methods usually outperform single models",
"📈 Cross-validation prevents overfitting better than simple train/test split"
]
import random
if st.sidebar.button("💡 Random ML Tip", key="random_tip"):
st.sidebar.info(random.choice(fun_facts))
# Resource links
st.sidebar.markdown("---")
st.sidebar.subheader("📚 Resources")
st.sidebar.markdown("""
- [PyCaret Documentation](https://pycaret.org/)
- [MLflow Documentation](https://mlflow.org/)
- [SHAP Tutorials](https://shap.readthedocs.io/)
- [Scikit-learn Guide](https://scikit-learn.org/)
""")
# Version info and credits
st.sidebar.markdown("---")
st.sidebar.caption("🚀 Super Data Science App")
st.sidebar.caption("Version 2.0 - Full Pipeline")
st.sidebar.caption("Built with Streamlit ❤️")