import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import warnings
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
mean_squared_error, mean_absolute_error, r2_score,
accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, classification_report, roc_auc_score
)
warnings.filterwarnings('ignore')
# MLflow and experiment tracking
try:
import mlflow
import mlflow.sklearn
MLFLOW_AVAILABLE = True
except ImportError:
MLFLOW_AVAILABLE = False
st.warning("MLflow not installed. Some features may be limited.")
# PyCaret imports
try:
from pycaret.classification import setup as cls_setup, compare_models as cls_compare, create_model as cls_create
from pycaret.classification import tune_model as cls_tune, finalize_model as cls_finalize, predict_model as cls_predict
from pycaret.classification import pull as cls_pull, plot_model as cls_plot, evaluate_model as cls_evaluate
from pycaret.regression import setup as reg_setup, compare_models as reg_compare, create_model as reg_create
from pycaret.regression import tune_model as reg_tune, finalize_model as reg_finalize, predict_model as reg_predict
from pycaret.regression import pull as reg_pull, plot_model as reg_plot, evaluate_model as reg_evaluate
PYCARET_AVAILABLE = True
except ImportError:
PYCARET_AVAILABLE = False
st.warning("PyCaret not installed. AutoML features will be limited.")
# Data profiling
#try:
# from ydata_profiling import ProfileReport
# from streamlit_pandas_profiling import st_profile_report
# PROFILING_AVAILABLE = True
#except ImportError:
# PROFILING_AVAILABLE = False
# PyTorch for deep learning
try:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
TORCH_AVAILABLE = True
except ImportError:
TORCH_AVAILABLE = False
# SHAP for explainability
try:
import shap
SHAP_AVAILABLE = True
except ImportError:
SHAP_AVAILABLE = False
# Scikit-learn imports
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# ================== CUSTOM CSS & STYLING ==================
st.set_page_config(
page_title="๐ Super Data Science App",
layout="wide",
initial_sidebar_state="expanded",
page_icon="๐"
)
st.markdown("""
""", unsafe_allow_html=True)
# ================== HEADER ==================
st.markdown("""
๐ Super Data Science App
Complete ML Pipeline: EDA โ Modeling โ AutoML โ Explainability โ Deployment
""", unsafe_allow_html=True)
# ================== AUTHENTICATION ==================
def check_authentication():
if 'authenticated' not in st.session_state:
st.session_state.authenticated = False
if not st.session_state.authenticated:
with st.sidebar:
st.header("๐ Authentication")
password = st.text_input("Enter Password", type="password", key="auth_password")
col1, col2 = st.columns(2)
with col1:
if st.button("๐ Login", key="login_btn"):
if password == "ds4everyone":
st.session_state.authenticated = True
st.success("โ
Access Granted!")
st.rerun()
else:
st.error("โ Incorrect Password")
with col2:
if st.button("๐ค Demo Mode", key="demo_btn"):
st.session_state.authenticated = True
st.session_state.demo_mode = True
st.info("๐ Demo Mode Activated")
st.rerun()
st.info("๐ Please authenticate to access the application")
st.stop()
check_authentication()
# ================== SESSION STATE INITIALIZATION ==================
if 'df' not in st.session_state:
st.session_state.df = None
if 'trained_models' not in st.session_state:
st.session_state.trained_models = {}
if 'pycaret_setup_done' not in st.session_state:
st.session_state.pycaret_setup_done = False
if 'best_model' not in st.session_state:
st.session_state.best_model = None
if 'dl_models' not in st.session_state:
st.session_state.dl_models = {}
if 'training_history' not in st.session_state:
st.session_state.training_history = {}
# ================== SIDEBAR NAVIGATION ==================
st.sidebar.title("๐งญ Navigation")
pages = [
"๐ Home",
"๐ Data Loading",
"๐ EDA & Profiling",
"๐ Visualization",
"๐ค Classical ML",
"โก PyCaret AutoML",
"๐ง Deep Learning",
"๐ฏ Model Evaluation",
"๐ฌ Explainability",
"๐ MLflow Tracking",
"๐ Model Deployment"
]
selected_page = st.sidebar.selectbox("Select Page", pages, key="page_selector")
# ================== UTILITY FUNCTIONS ==================
def load_sample_data(dataset_name):
"""Load sample datasets"""
if dataset_name == "California Housing":
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing(as_frame=True)
df = pd.concat([data.data, data.target.rename('MedHouseVal')], axis=1)
return df.sample(n=min(2000, len(df))) # Limit for performance
elif dataset_name == "Iris":
from sklearn.datasets import load_iris
data = load_iris(as_frame=True)
df = pd.concat([data.data, data.target.rename('species')], axis=1)
return df
elif dataset_name == "Wine Quality":
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
try:
df = pd.read_csv(url, sep=';')
return df.sample(n=min(1000, len(df)))
except:
st.error("Could not load Wine Quality dataset")
return None
elif dataset_name == "Titanic":
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
try:
df = pd.read_csv(url)
return df
except:
st.error("Could not load Titanic dataset")
return None
def get_dataset_info(df):
"""Get comprehensive dataset information"""
info = {
'shape': df.shape,
'columns': df.columns.tolist(),
'dtypes': df.dtypes.to_dict(),
'missing_values': df.isnull().sum().to_dict(),
'memory_usage': f"{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB",
'numeric_columns': df.select_dtypes(include=[np.number]).columns.tolist(),
'categorical_columns': df.select_dtypes(exclude=[np.number]).columns.tolist()
}
return info
# ================== PAGE CONTENT ==================
if selected_page == "๐ Home":
col1, col2, col3 = st.columns([1, 2, 1])
with col2:
st.markdown("""
## Welcome to the Super Data Science App! ๐
This comprehensive application provides a complete machine learning pipeline:
""")
features = [
"๐ **Data Loading**: Upload CSV or use sample datasets",
"๐ **EDA & Profiling**: Automated data profiling and exploration",
"๐ **Visualization**: Interactive charts with Plotly and Seaborn",
"๐ค **Classical ML**: Scikit-learn models with hyperparameter tuning",
"โก **PyCaret AutoML**: Automated machine learning with model comparison",
"๐ฏ **Model Evaluation**: Comprehensive model performance analysis",
"๐ฌ **Explainability**: SHAP values and feature importance",
"๐ **MLflow Tracking**: Experiment tracking and model versioning",
"๐ **Model Deployment**: Model export and deployment preparation"
]
for feature in features:
st.markdown(feature)
st.markdown("---")
# Quick stats
if st.session_state.df is not None:
col_a, col_b, col_c, col_d = st.columns(4)
with col_a:
st.metric("๐ Rows", f"{st.session_state.df.shape[0]:,}")
with col_b:
st.metric("๐ Columns", f"{st.session_state.df.shape[1]:,}")
with col_c:
st.metric("๐ค Models Trained", len(st.session_state.trained_models))
with col_d:
st.metric("โ
Setup Complete", "Ready" if st.session_state.pycaret_setup_done else "Pending")
elif selected_page == "๐ Data Loading":
st.header("๐ Data Loading & Management")
col1, col2 = st.columns([1, 2])
with col1:
st.subheader("Data Source")
data_source = st.radio(
"Choose data source:",
["๐ Upload CSV", "๐ฒ Sample Datasets", "๐ Current Data Info"]
)
with col2:
if data_source == "๐ Upload CSV":
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
if uploaded_file is not None:
try:
df = pd.read_csv(uploaded_file)
st.session_state.df = df
st.success(f"โ
Successfully loaded {df.shape[0]} rows and {df.shape[1]} columns")
except Exception as e:
st.error(f"โ Error loading file: {str(e)}")
if 'model_comparison' not in st.session_state:
st.session_state.model_comparison = None
#st.error(f"โ Error loading file: {str(e)}")
elif data_source == "๐ฒ Sample Datasets":
sample_options = ["California Housing", "Iris", "Wine Quality", "Titanic"]
selected_sample = st.selectbox("Choose sample dataset:", sample_options)
if st.button(f"๐ Load {selected_sample} Dataset"):
with st.spinner(f"Loading {selected_sample}..."):
df = load_sample_data(selected_sample)
if df is not None:
st.session_state.df = df
st.success(f"โ
Loaded {selected_sample} dataset!")
elif data_source == "๐ Current Data Info":
if st.session_state.df is not None:
info = get_dataset_info(st.session_state.df)
col_a, col_b = st.columns(2)
with col_a:
st.metric("๐ Rows", f"{info['shape'][0]:,}")
st.metric("๐ Columns", f"{info['shape'][1]:,}")
st.metric("๐พ Memory Usage", info['memory_usage'])
with col_b:
st.metric("๐ข Numeric Columns", len(info['numeric_columns']))
st.metric("๐ Categorical Columns", len(info['categorical_columns']))
st.metric("โ Missing Values", sum(info['missing_values'].values()))
else:
st.info("๐ No data loaded yet")
# Data Preview
if st.session_state.df is not None:
st.subheader("๐ Data Preview")
col1, col2, col3 = st.columns(3)
with col1:
show_rows = st.slider("Rows to display", 5, 50, 10)
with col2:
show_info = st.checkbox("Show column info", value=True)
with col3:
if st.button("๐พ Download Current Data"):
csv = st.session_state.df.to_csv(index=False)
st.download_button(
label="๐ฅ Download CSV",
data=csv,
file_name=f"processed_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
mime='text/csv'
)
# Display data
st.dataframe(st.session_state.df.head(show_rows), use_container_width=True)
if show_info:
st.subheader("๐ Column Information")
info_df = pd.DataFrame({
'Column': st.session_state.df.columns,
'Data Type': st.session_state.df.dtypes,
'Non-Null Count': st.session_state.df.count(),
'Missing Values': st.session_state.df.isnull().sum(),
'Missing %': (st.session_state.df.isnull().sum() / len(st.session_state.df) * 100).round(2)
})
st.dataframe(info_df, use_container_width=True)
elif selected_page == "๐ EDA & Profiling":
st.header("๐ Exploratory Data Analysis & Profiling")
if st.session_state.df is None:
st.warning("โ ๏ธ Please load data first from the Data Loading page")
st.stop()
df = st.session_state.df
# Quick EDA
st.subheader("๐ Quick Statistics")
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("๐ Dataset Shape", f"{df.shape[0]} ร {df.shape[1]}")
with col2:
st.metric("๐ข Numeric Columns", len(df.select_dtypes(include=[np.number]).columns))
with col3:
st.metric("๐ Text Columns", len(df.select_dtypes(exclude=[np.number]).columns))
with col4:
st.metric("โ Missing Values", df.isnull().sum().sum())
# Missing Values Analysis
st.subheader("โ Missing Values Analysis")
missing_df = pd.DataFrame({
'Column': df.columns,
'Missing Count': df.isnull().sum(),
'Missing Percentage': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing Count', ascending=False)
missing_df = missing_df[missing_df['Missing Count'] > 0]
if len(missing_df) > 0:
st.dataframe(missing_df, use_container_width=True)
# Missing values heatmap
fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(df.isnull(), yticklabels=False, cbar=True, cmap='viridis')
plt.title('Missing Values Heatmap')
st.pyplot(fig)
else:
st.success("โ
No missing values found in the dataset!")
# Statistical Summary
st.subheader("๐ Statistical Summary")
numeric_cols = df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
st.dataframe(df[numeric_cols].describe(), use_container_width=True)
# Distribution plots
st.subheader("๐ Distribution Analysis")
selected_cols = st.multiselect("Select columns for distribution analysis:", numeric_cols, default=numeric_cols[:3])
if selected_cols:
cols_per_row = 2
n_rows = (len(selected_cols) + cols_per_row - 1) // cols_per_row
fig, axes = plt.subplots(n_rows, cols_per_row, figsize=(15, 5*n_rows))
if n_rows == 1:
axes = [axes] if cols_per_row == 1 else axes
else:
axes = axes.flatten()
for i, col in enumerate(selected_cols):
sns.histplot(data=df, x=col, kde=True, ax=axes[i])
axes[i].set_title(f'Distribution of {col}')
# Hide empty subplots
for i in range(len(selected_cols), len(axes)):
axes[i].set_visible(False)
plt.tight_layout()
st.pyplot(fig)
# Correlation Analysis
if len(numeric_cols) > 1:
st.subheader("๐ Correlation Analysis")
corr_matrix = df[numeric_cols].corr()
fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0,
square=True, fmt='.2f', ax=ax)
plt.title('Correlation Matrix')
st.pyplot(fig)
# Automated Profiling Report
# if PROFILING_AVAILABLE:
# st.subheader("๐ Automated Profiling Report")
# if st.button("๐ Generate Comprehensive Profile Report"):
# with st.spinner("Generating detailed profiling report..."):
# profile = ProfileReport(df, title="Dataset Profiling Report", explorative=True)
# st_profile_report(profile)
elif selected_page == "๐ Visualization":
st.header("๐ Interactive Data Visualization")
if st.session_state.df is None:
st.warning("โ ๏ธ Please load data first from the Data Loading page")
st.stop()
df = st.session_state.df
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
# Visualization controls
st.subheader("๐๏ธ Visualization Controls")
col1, col2, col3 = st.columns(3)
with col1:
viz_type = st.selectbox("Select visualization type:", [
"๐ Histogram", "๐ Scatter Plot", "๐ฆ Box Plot",
"๐ฅ Heatmap", "๐ Line Plot", "๐ฏ Pair Plot"
])
with col2:
if viz_type in ["๐ Scatter Plot", "๐ Line Plot"]:
x_col = st.selectbox("X-axis:", numeric_cols + categorical_cols)
y_col = st.selectbox("Y-axis:", numeric_cols)
else:
selected_col = st.selectbox("Select column:", numeric_cols if viz_type != "๐ฆ Box Plot" else df.columns)
with col3:
if categorical_cols and viz_type in ["๐ Histogram", "๐ Scatter Plot", "๐ฆ Box Plot"]:
color_col = st.selectbox("Color by (optional):", ["None"] + categorical_cols)
color_col = None if color_col == "None" else color_col
else:
color_col = None
# Generate visualizations
st.subheader("๐ Visualization Output")
try:
if viz_type == "๐ Histogram":
fig = px.histogram(df, x=selected_col, color=color_col,
title=f'Distribution of {selected_col}',
marginal="box")
st.plotly_chart(fig, use_container_width=True)
elif viz_type == "๐ Scatter Plot":
fig = px.scatter(df, x=x_col, y=y_col, color=color_col,
title=f'{y_col} vs {x_col}',
trendline="ols" if color_col is None else None)
st.plotly_chart(fig, use_container_width=True)
elif viz_type == "๐ฆ Box Plot":
if color_col:
fig = px.box(df, y=selected_col, x=color_col,
title=f'Box Plot of {selected_col} by {color_col}')
else:
fig = px.box(df, y=selected_col,
title=f'Box Plot of {selected_col}')
st.plotly_chart(fig, use_container_width=True)
elif viz_type == "๐ฅ Heatmap":
if len(numeric_cols) > 1:
corr_matrix = df[numeric_cols].corr()
fig = px.imshow(corr_matrix, text_auto=True, aspect="auto",
title="Correlation Heatmap")
st.plotly_chart(fig, use_container_width=True)
else:
st.warning("Need at least 2 numeric columns for correlation heatmap")
elif viz_type == "๐ Line Plot":
fig = px.line(df.sort_values(x_col), x=x_col, y=y_col,
title=f'{y_col} vs {x_col} (Line Plot)')
st.plotly_chart(fig, use_container_width=True)
elif viz_type == "๐ฏ Pair Plot":
if len(numeric_cols) >= 2:
selected_numeric = st.multiselect("Select numeric columns for pair plot:",
numeric_cols, default=numeric_cols[:4])
if len(selected_numeric) >= 2:
fig = px.scatter_matrix(df, dimensions=selected_numeric, color=color_col,
title="Pair Plot Matrix")
st.plotly_chart(fig, use_container_width=True)
else:
st.warning("Please select at least 2 numeric columns")
else:
st.warning("Need at least 2 numeric columns for pair plot")
except Exception as e:
st.error(f"Error generating visualization: {str(e)}")
# Additional visualizations
st.subheader("๐ Additional Insights")
# Value counts for categorical columns
if categorical_cols:
st.write("**Categorical Column Distributions:**")
for col in categorical_cols[:3]: # Limit to first 3
if df[col].nunique() <= 20: # Only show if not too many categories
fig = px.bar(df[col].value_counts().head(10),
title=f'Top 10 values in {col}')
st.plotly_chart(fig, use_container_width=True)
elif selected_page == "๐ค Classical ML":
st.header("๐ค Classical Machine Learning")
if st.session_state.df is None:
st.warning("โ ๏ธ Please load data first from the Data Loading page")
st.stop()
df = st.session_state.df
# Model configuration
st.subheader("โ๏ธ Model Configuration")
col1, col2 = st.columns(2)
with col1:
# Target selection
target_col = st.selectbox("๐ฏ Select target variable:", df.columns)
# Feature selection
available_features = [col for col in df.columns if col != target_col]
selected_features = st.multiselect("๐ Select features:", available_features,
default=available_features[:5])
with col2:
# Problem type detection
if df[target_col].dtype in ['object', 'bool'] or df[target_col].nunique() < 10:
problem_type = "Classification"
st.info("๐ฏ Detected: Classification Problem")
model_options = ["Logistic Regression", "Decision Tree", "Random Forest"]
else:
problem_type = "Regression"
st.info("๐ Detected: Regression Problem")
model_options = ["Linear Regression", "Decision Tree", "Random Forest"]
selected_model = st.selectbox("๐ค Select model:", model_options)
test_size = st.slider("๐ Test set size:", 0.1, 0.5, 0.2, 0.05)
if not selected_features:
st.warning("โ ๏ธ Please select at least one feature")
st.stop()
# Data preprocessing
if st.button("๐ Train Model"):
with st.spinner("Training model..."):
try:
# Prepare data
X = df[selected_features].copy()
y = df[target_col].copy()
# Handle missing values
X = X.fillna(X.mean() if X.select_dtypes(include=[np.number]).shape[1] > 0 else X.mode().iloc[0])
# Encode categorical variables
le_dict = {}
for col in X.select_dtypes(include=['object']).columns:
le = LabelEncoder()
X[col] = le.fit_transform(X[col].astype(str))
le_dict[col] = le
# Encode target if classification
if problem_type == "Classification" and y.dtype == 'object':
target_le = LabelEncoder()
y = target_le.fit_transform(y)
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=42
)
# Train model
if selected_model == "Linear Regression":
model = LinearRegression()
elif selected_model == "Logistic Regression":
model = LogisticRegression(random_state=42, max_iter=1000)
elif selected_model == "Decision Tree":
if problem_type == "Classification":
model = DecisionTreeClassifier(random_state=42)
else:
model = DecisionTreeRegressor(random_state=42)
elif selected_model == "Random Forest":
if problem_type == "Classification":
model = RandomForestClassifier(random_state=42, n_estimators=100)
else:
model = RandomForestRegressor(random_state=42, n_estimators=100)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
# Store model
st.session_state.trained_models[selected_model] = {
'model': model,
'X_test': X_test,
'y_test': y_test,
'predictions': predictions,
'features': selected_features,
'target': target_col,
'problem_type': problem_type
}
st.success("โ
Model trained successfully!")
# Display results
st.subheader("๐ Model Performance")
if problem_type == "Regression":
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
col1, col2, col3 = st.columns(3)
with col1:
st.metric("MSE", f"{mse:.4f}")
with col2:
st.metric("MAE", f"{mae:.4f}")
with col3:
st.metric("Rยฒ Score", f"{r2:.4f}")
# Actual vs Predicted plot
fig = px.scatter(x=y_test, y=predictions,
labels={'x': 'Actual', 'y': 'Predicted'},
title='Actual vs Predicted Values')
fig.add_shape(type="line", x0=y_test.min(), y0=y_test.min(),
x1=y_test.max(), y1=y_test.max(),
line=dict(color="red", dash="dash"))
st.plotly_chart(fig, use_container_width=True)
else: # Classification
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Accuracy", f"{accuracy:.4f}")
with col2:
st.metric("Precision", f"{precision:.4f}")
with col3:
st.metric("Recall", f"{recall:.4f}")
with col4:
st.metric("F1-Score", f"{f1:.4f}")
# Confusion Matrix
cm = confusion_matrix(y_test, predictions)
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_title('Confusion Matrix')
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
st.pyplot(fig)
# Feature importance (for tree-based models)
if hasattr(model, 'feature_importances_'):
st.subheader("๐ Feature Importance")
importance_df = pd.DataFrame({
'Feature': selected_features,
'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
fig = px.bar(importance_df, x='Importance', y='Feature',
orientation='h', title='Feature Importance')
st.plotly_chart(fig, use_container_width=True)
except Exception as e:
st.error(f"โ Error training model: {str(e)}")
elif selected_page == "โก PyCaret AutoML":
st.header("โก PyCaret AutoML")
if not PYCARET_AVAILABLE:
st.error("โ PyCaret is not installed. Please install it to use AutoML features.")
st.stop()
if st.session_state.df is None:
st.warning("โ ๏ธ Please load data first from the Data Loading page")
st.stop()
df = st.session_state.df
# AutoML Configuration
st.subheader("โ๏ธ AutoML Configuration")
col1, col2 = st.columns(2)
with col1:
target_col = st.selectbox("๐ฏ Select target variable:", df.columns, key="pycaret_target")
# Auto-detect problem type
if df[target_col].dtype in ['object', 'bool'] or df[target_col].nunique() < 10:
problem_type = "classification"
st.info("๐ฏ Detected: Classification Problem")
else:
problem_type = "regression"
st.info("๐ Detected: Regression Problem")
with col2:
train_size = st.slider("๐ Training set size:", 0.5, 0.9, 0.8, 0.05)
sample_size = st.slider("๐ Sample size (for performance):", 500, min(5000, len(df)), min(2000, len(df)))
if len(df) > sample_size:
df_sample = df.sample(n=sample_size, random_state=42)
st.info(f"๐ Using {sample_size} samples for faster processing")
else:
df_sample = df.copy()
# Advanced settings
with st.expander("๐ง Advanced Settings"):
col1, col2 = st.columns(2)
with col1:
cross_validation = st.checkbox("๐ Cross Validation", value=True)
normalize = st.checkbox("๐ Normalize Features", value=True)
with col2:
remove_outliers = st.checkbox("๐ซ Remove Outliers", value=False)
feature_selection = st.checkbox("๐ฏ Feature Selection", value=False)
# Setup PyCaret Environment
if st.button("๐ Setup PyCaret Environment"):
with st.spinner("Setting up PyCaret environment..."):
try:
if problem_type == "classification":
st.session_state.pycaret_exp = cls_setup(
data=df_sample,
target=target_col,
train_size=train_size,
session_id=42,
normalize=normalize,
remove_outliers=remove_outliers,
feature_selection=feature_selection,
silent=True
)
else:
st.session_state.pycaret_exp = reg_setup(
data=df_sample,
target=target_col,
train_size=train_size,
session_id=42,
normalize=normalize,
remove_outliers=remove_outliers,
feature_selection=feature_selection,
silent=True
)
st.session_state.pycaret_setup_done = True
st.session_state.pycaret_problem_type = problem_type
st.success("โ
PyCaret environment setup complete!")
except Exception as e:
st.error(f"โ Error setting up PyCaret: {str(e)}")
# Model Comparison
if st.session_state.pycaret_setup_done:
st.subheader("๐ Model Comparison")
if st.button("๐ Compare Models"):
with st.spinner("Comparing multiple models..."):
try:
if st.session_state.pycaret_problem_type == "classification":
comparison_df = cls_compare(
include=['lr', 'rf', 'et', 'nb', 'dt', 'svm'],
sort='Accuracy',
n_select=5
)
st.session_state.model_comparison = cls_pull()
else:
comparison_df = reg_compare(
include=['lr', 'rf', 'et', 'dt', 'huber'],
sort='R2',
n_select=5
)
st.session_state.model_comparison = reg_pull()
st.success("โ
Model comparison complete!")
except Exception as e:
st.error(f"โ Error comparing models: {str(e)}")
# Display comparison results
if st.session_state.model_comparison is not None:
st.subheader("๐ Model Comparison Results")
st.dataframe(st.session_state.model_comparison, use_container_width=True)
# Select best model
best_model_name = st.selectbox(
"๐ Select model for tuning:",
['lr', 'rf', 'et', 'dt', 'nb', 'svm'] if st.session_state.pycaret_problem_type == "classification"
else ['lr', 'rf', 'et', 'dt', 'huber']
)
# Create and tune model
col1, col2 = st.columns(2)
with col1:
if st.button("๐ฏ Create Model"):
with st.spinner("Creating model..."):
try:
if st.session_state.pycaret_problem_type == "classification":
model = cls_create(best_model_name)
else:
model = reg_create(best_model_name)
st.session_state.pycaret_model = model
st.success("โ
Model created successfully!")
except Exception as e:
st.error(f"โ Error creating model: {str(e)}")
with col2:
if st.button("โก Tune Hyperparameters"):
if 'pycaret_model' in st.session_state:
with st.spinner("Tuning hyperparameters..."):
try:
if st.session_state.pycaret_problem_type == "classification":
tuned_model = cls_tune(st.session_state.pycaret_model,
optimize='Accuracy', n_iter=10)
else:
tuned_model = reg_tune(st.session_state.pycaret_model,
optimize='R2', n_iter=10)
st.session_state.tuned_model = tuned_model
st.success("โ
Hyperparameter tuning complete!")
except Exception as e:
st.error(f"โ Error tuning model: {str(e)}")
else:
st.warning("โ ๏ธ Please create a model first")
# Finalize model
if st.button("๐ Finalize Best Model"):
if 'tuned_model' in st.session_state:
model_to_finalize = st.session_state.tuned_model
elif 'pycaret_model' in st.session_state:
model_to_finalize = st.session_state.pycaret_model
else:
st.warning("โ ๏ธ Please create a model first")
model_to_finalize = None
if model_to_finalize is not None:
with st.spinner("Finalizing model..."):
try:
if st.session_state.pycaret_problem_type == "classification":
final_model = cls_finalize(model_to_finalize)
else:
final_model = reg_finalize(model_to_finalize)
st.session_state.best_model = final_model
st.success("โ
Model finalized successfully!")
except Exception as e:
st.error(f"โ Error finalizing model: {str(e)}")
elif selected_page == "๐ฏ Model Evaluation":
st.header("๐ฏ Advanced Model Evaluation")
if st.session_state.df is None:
st.warning("โ ๏ธ Please load data first")
st.stop()
# Check for available models
available_models = []
if st.session_state.trained_models:
available_models.extend(list(st.session_state.trained_models.keys()))
if 'best_model' in st.session_state and st.session_state.best_model is not None:
available_models.append("PyCaret Best Model")
if not available_models:
st.warning("โ ๏ธ No trained models available. Please train a model first.")
st.stop()
selected_model_name = st.selectbox("๐ Select model to evaluate:", available_models)
if selected_model_name == "PyCaret Best Model":
if 'best_model' not in st.session_state:
st.error("โ PyCaret model not available")
st.stop()
model_info = st.session_state.best_model
problem_type = st.session_state.get('pycaret_problem_type', 'regression')
st.subheader("๐ PyCaret Model Evaluation")
# PyCaret built-in plots
if PYCARET_AVAILABLE:
col1, col2 = st.columns(2)
with col1:
plot_types_cls = ['auc', 'confusion_matrix', 'class_report', 'pr', 'feature']
plot_types_reg = ['residuals', 'feature', 'rfe', 'learning', 'vc']
plot_types = plot_types_cls if problem_type == "classification" else plot_types_reg
selected_plot = st.selectbox("๐ Select evaluation plot:", plot_types)
with col2:
if st.button("๐ Generate Plot"):
try:
with st.spinner("Generating plot..."):
if problem_type == "classification":
cls_plot(model_info, plot=selected_plot, display_format='streamlit')
else:
reg_plot(model_info, plot=selected_plot, display_format='streamlit')
except Exception as e:
st.error(f"โ Error generating plot: {str(e)}")
# Model predictions
if st.button("๐ฎ Generate Predictions"):
try:
with st.spinner("Generating predictions..."):
if problem_type == "classification":
predictions_df = cls_predict(model_info)
else:
predictions_df = reg_predict(model_info)
st.subheader("๐ฎ Model Predictions")
st.dataframe(predictions_df.head(20), use_container_width=True)
# Download predictions
csv = predictions_df.to_csv(index=False)
st.download_button(
label="๐ฅ Download Predictions",
data=csv,
file_name=f"predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
mime='text/csv'
)
except Exception as e:
st.error(f"โ Error generating predictions: {str(e)}")
else:
# Classical ML model evaluation
model_data = st.session_state.trained_models[selected_model_name]
model = model_data['model']
X_test = model_data['X_test']
y_test = model_data['y_test']
predictions = model_data['predictions']
problem_type = model_data['problem_type']
st.subheader(f"๐ {selected_model_name} Evaluation")
if problem_type == "Regression":
# Regression metrics
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
rmse = np.sqrt(mse)
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("RMSE", f"{rmse:.4f}")
with col2:
st.metric("MAE", f"{mae:.4f}")
with col3:
st.metric("Rยฒ Score", f"{r2:.4f}")
with col4:
st.metric("MSE", f"{mse:.4f}")
# Residual analysis
residuals = y_test - predictions
col1, col2 = st.columns(2)
with col1:
# Residual plot
fig = px.scatter(x=predictions, y=residuals,
labels={'x': 'Predicted', 'y': 'Residuals'},
title='Residual Plot')
fig.add_hline(y=0, line_dash="dash", line_color="red")
st.plotly_chart(fig, use_container_width=True)
with col2:
# Residual distribution
fig = px.histogram(residuals, title='Residual Distribution',
labels={'value': 'Residuals', 'count': 'Frequency'})
st.plotly_chart(fig, use_container_width=True)
else:
# Classification metrics
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Accuracy", f"{accuracy:.4f}")
with col2:
st.metric("Precision", f"{precision:.4f}")
with col3:
st.metric("Recall", f"{recall:.4f}")
with col4:
st.metric("F1-Score", f"{f1:.4f}")
elif selected_page == "๐ฌ Explainability":
st.header("๐ฌ Model Explainability with SHAP")
if not SHAP_AVAILABLE:
st.warning("โ ๏ธ SHAP is not installed. Explainability features are limited.")
st.stop()
if st.session_state.df is None:
st.warning("โ ๏ธ Please load data first")
st.stop()
# Check for available models
if not st.session_state.trained_models and 'best_model' not in st.session_state:
st.warning("โ ๏ธ No trained models available. Please train a model first.")
st.stop()
# Select model for explanation
available_models = list(st.session_state.trained_models.keys())
if 'best_model' in st.session_state:
available_models.append("PyCaret Best Model")
selected_model = st.selectbox("๐ค Select model to explain:", available_models)
if selected_model != "PyCaret Best Model":
model_data = st.session_state.trained_models[selected_model]
model = model_data['model']
features = model_data['features']
X_test = model_data['X_test']
# SHAP Explanation
st.subheader("๐ฌ SHAP Analysis")
try:
# Create SHAP explainer
with st.spinner("Creating SHAP explainer..."):
explainer = shap.Explainer(model, X_test.iloc[:100]) # Use subset for performance
shap_values = explainer(X_test.iloc[:100])
# Global feature importance
st.subheader("๐ Global Feature Importance")
fig, ax = plt.subplots()
shap.plots.bar(shap_values, ax=ax, show=False)
st.pyplot(fig)
# Summary plot
st.subheader("๐ Feature Impact Summary")
fig, ax = plt.subplots()
shap.plots.beeswarm(shap_values, ax=ax, show=False)
st.pyplot(fig)
# Individual prediction explanation
st.subheader("๐ Individual Prediction Explanation")
instance_idx = st.slider("Select instance:", 0, len(X_test)-1, 0)
fig, ax = plt.subplots()
shap.plots.waterfall(shap_values[instance_idx], ax=ax, show=False)
st.pyplot(fig)
# Feature dependence
if len(features) > 1:
st.subheader("๐ Feature Dependence")
feature_for_dependence = st.selectbox("Select feature:", features)
if feature_for_dependence in X_test.columns:
fig, ax = plt.subplots()
shap.plots.scatter(shap_values[:, feature_for_dependence], ax=ax, show=False)
st.pyplot(fig)
except Exception as e:
st.error(f"โ Error generating SHAP explanations: {str(e)}")
st.info("๐ก SHAP works best with tree-based models (Random Forest, XGBoost, etc.)")
elif selected_page == "๐ MLflow Tracking":
st.header("๐ MLflow Experiment Tracking")
if not MLFLOW_AVAILABLE:
st.warning("โ ๏ธ MLflow is not installed. Install it to use experiment tracking.")
st.stop()
# MLflow Configuration
st.subheader("โ๏ธ MLflow Configuration")
col1, col2 = st.columns(2)
with col1:
tracking_uri = st.text_input("๐ Tracking URI:", "http://localhost:5000")
experiment_name = st.text_input("๐งช Experiment Name:", "super_app_experiments")
with col2:
if st.button("๐ง Set MLflow Configuration"):
try:
mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment(experiment_name)
st.success("โ
MLflow configuration set!")
except Exception as e:
st.error(f"โ Error setting MLflow: {str(e)}")
# Log current models
st.subheader("๐ Log Models to MLflow")
if st.session_state.trained_models:
model_to_log = st.selectbox("Select model to log:", list(st.session_state.trained_models.keys()))
if st.button("๐ค Log Model"):
try:
with mlflow.start_run(run_name=f"{model_to_log}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
model_data = st.session_state.trained_models[model_to_log]
model = model_data['model']
# Log model
mlflow.sklearn.log_model(model, "model")
# Log parameters
mlflow.log_param("model_type", model_to_log)
mlflow.log_param("features", model_data['features'])
mlflow.log_param("target", model_data['target'])
# Log metrics (if available)
if 'predictions' in model_data:
y_test = model_data['y_test']
predictions = model_data['predictions']
if model_data['problem_type'] == "Regression":
mlflow.log_metric("mse", mean_squared_error(y_test, predictions))
mlflow.log_metric("mae", mean_absolute_error(y_test, predictions))
mlflow.log_metric("r2", r2_score(y_test, predictions))
else:
mlflow.log_metric("accuracy", accuracy_score(y_test, predictions))
st.success("โ
Model logged to MLflow!")
except Exception as e:
st.error(f"โ Error logging model: {str(e)}")
# Display recent runs
st.subheader("๐ Recent Experiment Runs")
if st.button("๐ Refresh Runs"):
try:
runs = mlflow.search_runs(order_by=["start_time desc"])
if not runs.empty:
st.dataframe(runs[['run_id', 'status', 'start_time', 'params.model_type',
'metrics.mse', 'metrics.r2', 'metrics.accuracy']],
use_container_width=True)
else:
st.info("๐ No runs found. Start logging some models!")
except Exception as e:
st.error(f"โ Error fetching runs: {str(e)}")
elif selected_page == "๐ Model Deployment":
st.header("๐ Model Deployment & Export")
if not st.session_state.trained_models and 'best_model' not in st.session_state:
st.warning("โ ๏ธ No trained models available for deployment.")
st.stop()
# Model selection for deployment
available_models = list(st.session_state.trained_models.keys())
if 'best_model' in st.session_state:
available_models.append("PyCaret Best Model")
selected_model = st.selectbox("๐ค Select model for deployment:", available_models)
# Model export options
st.subheader("๐พ Export Options")
col1, col2, col3 = st.columns(3)
with col1:
if st.button("๐ฆ Export Model (Pickle)"):
try:
import pickle
if selected_model == "PyCaret Best Model":
model_to_export = st.session_state.best_model
else:
model_to_export = st.session_state.trained_models[selected_model]['model']
# Serialize model
model_bytes = pickle.dumps(model_to_export)
st.download_button(
label="๐ฅ Download Model",
data=model_bytes,
file_name=f"{selected_model.replace(' ', '_')}_model.pkl",
mime="application/octet-stream"
)
st.success("โ
Model ready for download!")
except Exception as e:
st.error(f"โ Error exporting model: {str(e)}")
with col2:
if st.button("๐ Generate Prediction Script"):
# Generate Python script for predictions
script_content = f'''
import pandas as pd
import pickle
import numpy as np
# Load the trained model
def load_model(model_path):
with open(model_path, 'rb') as f:
model = pickle.load(f)
return model
# Make predictions
def predict(model, input_data):
"""
Make predictions using the trained model
Parameters:
model: Trained model object
input_data: pandas DataFrame with features
Returns:
predictions: numpy array of predictions
"""
predictions = model.predict(input_data)
return predictions
# Example usage
if __name__ == "__main__":
# Load your model
model = load_model("path_to_your_model.pkl")
# Create sample input data (replace with your actual data)
sample_data = pd.DataFrame({{
# Add your feature columns here
# 'feature1': [value1],
# 'feature2': [value2],
}})
# Make predictions
predictions = predict(model, sample_data)
print("Predictions:", predictions)
'''
st.download_button(
label="๐ฅ Download Script",
data=script_content,
file_name=f"{selected_model.replace(' ', '_')}_prediction_script.py",
mime="text/plain"
)
st.success("โ
Prediction script ready!")
with col3:
if st.button("๐ณ Generate Dockerfile"):
dockerfile_content = '''
FROM python:3.9-slim
WORKDIR /app
# Copy requirements
COPY requirements.txt .
RUN pip install -r requirements.txt
# Copy model and script
COPY model.pkl .
COPY app.py .
# Expose port
EXPOSE 8000
# Run the application
CMD ["python", "app.py"]
'''
requirements_content = '''
pandas==1.5.3
scikit-learn==1.3.0
numpy==1.24.3
flask==2.3.2
'''
col_a, col_b = st.columns(2)
with col_a:
st.download_button(
label="๐ฅ Download Dockerfile",
data=dockerfile_content,
file_name="Dockerfile",
mime="text/plain"
)
with col_b:
st.download_button(
label="๐ฅ Download Requirements",
data=requirements_content,
file_name="requirements.txt",
mime="text/plain"
)
st.success("โ
Docker files ready!")
# Model API endpoint generator
st.subheader("๐ API Endpoint Generator")
if st.button("๐ง Generate Flask API"):
api_code = f'''
from flask import Flask, request, jsonify
import pandas as pd
import pickle
import numpy as np
app = Flask(__name__)
# Load model at startup
model = None
def load_model():
global model
with open('model.pkl', 'rb') as f:
model = pickle.load(f)
@app.route('/predict', methods=['POST'])
def predict():
try:
# Get data from request
data = request.get_json()
# Convert to DataFrame
df = pd.DataFrame([data])
# Make prediction
prediction = model.predict(df)
# Return result
return jsonify({{
'prediction': prediction.tolist(),
'status': 'success'
}})
except Exception as e:
return jsonify({{
'error': str(e),
'status': 'error'
}}), 400
@app.route('/health', methods=['GET'])
def health():
return jsonify({{'status': 'healthy'}})
if __name__ == '__main__':
load_model()
app.run(host='0.0.0.0', port=8000, debug=False)
'''
st.download_button(
label="๐ฅ Download Flask API",
data=api_code,
file_name="app.py",
mime="text/plain"
)
st.success("โ
Flask API code ready!")
# Deployment instructions
st.subheader("๐ Deployment Instructions")
st.markdown("""
### ๐ Deployment Steps:
1. **Local Deployment:**
- Download the model pickle file
- Download the prediction script or Flask API
- Install required dependencies: `pip install -r requirements.txt`
- Run the application: `python app.py`
2. **Docker Deployment:**
- Download all generated files (Dockerfile, requirements.txt, app.py, model.pkl)
- Build image: `docker build -t my-ml-app .`
- Run container: `docker run -p 8000:8000 my-ml-app`
3. **Cloud Deployment:**
- **AWS**: Upload to EC2 or use ECS with the Docker image
- **GCP**: Deploy to Google Cloud Run or App Engine
- **Azure**: Use Azure Container Instances or App Service
- **Heroku**: Push Docker image to Heroku Container Registry
4. **API Usage Example:**
```bash
curl -X POST http://localhost:8000/predict \
-H "Content-Type: application/json" \
-d '{"feature1": 1.0, "feature2": 2.0}'
```
""")
# Model performance summary
if selected_model != "PyCaret Best Model" and selected_model in st.session_state.trained_models:
st.subheader("๐ Model Summary for Deployment")
model_data = st.session_state.trained_models[selected_model]
col1, col2 = st.columns(2)
with col1:
st.write("**Model Details:**")
st.write(f"- **Type:** {selected_model}")
st.write(f"- **Problem Type:** {model_data['problem_type']}")
st.write(f"- **Features:** {len(model_data['features'])}")
st.write(f"- **Target:** {model_data['target']}")
with col2:
if 'predictions' in model_data:
y_test = model_data['y_test']
predictions = model_data['predictions']
st.write("**Performance Metrics:**")
if model_data['problem_type'] == "Regression":
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
st.write(f"- **Rยฒ Score:** {r2:.4f}")
st.write(f"- **MAE:** {mae:.4f}")
else:
accuracy = accuracy_score(y_test, predictions)
st.write(f"- **Accuracy:** {accuracy:.4f}")
# ================== FOOTER ==================
st.markdown("---")
col1, col2, col3 = st.columns(3)
with col1:
st.markdown("### ๐ Quick Stats")
if st.session_state.df is not None:
st.write(f"Dataset: {st.session_state.df.shape[0]} rows ร {st.session_state.df.shape[1]} cols")
st.write(f"Models Trained: {len(st.session_state.trained_models)}")
with col2:
st.markdown("### ๐ Quick Actions")
if st.button("๐ Reset All Data", key="footer_reset"):
for key in list(st.session_state.keys()):
if key not in ['authenticated', 'demo_mode']:
del st.session_state[key]
st.success("โ
All data reset!")
st.rerun()
with col3:
st.markdown("### โน๏ธ App Info")
st.write("Super Data Science App v2.0")
st.write(f"Session: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
# ================== SIDEBAR STATUS ==================
st.sidebar.markdown("---")
st.sidebar.subheader("๐ Current Status")
# Data status
if st.session_state.df is not None:
st.sidebar.success(f"โ
Data Loaded ({st.session_state.df.shape[0]} rows)")
else:
st.sidebar.warning("โ ๏ธ No Data Loaded")
# Models status
if st.session_state.trained_models:
st.sidebar.success(f"โ
{len(st.session_state.trained_models)} Classical Models")
else:
st.sidebar.info("โน๏ธ No Classical Models")
if st.session_state.pycaret_setup_done:
st.sidebar.success("โ
PyCaret Setup Complete")
else:
st.sidebar.info("โน๏ธ PyCaret Not Setup")
if st.session_state.dl_models:
st.sidebar.success(f"โ
{len(st.session_state.dl_models)} Deep Learning Models")
else:
st.sidebar.info("โน๏ธ No Deep Learning Models")
# Available libraries status
st.sidebar.markdown("---")
st.sidebar.subheader("๐ Libraries Status")
st.sidebar.write(f"PyCaret: {'โ
' if PYCARET_AVAILABLE else 'โ'}")
st.sidebar.write(f"PyTorch: {'โ
' if TORCH_AVAILABLE else 'โ'}")
st.sidebar.write(f"MLflow: {'โ
' if MLFLOW_AVAILABLE else 'โ'}")
st.sidebar.write(f"SHAP: {'โ
' if SHAP_AVAILABLE else 'โ'}")
#st.sidebar.write(f"Profiling: {'โ
' if PROFILING_AVAILABLE else 'โ'}")
# Help section
st.sidebar.markdown("---")
st.sidebar.subheader("โ Need Help?")
st.sidebar.markdown("""
**Quick Start:**
1. ๐ Load data (sample or upload)
2. ๐ Explore with EDA
3. ๐ค Train models (Classical or AutoML)
4. ๐ฏ Evaluate performance
5. ๐ Deploy your model
**Tips:**
- Use sample data for quick testing
- PyCaret AutoML for best results
- Export models for production use
""")
# Advanced features hint
if st.sidebar.button("๐ฏ Show Advanced Tips"):
st.sidebar.info("""
**Advanced Features:**
- Feature engineering in EDA
- Hyperparameter tuning in Classical ML
- Cross-validation in PyCaret
- SHAP explanations for interpretability
- MLflow for experiment tracking
- Docker deployment ready
""")
# Debug mode for development
if st.sidebar.checkbox("๐ Debug Mode", key="debug_mode"):
st.sidebar.subheader("๐ง Debug Info")
st.sidebar.write("Session State Keys:")
for key in st.session_state.keys():
if not key.startswith('_'):
st.sidebar.write(f"- {key}")
# Performance optimization note
st.sidebar.markdown("---")
st.sidebar.caption("๐ก For large datasets, consider using data sampling for faster processing")
st.sidebar.caption(f"โฐ Last updated: {datetime.now().strftime('%H:%M:%S')}")
# Auto-refresh data (for development)
if st.sidebar.button("๐ Auto Refresh", key="auto_refresh"):
st.rerun()
# Export session state
if st.sidebar.button("๐พ Export Session", key="export_session"):
session_data = {
'trained_models_count': len(st.session_state.trained_models),
'data_loaded': st.session_state.df is not None,
'pycaret_setup': st.session_state.pycaret_setup_done,
'timestamp': datetime.now().isoformat()
}
st.sidebar.download_button(
label="๐ฅ Download Session Info",
data=str(session_data),
file_name=f"session_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
mime="text/plain"
)
# Success message for completion
if (st.session_state.df is not None and
st.session_state.trained_models and
st.session_state.pycaret_setup_done):
st.sidebar.success("๐ Full Pipeline Complete!")
st.sidebar.balloons()
# Warning for missing dependencies
missing_deps = []
if not PYCARET_AVAILABLE:
missing_deps.append("pycaret")
if not MLFLOW_AVAILABLE:
missing_deps.append("mlflow")
if not SHAP_AVAILABLE:
missing_deps.append("shap")
#if not PROFILING_AVAILABLE:
# missing_deps.append("ydata-profiling")
if missing_deps:
st.sidebar.warning(f"โ ๏ธ Missing: {', '.join(missing_deps)}")
st.sidebar.code(f"pip install {' '.join(missing_deps)}")
# Fun facts
fun_facts = [
"๐ง Machine Learning can predict with 95%+ accuracy in many domains",
"๐ AutoML can save 80% of model development time",
"๐ Feature engineering often provides the biggest performance boost",
"๐ฌ Model explainability is crucial for production deployment",
"โก Ensemble methods usually outperform single models",
"๐ Cross-validation prevents overfitting better than simple train/test split"
]
import random
if st.sidebar.button("๐ก Random ML Tip", key="random_tip"):
st.sidebar.info(random.choice(fun_facts))
# Resource links
st.sidebar.markdown("---")
st.sidebar.subheader("๐ Resources")
st.sidebar.markdown("""
- [PyCaret Documentation](https://pycaret.org/)
- [MLflow Documentation](https://mlflow.org/)
- [SHAP Tutorials](https://shap.readthedocs.io/)
- [Scikit-learn Guide](https://scikit-learn.org/)
""")
# Version info and credits
st.sidebar.markdown("---")
st.sidebar.caption("๐ Super Data Science App")
st.sidebar.caption("Version 2.0 - Full Pipeline")
st.sidebar.caption("Built with Streamlit โค๏ธ")