Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from datetime import datetime | |
| import warnings | |
| from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV | |
| from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler | |
| from sklearn.linear_model import LinearRegression, LogisticRegression | |
| from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier | |
| from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier | |
| from sklearn.svm import SVC, SVR | |
| from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor | |
| from sklearn.naive_bayes import GaussianNB | |
| from sklearn.metrics import ( | |
| mean_squared_error, mean_absolute_error, r2_score, | |
| accuracy_score, precision_score, recall_score, f1_score, | |
| confusion_matrix, classification_report, roc_auc_score | |
| ) | |
| warnings.filterwarnings('ignore') | |
| # MLflow and experiment tracking | |
| try: | |
| import mlflow | |
| import mlflow.sklearn | |
| MLFLOW_AVAILABLE = True | |
| except ImportError: | |
| MLFLOW_AVAILABLE = False | |
| st.warning("MLflow not installed. Some features may be limited.") | |
| # PyCaret imports | |
| try: | |
| from pycaret.classification import setup as cls_setup, compare_models as cls_compare, create_model as cls_create | |
| from pycaret.classification import tune_model as cls_tune, finalize_model as cls_finalize, predict_model as cls_predict | |
| from pycaret.classification import pull as cls_pull, plot_model as cls_plot, evaluate_model as cls_evaluate | |
| from pycaret.regression import setup as reg_setup, compare_models as reg_compare, create_model as reg_create | |
| from pycaret.regression import tune_model as reg_tune, finalize_model as reg_finalize, predict_model as reg_predict | |
| from pycaret.regression import pull as reg_pull, plot_model as reg_plot, evaluate_model as reg_evaluate | |
| PYCARET_AVAILABLE = True | |
| except ImportError: | |
| PYCARET_AVAILABLE = False | |
| st.warning("PyCaret not installed. AutoML features will be limited.") | |
| # Data profiling | |
| #try: | |
| # from ydata_profiling import ProfileReport | |
| # from streamlit_pandas_profiling import st_profile_report | |
| # PROFILING_AVAILABLE = True | |
| #except ImportError: | |
| # PROFILING_AVAILABLE = False | |
| # PyTorch for deep learning | |
| try: | |
| import torch | |
| import torch.nn as nn | |
| import torch.optim as optim | |
| from torch.utils.data import TensorDataset, DataLoader | |
| TORCH_AVAILABLE = True | |
| except ImportError: | |
| TORCH_AVAILABLE = False | |
| # SHAP for explainability | |
| try: | |
| import shap | |
| SHAP_AVAILABLE = True | |
| except ImportError: | |
| SHAP_AVAILABLE = False | |
| # Scikit-learn imports | |
| from sklearn.preprocessing import LabelEncoder, StandardScaler | |
| from sklearn.linear_model import LinearRegression, LogisticRegression | |
| from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier | |
| from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier | |
| from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score | |
| from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix | |
| # ================== CUSTOM CSS & STYLING ================== | |
| st.set_page_config( | |
| page_title="🚀 Super Data Science App", | |
| layout="wide", | |
| initial_sidebar_state="expanded", | |
| page_icon="🚀" | |
| ) | |
| st.markdown(""" | |
| <style> | |
| /* Main styling */ | |
| .main { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| font-family: 'Arial', sans-serif; | |
| } | |
| /* Sidebar styling */ | |
| .sidebar .sidebar-content { | |
| background: linear-gradient(180deg, #2C3E50, #3498DB); | |
| color: white; | |
| } | |
| /* Button styling */ | |
| .stButton > button { | |
| background: linear-gradient(45deg, #FF6B6B, #4ECDC4); | |
| color: white; | |
| border: none; | |
| border-radius: 25px; | |
| padding: 0.6rem 1.5rem; | |
| font-weight: bold; | |
| transition: all 0.3s ease; | |
| box-shadow: 0 4px 15px 0 rgba(31, 38, 135, 0.37); | |
| } | |
| .stButton > button:hover { | |
| transform: translateY(-2px); | |
| box-shadow: 0 8px 25px 0 rgba(31, 38, 135, 0.37); | |
| } | |
| /* Metric styling */ | |
| .metric-container { | |
| background: rgba(255, 255, 255, 0.1); | |
| backdrop-filter: blur(10px); | |
| border-radius: 15px; | |
| padding: 1rem; | |
| margin: 0.5rem 0; | |
| border: 1px solid rgba(255, 255, 255, 0.2); | |
| } | |
| /* Header styling */ | |
| .main-header { | |
| text-align: center; | |
| padding: 2rem 0; | |
| background: rgba(255, 255, 255, 0.1); | |
| backdrop-filter: blur(10px); | |
| border-radius: 20px; | |
| margin-bottom: 2rem; | |
| border: 1px solid rgba(255, 255, 255, 0.2); | |
| } | |
| /* Success/Error messages */ | |
| .stSuccess, .stError, .stWarning { | |
| border-radius: 10px; | |
| border: none; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # ================== HEADER ================== | |
| st.markdown(""" | |
| <div class="main-header"> | |
| <h1 style="color: white; font-size: 3rem; margin-bottom: 0;">🚀 Super Data Science App</h1> | |
| <p style="color: rgba(255,255,255,0.8); font-size: 1.2rem;"> | |
| Complete ML Pipeline: EDA → Modeling → AutoML → Explainability → Deployment | |
| </p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # ================== AUTHENTICATION ================== | |
| def check_authentication(): | |
| if 'authenticated' not in st.session_state: | |
| st.session_state.authenticated = False | |
| if not st.session_state.authenticated: | |
| with st.sidebar: | |
| st.header("🔒 Authentication") | |
| password = st.text_input("Enter Password", type="password", key="auth_password") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if st.button("🔑 Login", key="login_btn"): | |
| if password == "ds4everyone": | |
| st.session_state.authenticated = True | |
| st.success("✅ Access Granted!") | |
| st.rerun() | |
| else: | |
| st.error("❌ Incorrect Password") | |
| with col2: | |
| if st.button("👤 Demo Mode", key="demo_btn"): | |
| st.session_state.authenticated = True | |
| st.session_state.demo_mode = True | |
| st.info("📊 Demo Mode Activated") | |
| st.rerun() | |
| st.info("🔐 Please authenticate to access the application") | |
| st.stop() | |
| check_authentication() | |
| # ================== SESSION STATE INITIALIZATION ================== | |
| if 'df' not in st.session_state: | |
| st.session_state.df = None | |
| if 'trained_models' not in st.session_state: | |
| st.session_state.trained_models = {} | |
| if 'pycaret_setup_done' not in st.session_state: | |
| st.session_state.pycaret_setup_done = False | |
| if 'best_model' not in st.session_state: | |
| st.session_state.best_model = None | |
| if 'dl_models' not in st.session_state: | |
| st.session_state.dl_models = {} | |
| if 'training_history' not in st.session_state: | |
| st.session_state.training_history = {} | |
| # ================== SIDEBAR NAVIGATION ================== | |
| st.sidebar.title("🧭 Navigation") | |
| pages = [ | |
| "🏠 Home", | |
| "📊 Data Loading", | |
| "🔍 EDA & Profiling", | |
| "📈 Visualization", | |
| "🤖 Classical ML", | |
| "⚡ PyCaret AutoML", | |
| "🧠 Deep Learning", | |
| "🎯 Model Evaluation", | |
| "🔬 Explainability", | |
| "📋 MLflow Tracking", | |
| "🚀 Model Deployment" | |
| ] | |
| selected_page = st.sidebar.selectbox("Select Page", pages, key="page_selector") | |
| # ================== UTILITY FUNCTIONS ================== | |
| def load_sample_data(dataset_name): | |
| """Load sample datasets""" | |
| if dataset_name == "California Housing": | |
| from sklearn.datasets import fetch_california_housing | |
| data = fetch_california_housing(as_frame=True) | |
| df = pd.concat([data.data, data.target.rename('MedHouseVal')], axis=1) | |
| return df.sample(n=min(2000, len(df))) # Limit for performance | |
| elif dataset_name == "Iris": | |
| from sklearn.datasets import load_iris | |
| data = load_iris(as_frame=True) | |
| df = pd.concat([data.data, data.target.rename('species')], axis=1) | |
| return df | |
| elif dataset_name == "Wine Quality": | |
| url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" | |
| try: | |
| df = pd.read_csv(url, sep=';') | |
| return df.sample(n=min(1000, len(df))) | |
| except: | |
| st.error("Could not load Wine Quality dataset") | |
| return None | |
| elif dataset_name == "Titanic": | |
| url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv" | |
| try: | |
| df = pd.read_csv(url) | |
| return df | |
| except: | |
| st.error("Could not load Titanic dataset") | |
| return None | |
| def get_dataset_info(df): | |
| """Get comprehensive dataset information""" | |
| info = { | |
| 'shape': df.shape, | |
| 'columns': df.columns.tolist(), | |
| 'dtypes': df.dtypes.to_dict(), | |
| 'missing_values': df.isnull().sum().to_dict(), | |
| 'memory_usage': f"{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB", | |
| 'numeric_columns': df.select_dtypes(include=[np.number]).columns.tolist(), | |
| 'categorical_columns': df.select_dtypes(exclude=[np.number]).columns.tolist() | |
| } | |
| return info | |
| # ================== PAGE CONTENT ================== | |
| if selected_page == "🏠 Home": | |
| col1, col2, col3 = st.columns([1, 2, 1]) | |
| with col2: | |
| st.markdown(""" | |
| ## Welcome to the Super Data Science App! 🎉 | |
| This comprehensive application provides a complete machine learning pipeline: | |
| """) | |
| features = [ | |
| "📊 **Data Loading**: Upload CSV or use sample datasets", | |
| "🔍 **EDA & Profiling**: Automated data profiling and exploration", | |
| "📈 **Visualization**: Interactive charts with Plotly and Seaborn", | |
| "🤖 **Classical ML**: Scikit-learn models with hyperparameter tuning", | |
| "⚡ **PyCaret AutoML**: Automated machine learning with model comparison", | |
| "🎯 **Model Evaluation**: Comprehensive model performance analysis", | |
| "🔬 **Explainability**: SHAP values and feature importance", | |
| "📋 **MLflow Tracking**: Experiment tracking and model versioning", | |
| "🚀 **Model Deployment**: Model export and deployment preparation" | |
| ] | |
| for feature in features: | |
| st.markdown(feature) | |
| st.markdown("---") | |
| # Quick stats | |
| if st.session_state.df is not None: | |
| col_a, col_b, col_c, col_d = st.columns(4) | |
| with col_a: | |
| st.metric("📊 Rows", f"{st.session_state.df.shape[0]:,}") | |
| with col_b: | |
| st.metric("📋 Columns", f"{st.session_state.df.shape[1]:,}") | |
| with col_c: | |
| st.metric("🤖 Models Trained", len(st.session_state.trained_models)) | |
| with col_d: | |
| st.metric("✅ Setup Complete", "Ready" if st.session_state.pycaret_setup_done else "Pending") | |
| elif selected_page == "📊 Data Loading": | |
| st.header("📊 Data Loading & Management") | |
| col1, col2 = st.columns([1, 2]) | |
| with col1: | |
| st.subheader("Data Source") | |
| data_source = st.radio( | |
| "Choose data source:", | |
| ["📁 Upload CSV", "🎲 Sample Datasets", "📋 Current Data Info"] | |
| ) | |
| with col2: | |
| if data_source == "📁 Upload CSV": | |
| uploaded_file = st.file_uploader("Choose a CSV file", type="csv") | |
| if uploaded_file is not None: | |
| try: | |
| df = pd.read_csv(uploaded_file) | |
| st.session_state.df = df | |
| st.success(f"✅ Successfully loaded {df.shape[0]} rows and {df.shape[1]} columns") | |
| except Exception as e: | |
| st.error(f"❌ Error loading file: {str(e)}") | |
| if 'model_comparison' not in st.session_state: | |
| st.session_state.model_comparison = None | |
| #st.error(f"❌ Error loading file: {str(e)}") | |
| elif data_source == "🎲 Sample Datasets": | |
| sample_options = ["California Housing", "Iris", "Wine Quality", "Titanic"] | |
| selected_sample = st.selectbox("Choose sample dataset:", sample_options) | |
| if st.button(f"🔄 Load {selected_sample} Dataset"): | |
| with st.spinner(f"Loading {selected_sample}..."): | |
| df = load_sample_data(selected_sample) | |
| if df is not None: | |
| st.session_state.df = df | |
| st.success(f"✅ Loaded {selected_sample} dataset!") | |
| elif data_source == "📋 Current Data Info": | |
| if st.session_state.df is not None: | |
| info = get_dataset_info(st.session_state.df) | |
| col_a, col_b = st.columns(2) | |
| with col_a: | |
| st.metric("📊 Rows", f"{info['shape'][0]:,}") | |
| st.metric("📋 Columns", f"{info['shape'][1]:,}") | |
| st.metric("💾 Memory Usage", info['memory_usage']) | |
| with col_b: | |
| st.metric("🔢 Numeric Columns", len(info['numeric_columns'])) | |
| st.metric("📝 Categorical Columns", len(info['categorical_columns'])) | |
| st.metric("❌ Missing Values", sum(info['missing_values'].values())) | |
| else: | |
| st.info("🔍 No data loaded yet") | |
| # Data Preview | |
| if st.session_state.df is not None: | |
| st.subheader("📋 Data Preview") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| show_rows = st.slider("Rows to display", 5, 50, 10) | |
| with col2: | |
| show_info = st.checkbox("Show column info", value=True) | |
| with col3: | |
| if st.button("💾 Download Current Data"): | |
| csv = st.session_state.df.to_csv(index=False) | |
| st.download_button( | |
| label="📥 Download CSV", | |
| data=csv, | |
| file_name=f"processed_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", | |
| mime='text/csv' | |
| ) | |
| # Display data | |
| st.dataframe(st.session_state.df.head(show_rows), use_container_width=True) | |
| if show_info: | |
| st.subheader("📊 Column Information") | |
| info_df = pd.DataFrame({ | |
| 'Column': st.session_state.df.columns, | |
| 'Data Type': st.session_state.df.dtypes, | |
| 'Non-Null Count': st.session_state.df.count(), | |
| 'Missing Values': st.session_state.df.isnull().sum(), | |
| 'Missing %': (st.session_state.df.isnull().sum() / len(st.session_state.df) * 100).round(2) | |
| }) | |
| st.dataframe(info_df, use_container_width=True) | |
| elif selected_page == "🔍 EDA & Profiling": | |
| st.header("🔍 Exploratory Data Analysis & Profiling") | |
| if st.session_state.df is None: | |
| st.warning("⚠️ Please load data first from the Data Loading page") | |
| st.stop() | |
| df = st.session_state.df | |
| # Quick EDA | |
| st.subheader("📊 Quick Statistics") | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("📏 Dataset Shape", f"{df.shape[0]} × {df.shape[1]}") | |
| with col2: | |
| st.metric("🔢 Numeric Columns", len(df.select_dtypes(include=[np.number]).columns)) | |
| with col3: | |
| st.metric("📝 Text Columns", len(df.select_dtypes(exclude=[np.number]).columns)) | |
| with col4: | |
| st.metric("❌ Missing Values", df.isnull().sum().sum()) | |
| # Missing Values Analysis | |
| st.subheader("❌ Missing Values Analysis") | |
| missing_df = pd.DataFrame({ | |
| 'Column': df.columns, | |
| 'Missing Count': df.isnull().sum(), | |
| 'Missing Percentage': (df.isnull().sum() / len(df) * 100).round(2) | |
| }).sort_values('Missing Count', ascending=False) | |
| missing_df = missing_df[missing_df['Missing Count'] > 0] | |
| if len(missing_df) > 0: | |
| st.dataframe(missing_df, use_container_width=True) | |
| # Missing values heatmap | |
| fig, ax = plt.subplots(figsize=(12, 8)) | |
| sns.heatmap(df.isnull(), yticklabels=False, cbar=True, cmap='viridis') | |
| plt.title('Missing Values Heatmap') | |
| st.pyplot(fig) | |
| else: | |
| st.success("✅ No missing values found in the dataset!") | |
| # Statistical Summary | |
| st.subheader("📈 Statistical Summary") | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns | |
| if len(numeric_cols) > 0: | |
| st.dataframe(df[numeric_cols].describe(), use_container_width=True) | |
| # Distribution plots | |
| st.subheader("📊 Distribution Analysis") | |
| selected_cols = st.multiselect("Select columns for distribution analysis:", numeric_cols, default=numeric_cols[:3]) | |
| if selected_cols: | |
| cols_per_row = 2 | |
| n_rows = (len(selected_cols) + cols_per_row - 1) // cols_per_row | |
| fig, axes = plt.subplots(n_rows, cols_per_row, figsize=(15, 5*n_rows)) | |
| if n_rows == 1: | |
| axes = [axes] if cols_per_row == 1 else axes | |
| else: | |
| axes = axes.flatten() | |
| for i, col in enumerate(selected_cols): | |
| sns.histplot(data=df, x=col, kde=True, ax=axes[i]) | |
| axes[i].set_title(f'Distribution of {col}') | |
| # Hide empty subplots | |
| for i in range(len(selected_cols), len(axes)): | |
| axes[i].set_visible(False) | |
| plt.tight_layout() | |
| st.pyplot(fig) | |
| # Correlation Analysis | |
| if len(numeric_cols) > 1: | |
| st.subheader("🔗 Correlation Analysis") | |
| corr_matrix = df[numeric_cols].corr() | |
| fig, ax = plt.subplots(figsize=(12, 10)) | |
| sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, | |
| square=True, fmt='.2f', ax=ax) | |
| plt.title('Correlation Matrix') | |
| st.pyplot(fig) | |
| # Automated Profiling Report | |
| # if PROFILING_AVAILABLE: | |
| # st.subheader("📋 Automated Profiling Report") | |
| # if st.button("🔄 Generate Comprehensive Profile Report"): | |
| # with st.spinner("Generating detailed profiling report..."): | |
| # profile = ProfileReport(df, title="Dataset Profiling Report", explorative=True) | |
| # st_profile_report(profile) | |
| elif selected_page == "📈 Visualization": | |
| st.header("📈 Interactive Data Visualization") | |
| if st.session_state.df is None: | |
| st.warning("⚠️ Please load data first from the Data Loading page") | |
| st.stop() | |
| df = st.session_state.df | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() | |
| categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist() | |
| # Visualization controls | |
| st.subheader("🎛️ Visualization Controls") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| viz_type = st.selectbox("Select visualization type:", [ | |
| "📊 Histogram", "📈 Scatter Plot", "📦 Box Plot", | |
| "🔥 Heatmap", "📉 Line Plot", "🎯 Pair Plot" | |
| ]) | |
| with col2: | |
| if viz_type in ["📈 Scatter Plot", "📉 Line Plot"]: | |
| x_col = st.selectbox("X-axis:", numeric_cols + categorical_cols) | |
| y_col = st.selectbox("Y-axis:", numeric_cols) | |
| else: | |
| selected_col = st.selectbox("Select column:", numeric_cols if viz_type != "📦 Box Plot" else df.columns) | |
| with col3: | |
| if categorical_cols and viz_type in ["📊 Histogram", "📈 Scatter Plot", "📦 Box Plot"]: | |
| color_col = st.selectbox("Color by (optional):", ["None"] + categorical_cols) | |
| color_col = None if color_col == "None" else color_col | |
| else: | |
| color_col = None | |
| # Generate visualizations | |
| st.subheader("📊 Visualization Output") | |
| try: | |
| if viz_type == "📊 Histogram": | |
| fig = px.histogram(df, x=selected_col, color=color_col, | |
| title=f'Distribution of {selected_col}', | |
| marginal="box") | |
| st.plotly_chart(fig, use_container_width=True) | |
| elif viz_type == "📈 Scatter Plot": | |
| fig = px.scatter(df, x=x_col, y=y_col, color=color_col, | |
| title=f'{y_col} vs {x_col}', | |
| trendline="ols" if color_col is None else None) | |
| st.plotly_chart(fig, use_container_width=True) | |
| elif viz_type == "📦 Box Plot": | |
| if color_col: | |
| fig = px.box(df, y=selected_col, x=color_col, | |
| title=f'Box Plot of {selected_col} by {color_col}') | |
| else: | |
| fig = px.box(df, y=selected_col, | |
| title=f'Box Plot of {selected_col}') | |
| st.plotly_chart(fig, use_container_width=True) | |
| elif viz_type == "🔥 Heatmap": | |
| if len(numeric_cols) > 1: | |
| corr_matrix = df[numeric_cols].corr() | |
| fig = px.imshow(corr_matrix, text_auto=True, aspect="auto", | |
| title="Correlation Heatmap") | |
| st.plotly_chart(fig, use_container_width=True) | |
| else: | |
| st.warning("Need at least 2 numeric columns for correlation heatmap") | |
| elif viz_type == "📉 Line Plot": | |
| fig = px.line(df.sort_values(x_col), x=x_col, y=y_col, | |
| title=f'{y_col} vs {x_col} (Line Plot)') | |
| st.plotly_chart(fig, use_container_width=True) | |
| elif viz_type == "🎯 Pair Plot": | |
| if len(numeric_cols) >= 2: | |
| selected_numeric = st.multiselect("Select numeric columns for pair plot:", | |
| numeric_cols, default=numeric_cols[:4]) | |
| if len(selected_numeric) >= 2: | |
| fig = px.scatter_matrix(df, dimensions=selected_numeric, color=color_col, | |
| title="Pair Plot Matrix") | |
| st.plotly_chart(fig, use_container_width=True) | |
| else: | |
| st.warning("Please select at least 2 numeric columns") | |
| else: | |
| st.warning("Need at least 2 numeric columns for pair plot") | |
| except Exception as e: | |
| st.error(f"Error generating visualization: {str(e)}") | |
| # Additional visualizations | |
| st.subheader("📊 Additional Insights") | |
| # Value counts for categorical columns | |
| if categorical_cols: | |
| st.write("**Categorical Column Distributions:**") | |
| for col in categorical_cols[:3]: # Limit to first 3 | |
| if df[col].nunique() <= 20: # Only show if not too many categories | |
| fig = px.bar(df[col].value_counts().head(10), | |
| title=f'Top 10 values in {col}') | |
| st.plotly_chart(fig, use_container_width=True) | |
| elif selected_page == "🤖 Classical ML": | |
| st.header("🤖 Classical Machine Learning") | |
| if st.session_state.df is None: | |
| st.warning("⚠️ Please load data first from the Data Loading page") | |
| st.stop() | |
| df = st.session_state.df | |
| # Model configuration | |
| st.subheader("⚙️ Model Configuration") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # Target selection | |
| target_col = st.selectbox("🎯 Select target variable:", df.columns) | |
| # Feature selection | |
| available_features = [col for col in df.columns if col != target_col] | |
| selected_features = st.multiselect("📊 Select features:", available_features, | |
| default=available_features[:5]) | |
| with col2: | |
| # Problem type detection | |
| if df[target_col].dtype in ['object', 'bool'] or df[target_col].nunique() < 10: | |
| problem_type = "Classification" | |
| st.info("🎯 Detected: Classification Problem") | |
| model_options = ["Logistic Regression", "Decision Tree", "Random Forest"] | |
| else: | |
| problem_type = "Regression" | |
| st.info("📈 Detected: Regression Problem") | |
| model_options = ["Linear Regression", "Decision Tree", "Random Forest"] | |
| selected_model = st.selectbox("🤖 Select model:", model_options) | |
| test_size = st.slider("🔄 Test set size:", 0.1, 0.5, 0.2, 0.05) | |
| if not selected_features: | |
| st.warning("⚠️ Please select at least one feature") | |
| st.stop() | |
| # Data preprocessing | |
| if st.button("🚀 Train Model"): | |
| with st.spinner("Training model..."): | |
| try: | |
| # Prepare data | |
| X = df[selected_features].copy() | |
| y = df[target_col].copy() | |
| # Handle missing values | |
| X = X.fillna(X.mean() if X.select_dtypes(include=[np.number]).shape[1] > 0 else X.mode().iloc[0]) | |
| # Encode categorical variables | |
| le_dict = {} | |
| for col in X.select_dtypes(include=['object']).columns: | |
| le = LabelEncoder() | |
| X[col] = le.fit_transform(X[col].astype(str)) | |
| le_dict[col] = le | |
| # Encode target if classification | |
| if problem_type == "Classification" and y.dtype == 'object': | |
| target_le = LabelEncoder() | |
| y = target_le.fit_transform(y) | |
| # Split data | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=test_size, random_state=42 | |
| ) | |
| # Train model | |
| if selected_model == "Linear Regression": | |
| model = LinearRegression() | |
| elif selected_model == "Logistic Regression": | |
| model = LogisticRegression(random_state=42, max_iter=1000) | |
| elif selected_model == "Decision Tree": | |
| if problem_type == "Classification": | |
| model = DecisionTreeClassifier(random_state=42) | |
| else: | |
| model = DecisionTreeRegressor(random_state=42) | |
| elif selected_model == "Random Forest": | |
| if problem_type == "Classification": | |
| model = RandomForestClassifier(random_state=42, n_estimators=100) | |
| else: | |
| model = RandomForestRegressor(random_state=42, n_estimators=100) | |
| model.fit(X_train, y_train) | |
| predictions = model.predict(X_test) | |
| # Store model | |
| st.session_state.trained_models[selected_model] = { | |
| 'model': model, | |
| 'X_test': X_test, | |
| 'y_test': y_test, | |
| 'predictions': predictions, | |
| 'features': selected_features, | |
| 'target': target_col, | |
| 'problem_type': problem_type | |
| } | |
| st.success("✅ Model trained successfully!") | |
| # Display results | |
| st.subheader("📊 Model Performance") | |
| if problem_type == "Regression": | |
| mse = mean_squared_error(y_test, predictions) | |
| mae = mean_absolute_error(y_test, predictions) | |
| r2 = r2_score(y_test, predictions) | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("MSE", f"{mse:.4f}") | |
| with col2: | |
| st.metric("MAE", f"{mae:.4f}") | |
| with col3: | |
| st.metric("R² Score", f"{r2:.4f}") | |
| # Actual vs Predicted plot | |
| fig = px.scatter(x=y_test, y=predictions, | |
| labels={'x': 'Actual', 'y': 'Predicted'}, | |
| title='Actual vs Predicted Values') | |
| fig.add_shape(type="line", x0=y_test.min(), y0=y_test.min(), | |
| x1=y_test.max(), y1=y_test.max(), | |
| line=dict(color="red", dash="dash")) | |
| st.plotly_chart(fig, use_container_width=True) | |
| else: # Classification | |
| accuracy = accuracy_score(y_test, predictions) | |
| precision = precision_score(y_test, predictions, average='weighted') | |
| recall = recall_score(y_test, predictions, average='weighted') | |
| f1 = f1_score(y_test, predictions, average='weighted') | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("Accuracy", f"{accuracy:.4f}") | |
| with col2: | |
| st.metric("Precision", f"{precision:.4f}") | |
| with col3: | |
| st.metric("Recall", f"{recall:.4f}") | |
| with col4: | |
| st.metric("F1-Score", f"{f1:.4f}") | |
| # Confusion Matrix | |
| cm = confusion_matrix(y_test, predictions) | |
| fig, ax = plt.subplots(figsize=(8, 6)) | |
| sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax) | |
| ax.set_title('Confusion Matrix') | |
| ax.set_xlabel('Predicted') | |
| ax.set_ylabel('Actual') | |
| st.pyplot(fig) | |
| # Feature importance (for tree-based models) | |
| if hasattr(model, 'feature_importances_'): | |
| st.subheader("📊 Feature Importance") | |
| importance_df = pd.DataFrame({ | |
| 'Feature': selected_features, | |
| 'Importance': model.feature_importances_ | |
| }).sort_values('Importance', ascending=False) | |
| fig = px.bar(importance_df, x='Importance', y='Feature', | |
| orientation='h', title='Feature Importance') | |
| st.plotly_chart(fig, use_container_width=True) | |
| except Exception as e: | |
| st.error(f"❌ Error training model: {str(e)}") | |
| elif selected_page == "⚡ PyCaret AutoML": | |
| st.header("⚡ PyCaret AutoML") | |
| if not PYCARET_AVAILABLE: | |
| st.error("❌ PyCaret is not installed. Please install it to use AutoML features.") | |
| st.stop() | |
| if st.session_state.df is None: | |
| st.warning("⚠️ Please load data first from the Data Loading page") | |
| st.stop() | |
| df = st.session_state.df | |
| # AutoML Configuration | |
| st.subheader("⚙️ AutoML Configuration") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| target_col = st.selectbox("🎯 Select target variable:", df.columns, key="pycaret_target") | |
| # Auto-detect problem type | |
| if df[target_col].dtype in ['object', 'bool'] or df[target_col].nunique() < 10: | |
| problem_type = "classification" | |
| st.info("🎯 Detected: Classification Problem") | |
| else: | |
| problem_type = "regression" | |
| st.info("📈 Detected: Regression Problem") | |
| with col2: | |
| train_size = st.slider("🔄 Training set size:", 0.5, 0.9, 0.8, 0.05) | |
| sample_size = st.slider("📊 Sample size (for performance):", 500, min(5000, len(df)), min(2000, len(df))) | |
| if len(df) > sample_size: | |
| df_sample = df.sample(n=sample_size, random_state=42) | |
| st.info(f"📊 Using {sample_size} samples for faster processing") | |
| else: | |
| df_sample = df.copy() | |
| # Advanced settings | |
| with st.expander("🔧 Advanced Settings"): | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| cross_validation = st.checkbox("🔄 Cross Validation", value=True) | |
| normalize = st.checkbox("📏 Normalize Features", value=True) | |
| with col2: | |
| remove_outliers = st.checkbox("🚫 Remove Outliers", value=False) | |
| feature_selection = st.checkbox("🎯 Feature Selection", value=False) | |
| # Setup PyCaret Environment | |
| if st.button("🚀 Setup PyCaret Environment"): | |
| with st.spinner("Setting up PyCaret environment..."): | |
| try: | |
| if problem_type == "classification": | |
| st.session_state.pycaret_exp = cls_setup( | |
| data=df_sample, | |
| target=target_col, | |
| train_size=train_size, | |
| session_id=42, | |
| normalize=normalize, | |
| remove_outliers=remove_outliers, | |
| feature_selection=feature_selection, | |
| silent=True | |
| ) | |
| else: | |
| st.session_state.pycaret_exp = reg_setup( | |
| data=df_sample, | |
| target=target_col, | |
| train_size=train_size, | |
| session_id=42, | |
| normalize=normalize, | |
| remove_outliers=remove_outliers, | |
| feature_selection=feature_selection, | |
| silent=True | |
| ) | |
| st.session_state.pycaret_setup_done = True | |
| st.session_state.pycaret_problem_type = problem_type | |
| st.success("✅ PyCaret environment setup complete!") | |
| except Exception as e: | |
| st.error(f"❌ Error setting up PyCaret: {str(e)}") | |
| # Model Comparison | |
| if st.session_state.pycaret_setup_done: | |
| st.subheader("📊 Model Comparison") | |
| if st.button("🔄 Compare Models"): | |
| with st.spinner("Comparing multiple models..."): | |
| try: | |
| if st.session_state.pycaret_problem_type == "classification": | |
| comparison_df = cls_compare( | |
| include=['lr', 'rf', 'et', 'nb', 'dt', 'svm'], | |
| sort='Accuracy', | |
| n_select=5 | |
| ) | |
| st.session_state.model_comparison = cls_pull() | |
| else: | |
| comparison_df = reg_compare( | |
| include=['lr', 'rf', 'et', 'dt', 'huber'], | |
| sort='R2', | |
| n_select=5 | |
| ) | |
| st.session_state.model_comparison = reg_pull() | |
| st.success("✅ Model comparison complete!") | |
| except Exception as e: | |
| st.error(f"❌ Error comparing models: {str(e)}") | |
| # Display comparison results | |
| if st.session_state.model_comparison is not None: | |
| st.subheader("📈 Model Comparison Results") | |
| st.dataframe(st.session_state.model_comparison, use_container_width=True) | |
| # Select best model | |
| best_model_name = st.selectbox( | |
| "🏆 Select model for tuning:", | |
| ['lr', 'rf', 'et', 'dt', 'nb', 'svm'] if st.session_state.pycaret_problem_type == "classification" | |
| else ['lr', 'rf', 'et', 'dt', 'huber'] | |
| ) | |
| # Create and tune model | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if st.button("🎯 Create Model"): | |
| with st.spinner("Creating model..."): | |
| try: | |
| if st.session_state.pycaret_problem_type == "classification": | |
| model = cls_create(best_model_name) | |
| else: | |
| model = reg_create(best_model_name) | |
| st.session_state.pycaret_model = model | |
| st.success("✅ Model created successfully!") | |
| except Exception as e: | |
| st.error(f"❌ Error creating model: {str(e)}") | |
| with col2: | |
| if st.button("⚡ Tune Hyperparameters"): | |
| if 'pycaret_model' in st.session_state: | |
| with st.spinner("Tuning hyperparameters..."): | |
| try: | |
| if st.session_state.pycaret_problem_type == "classification": | |
| tuned_model = cls_tune(st.session_state.pycaret_model, | |
| optimize='Accuracy', n_iter=10) | |
| else: | |
| tuned_model = reg_tune(st.session_state.pycaret_model, | |
| optimize='R2', n_iter=10) | |
| st.session_state.tuned_model = tuned_model | |
| st.success("✅ Hyperparameter tuning complete!") | |
| except Exception as e: | |
| st.error(f"❌ Error tuning model: {str(e)}") | |
| else: | |
| st.warning("⚠️ Please create a model first") | |
| # Finalize model | |
| if st.button("🏁 Finalize Best Model"): | |
| if 'tuned_model' in st.session_state: | |
| model_to_finalize = st.session_state.tuned_model | |
| elif 'pycaret_model' in st.session_state: | |
| model_to_finalize = st.session_state.pycaret_model | |
| else: | |
| st.warning("⚠️ Please create a model first") | |
| model_to_finalize = None | |
| if model_to_finalize is not None: | |
| with st.spinner("Finalizing model..."): | |
| try: | |
| if st.session_state.pycaret_problem_type == "classification": | |
| final_model = cls_finalize(model_to_finalize) | |
| else: | |
| final_model = reg_finalize(model_to_finalize) | |
| st.session_state.best_model = final_model | |
| st.success("✅ Model finalized successfully!") | |
| except Exception as e: | |
| st.error(f"❌ Error finalizing model: {str(e)}") | |
| elif selected_page == "🎯 Model Evaluation": | |
| st.header("🎯 Advanced Model Evaluation") | |
| if st.session_state.df is None: | |
| st.warning("⚠️ Please load data first") | |
| st.stop() | |
| # Check for available models | |
| available_models = [] | |
| if st.session_state.trained_models: | |
| available_models.extend(list(st.session_state.trained_models.keys())) | |
| if 'best_model' in st.session_state and st.session_state.best_model is not None: | |
| available_models.append("PyCaret Best Model") | |
| if not available_models: | |
| st.warning("⚠️ No trained models available. Please train a model first.") | |
| st.stop() | |
| selected_model_name = st.selectbox("📊 Select model to evaluate:", available_models) | |
| if selected_model_name == "PyCaret Best Model": | |
| if 'best_model' not in st.session_state: | |
| st.error("❌ PyCaret model not available") | |
| st.stop() | |
| model_info = st.session_state.best_model | |
| problem_type = st.session_state.get('pycaret_problem_type', 'regression') | |
| st.subheader("📈 PyCaret Model Evaluation") | |
| # PyCaret built-in plots | |
| if PYCARET_AVAILABLE: | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| plot_types_cls = ['auc', 'confusion_matrix', 'class_report', 'pr', 'feature'] | |
| plot_types_reg = ['residuals', 'feature', 'rfe', 'learning', 'vc'] | |
| plot_types = plot_types_cls if problem_type == "classification" else plot_types_reg | |
| selected_plot = st.selectbox("📊 Select evaluation plot:", plot_types) | |
| with col2: | |
| if st.button("📊 Generate Plot"): | |
| try: | |
| with st.spinner("Generating plot..."): | |
| if problem_type == "classification": | |
| cls_plot(model_info, plot=selected_plot, display_format='streamlit') | |
| else: | |
| reg_plot(model_info, plot=selected_plot, display_format='streamlit') | |
| except Exception as e: | |
| st.error(f"❌ Error generating plot: {str(e)}") | |
| # Model predictions | |
| if st.button("🔮 Generate Predictions"): | |
| try: | |
| with st.spinner("Generating predictions..."): | |
| if problem_type == "classification": | |
| predictions_df = cls_predict(model_info) | |
| else: | |
| predictions_df = reg_predict(model_info) | |
| st.subheader("🔮 Model Predictions") | |
| st.dataframe(predictions_df.head(20), use_container_width=True) | |
| # Download predictions | |
| csv = predictions_df.to_csv(index=False) | |
| st.download_button( | |
| label="📥 Download Predictions", | |
| data=csv, | |
| file_name=f"predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", | |
| mime='text/csv' | |
| ) | |
| except Exception as e: | |
| st.error(f"❌ Error generating predictions: {str(e)}") | |
| else: | |
| # Classical ML model evaluation | |
| model_data = st.session_state.trained_models[selected_model_name] | |
| model = model_data['model'] | |
| X_test = model_data['X_test'] | |
| y_test = model_data['y_test'] | |
| predictions = model_data['predictions'] | |
| problem_type = model_data['problem_type'] | |
| st.subheader(f"📊 {selected_model_name} Evaluation") | |
| if problem_type == "Regression": | |
| # Regression metrics | |
| mse = mean_squared_error(y_test, predictions) | |
| mae = mean_absolute_error(y_test, predictions) | |
| r2 = r2_score(y_test, predictions) | |
| rmse = np.sqrt(mse) | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("RMSE", f"{rmse:.4f}") | |
| with col2: | |
| st.metric("MAE", f"{mae:.4f}") | |
| with col3: | |
| st.metric("R² Score", f"{r2:.4f}") | |
| with col4: | |
| st.metric("MSE", f"{mse:.4f}") | |
| # Residual analysis | |
| residuals = y_test - predictions | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # Residual plot | |
| fig = px.scatter(x=predictions, y=residuals, | |
| labels={'x': 'Predicted', 'y': 'Residuals'}, | |
| title='Residual Plot') | |
| fig.add_hline(y=0, line_dash="dash", line_color="red") | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| # Residual distribution | |
| fig = px.histogram(residuals, title='Residual Distribution', | |
| labels={'value': 'Residuals', 'count': 'Frequency'}) | |
| st.plotly_chart(fig, use_container_width=True) | |
| else: | |
| # Classification metrics | |
| accuracy = accuracy_score(y_test, predictions) | |
| precision = precision_score(y_test, predictions, average='weighted') | |
| recall = recall_score(y_test, predictions, average='weighted') | |
| f1 = f1_score(y_test, predictions, average='weighted') | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("Accuracy", f"{accuracy:.4f}") | |
| with col2: | |
| st.metric("Precision", f"{precision:.4f}") | |
| with col3: | |
| st.metric("Recall", f"{recall:.4f}") | |
| with col4: | |
| st.metric("F1-Score", f"{f1:.4f}") | |
| elif selected_page == "🔬 Explainability": | |
| st.header("🔬 Model Explainability with SHAP") | |
| if not SHAP_AVAILABLE: | |
| st.warning("⚠️ SHAP is not installed. Explainability features are limited.") | |
| st.stop() | |
| if st.session_state.df is None: | |
| st.warning("⚠️ Please load data first") | |
| st.stop() | |
| # Check for available models | |
| if not st.session_state.trained_models and 'best_model' not in st.session_state: | |
| st.warning("⚠️ No trained models available. Please train a model first.") | |
| st.stop() | |
| # Select model for explanation | |
| available_models = list(st.session_state.trained_models.keys()) | |
| if 'best_model' in st.session_state: | |
| available_models.append("PyCaret Best Model") | |
| selected_model = st.selectbox("🤖 Select model to explain:", available_models) | |
| if selected_model != "PyCaret Best Model": | |
| model_data = st.session_state.trained_models[selected_model] | |
| model = model_data['model'] | |
| features = model_data['features'] | |
| X_test = model_data['X_test'] | |
| # SHAP Explanation | |
| st.subheader("🔬 SHAP Analysis") | |
| try: | |
| # Create SHAP explainer | |
| with st.spinner("Creating SHAP explainer..."): | |
| explainer = shap.Explainer(model, X_test.iloc[:100]) # Use subset for performance | |
| shap_values = explainer(X_test.iloc[:100]) | |
| # Global feature importance | |
| st.subheader("🌍 Global Feature Importance") | |
| fig, ax = plt.subplots() | |
| shap.plots.bar(shap_values, ax=ax, show=False) | |
| st.pyplot(fig) | |
| # Summary plot | |
| st.subheader("📊 Feature Impact Summary") | |
| fig, ax = plt.subplots() | |
| shap.plots.beeswarm(shap_values, ax=ax, show=False) | |
| st.pyplot(fig) | |
| # Individual prediction explanation | |
| st.subheader("🔍 Individual Prediction Explanation") | |
| instance_idx = st.slider("Select instance:", 0, len(X_test)-1, 0) | |
| fig, ax = plt.subplots() | |
| shap.plots.waterfall(shap_values[instance_idx], ax=ax, show=False) | |
| st.pyplot(fig) | |
| # Feature dependence | |
| if len(features) > 1: | |
| st.subheader("📈 Feature Dependence") | |
| feature_for_dependence = st.selectbox("Select feature:", features) | |
| if feature_for_dependence in X_test.columns: | |
| fig, ax = plt.subplots() | |
| shap.plots.scatter(shap_values[:, feature_for_dependence], ax=ax, show=False) | |
| st.pyplot(fig) | |
| except Exception as e: | |
| st.error(f"❌ Error generating SHAP explanations: {str(e)}") | |
| st.info("💡 SHAP works best with tree-based models (Random Forest, XGBoost, etc.)") | |
| elif selected_page == "📋 MLflow Tracking": | |
| st.header("📋 MLflow Experiment Tracking") | |
| if not MLFLOW_AVAILABLE: | |
| st.warning("⚠️ MLflow is not installed. Install it to use experiment tracking.") | |
| st.stop() | |
| # MLflow Configuration | |
| st.subheader("⚙️ MLflow Configuration") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| tracking_uri = st.text_input("🔗 Tracking URI:", "http://localhost:5000") | |
| experiment_name = st.text_input("🧪 Experiment Name:", "super_app_experiments") | |
| with col2: | |
| if st.button("🔧 Set MLflow Configuration"): | |
| try: | |
| mlflow.set_tracking_uri(tracking_uri) | |
| mlflow.set_experiment(experiment_name) | |
| st.success("✅ MLflow configuration set!") | |
| except Exception as e: | |
| st.error(f"❌ Error setting MLflow: {str(e)}") | |
| # Log current models | |
| st.subheader("📊 Log Models to MLflow") | |
| if st.session_state.trained_models: | |
| model_to_log = st.selectbox("Select model to log:", list(st.session_state.trained_models.keys())) | |
| if st.button("📤 Log Model"): | |
| try: | |
| with mlflow.start_run(run_name=f"{model_to_log}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"): | |
| model_data = st.session_state.trained_models[model_to_log] | |
| model = model_data['model'] | |
| # Log model | |
| mlflow.sklearn.log_model(model, "model") | |
| # Log parameters | |
| mlflow.log_param("model_type", model_to_log) | |
| mlflow.log_param("features", model_data['features']) | |
| mlflow.log_param("target", model_data['target']) | |
| # Log metrics (if available) | |
| if 'predictions' in model_data: | |
| y_test = model_data['y_test'] | |
| predictions = model_data['predictions'] | |
| if model_data['problem_type'] == "Regression": | |
| mlflow.log_metric("mse", mean_squared_error(y_test, predictions)) | |
| mlflow.log_metric("mae", mean_absolute_error(y_test, predictions)) | |
| mlflow.log_metric("r2", r2_score(y_test, predictions)) | |
| else: | |
| mlflow.log_metric("accuracy", accuracy_score(y_test, predictions)) | |
| st.success("✅ Model logged to MLflow!") | |
| except Exception as e: | |
| st.error(f"❌ Error logging model: {str(e)}") | |
| # Display recent runs | |
| st.subheader("📈 Recent Experiment Runs") | |
| if st.button("🔄 Refresh Runs"): | |
| try: | |
| runs = mlflow.search_runs(order_by=["start_time desc"]) | |
| if not runs.empty: | |
| st.dataframe(runs[['run_id', 'status', 'start_time', 'params.model_type', | |
| 'metrics.mse', 'metrics.r2', 'metrics.accuracy']], | |
| use_container_width=True) | |
| else: | |
| st.info("📊 No runs found. Start logging some models!") | |
| except Exception as e: | |
| st.error(f"❌ Error fetching runs: {str(e)}") | |
| elif selected_page == "🚀 Model Deployment": | |
| st.header("🚀 Model Deployment & Export") | |
| if not st.session_state.trained_models and 'best_model' not in st.session_state: | |
| st.warning("⚠️ No trained models available for deployment.") | |
| st.stop() | |
| # Model selection for deployment | |
| available_models = list(st.session_state.trained_models.keys()) | |
| if 'best_model' in st.session_state: | |
| available_models.append("PyCaret Best Model") | |
| selected_model = st.selectbox("🤖 Select model for deployment:", available_models) | |
| # Model export options | |
| st.subheader("💾 Export Options") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| if st.button("📦 Export Model (Pickle)"): | |
| try: | |
| import pickle | |
| if selected_model == "PyCaret Best Model": | |
| model_to_export = st.session_state.best_model | |
| else: | |
| model_to_export = st.session_state.trained_models[selected_model]['model'] | |
| # Serialize model | |
| model_bytes = pickle.dumps(model_to_export) | |
| st.download_button( | |
| label="📥 Download Model", | |
| data=model_bytes, | |
| file_name=f"{selected_model.replace(' ', '_')}_model.pkl", | |
| mime="application/octet-stream" | |
| ) | |
| st.success("✅ Model ready for download!") | |
| except Exception as e: | |
| st.error(f"❌ Error exporting model: {str(e)}") | |
| with col2: | |
| if st.button("📄 Generate Prediction Script"): | |
| # Generate Python script for predictions | |
| script_content = f''' | |
| import pandas as pd | |
| import pickle | |
| import numpy as np | |
| # Load the trained model | |
| def load_model(model_path): | |
| with open(model_path, 'rb') as f: | |
| model = pickle.load(f) | |
| return model | |
| # Make predictions | |
| def predict(model, input_data): | |
| """ | |
| Make predictions using the trained model | |
| Parameters: | |
| model: Trained model object | |
| input_data: pandas DataFrame with features | |
| Returns: | |
| predictions: numpy array of predictions | |
| """ | |
| predictions = model.predict(input_data) | |
| return predictions | |
| # Example usage | |
| if __name__ == "__main__": | |
| # Load your model | |
| model = load_model("path_to_your_model.pkl") | |
| # Create sample input data (replace with your actual data) | |
| sample_data = pd.DataFrame({{ | |
| # Add your feature columns here | |
| # 'feature1': [value1], | |
| # 'feature2': [value2], | |
| }}) | |
| # Make predictions | |
| predictions = predict(model, sample_data) | |
| print("Predictions:", predictions) | |
| ''' | |
| st.download_button( | |
| label="📥 Download Script", | |
| data=script_content, | |
| file_name=f"{selected_model.replace(' ', '_')}_prediction_script.py", | |
| mime="text/plain" | |
| ) | |
| st.success("✅ Prediction script ready!") | |
| with col3: | |
| if st.button("🐳 Generate Dockerfile"): | |
| dockerfile_content = ''' | |
| FROM python:3.9-slim | |
| WORKDIR /app | |
| # Copy requirements | |
| COPY requirements.txt . | |
| RUN pip install -r requirements.txt | |
| # Copy model and script | |
| COPY model.pkl . | |
| COPY app.py . | |
| # Expose port | |
| EXPOSE 8000 | |
| # Run the application | |
| CMD ["python", "app.py"] | |
| ''' | |
| requirements_content = ''' | |
| pandas==1.5.3 | |
| scikit-learn==1.3.0 | |
| numpy==1.24.3 | |
| flask==2.3.2 | |
| ''' | |
| col_a, col_b = st.columns(2) | |
| with col_a: | |
| st.download_button( | |
| label="📥 Download Dockerfile", | |
| data=dockerfile_content, | |
| file_name="Dockerfile", | |
| mime="text/plain" | |
| ) | |
| with col_b: | |
| st.download_button( | |
| label="📥 Download Requirements", | |
| data=requirements_content, | |
| file_name="requirements.txt", | |
| mime="text/plain" | |
| ) | |
| st.success("✅ Docker files ready!") | |
| # Model API endpoint generator | |
| st.subheader("🌐 API Endpoint Generator") | |
| if st.button("🔧 Generate Flask API"): | |
| api_code = f''' | |
| from flask import Flask, request, jsonify | |
| import pandas as pd | |
| import pickle | |
| import numpy as np | |
| app = Flask(__name__) | |
| # Load model at startup | |
| model = None | |
| def load_model(): | |
| global model | |
| with open('model.pkl', 'rb') as f: | |
| model = pickle.load(f) | |
| @app.route('/predict', methods=['POST']) | |
| def predict(): | |
| try: | |
| # Get data from request | |
| data = request.get_json() | |
| # Convert to DataFrame | |
| df = pd.DataFrame([data]) | |
| # Make prediction | |
| prediction = model.predict(df) | |
| # Return result | |
| return jsonify({{ | |
| 'prediction': prediction.tolist(), | |
| 'status': 'success' | |
| }}) | |
| except Exception as e: | |
| return jsonify({{ | |
| 'error': str(e), | |
| 'status': 'error' | |
| }}), 400 | |
| @app.route('/health', methods=['GET']) | |
| def health(): | |
| return jsonify({{'status': 'healthy'}}) | |
| if __name__ == '__main__': | |
| load_model() | |
| app.run(host='0.0.0.0', port=8000, debug=False) | |
| ''' | |
| st.download_button( | |
| label="📥 Download Flask API", | |
| data=api_code, | |
| file_name="app.py", | |
| mime="text/plain" | |
| ) | |
| st.success("✅ Flask API code ready!") | |
| # Deployment instructions | |
| st.subheader("📋 Deployment Instructions") | |
| st.markdown(""" | |
| ### 🚀 Deployment Steps: | |
| 1. **Local Deployment:** | |
| - Download the model pickle file | |
| - Download the prediction script or Flask API | |
| - Install required dependencies: `pip install -r requirements.txt` | |
| - Run the application: `python app.py` | |
| 2. **Docker Deployment:** | |
| - Download all generated files (Dockerfile, requirements.txt, app.py, model.pkl) | |
| - Build image: `docker build -t my-ml-app .` | |
| - Run container: `docker run -p 8000:8000 my-ml-app` | |
| 3. **Cloud Deployment:** | |
| - **AWS**: Upload to EC2 or use ECS with the Docker image | |
| - **GCP**: Deploy to Google Cloud Run or App Engine | |
| - **Azure**: Use Azure Container Instances or App Service | |
| - **Heroku**: Push Docker image to Heroku Container Registry | |
| 4. **API Usage Example:** | |
| ```bash | |
| curl -X POST http://localhost:8000/predict \ | |
| -H "Content-Type: application/json" \ | |
| -d '{"feature1": 1.0, "feature2": 2.0}' | |
| ``` | |
| """) | |
| # Model performance summary | |
| if selected_model != "PyCaret Best Model" and selected_model in st.session_state.trained_models: | |
| st.subheader("📊 Model Summary for Deployment") | |
| model_data = st.session_state.trained_models[selected_model] | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.write("**Model Details:**") | |
| st.write(f"- **Type:** {selected_model}") | |
| st.write(f"- **Problem Type:** {model_data['problem_type']}") | |
| st.write(f"- **Features:** {len(model_data['features'])}") | |
| st.write(f"- **Target:** {model_data['target']}") | |
| with col2: | |
| if 'predictions' in model_data: | |
| y_test = model_data['y_test'] | |
| predictions = model_data['predictions'] | |
| st.write("**Performance Metrics:**") | |
| if model_data['problem_type'] == "Regression": | |
| r2 = r2_score(y_test, predictions) | |
| mae = mean_absolute_error(y_test, predictions) | |
| st.write(f"- **R² Score:** {r2:.4f}") | |
| st.write(f"- **MAE:** {mae:.4f}") | |
| else: | |
| accuracy = accuracy_score(y_test, predictions) | |
| st.write(f"- **Accuracy:** {accuracy:.4f}") | |
| # ================== FOOTER ================== | |
| st.markdown("---") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.markdown("### 📊 Quick Stats") | |
| if st.session_state.df is not None: | |
| st.write(f"Dataset: {st.session_state.df.shape[0]} rows × {st.session_state.df.shape[1]} cols") | |
| st.write(f"Models Trained: {len(st.session_state.trained_models)}") | |
| with col2: | |
| st.markdown("### 🔗 Quick Actions") | |
| if st.button("🔄 Reset All Data", key="footer_reset"): | |
| for key in list(st.session_state.keys()): | |
| if key not in ['authenticated', 'demo_mode']: | |
| del st.session_state[key] | |
| st.success("✅ All data reset!") | |
| st.rerun() | |
| with col3: | |
| st.markdown("### ℹ️ App Info") | |
| st.write("Super Data Science App v2.0") | |
| st.write(f"Session: {datetime.now().strftime('%Y-%m-%d %H:%M')}") | |
| # ================== SIDEBAR STATUS ================== | |
| st.sidebar.markdown("---") | |
| st.sidebar.subheader("📊 Current Status") | |
| # Data status | |
| if st.session_state.df is not None: | |
| st.sidebar.success(f"✅ Data Loaded ({st.session_state.df.shape[0]} rows)") | |
| else: | |
| st.sidebar.warning("⚠️ No Data Loaded") | |
| # Models status | |
| if st.session_state.trained_models: | |
| st.sidebar.success(f"✅ {len(st.session_state.trained_models)} Classical Models") | |
| else: | |
| st.sidebar.info("ℹ️ No Classical Models") | |
| if st.session_state.pycaret_setup_done: | |
| st.sidebar.success("✅ PyCaret Setup Complete") | |
| else: | |
| st.sidebar.info("ℹ️ PyCaret Not Setup") | |
| if st.session_state.dl_models: | |
| st.sidebar.success(f"✅ {len(st.session_state.dl_models)} Deep Learning Models") | |
| else: | |
| st.sidebar.info("ℹ️ No Deep Learning Models") | |
| # Available libraries status | |
| st.sidebar.markdown("---") | |
| st.sidebar.subheader("📚 Libraries Status") | |
| st.sidebar.write(f"PyCaret: {'✅' if PYCARET_AVAILABLE else '❌'}") | |
| st.sidebar.write(f"PyTorch: {'✅' if TORCH_AVAILABLE else '❌'}") | |
| st.sidebar.write(f"MLflow: {'✅' if MLFLOW_AVAILABLE else '❌'}") | |
| st.sidebar.write(f"SHAP: {'✅' if SHAP_AVAILABLE else '❌'}") | |
| #st.sidebar.write(f"Profiling: {'✅' if PROFILING_AVAILABLE else '❌'}") | |
| # Help section | |
| st.sidebar.markdown("---") | |
| st.sidebar.subheader("❓ Need Help?") | |
| st.sidebar.markdown(""" | |
| **Quick Start:** | |
| 1. 📊 Load data (sample or upload) | |
| 2. 🔍 Explore with EDA | |
| 3. 🤖 Train models (Classical or AutoML) | |
| 4. 🎯 Evaluate performance | |
| 5. 🚀 Deploy your model | |
| **Tips:** | |
| - Use sample data for quick testing | |
| - PyCaret AutoML for best results | |
| - Export models for production use | |
| """) | |
| # Advanced features hint | |
| if st.sidebar.button("🎯 Show Advanced Tips"): | |
| st.sidebar.info(""" | |
| **Advanced Features:** | |
| - Feature engineering in EDA | |
| - Hyperparameter tuning in Classical ML | |
| - Cross-validation in PyCaret | |
| - SHAP explanations for interpretability | |
| - MLflow for experiment tracking | |
| - Docker deployment ready | |
| """) | |
| # Debug mode for development | |
| if st.sidebar.checkbox("🐛 Debug Mode", key="debug_mode"): | |
| st.sidebar.subheader("🔧 Debug Info") | |
| st.sidebar.write("Session State Keys:") | |
| for key in st.session_state.keys(): | |
| if not key.startswith('_'): | |
| st.sidebar.write(f"- {key}") | |
| # Performance optimization note | |
| st.sidebar.markdown("---") | |
| st.sidebar.caption("💡 For large datasets, consider using data sampling for faster processing") | |
| st.sidebar.caption(f"⏰ Last updated: {datetime.now().strftime('%H:%M:%S')}") | |
| # Auto-refresh data (for development) | |
| if st.sidebar.button("🔄 Auto Refresh", key="auto_refresh"): | |
| st.rerun() | |
| # Export session state | |
| if st.sidebar.button("💾 Export Session", key="export_session"): | |
| session_data = { | |
| 'trained_models_count': len(st.session_state.trained_models), | |
| 'data_loaded': st.session_state.df is not None, | |
| 'pycaret_setup': st.session_state.pycaret_setup_done, | |
| 'timestamp': datetime.now().isoformat() | |
| } | |
| st.sidebar.download_button( | |
| label="📥 Download Session Info", | |
| data=str(session_data), | |
| file_name=f"session_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt", | |
| mime="text/plain" | |
| ) | |
| # Success message for completion | |
| if (st.session_state.df is not None and | |
| st.session_state.trained_models and | |
| st.session_state.pycaret_setup_done): | |
| st.sidebar.success("🎉 Full Pipeline Complete!") | |
| st.sidebar.balloons() | |
| # Warning for missing dependencies | |
| missing_deps = [] | |
| if not PYCARET_AVAILABLE: | |
| missing_deps.append("pycaret") | |
| if not MLFLOW_AVAILABLE: | |
| missing_deps.append("mlflow") | |
| if not SHAP_AVAILABLE: | |
| missing_deps.append("shap") | |
| #if not PROFILING_AVAILABLE: | |
| # missing_deps.append("ydata-profiling") | |
| if missing_deps: | |
| st.sidebar.warning(f"⚠️ Missing: {', '.join(missing_deps)}") | |
| st.sidebar.code(f"pip install {' '.join(missing_deps)}") | |
| # Fun facts | |
| fun_facts = [ | |
| "🧠 Machine Learning can predict with 95%+ accuracy in many domains", | |
| "🚀 AutoML can save 80% of model development time", | |
| "📊 Feature engineering often provides the biggest performance boost", | |
| "🔬 Model explainability is crucial for production deployment", | |
| "⚡ Ensemble methods usually outperform single models", | |
| "📈 Cross-validation prevents overfitting better than simple train/test split" | |
| ] | |
| import random | |
| if st.sidebar.button("💡 Random ML Tip", key="random_tip"): | |
| st.sidebar.info(random.choice(fun_facts)) | |
| # Resource links | |
| st.sidebar.markdown("---") | |
| st.sidebar.subheader("📚 Resources") | |
| st.sidebar.markdown(""" | |
| - [PyCaret Documentation](https://pycaret.org/) | |
| - [MLflow Documentation](https://mlflow.org/) | |
| - [SHAP Tutorials](https://shap.readthedocs.io/) | |
| - [Scikit-learn Guide](https://scikit-learn.org/) | |
| """) | |
| # Version info and credits | |
| st.sidebar.markdown("---") | |
| st.sidebar.caption("🚀 Super Data Science App") | |
| st.sidebar.caption("Version 2.0 - Full Pipeline") | |
| st.sidebar.caption("Built with Streamlit ❤️") |