import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px import plotly.graph_objects as go from datetime import datetime import warnings from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.svm import SVC, SVR from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor from sklearn.naive_bayes import GaussianNB from sklearn.metrics import ( mean_squared_error, mean_absolute_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score ) warnings.filterwarnings('ignore') # MLflow and experiment tracking try: import mlflow import mlflow.sklearn MLFLOW_AVAILABLE = True except ImportError: MLFLOW_AVAILABLE = False st.warning("MLflow not installed. Some features may be limited.") # PyCaret imports try: from pycaret.classification import setup as cls_setup, compare_models as cls_compare, create_model as cls_create from pycaret.classification import tune_model as cls_tune, finalize_model as cls_finalize, predict_model as cls_predict from pycaret.classification import pull as cls_pull, plot_model as cls_plot, evaluate_model as cls_evaluate from pycaret.regression import setup as reg_setup, compare_models as reg_compare, create_model as reg_create from pycaret.regression import tune_model as reg_tune, finalize_model as reg_finalize, predict_model as reg_predict from pycaret.regression import pull as reg_pull, plot_model as reg_plot, evaluate_model as reg_evaluate PYCARET_AVAILABLE = True except ImportError: PYCARET_AVAILABLE = False st.warning("PyCaret not installed. AutoML features will be limited.") # Data profiling #try: # from ydata_profiling import ProfileReport # from streamlit_pandas_profiling import st_profile_report # PROFILING_AVAILABLE = True #except ImportError: # PROFILING_AVAILABLE = False # PyTorch for deep learning try: import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import TensorDataset, DataLoader TORCH_AVAILABLE = True except ImportError: TORCH_AVAILABLE = False # SHAP for explainability try: import shap SHAP_AVAILABLE = True except ImportError: SHAP_AVAILABLE = False # Scikit-learn imports from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix # ================== CUSTOM CSS & STYLING ================== st.set_page_config( page_title="๐Ÿš€ Super Data Science App", layout="wide", initial_sidebar_state="expanded", page_icon="๐Ÿš€" ) st.markdown(""" """, unsafe_allow_html=True) # ================== HEADER ================== st.markdown("""

๐Ÿš€ Super Data Science App

Complete ML Pipeline: EDA โ†’ Modeling โ†’ AutoML โ†’ Explainability โ†’ Deployment

""", unsafe_allow_html=True) # ================== AUTHENTICATION ================== def check_authentication(): if 'authenticated' not in st.session_state: st.session_state.authenticated = False if not st.session_state.authenticated: with st.sidebar: st.header("๐Ÿ”’ Authentication") password = st.text_input("Enter Password", type="password", key="auth_password") col1, col2 = st.columns(2) with col1: if st.button("๐Ÿ”‘ Login", key="login_btn"): if password == "ds4everyone": st.session_state.authenticated = True st.success("โœ… Access Granted!") st.rerun() else: st.error("โŒ Incorrect Password") with col2: if st.button("๐Ÿ‘ค Demo Mode", key="demo_btn"): st.session_state.authenticated = True st.session_state.demo_mode = True st.info("๐Ÿ“Š Demo Mode Activated") st.rerun() st.info("๐Ÿ” Please authenticate to access the application") st.stop() check_authentication() # ================== SESSION STATE INITIALIZATION ================== if 'df' not in st.session_state: st.session_state.df = None if 'trained_models' not in st.session_state: st.session_state.trained_models = {} if 'pycaret_setup_done' not in st.session_state: st.session_state.pycaret_setup_done = False if 'best_model' not in st.session_state: st.session_state.best_model = None if 'dl_models' not in st.session_state: st.session_state.dl_models = {} if 'training_history' not in st.session_state: st.session_state.training_history = {} # ================== SIDEBAR NAVIGATION ================== st.sidebar.title("๐Ÿงญ Navigation") pages = [ "๐Ÿ  Home", "๐Ÿ“Š Data Loading", "๐Ÿ” EDA & Profiling", "๐Ÿ“ˆ Visualization", "๐Ÿค– Classical ML", "โšก PyCaret AutoML", "๐Ÿง  Deep Learning", "๐ŸŽฏ Model Evaluation", "๐Ÿ”ฌ Explainability", "๐Ÿ“‹ MLflow Tracking", "๐Ÿš€ Model Deployment" ] selected_page = st.sidebar.selectbox("Select Page", pages, key="page_selector") # ================== UTILITY FUNCTIONS ================== def load_sample_data(dataset_name): """Load sample datasets""" if dataset_name == "California Housing": from sklearn.datasets import fetch_california_housing data = fetch_california_housing(as_frame=True) df = pd.concat([data.data, data.target.rename('MedHouseVal')], axis=1) return df.sample(n=min(2000, len(df))) # Limit for performance elif dataset_name == "Iris": from sklearn.datasets import load_iris data = load_iris(as_frame=True) df = pd.concat([data.data, data.target.rename('species')], axis=1) return df elif dataset_name == "Wine Quality": url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" try: df = pd.read_csv(url, sep=';') return df.sample(n=min(1000, len(df))) except: st.error("Could not load Wine Quality dataset") return None elif dataset_name == "Titanic": url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv" try: df = pd.read_csv(url) return df except: st.error("Could not load Titanic dataset") return None def get_dataset_info(df): """Get comprehensive dataset information""" info = { 'shape': df.shape, 'columns': df.columns.tolist(), 'dtypes': df.dtypes.to_dict(), 'missing_values': df.isnull().sum().to_dict(), 'memory_usage': f"{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB", 'numeric_columns': df.select_dtypes(include=[np.number]).columns.tolist(), 'categorical_columns': df.select_dtypes(exclude=[np.number]).columns.tolist() } return info # ================== PAGE CONTENT ================== if selected_page == "๐Ÿ  Home": col1, col2, col3 = st.columns([1, 2, 1]) with col2: st.markdown(""" ## Welcome to the Super Data Science App! ๐ŸŽ‰ This comprehensive application provides a complete machine learning pipeline: """) features = [ "๐Ÿ“Š **Data Loading**: Upload CSV or use sample datasets", "๐Ÿ” **EDA & Profiling**: Automated data profiling and exploration", "๐Ÿ“ˆ **Visualization**: Interactive charts with Plotly and Seaborn", "๐Ÿค– **Classical ML**: Scikit-learn models with hyperparameter tuning", "โšก **PyCaret AutoML**: Automated machine learning with model comparison", "๐ŸŽฏ **Model Evaluation**: Comprehensive model performance analysis", "๐Ÿ”ฌ **Explainability**: SHAP values and feature importance", "๐Ÿ“‹ **MLflow Tracking**: Experiment tracking and model versioning", "๐Ÿš€ **Model Deployment**: Model export and deployment preparation" ] for feature in features: st.markdown(feature) st.markdown("---") # Quick stats if st.session_state.df is not None: col_a, col_b, col_c, col_d = st.columns(4) with col_a: st.metric("๐Ÿ“Š Rows", f"{st.session_state.df.shape[0]:,}") with col_b: st.metric("๐Ÿ“‹ Columns", f"{st.session_state.df.shape[1]:,}") with col_c: st.metric("๐Ÿค– Models Trained", len(st.session_state.trained_models)) with col_d: st.metric("โœ… Setup Complete", "Ready" if st.session_state.pycaret_setup_done else "Pending") elif selected_page == "๐Ÿ“Š Data Loading": st.header("๐Ÿ“Š Data Loading & Management") col1, col2 = st.columns([1, 2]) with col1: st.subheader("Data Source") data_source = st.radio( "Choose data source:", ["๐Ÿ“ Upload CSV", "๐ŸŽฒ Sample Datasets", "๐Ÿ“‹ Current Data Info"] ) with col2: if data_source == "๐Ÿ“ Upload CSV": uploaded_file = st.file_uploader("Choose a CSV file", type="csv") if uploaded_file is not None: try: df = pd.read_csv(uploaded_file) st.session_state.df = df st.success(f"โœ… Successfully loaded {df.shape[0]} rows and {df.shape[1]} columns") except Exception as e: st.error(f"โŒ Error loading file: {str(e)}") if 'model_comparison' not in st.session_state: st.session_state.model_comparison = None #st.error(f"โŒ Error loading file: {str(e)}") elif data_source == "๐ŸŽฒ Sample Datasets": sample_options = ["California Housing", "Iris", "Wine Quality", "Titanic"] selected_sample = st.selectbox("Choose sample dataset:", sample_options) if st.button(f"๐Ÿ”„ Load {selected_sample} Dataset"): with st.spinner(f"Loading {selected_sample}..."): df = load_sample_data(selected_sample) if df is not None: st.session_state.df = df st.success(f"โœ… Loaded {selected_sample} dataset!") elif data_source == "๐Ÿ“‹ Current Data Info": if st.session_state.df is not None: info = get_dataset_info(st.session_state.df) col_a, col_b = st.columns(2) with col_a: st.metric("๐Ÿ“Š Rows", f"{info['shape'][0]:,}") st.metric("๐Ÿ“‹ Columns", f"{info['shape'][1]:,}") st.metric("๐Ÿ’พ Memory Usage", info['memory_usage']) with col_b: st.metric("๐Ÿ”ข Numeric Columns", len(info['numeric_columns'])) st.metric("๐Ÿ“ Categorical Columns", len(info['categorical_columns'])) st.metric("โŒ Missing Values", sum(info['missing_values'].values())) else: st.info("๐Ÿ” No data loaded yet") # Data Preview if st.session_state.df is not None: st.subheader("๐Ÿ“‹ Data Preview") col1, col2, col3 = st.columns(3) with col1: show_rows = st.slider("Rows to display", 5, 50, 10) with col2: show_info = st.checkbox("Show column info", value=True) with col3: if st.button("๐Ÿ’พ Download Current Data"): csv = st.session_state.df.to_csv(index=False) st.download_button( label="๐Ÿ“ฅ Download CSV", data=csv, file_name=f"processed_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", mime='text/csv' ) # Display data st.dataframe(st.session_state.df.head(show_rows), use_container_width=True) if show_info: st.subheader("๐Ÿ“Š Column Information") info_df = pd.DataFrame({ 'Column': st.session_state.df.columns, 'Data Type': st.session_state.df.dtypes, 'Non-Null Count': st.session_state.df.count(), 'Missing Values': st.session_state.df.isnull().sum(), 'Missing %': (st.session_state.df.isnull().sum() / len(st.session_state.df) * 100).round(2) }) st.dataframe(info_df, use_container_width=True) elif selected_page == "๐Ÿ” EDA & Profiling": st.header("๐Ÿ” Exploratory Data Analysis & Profiling") if st.session_state.df is None: st.warning("โš ๏ธ Please load data first from the Data Loading page") st.stop() df = st.session_state.df # Quick EDA st.subheader("๐Ÿ“Š Quick Statistics") col1, col2, col3, col4 = st.columns(4) with col1: st.metric("๐Ÿ“ Dataset Shape", f"{df.shape[0]} ร— {df.shape[1]}") with col2: st.metric("๐Ÿ”ข Numeric Columns", len(df.select_dtypes(include=[np.number]).columns)) with col3: st.metric("๐Ÿ“ Text Columns", len(df.select_dtypes(exclude=[np.number]).columns)) with col4: st.metric("โŒ Missing Values", df.isnull().sum().sum()) # Missing Values Analysis st.subheader("โŒ Missing Values Analysis") missing_df = pd.DataFrame({ 'Column': df.columns, 'Missing Count': df.isnull().sum(), 'Missing Percentage': (df.isnull().sum() / len(df) * 100).round(2) }).sort_values('Missing Count', ascending=False) missing_df = missing_df[missing_df['Missing Count'] > 0] if len(missing_df) > 0: st.dataframe(missing_df, use_container_width=True) # Missing values heatmap fig, ax = plt.subplots(figsize=(12, 8)) sns.heatmap(df.isnull(), yticklabels=False, cbar=True, cmap='viridis') plt.title('Missing Values Heatmap') st.pyplot(fig) else: st.success("โœ… No missing values found in the dataset!") # Statistical Summary st.subheader("๐Ÿ“ˆ Statistical Summary") numeric_cols = df.select_dtypes(include=[np.number]).columns if len(numeric_cols) > 0: st.dataframe(df[numeric_cols].describe(), use_container_width=True) # Distribution plots st.subheader("๐Ÿ“Š Distribution Analysis") selected_cols = st.multiselect("Select columns for distribution analysis:", numeric_cols, default=numeric_cols[:3]) if selected_cols: cols_per_row = 2 n_rows = (len(selected_cols) + cols_per_row - 1) // cols_per_row fig, axes = plt.subplots(n_rows, cols_per_row, figsize=(15, 5*n_rows)) if n_rows == 1: axes = [axes] if cols_per_row == 1 else axes else: axes = axes.flatten() for i, col in enumerate(selected_cols): sns.histplot(data=df, x=col, kde=True, ax=axes[i]) axes[i].set_title(f'Distribution of {col}') # Hide empty subplots for i in range(len(selected_cols), len(axes)): axes[i].set_visible(False) plt.tight_layout() st.pyplot(fig) # Correlation Analysis if len(numeric_cols) > 1: st.subheader("๐Ÿ”— Correlation Analysis") corr_matrix = df[numeric_cols].corr() fig, ax = plt.subplots(figsize=(12, 10)) sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, square=True, fmt='.2f', ax=ax) plt.title('Correlation Matrix') st.pyplot(fig) # Automated Profiling Report # if PROFILING_AVAILABLE: # st.subheader("๐Ÿ“‹ Automated Profiling Report") # if st.button("๐Ÿ”„ Generate Comprehensive Profile Report"): # with st.spinner("Generating detailed profiling report..."): # profile = ProfileReport(df, title="Dataset Profiling Report", explorative=True) # st_profile_report(profile) elif selected_page == "๐Ÿ“ˆ Visualization": st.header("๐Ÿ“ˆ Interactive Data Visualization") if st.session_state.df is None: st.warning("โš ๏ธ Please load data first from the Data Loading page") st.stop() df = st.session_state.df numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist() # Visualization controls st.subheader("๐ŸŽ›๏ธ Visualization Controls") col1, col2, col3 = st.columns(3) with col1: viz_type = st.selectbox("Select visualization type:", [ "๐Ÿ“Š Histogram", "๐Ÿ“ˆ Scatter Plot", "๐Ÿ“ฆ Box Plot", "๐Ÿ”ฅ Heatmap", "๐Ÿ“‰ Line Plot", "๐ŸŽฏ Pair Plot" ]) with col2: if viz_type in ["๐Ÿ“ˆ Scatter Plot", "๐Ÿ“‰ Line Plot"]: x_col = st.selectbox("X-axis:", numeric_cols + categorical_cols) y_col = st.selectbox("Y-axis:", numeric_cols) else: selected_col = st.selectbox("Select column:", numeric_cols if viz_type != "๐Ÿ“ฆ Box Plot" else df.columns) with col3: if categorical_cols and viz_type in ["๐Ÿ“Š Histogram", "๐Ÿ“ˆ Scatter Plot", "๐Ÿ“ฆ Box Plot"]: color_col = st.selectbox("Color by (optional):", ["None"] + categorical_cols) color_col = None if color_col == "None" else color_col else: color_col = None # Generate visualizations st.subheader("๐Ÿ“Š Visualization Output") try: if viz_type == "๐Ÿ“Š Histogram": fig = px.histogram(df, x=selected_col, color=color_col, title=f'Distribution of {selected_col}', marginal="box") st.plotly_chart(fig, use_container_width=True) elif viz_type == "๐Ÿ“ˆ Scatter Plot": fig = px.scatter(df, x=x_col, y=y_col, color=color_col, title=f'{y_col} vs {x_col}', trendline="ols" if color_col is None else None) st.plotly_chart(fig, use_container_width=True) elif viz_type == "๐Ÿ“ฆ Box Plot": if color_col: fig = px.box(df, y=selected_col, x=color_col, title=f'Box Plot of {selected_col} by {color_col}') else: fig = px.box(df, y=selected_col, title=f'Box Plot of {selected_col}') st.plotly_chart(fig, use_container_width=True) elif viz_type == "๐Ÿ”ฅ Heatmap": if len(numeric_cols) > 1: corr_matrix = df[numeric_cols].corr() fig = px.imshow(corr_matrix, text_auto=True, aspect="auto", title="Correlation Heatmap") st.plotly_chart(fig, use_container_width=True) else: st.warning("Need at least 2 numeric columns for correlation heatmap") elif viz_type == "๐Ÿ“‰ Line Plot": fig = px.line(df.sort_values(x_col), x=x_col, y=y_col, title=f'{y_col} vs {x_col} (Line Plot)') st.plotly_chart(fig, use_container_width=True) elif viz_type == "๐ŸŽฏ Pair Plot": if len(numeric_cols) >= 2: selected_numeric = st.multiselect("Select numeric columns for pair plot:", numeric_cols, default=numeric_cols[:4]) if len(selected_numeric) >= 2: fig = px.scatter_matrix(df, dimensions=selected_numeric, color=color_col, title="Pair Plot Matrix") st.plotly_chart(fig, use_container_width=True) else: st.warning("Please select at least 2 numeric columns") else: st.warning("Need at least 2 numeric columns for pair plot") except Exception as e: st.error(f"Error generating visualization: {str(e)}") # Additional visualizations st.subheader("๐Ÿ“Š Additional Insights") # Value counts for categorical columns if categorical_cols: st.write("**Categorical Column Distributions:**") for col in categorical_cols[:3]: # Limit to first 3 if df[col].nunique() <= 20: # Only show if not too many categories fig = px.bar(df[col].value_counts().head(10), title=f'Top 10 values in {col}') st.plotly_chart(fig, use_container_width=True) elif selected_page == "๐Ÿค– Classical ML": st.header("๐Ÿค– Classical Machine Learning") if st.session_state.df is None: st.warning("โš ๏ธ Please load data first from the Data Loading page") st.stop() df = st.session_state.df # Model configuration st.subheader("โš™๏ธ Model Configuration") col1, col2 = st.columns(2) with col1: # Target selection target_col = st.selectbox("๐ŸŽฏ Select target variable:", df.columns) # Feature selection available_features = [col for col in df.columns if col != target_col] selected_features = st.multiselect("๐Ÿ“Š Select features:", available_features, default=available_features[:5]) with col2: # Problem type detection if df[target_col].dtype in ['object', 'bool'] or df[target_col].nunique() < 10: problem_type = "Classification" st.info("๐ŸŽฏ Detected: Classification Problem") model_options = ["Logistic Regression", "Decision Tree", "Random Forest"] else: problem_type = "Regression" st.info("๐Ÿ“ˆ Detected: Regression Problem") model_options = ["Linear Regression", "Decision Tree", "Random Forest"] selected_model = st.selectbox("๐Ÿค– Select model:", model_options) test_size = st.slider("๐Ÿ”„ Test set size:", 0.1, 0.5, 0.2, 0.05) if not selected_features: st.warning("โš ๏ธ Please select at least one feature") st.stop() # Data preprocessing if st.button("๐Ÿš€ Train Model"): with st.spinner("Training model..."): try: # Prepare data X = df[selected_features].copy() y = df[target_col].copy() # Handle missing values X = X.fillna(X.mean() if X.select_dtypes(include=[np.number]).shape[1] > 0 else X.mode().iloc[0]) # Encode categorical variables le_dict = {} for col in X.select_dtypes(include=['object']).columns: le = LabelEncoder() X[col] = le.fit_transform(X[col].astype(str)) le_dict[col] = le # Encode target if classification if problem_type == "Classification" and y.dtype == 'object': target_le = LabelEncoder() y = target_le.fit_transform(y) # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=42 ) # Train model if selected_model == "Linear Regression": model = LinearRegression() elif selected_model == "Logistic Regression": model = LogisticRegression(random_state=42, max_iter=1000) elif selected_model == "Decision Tree": if problem_type == "Classification": model = DecisionTreeClassifier(random_state=42) else: model = DecisionTreeRegressor(random_state=42) elif selected_model == "Random Forest": if problem_type == "Classification": model = RandomForestClassifier(random_state=42, n_estimators=100) else: model = RandomForestRegressor(random_state=42, n_estimators=100) model.fit(X_train, y_train) predictions = model.predict(X_test) # Store model st.session_state.trained_models[selected_model] = { 'model': model, 'X_test': X_test, 'y_test': y_test, 'predictions': predictions, 'features': selected_features, 'target': target_col, 'problem_type': problem_type } st.success("โœ… Model trained successfully!") # Display results st.subheader("๐Ÿ“Š Model Performance") if problem_type == "Regression": mse = mean_squared_error(y_test, predictions) mae = mean_absolute_error(y_test, predictions) r2 = r2_score(y_test, predictions) col1, col2, col3 = st.columns(3) with col1: st.metric("MSE", f"{mse:.4f}") with col2: st.metric("MAE", f"{mae:.4f}") with col3: st.metric("Rยฒ Score", f"{r2:.4f}") # Actual vs Predicted plot fig = px.scatter(x=y_test, y=predictions, labels={'x': 'Actual', 'y': 'Predicted'}, title='Actual vs Predicted Values') fig.add_shape(type="line", x0=y_test.min(), y0=y_test.min(), x1=y_test.max(), y1=y_test.max(), line=dict(color="red", dash="dash")) st.plotly_chart(fig, use_container_width=True) else: # Classification accuracy = accuracy_score(y_test, predictions) precision = precision_score(y_test, predictions, average='weighted') recall = recall_score(y_test, predictions, average='weighted') f1 = f1_score(y_test, predictions, average='weighted') col1, col2, col3, col4 = st.columns(4) with col1: st.metric("Accuracy", f"{accuracy:.4f}") with col2: st.metric("Precision", f"{precision:.4f}") with col3: st.metric("Recall", f"{recall:.4f}") with col4: st.metric("F1-Score", f"{f1:.4f}") # Confusion Matrix cm = confusion_matrix(y_test, predictions) fig, ax = plt.subplots(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax) ax.set_title('Confusion Matrix') ax.set_xlabel('Predicted') ax.set_ylabel('Actual') st.pyplot(fig) # Feature importance (for tree-based models) if hasattr(model, 'feature_importances_'): st.subheader("๐Ÿ“Š Feature Importance") importance_df = pd.DataFrame({ 'Feature': selected_features, 'Importance': model.feature_importances_ }).sort_values('Importance', ascending=False) fig = px.bar(importance_df, x='Importance', y='Feature', orientation='h', title='Feature Importance') st.plotly_chart(fig, use_container_width=True) except Exception as e: st.error(f"โŒ Error training model: {str(e)}") elif selected_page == "โšก PyCaret AutoML": st.header("โšก PyCaret AutoML") if not PYCARET_AVAILABLE: st.error("โŒ PyCaret is not installed. Please install it to use AutoML features.") st.stop() if st.session_state.df is None: st.warning("โš ๏ธ Please load data first from the Data Loading page") st.stop() df = st.session_state.df # AutoML Configuration st.subheader("โš™๏ธ AutoML Configuration") col1, col2 = st.columns(2) with col1: target_col = st.selectbox("๐ŸŽฏ Select target variable:", df.columns, key="pycaret_target") # Auto-detect problem type if df[target_col].dtype in ['object', 'bool'] or df[target_col].nunique() < 10: problem_type = "classification" st.info("๐ŸŽฏ Detected: Classification Problem") else: problem_type = "regression" st.info("๐Ÿ“ˆ Detected: Regression Problem") with col2: train_size = st.slider("๐Ÿ”„ Training set size:", 0.5, 0.9, 0.8, 0.05) sample_size = st.slider("๐Ÿ“Š Sample size (for performance):", 500, min(5000, len(df)), min(2000, len(df))) if len(df) > sample_size: df_sample = df.sample(n=sample_size, random_state=42) st.info(f"๐Ÿ“Š Using {sample_size} samples for faster processing") else: df_sample = df.copy() # Advanced settings with st.expander("๐Ÿ”ง Advanced Settings"): col1, col2 = st.columns(2) with col1: cross_validation = st.checkbox("๐Ÿ”„ Cross Validation", value=True) normalize = st.checkbox("๐Ÿ“ Normalize Features", value=True) with col2: remove_outliers = st.checkbox("๐Ÿšซ Remove Outliers", value=False) feature_selection = st.checkbox("๐ŸŽฏ Feature Selection", value=False) # Setup PyCaret Environment if st.button("๐Ÿš€ Setup PyCaret Environment"): with st.spinner("Setting up PyCaret environment..."): try: if problem_type == "classification": st.session_state.pycaret_exp = cls_setup( data=df_sample, target=target_col, train_size=train_size, session_id=42, normalize=normalize, remove_outliers=remove_outliers, feature_selection=feature_selection, silent=True ) else: st.session_state.pycaret_exp = reg_setup( data=df_sample, target=target_col, train_size=train_size, session_id=42, normalize=normalize, remove_outliers=remove_outliers, feature_selection=feature_selection, silent=True ) st.session_state.pycaret_setup_done = True st.session_state.pycaret_problem_type = problem_type st.success("โœ… PyCaret environment setup complete!") except Exception as e: st.error(f"โŒ Error setting up PyCaret: {str(e)}") # Model Comparison if st.session_state.pycaret_setup_done: st.subheader("๐Ÿ“Š Model Comparison") if st.button("๐Ÿ”„ Compare Models"): with st.spinner("Comparing multiple models..."): try: if st.session_state.pycaret_problem_type == "classification": comparison_df = cls_compare( include=['lr', 'rf', 'et', 'nb', 'dt', 'svm'], sort='Accuracy', n_select=5 ) st.session_state.model_comparison = cls_pull() else: comparison_df = reg_compare( include=['lr', 'rf', 'et', 'dt', 'huber'], sort='R2', n_select=5 ) st.session_state.model_comparison = reg_pull() st.success("โœ… Model comparison complete!") except Exception as e: st.error(f"โŒ Error comparing models: {str(e)}") # Display comparison results if st.session_state.model_comparison is not None: st.subheader("๐Ÿ“ˆ Model Comparison Results") st.dataframe(st.session_state.model_comparison, use_container_width=True) # Select best model best_model_name = st.selectbox( "๐Ÿ† Select model for tuning:", ['lr', 'rf', 'et', 'dt', 'nb', 'svm'] if st.session_state.pycaret_problem_type == "classification" else ['lr', 'rf', 'et', 'dt', 'huber'] ) # Create and tune model col1, col2 = st.columns(2) with col1: if st.button("๐ŸŽฏ Create Model"): with st.spinner("Creating model..."): try: if st.session_state.pycaret_problem_type == "classification": model = cls_create(best_model_name) else: model = reg_create(best_model_name) st.session_state.pycaret_model = model st.success("โœ… Model created successfully!") except Exception as e: st.error(f"โŒ Error creating model: {str(e)}") with col2: if st.button("โšก Tune Hyperparameters"): if 'pycaret_model' in st.session_state: with st.spinner("Tuning hyperparameters..."): try: if st.session_state.pycaret_problem_type == "classification": tuned_model = cls_tune(st.session_state.pycaret_model, optimize='Accuracy', n_iter=10) else: tuned_model = reg_tune(st.session_state.pycaret_model, optimize='R2', n_iter=10) st.session_state.tuned_model = tuned_model st.success("โœ… Hyperparameter tuning complete!") except Exception as e: st.error(f"โŒ Error tuning model: {str(e)}") else: st.warning("โš ๏ธ Please create a model first") # Finalize model if st.button("๐Ÿ Finalize Best Model"): if 'tuned_model' in st.session_state: model_to_finalize = st.session_state.tuned_model elif 'pycaret_model' in st.session_state: model_to_finalize = st.session_state.pycaret_model else: st.warning("โš ๏ธ Please create a model first") model_to_finalize = None if model_to_finalize is not None: with st.spinner("Finalizing model..."): try: if st.session_state.pycaret_problem_type == "classification": final_model = cls_finalize(model_to_finalize) else: final_model = reg_finalize(model_to_finalize) st.session_state.best_model = final_model st.success("โœ… Model finalized successfully!") except Exception as e: st.error(f"โŒ Error finalizing model: {str(e)}") elif selected_page == "๐ŸŽฏ Model Evaluation": st.header("๐ŸŽฏ Advanced Model Evaluation") if st.session_state.df is None: st.warning("โš ๏ธ Please load data first") st.stop() # Check for available models available_models = [] if st.session_state.trained_models: available_models.extend(list(st.session_state.trained_models.keys())) if 'best_model' in st.session_state and st.session_state.best_model is not None: available_models.append("PyCaret Best Model") if not available_models: st.warning("โš ๏ธ No trained models available. Please train a model first.") st.stop() selected_model_name = st.selectbox("๐Ÿ“Š Select model to evaluate:", available_models) if selected_model_name == "PyCaret Best Model": if 'best_model' not in st.session_state: st.error("โŒ PyCaret model not available") st.stop() model_info = st.session_state.best_model problem_type = st.session_state.get('pycaret_problem_type', 'regression') st.subheader("๐Ÿ“ˆ PyCaret Model Evaluation") # PyCaret built-in plots if PYCARET_AVAILABLE: col1, col2 = st.columns(2) with col1: plot_types_cls = ['auc', 'confusion_matrix', 'class_report', 'pr', 'feature'] plot_types_reg = ['residuals', 'feature', 'rfe', 'learning', 'vc'] plot_types = plot_types_cls if problem_type == "classification" else plot_types_reg selected_plot = st.selectbox("๐Ÿ“Š Select evaluation plot:", plot_types) with col2: if st.button("๐Ÿ“Š Generate Plot"): try: with st.spinner("Generating plot..."): if problem_type == "classification": cls_plot(model_info, plot=selected_plot, display_format='streamlit') else: reg_plot(model_info, plot=selected_plot, display_format='streamlit') except Exception as e: st.error(f"โŒ Error generating plot: {str(e)}") # Model predictions if st.button("๐Ÿ”ฎ Generate Predictions"): try: with st.spinner("Generating predictions..."): if problem_type == "classification": predictions_df = cls_predict(model_info) else: predictions_df = reg_predict(model_info) st.subheader("๐Ÿ”ฎ Model Predictions") st.dataframe(predictions_df.head(20), use_container_width=True) # Download predictions csv = predictions_df.to_csv(index=False) st.download_button( label="๐Ÿ“ฅ Download Predictions", data=csv, file_name=f"predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", mime='text/csv' ) except Exception as e: st.error(f"โŒ Error generating predictions: {str(e)}") else: # Classical ML model evaluation model_data = st.session_state.trained_models[selected_model_name] model = model_data['model'] X_test = model_data['X_test'] y_test = model_data['y_test'] predictions = model_data['predictions'] problem_type = model_data['problem_type'] st.subheader(f"๐Ÿ“Š {selected_model_name} Evaluation") if problem_type == "Regression": # Regression metrics mse = mean_squared_error(y_test, predictions) mae = mean_absolute_error(y_test, predictions) r2 = r2_score(y_test, predictions) rmse = np.sqrt(mse) col1, col2, col3, col4 = st.columns(4) with col1: st.metric("RMSE", f"{rmse:.4f}") with col2: st.metric("MAE", f"{mae:.4f}") with col3: st.metric("Rยฒ Score", f"{r2:.4f}") with col4: st.metric("MSE", f"{mse:.4f}") # Residual analysis residuals = y_test - predictions col1, col2 = st.columns(2) with col1: # Residual plot fig = px.scatter(x=predictions, y=residuals, labels={'x': 'Predicted', 'y': 'Residuals'}, title='Residual Plot') fig.add_hline(y=0, line_dash="dash", line_color="red") st.plotly_chart(fig, use_container_width=True) with col2: # Residual distribution fig = px.histogram(residuals, title='Residual Distribution', labels={'value': 'Residuals', 'count': 'Frequency'}) st.plotly_chart(fig, use_container_width=True) else: # Classification metrics accuracy = accuracy_score(y_test, predictions) precision = precision_score(y_test, predictions, average='weighted') recall = recall_score(y_test, predictions, average='weighted') f1 = f1_score(y_test, predictions, average='weighted') col1, col2, col3, col4 = st.columns(4) with col1: st.metric("Accuracy", f"{accuracy:.4f}") with col2: st.metric("Precision", f"{precision:.4f}") with col3: st.metric("Recall", f"{recall:.4f}") with col4: st.metric("F1-Score", f"{f1:.4f}") elif selected_page == "๐Ÿ”ฌ Explainability": st.header("๐Ÿ”ฌ Model Explainability with SHAP") if not SHAP_AVAILABLE: st.warning("โš ๏ธ SHAP is not installed. Explainability features are limited.") st.stop() if st.session_state.df is None: st.warning("โš ๏ธ Please load data first") st.stop() # Check for available models if not st.session_state.trained_models and 'best_model' not in st.session_state: st.warning("โš ๏ธ No trained models available. Please train a model first.") st.stop() # Select model for explanation available_models = list(st.session_state.trained_models.keys()) if 'best_model' in st.session_state: available_models.append("PyCaret Best Model") selected_model = st.selectbox("๐Ÿค– Select model to explain:", available_models) if selected_model != "PyCaret Best Model": model_data = st.session_state.trained_models[selected_model] model = model_data['model'] features = model_data['features'] X_test = model_data['X_test'] # SHAP Explanation st.subheader("๐Ÿ”ฌ SHAP Analysis") try: # Create SHAP explainer with st.spinner("Creating SHAP explainer..."): explainer = shap.Explainer(model, X_test.iloc[:100]) # Use subset for performance shap_values = explainer(X_test.iloc[:100]) # Global feature importance st.subheader("๐ŸŒ Global Feature Importance") fig, ax = plt.subplots() shap.plots.bar(shap_values, ax=ax, show=False) st.pyplot(fig) # Summary plot st.subheader("๐Ÿ“Š Feature Impact Summary") fig, ax = plt.subplots() shap.plots.beeswarm(shap_values, ax=ax, show=False) st.pyplot(fig) # Individual prediction explanation st.subheader("๐Ÿ” Individual Prediction Explanation") instance_idx = st.slider("Select instance:", 0, len(X_test)-1, 0) fig, ax = plt.subplots() shap.plots.waterfall(shap_values[instance_idx], ax=ax, show=False) st.pyplot(fig) # Feature dependence if len(features) > 1: st.subheader("๐Ÿ“ˆ Feature Dependence") feature_for_dependence = st.selectbox("Select feature:", features) if feature_for_dependence in X_test.columns: fig, ax = plt.subplots() shap.plots.scatter(shap_values[:, feature_for_dependence], ax=ax, show=False) st.pyplot(fig) except Exception as e: st.error(f"โŒ Error generating SHAP explanations: {str(e)}") st.info("๐Ÿ’ก SHAP works best with tree-based models (Random Forest, XGBoost, etc.)") elif selected_page == "๐Ÿ“‹ MLflow Tracking": st.header("๐Ÿ“‹ MLflow Experiment Tracking") if not MLFLOW_AVAILABLE: st.warning("โš ๏ธ MLflow is not installed. Install it to use experiment tracking.") st.stop() # MLflow Configuration st.subheader("โš™๏ธ MLflow Configuration") col1, col2 = st.columns(2) with col1: tracking_uri = st.text_input("๐Ÿ”— Tracking URI:", "http://localhost:5000") experiment_name = st.text_input("๐Ÿงช Experiment Name:", "super_app_experiments") with col2: if st.button("๐Ÿ”ง Set MLflow Configuration"): try: mlflow.set_tracking_uri(tracking_uri) mlflow.set_experiment(experiment_name) st.success("โœ… MLflow configuration set!") except Exception as e: st.error(f"โŒ Error setting MLflow: {str(e)}") # Log current models st.subheader("๐Ÿ“Š Log Models to MLflow") if st.session_state.trained_models: model_to_log = st.selectbox("Select model to log:", list(st.session_state.trained_models.keys())) if st.button("๐Ÿ“ค Log Model"): try: with mlflow.start_run(run_name=f"{model_to_log}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"): model_data = st.session_state.trained_models[model_to_log] model = model_data['model'] # Log model mlflow.sklearn.log_model(model, "model") # Log parameters mlflow.log_param("model_type", model_to_log) mlflow.log_param("features", model_data['features']) mlflow.log_param("target", model_data['target']) # Log metrics (if available) if 'predictions' in model_data: y_test = model_data['y_test'] predictions = model_data['predictions'] if model_data['problem_type'] == "Regression": mlflow.log_metric("mse", mean_squared_error(y_test, predictions)) mlflow.log_metric("mae", mean_absolute_error(y_test, predictions)) mlflow.log_metric("r2", r2_score(y_test, predictions)) else: mlflow.log_metric("accuracy", accuracy_score(y_test, predictions)) st.success("โœ… Model logged to MLflow!") except Exception as e: st.error(f"โŒ Error logging model: {str(e)}") # Display recent runs st.subheader("๐Ÿ“ˆ Recent Experiment Runs") if st.button("๐Ÿ”„ Refresh Runs"): try: runs = mlflow.search_runs(order_by=["start_time desc"]) if not runs.empty: st.dataframe(runs[['run_id', 'status', 'start_time', 'params.model_type', 'metrics.mse', 'metrics.r2', 'metrics.accuracy']], use_container_width=True) else: st.info("๐Ÿ“Š No runs found. Start logging some models!") except Exception as e: st.error(f"โŒ Error fetching runs: {str(e)}") elif selected_page == "๐Ÿš€ Model Deployment": st.header("๐Ÿš€ Model Deployment & Export") if not st.session_state.trained_models and 'best_model' not in st.session_state: st.warning("โš ๏ธ No trained models available for deployment.") st.stop() # Model selection for deployment available_models = list(st.session_state.trained_models.keys()) if 'best_model' in st.session_state: available_models.append("PyCaret Best Model") selected_model = st.selectbox("๐Ÿค– Select model for deployment:", available_models) # Model export options st.subheader("๐Ÿ’พ Export Options") col1, col2, col3 = st.columns(3) with col1: if st.button("๐Ÿ“ฆ Export Model (Pickle)"): try: import pickle if selected_model == "PyCaret Best Model": model_to_export = st.session_state.best_model else: model_to_export = st.session_state.trained_models[selected_model]['model'] # Serialize model model_bytes = pickle.dumps(model_to_export) st.download_button( label="๐Ÿ“ฅ Download Model", data=model_bytes, file_name=f"{selected_model.replace(' ', '_')}_model.pkl", mime="application/octet-stream" ) st.success("โœ… Model ready for download!") except Exception as e: st.error(f"โŒ Error exporting model: {str(e)}") with col2: if st.button("๐Ÿ“„ Generate Prediction Script"): # Generate Python script for predictions script_content = f''' import pandas as pd import pickle import numpy as np # Load the trained model def load_model(model_path): with open(model_path, 'rb') as f: model = pickle.load(f) return model # Make predictions def predict(model, input_data): """ Make predictions using the trained model Parameters: model: Trained model object input_data: pandas DataFrame with features Returns: predictions: numpy array of predictions """ predictions = model.predict(input_data) return predictions # Example usage if __name__ == "__main__": # Load your model model = load_model("path_to_your_model.pkl") # Create sample input data (replace with your actual data) sample_data = pd.DataFrame({{ # Add your feature columns here # 'feature1': [value1], # 'feature2': [value2], }}) # Make predictions predictions = predict(model, sample_data) print("Predictions:", predictions) ''' st.download_button( label="๐Ÿ“ฅ Download Script", data=script_content, file_name=f"{selected_model.replace(' ', '_')}_prediction_script.py", mime="text/plain" ) st.success("โœ… Prediction script ready!") with col3: if st.button("๐Ÿณ Generate Dockerfile"): dockerfile_content = ''' FROM python:3.9-slim WORKDIR /app # Copy requirements COPY requirements.txt . RUN pip install -r requirements.txt # Copy model and script COPY model.pkl . COPY app.py . # Expose port EXPOSE 8000 # Run the application CMD ["python", "app.py"] ''' requirements_content = ''' pandas==1.5.3 scikit-learn==1.3.0 numpy==1.24.3 flask==2.3.2 ''' col_a, col_b = st.columns(2) with col_a: st.download_button( label="๐Ÿ“ฅ Download Dockerfile", data=dockerfile_content, file_name="Dockerfile", mime="text/plain" ) with col_b: st.download_button( label="๐Ÿ“ฅ Download Requirements", data=requirements_content, file_name="requirements.txt", mime="text/plain" ) st.success("โœ… Docker files ready!") # Model API endpoint generator st.subheader("๐ŸŒ API Endpoint Generator") if st.button("๐Ÿ”ง Generate Flask API"): api_code = f''' from flask import Flask, request, jsonify import pandas as pd import pickle import numpy as np app = Flask(__name__) # Load model at startup model = None def load_model(): global model with open('model.pkl', 'rb') as f: model = pickle.load(f) @app.route('/predict', methods=['POST']) def predict(): try: # Get data from request data = request.get_json() # Convert to DataFrame df = pd.DataFrame([data]) # Make prediction prediction = model.predict(df) # Return result return jsonify({{ 'prediction': prediction.tolist(), 'status': 'success' }}) except Exception as e: return jsonify({{ 'error': str(e), 'status': 'error' }}), 400 @app.route('/health', methods=['GET']) def health(): return jsonify({{'status': 'healthy'}}) if __name__ == '__main__': load_model() app.run(host='0.0.0.0', port=8000, debug=False) ''' st.download_button( label="๐Ÿ“ฅ Download Flask API", data=api_code, file_name="app.py", mime="text/plain" ) st.success("โœ… Flask API code ready!") # Deployment instructions st.subheader("๐Ÿ“‹ Deployment Instructions") st.markdown(""" ### ๐Ÿš€ Deployment Steps: 1. **Local Deployment:** - Download the model pickle file - Download the prediction script or Flask API - Install required dependencies: `pip install -r requirements.txt` - Run the application: `python app.py` 2. **Docker Deployment:** - Download all generated files (Dockerfile, requirements.txt, app.py, model.pkl) - Build image: `docker build -t my-ml-app .` - Run container: `docker run -p 8000:8000 my-ml-app` 3. **Cloud Deployment:** - **AWS**: Upload to EC2 or use ECS with the Docker image - **GCP**: Deploy to Google Cloud Run or App Engine - **Azure**: Use Azure Container Instances or App Service - **Heroku**: Push Docker image to Heroku Container Registry 4. **API Usage Example:** ```bash curl -X POST http://localhost:8000/predict \ -H "Content-Type: application/json" \ -d '{"feature1": 1.0, "feature2": 2.0}' ``` """) # Model performance summary if selected_model != "PyCaret Best Model" and selected_model in st.session_state.trained_models: st.subheader("๐Ÿ“Š Model Summary for Deployment") model_data = st.session_state.trained_models[selected_model] col1, col2 = st.columns(2) with col1: st.write("**Model Details:**") st.write(f"- **Type:** {selected_model}") st.write(f"- **Problem Type:** {model_data['problem_type']}") st.write(f"- **Features:** {len(model_data['features'])}") st.write(f"- **Target:** {model_data['target']}") with col2: if 'predictions' in model_data: y_test = model_data['y_test'] predictions = model_data['predictions'] st.write("**Performance Metrics:**") if model_data['problem_type'] == "Regression": r2 = r2_score(y_test, predictions) mae = mean_absolute_error(y_test, predictions) st.write(f"- **Rยฒ Score:** {r2:.4f}") st.write(f"- **MAE:** {mae:.4f}") else: accuracy = accuracy_score(y_test, predictions) st.write(f"- **Accuracy:** {accuracy:.4f}") # ================== FOOTER ================== st.markdown("---") col1, col2, col3 = st.columns(3) with col1: st.markdown("### ๐Ÿ“Š Quick Stats") if st.session_state.df is not None: st.write(f"Dataset: {st.session_state.df.shape[0]} rows ร— {st.session_state.df.shape[1]} cols") st.write(f"Models Trained: {len(st.session_state.trained_models)}") with col2: st.markdown("### ๐Ÿ”— Quick Actions") if st.button("๐Ÿ”„ Reset All Data", key="footer_reset"): for key in list(st.session_state.keys()): if key not in ['authenticated', 'demo_mode']: del st.session_state[key] st.success("โœ… All data reset!") st.rerun() with col3: st.markdown("### โ„น๏ธ App Info") st.write("Super Data Science App v2.0") st.write(f"Session: {datetime.now().strftime('%Y-%m-%d %H:%M')}") # ================== SIDEBAR STATUS ================== st.sidebar.markdown("---") st.sidebar.subheader("๐Ÿ“Š Current Status") # Data status if st.session_state.df is not None: st.sidebar.success(f"โœ… Data Loaded ({st.session_state.df.shape[0]} rows)") else: st.sidebar.warning("โš ๏ธ No Data Loaded") # Models status if st.session_state.trained_models: st.sidebar.success(f"โœ… {len(st.session_state.trained_models)} Classical Models") else: st.sidebar.info("โ„น๏ธ No Classical Models") if st.session_state.pycaret_setup_done: st.sidebar.success("โœ… PyCaret Setup Complete") else: st.sidebar.info("โ„น๏ธ PyCaret Not Setup") if st.session_state.dl_models: st.sidebar.success(f"โœ… {len(st.session_state.dl_models)} Deep Learning Models") else: st.sidebar.info("โ„น๏ธ No Deep Learning Models") # Available libraries status st.sidebar.markdown("---") st.sidebar.subheader("๐Ÿ“š Libraries Status") st.sidebar.write(f"PyCaret: {'โœ…' if PYCARET_AVAILABLE else 'โŒ'}") st.sidebar.write(f"PyTorch: {'โœ…' if TORCH_AVAILABLE else 'โŒ'}") st.sidebar.write(f"MLflow: {'โœ…' if MLFLOW_AVAILABLE else 'โŒ'}") st.sidebar.write(f"SHAP: {'โœ…' if SHAP_AVAILABLE else 'โŒ'}") #st.sidebar.write(f"Profiling: {'โœ…' if PROFILING_AVAILABLE else 'โŒ'}") # Help section st.sidebar.markdown("---") st.sidebar.subheader("โ“ Need Help?") st.sidebar.markdown(""" **Quick Start:** 1. ๐Ÿ“Š Load data (sample or upload) 2. ๐Ÿ” Explore with EDA 3. ๐Ÿค– Train models (Classical or AutoML) 4. ๐ŸŽฏ Evaluate performance 5. ๐Ÿš€ Deploy your model **Tips:** - Use sample data for quick testing - PyCaret AutoML for best results - Export models for production use """) # Advanced features hint if st.sidebar.button("๐ŸŽฏ Show Advanced Tips"): st.sidebar.info(""" **Advanced Features:** - Feature engineering in EDA - Hyperparameter tuning in Classical ML - Cross-validation in PyCaret - SHAP explanations for interpretability - MLflow for experiment tracking - Docker deployment ready """) # Debug mode for development if st.sidebar.checkbox("๐Ÿ› Debug Mode", key="debug_mode"): st.sidebar.subheader("๐Ÿ”ง Debug Info") st.sidebar.write("Session State Keys:") for key in st.session_state.keys(): if not key.startswith('_'): st.sidebar.write(f"- {key}") # Performance optimization note st.sidebar.markdown("---") st.sidebar.caption("๐Ÿ’ก For large datasets, consider using data sampling for faster processing") st.sidebar.caption(f"โฐ Last updated: {datetime.now().strftime('%H:%M:%S')}") # Auto-refresh data (for development) if st.sidebar.button("๐Ÿ”„ Auto Refresh", key="auto_refresh"): st.rerun() # Export session state if st.sidebar.button("๐Ÿ’พ Export Session", key="export_session"): session_data = { 'trained_models_count': len(st.session_state.trained_models), 'data_loaded': st.session_state.df is not None, 'pycaret_setup': st.session_state.pycaret_setup_done, 'timestamp': datetime.now().isoformat() } st.sidebar.download_button( label="๐Ÿ“ฅ Download Session Info", data=str(session_data), file_name=f"session_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt", mime="text/plain" ) # Success message for completion if (st.session_state.df is not None and st.session_state.trained_models and st.session_state.pycaret_setup_done): st.sidebar.success("๐ŸŽ‰ Full Pipeline Complete!") st.sidebar.balloons() # Warning for missing dependencies missing_deps = [] if not PYCARET_AVAILABLE: missing_deps.append("pycaret") if not MLFLOW_AVAILABLE: missing_deps.append("mlflow") if not SHAP_AVAILABLE: missing_deps.append("shap") #if not PROFILING_AVAILABLE: # missing_deps.append("ydata-profiling") if missing_deps: st.sidebar.warning(f"โš ๏ธ Missing: {', '.join(missing_deps)}") st.sidebar.code(f"pip install {' '.join(missing_deps)}") # Fun facts fun_facts = [ "๐Ÿง  Machine Learning can predict with 95%+ accuracy in many domains", "๐Ÿš€ AutoML can save 80% of model development time", "๐Ÿ“Š Feature engineering often provides the biggest performance boost", "๐Ÿ”ฌ Model explainability is crucial for production deployment", "โšก Ensemble methods usually outperform single models", "๐Ÿ“ˆ Cross-validation prevents overfitting better than simple train/test split" ] import random if st.sidebar.button("๐Ÿ’ก Random ML Tip", key="random_tip"): st.sidebar.info(random.choice(fun_facts)) # Resource links st.sidebar.markdown("---") st.sidebar.subheader("๐Ÿ“š Resources") st.sidebar.markdown(""" - [PyCaret Documentation](https://pycaret.org/) - [MLflow Documentation](https://mlflow.org/) - [SHAP Tutorials](https://shap.readthedocs.io/) - [Scikit-learn Guide](https://scikit-learn.org/) """) # Version info and credits st.sidebar.markdown("---") st.sidebar.caption("๐Ÿš€ Super Data Science App") st.sidebar.caption("Version 2.0 - Full Pipeline") st.sidebar.caption("Built with Streamlit โค๏ธ")