Spaces:

MENG21
/

synthetic-data-generation

Sleeping

App Files Files Community

MENG21 commited on Apr 27, 2025

Commit

d6212ac

1 Parent(s): 6883c01

Enhance README.md with detailed application overview, features, installation instructions, and usage guidelines for synthetic data generation and ML model training.

Browse files

Files changed (6) hide show

.gitignore +4 -0
App.py +1316 -0
README.md +102 -13
pages/02_Algorithm_Education.py +1250 -0
pages/03_Model_implementation.py +227 -0
requirements.txt +8 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+models/
+temp_uploads/
+__pycache__/

App.py ADDED Viewed

	@@ -0,0 +1,1316 @@

+import streamlit as st
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+import plotly.express as px
+from sklearn.linear_model import LogisticRegression, RidgeClassifier
+from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier
+from sklearn.svm import SVC, LinearSVC
+from sklearn.naive_bayes import GaussianNB, MultinomialNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.neural_network import MLPClassifier
+from sklearn.preprocessing import MaxAbsScaler, StandardScaler, MinMaxScaler
+import time
+import warnings
+import joblib
+import os
+from datetime import datetime
+import seaborn as sns
+import matplotlib.pyplot as plt
+from matplotlib.colors import LinearSegmentedColormap
+from sklearn.model_selection import learning_curve
+import pickle
+warnings.filterwarnings('ignore')
+class DataGenerator:
+    def __init__(self):
+        self.features = None
+        self.feature_configs = None
+        self.classes = None
+        self.class_configs = None
+    def generate_synthetic_data(self, n_samples, feature_configs, classes, class_configs=None):
+        """Generate synthetic data based on configurations"""
+        n_features = len(feature_configs)
+        n_classes = len(classes)
+        X = []
+        y = []
+        samples_per_class = n_samples // n_classes
+        for i in range(n_classes):
+            class_samples = []
+            class_name = classes[i]
+            for j, (feature_name, config) in enumerate(feature_configs.items()):
+                if class_configs and class_name in class_configs:
+                    center = class_configs[class_name]['mean'][j]
+                    std = class_configs[class_name]['std'][j]
+                else:
+                    if config['type'] == 'random':
+                        center = np.random.randn() * 5
+                        std = config['std']
+                    else:
+                        center = config['center']
+                        std = config['std']
+                feature_samples = np.round(np.random.normal(
+                    loc=center,
+                    scale=std,
+                    size=samples_per_class
+                ), decimals=2)
+                class_samples.append(feature_samples)
+            X.append(np.column_stack(class_samples))
+            y.extend([classes[i]] * samples_per_class)
+        X = np.vstack(X)
+        return X, np.array(y)
+class ModelManager:
+    @staticmethod
+    def get_classifiers():
+        """Return dictionary of classifiers with appropriate preprocessing"""
+        return {
+            'LogisticRegression': {
+                'model': LogisticRegression(max_iter=1000),
+                'scaler': StandardScaler()
+            },
+            'RidgeClassifier': {
+                'model': RidgeClassifier(),
+                'scaler': StandardScaler()
+            },
+            'RandomForestClassifier': {
+                'model': RandomForestClassifier(random_state=42),
+                'scaler': StandardScaler()
+            },
+            'AdaBoostClassifier': {
+                'model': AdaBoostClassifier(),
+                'scaler': StandardScaler()
+            },
+            'ExtraTreesClassifier': {
+                'model': ExtraTreesClassifier(),
+                'scaler': StandardScaler()
+            },
+            'SVC': {
+                'model': SVC(),
+                'scaler': StandardScaler()
+            },
+            'LinearSVC': {
+                'model': LinearSVC(max_iter=2000),
+                'scaler': StandardScaler()
+            },
+            'GaussianNB': {
+                'model': GaussianNB(),
+                'scaler': StandardScaler()
+            },
+            'KNeighborsClassifier': {
+                'model': KNeighborsClassifier(),
+                'scaler': StandardScaler()
+            },
+            'MLPClassifier': {
+                'model': MLPClassifier(max_iter=1000),
+                'scaler': StandardScaler()
+            },
+            'MultinomialNB': {
+                'model': MultinomialNB(),
+                'scaler': MaxAbsScaler()
+            }
+        }
+    @staticmethod
+    def ensure_non_negative(X):
+        """Ensure data is non-negative by shifting"""
+        if isinstance(X, pd.DataFrame):
+            min_val = X.values.min()
+            if min_val < 0:
+                return X + abs(min_val)
+            return X
+        else:
+            min_val = X.min()
+            if min_val < 0:
+                return X - min_val
+            return X
+    def save_model(self, model_dict, model_name):
+        """Save model and its scaler to files"""
+        if not os.path.exists('models'):
+            os.makedirs('models')
+        base_filename = f"{model_name}"
+        if hasattr(model_dict['model'], 'feature_names_in_'):
+            model_dict['scaler'].feature_names_in_ = model_dict['model'].feature_names_in_
+        elif hasattr(st.session_state, 'features'):
+            model_dict['scaler'].feature_names_in_ = np.array(st.session_state.features)
+        model_path = os.path.join('models', f"{base_filename}_model.joblib")
+        scaler_path = os.path.join('models', f"{base_filename}_scaler.joblib")
+        joblib.dump(model_dict['model'], model_path)
+        joblib.dump(model_dict['scaler'], scaler_path)
+        return model_path, scaler_path
+    def train_and_evaluate_model(self, clf_dict, X_train, X_test, y_train, y_test, model_name):
+        """Train and evaluate a single model"""
+        start_time = time.time()
+        try:
+            scaler = clf_dict['scaler']
+            feature_names = st.session_state.features if hasattr(st.session_state, 'features') else None
+            if model_name == 'MultinomialNB':
+                X_train_positive = self.ensure_non_negative(X_train)
+                X_test_positive = self.ensure_non_negative(X_test)
+                X_train_scaled = scaler.fit_transform(X_train_positive)
+                X_test_scaled = scaler.transform(X_test_positive)
+                if np.any(X_train_scaled < 0) or np.any(X_test_scaled < 0):
+                    raise ValueError("Negative values in scaled data")
+            else:
+                X_train_scaled = scaler.fit_transform(X_train)
+                X_test_scaled = scaler.transform(X_test)
+            if feature_names is not None:
+                if hasattr(clf_dict['model'], 'feature_names_in_'):
+                    clf_dict['model'].feature_names_in_ = np.array(feature_names)
+                scaler.feature_names_in_ = np.array(feature_names)
+            clf_dict['model'].fit(X_train_scaled, y_train)
+            y_pred = clf_dict['model'].predict(X_test_scaled)
+            accuracy = accuracy_score(y_test, y_pred)
+            training_time = time.time() - start_time
+            model_path, scaler_path = self.save_model(clf_dict, model_name)
+            conf_matrix = confusion_matrix(y_test, y_pred)
+            return {
+                'model_name': model_name,
+                'accuracy': accuracy,
+                'training_time': training_time,
+                'model': clf_dict['model'],
+                'predictions': y_pred,
+                'status': 'success',
+                'scaler': scaler_path,
+                'model_path': model_path,
+                'confusion_matrix': conf_matrix
+            }
+        except Exception as e:
+            return {
+                'model_name': model_name,
+                'accuracy': 0,
+                'training_time': 0,
+                'model': None,
+                'predictions': None,
+                'status': f'failed: {str(e)}',
+                'scaler': None,
+                'model_path': None,
+                'confusion_matrix': None
+            }
+class Visualizer:
+    @staticmethod
+    def plot_learning_curve(estimator, X, y, title, ax):
+        """Plot learning curves for a model"""
+        train_sizes, train_scores, test_scores = learning_curve(
+            estimator, X, y,
+            train_sizes=np.linspace(0.1, 1.0, 10),
+            cv=5,
+            n_jobs=-1,
+            scoring='accuracy'
+        )
+        train_mean = np.mean(train_scores, axis=1)
+        train_std = np.std(train_scores, axis=1)
+        test_mean = np.mean(test_scores, axis=1)
+        test_std = np.std(test_scores, axis=1)
+        ax.plot(train_sizes, train_mean, label='Training score')
+        ax.plot(train_sizes, test_mean, label='Cross-validation score')
+        ax.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1)
+        ax.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1)
+        ax.set_xlabel('Training Examples')
+        ax.set_ylabel('Score')
+        ax.set_title(title)
+        ax.legend(loc='lower right')
+        ax.grid(True)
+    def create_confusion_matrices_plot(self, successful_results, y_test):
+        """Create and display confusion matrices for successful models"""
+        n_models = len(successful_results)
+        n_cols = 2
+        n_rows = (n_models + n_cols - 1) // n_cols
+        fig = plt.figure(figsize=(15, 5 * n_rows))
+        colors = ['white', '#4a90e2']
+        n_bins = 100
+        # cmap = LinearSegmentedColormap.from_list("custom_blues", colors, N=n_bins)
+        for idx, result in enumerate(successful_results):
+            ax = plt.subplot(n_rows, n_cols, idx + 1)
+            sns.heatmap(
+                result['confusion_matrix'],
+                annot=True,
+                fmt='d',
+                # cmap=cmap,
+                cmap='viridis',
+                ax=ax,
+                xticklabels=sorted(set(y_test)),
+                yticklabels=sorted(set(y_test))
+            )
+            ax.set_xlabel('Predicted')
+            ax.set_ylabel('Actual')
+            ax.set_title(f"{result['model_name']}\nAccuracy: {result['accuracy']:.4f}")
+        plt.tight_layout()
+        return fig
+    def create_performance_summary_plot(self, successful_df, selected_models):
+        """Create performance metrics summary plot"""
+        metrics_to_compare = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
+        summary_df = successful_df[successful_df['Model'].isin(selected_models)].melt(
+            id_vars=['Model'],
+            value_vars=metrics_to_compare,
+            var_name='Metric',
+            value_name='Score'
+        )
+        fig_summary = px.bar(
+            summary_df,
+            x='Model',
+            y='Score',
+            color='Metric',
+            barmode='group',
+            title="Model Performance Metrics Comparison",
+            text='Score'
+        )
+        fig_summary.update_layout(
+            xaxis_tickangle=-45,
+            showlegend=True,
+            height=600,
+            yaxis=dict(
+                range=[0, 1],
+                title='Score'
+            ),
+            legend=dict(
+                title='Metric',
+                orientation='h',
+                yanchor='bottom',
+                y=1.02,
+                xanchor='right',
+                x=1
+            )
+        )
+        fig_summary.update_traces(
+            texttemplate='%{text:.4f}',
+            textposition='outside',
+            textangle=0
+        )
+        summary_df['Avg_Score'] = summary_df.groupby('Model')['Score'].transform('mean')
+        models_order = summary_df.drop_duplicates('Model').sort_values('Avg_Score', ascending=False)['Model']
+        fig_summary.update_layout(xaxis={'categoryorder': 'array', 'categoryarray': models_order})
+        return fig_summary
+class StreamlitUI:
+    def __init__(self):
+        self.data_generator = DataGenerator()
+        self.model_manager = ModelManager()
+        self.visualizer = Visualizer()
+        # Add default configurations as class attribute
+        self.default_configs = {
+            # Features: [length (mm), width (mm), density (g/cm³), pH]
+            # AMPALAYA: Medium length (150-180mm), thin width (40-50mm)
+            # Medium density (95 g/cm³) due to hollow interior, slightly basic pH (6.8-7.0)
+            "Ampalaya": {'mean': [165, 45, 95, 6.9], 'std': [15, 5, 10, 0.1]},
+            # BANANA: Long length (180-220mm), medium width (30-40mm)
+            # Low density (85 g/cm³), acidic pH (4.5-5.2)
+            "Banana": {'mean': [200, 35, 85, 4.8], 'std': [20, 5, 8, 0.3]},
+            # CABBAGE: Round shape - similar length/width (150-200mm x 150-200mm)
+            # Very low density (65 g/cm³) due to layered leaves, neutral pH (6.5-7.0)
+            "Cabbage": {'mean': [175, 175, 65, 6.8], 'std': [25, 25, 5, 0.2]},
+            # CARROT: Medium length (140-180mm), narrow width (25-35mm)
+            # High density (115 g/cm³) due to dense flesh, slightly acidic pH (6.0-6.5)
+            "Carrot": {'mean': [160, 30, 115, 6.3], 'std': [20, 5, 10, 0.2]},
+            # CASSAVA: Long length (200-300mm), thick width (50-80mm)
+            # High density (125 g/cm³) due to starchy flesh, slightly acidic pH (6.0-6.5)
+            "Cassava": {'mean': [250, 65, 125, 6.2], 'std': [50, 15, 12, 0.2]}
+        }
+        # Default feature names that match the measurements in default_configs
+        self.default_features = [
+            'length (mm)',
+            'width (mm)',
+            'density (g/cm³)',
+            'pH'
+        ]
+        # Add new session state variables for static visualizations
+        self.initialize_static_visualizations()
+        # Add new session state variable for data source
+        if 'data_source' not in st.session_state:
+            st.session_state.data_source = 'synthetic'
+    def initialize_static_visualizations(self):
+        """Initialize session state variables for static visualizations"""
+        if 'confusion_matrices_fig' not in st.session_state:
+            st.session_state.confusion_matrices_fig = None
+        if 'learning_curves_fig' not in st.session_state:
+            st.session_state.learning_curves_fig = None
+    def initialize_session_state(self):
+        """Initialize all session state variables"""
+        session_vars = {
+            'data_generated': False,
+            'df': None,
+            'features': None,
+            'feature_configs': None,
+            'X_train': None,
+            'X_test': None,
+            'y_train': None,
+            'y_test': None,
+            'y_pred': None,
+            'model_results': None,
+            'best_model': None,
+            'accuracy': None,
+            'feature_importance': None,
+            'split_info': None
+        }
+        for var, value in session_vars.items():
+            if var not in st.session_state:
+                st.session_state[var] = value
+    def setup_page_config(self):
+        """Configure the Streamlit page"""
+        st.set_page_config(
+            page_title="ML Model Generator & Implementation",
+            page_icon="🤖",
+            layout="wide",
+            menu_items={
+                'About': """
+## Final project in Modeling and Simulation \n
+### Juan Dela Cruz - BSCS 4A"""
+            }
+        )
+    def get_sidebar_inputs(self):
+        """Get all inputs from the sidebar"""
+        st.sidebar.header("Data Generation Parameters")
+        # Feature configuration
+        st.sidebar.subheader("Feature Configuration")
+        # Initialize default features if not in session state
+        if 'features_input' not in st.session_state:
+            st.session_state.features_input = ", ".join(self.default_features)
+        features_input = st.sidebar.text_input(
+            "Enter feature names (comma-separated)",
+            key='features_input'
+        )
+        features = [f.strip() for f in features_input.split(",")]
+        # Initialize default classes if not in session state
+        if 'classes_input' not in st.session_state:
+            st.session_state.classes_input = ", ".join(self.default_configs.keys())
+        classes_input = st.sidebar.text_input(
+            "Enter class names (comma-separated)",
+            key='classes_input'
+        )
+        classes = [c.strip() for c in classes_input.split(",")]
+        # Generate feature configs
+        feature_configs = {}
+        for feature in features:
+            feature_configs[feature] = {
+                'type': 'random',
+                'std': 20.0,
+                'center': None
+            }
+        return features, feature_configs, classes
+    def get_class_configs(self, classes, features):
+        """Get class-specific configurations from the sidebar"""
+        class_configs = {}
+        st.sidebar.subheader("Class-Specific Settings")
+        for class_name in classes:
+            with st.sidebar.expander(f"{class_name} Settings", expanded=False):
+                checkbox_key = f"use_specific_{class_name}"
+                # Initialize checkbox state if not in session state
+                if checkbox_key not in st.session_state:
+                    st.session_state[checkbox_key] = True
+                use_specific = st.checkbox(
+                    f"Set specific values for {class_name}",
+                    key=checkbox_key
+                )
+                means = []
+                stds = []
+                # Generate unique means for each class if not in default configs
+                if class_name not in self.default_configs:
+                    # Generate random means between 0-100 that are different from other classes
+                    random_means = []
+                    for _ in range(len(features)):
+                        mean = np.random.uniform(0, 100)
+                        # Ensure means are unique across classes
+                        while any(abs(mean - c['mean'][_]) < 10 for c in class_configs.values() if 'mean' in c):
+                            mean = np.random.uniform(0, 100)
+                        random_means.append(mean)
+                    default_values = {'mean': random_means, 'std': [20.0] * len(features)}
+                else:
+                    # Ensure default values match the number of features
+                    default_means = self.default_configs[class_name]['mean']
+                    default_stds = self.default_configs[class_name]['std']
+                    # If we have more features than default values, extend with random values
+                    if len(features) > len(default_means):
+                        additional_means = [np.random.uniform(0, 100) for _ in range(len(features) - len(default_means))]
+                        additional_stds = [20.0 for _ in range(len(features) - len(default_stds))]
+                        default_means.extend(additional_means)
+                        default_stds.extend(additional_stds)
+                    # If we have fewer features than default values, truncate
+                    elif len(features) < len(default_means):
+                        default_means = default_means[:len(features)]
+                        default_stds = default_stds[:len(features)]
+                    default_values = {'mean': default_means, 'std': default_stds}
+                if use_specific:
+                    for idx, feature in enumerate(features):
+                        mean_key = f"mean_{class_name}_{feature}"
+                        std_key = f"std_{class_name}_{feature}"
+                        if mean_key not in st.session_state:
+                            st.session_state[mean_key] = float(default_values['mean'][idx])
+                        if std_key not in st.session_state:
+                            st.session_state[std_key] = float(default_values['std'][idx])
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            mean = st.number_input(
+                                f"Mean for {feature}",
+                                key=mean_key
+                            )
+                            means.append(mean)
+                        with col2:
+                            std = st.number_input(
+                                f"Std Dev for {feature}",
+                                min_value=0.1,
+                                key=std_key
+                            )
+                            stds.append(std)
+                else:
+                    # Use default values if specific values not requested
+                    means = default_values['mean']
+                    stds = default_values['std']
+                class_configs[class_name] = {
+                    'mean': means,
+                    'std': stds
+                }
+        return class_configs
+    def get_training_params(self):
+        """Get training parameters from the sidebar"""
+        st.sidebar.subheader("Sample Size & Train/Test Split Configuration")
+        # Initialize default values if not in session state
+        if 'n_samples' not in st.session_state:
+            st.session_state.n_samples = 10000
+        col1, col2 = st.sidebar.columns(2)
+        with col1:
+            n_samples = st.slider(
+                "Number of samples",
+                500,
+                50000,
+                step=500,
+                key='n_samples'
+            )
+        with col2:
+            test_size = st.slider(
+                "Test Size",
+                min_value=10,
+                max_value=50,
+                value=30,  # Default value directly in the widget
+                step=5,
+                key='test_size',
+                format="%d%%",
+                help="Percentage of data to use for testing"
+            )
+            st.write(f"Test: {test_size}% / Train: {100 - test_size}%")
+        return n_samples, test_size
+    def generate_and_train(self, n_samples, feature_configs, classes, class_configs, test_size):
+        """Generate data and train models"""
+        X, y = self.data_generator.generate_synthetic_data(
+            n_samples,
+            feature_configs,
+            classes,
+            class_configs
+        )
+        st.session_state.df = pd.DataFrame(X, columns=st.session_state.features)
+        st.session_state.df['target'] = y
+        # Train test split
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y,
+            test_size=test_size/100,
+            random_state=42
+        )
+        # Store split data
+        st.session_state.X_train = X_train
+        st.session_state.X_test = X_test
+        st.session_state.y_train = y_train
+        st.session_state.y_test = y_test
+        # Get classifiers and train models
+        classifiers = self.model_manager.get_classifiers()
+        results = []
+        with st.spinner('Training models... Please wait.'):
+            progress_bar = st.progress(0)
+            for idx, (name, clf_dict) in enumerate(classifiers.items()):
+                result = self.model_manager.train_and_evaluate_model(
+                    clf_dict,
+                    X_train,
+                    X_test,
+                    y_train,
+                    y_test,
+                    name
+                )
+                results.append(result)
+                progress_bar.progress((idx + 1) / len(classifiers))
+        st.session_state.model_results = results
+        st.session_state.data_generated = True
+        # Find best model
+        successful_results = [r for r in results if r['status'] == 'success']
+        if successful_results:
+            best_model = max(successful_results, key=lambda x: x['accuracy'])
+            st.session_state.best_model = best_model
+        # Store split information
+        st.session_state.split_info = {
+            'total_samples': len(X),
+            'train_samples': len(X_train),
+            'test_samples': len(X_test),
+            'test_percentage': test_size
+        }
+        st.session_state.feature_configs = feature_configs
+        # Generate static visualizations after training
+        successful_results = [r for r in st.session_state.model_results if r['status'] == 'success']
+        if successful_results:
+            # Generate and store confusion matrices
+            st.session_state.confusion_matrices_fig = self.visualizer.create_confusion_matrices_plot(
+                successful_results,
+                st.session_state.y_test
+            )
+            # Generate and store learning curves
+            st.session_state.learning_curves_fig = self.generate_learning_curves_figure(successful_results)
+    def generate_learning_curves_figure(self, successful_results):
+        """Generate learning curves figure"""
+        successful_results.sort(key=lambda x: x['accuracy'], reverse=True)
+        n_models = len(successful_results)
+        n_cols = 2
+        n_rows = (n_models + n_cols - 1) // n_cols
+        fig_learning = plt.figure(figsize=(15, 5 * n_rows))
+        for idx, result in enumerate(successful_results):
+            ax = plt.subplot(n_rows, n_cols, idx + 1)
+            model_name = result['model_name']
+            model = result['model']
+            scaler = joblib.load(result['scaler'])
+            if model_name == 'MultinomialNB':
+                X_scaled = self.model_manager.ensure_non_negative(
+                    st.session_state.df.drop('target', axis=1)
+                )
+                X_scaled = scaler.transform(X_scaled)
+            else:
+                X_scaled = scaler.transform(st.session_state.df.drop('target', axis=1))
+            y = st.session_state.df['target']
+            self.visualizer.plot_learning_curve(
+                model,
+                X_scaled,
+                y,
+                f'Learning Curve - {model_name}\nFinal Accuracy: {result["accuracy"]:.4f}',
+                ax
+            )
+        plt.tight_layout()
+        return fig_learning
+    def display_model_comparison(self):
+        """Display model comparison section"""
+        st.subheader("Model Comparison")
+        comparison_data = []
+        for result in st.session_state.model_results:
+            if result['status'] == 'success':
+                report_dict = classification_report(
+                    st.session_state.y_test,
+                    result['predictions'],
+                    output_dict=True
+                )
+                macro_avg = report_dict['macro avg']
+                comparison_data.append({
+                    'Model': result['model_name'],
+                    'Accuracy': float(f"{result['accuracy']:.4f}"),
+                    'Precision': float(f"{macro_avg['precision']:.4f}"),
+                    'Recall': float(f"{macro_avg['recall']:.4f}"),
+                    'F1-Score': float(f"{macro_avg['f1-score']:.4f}"),
+                    'Training Time (s)': float(f"{result['training_time']:.3f}"),
+                    'Status': 'Success'
+                })
+            else:
+                comparison_data.append({
+                    'Model': result['model_name'],
+                    'Accuracy': 0,
+                    'Precision': 0,
+                    'Recall': 0,
+                    'F1-Score': 0,
+                    'Training Time (s)': 0,
+                    'Status': result['status']
+                })
+        comparison_df = pd.DataFrame(comparison_data)
+        comparison_df = comparison_df.sort_values('Accuracy', ascending=False)
+        st.dataframe(comparison_df.style.format({
+            'Accuracy': '{:.4f}',
+            'Precision': '{:.4f}',
+            'Recall': '{:.4f}',
+            'F1-Score': '{:.4f}',
+            'Training Time (s)': '{:.3f}'
+        }))
+        return comparison_df
+    def display_metric_visualization(self, comparison_df):
+        """Display metric visualization section"""
+        metric_to_plot = st.selectbox(
+            "Select metric to visualize",
+            ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'Training Time (s)']
+        )
+        successful_df = comparison_df[comparison_df['Status'] == 'Success']
+        if metric_to_plot == 'Training Time (s)':
+            successful_df = successful_df.sort_values(metric_to_plot)
+        else:
+            successful_df = successful_df.sort_values(metric_to_plot, ascending=False)
+        fig_comparison = px.bar(
+            successful_df,
+            x='Model',
+            y=metric_to_plot,
+            title=f"Model {metric_to_plot} Comparison",
+            color=metric_to_plot,
+            text=metric_to_plot
+        )
+        fig_comparison.update_layout(
+            xaxis_tickangle=-45,
+            showlegend=True,
+            height=500,
+            yaxis=dict(
+                range=[0, 1] if metric_to_plot != 'Training Time (s)' else None
+            )
+        )
+        fig_comparison.update_traces(
+            texttemplate='%{text:.4f}',
+            textposition='outside',
+            textangle=0
+        )
+        st.plotly_chart(fig_comparison)
+        return successful_df
+    def display_best_model_performance(self):
+        """Display best model performance section"""
+        if hasattr(st.session_state, 'best_model'):
+            st.subheader("Best Model Performance")
+            best_model = st.session_state.best_model
+            st.write(f"Best Model: **{best_model['model_name']}**")
+            st.write(f"Accuracy: {best_model['accuracy']:.4f}")
+            st.write("Classification Report (Best Model):")
+            report_dict = classification_report(
+                st.session_state.y_test,
+                best_model['predictions'],
+                output_dict=True
+            )
+            report_df = pd.DataFrame(report_dict).transpose()
+            st.dataframe(report_df.style.format('{:.4f}'))
+    def display_dataset_info(self):
+        """Display dataset split information"""
+        if st.session_state.split_info:
+            st.subheader("Dataset Split Information")
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric(
+                    "Total Samples",
+                    st.session_state.split_info['total_samples']
+                )
+            with col2:
+                st.metric(
+                    "Training Samples",
+                    f"{st.session_state.split_info['train_samples']} "
+                    f"({100 - st.session_state.split_info['test_percentage']}%)"
+                )
+            with col3:
+                st.metric(
+                    "Testing Samples",
+                    f"{st.session_state.split_info['test_samples']} "
+                    f"({st.session_state.split_info['test_percentage']}%)"
+                )
+    def display_feature_configs(self):
+        """Display feature configurations"""
+        st.subheader("Feature Configurations")
+        config_data = []
+        for feature, config in st.session_state.feature_configs.items():
+            config_data.append({
+                'Feature': feature,
+                'Type': config['type'],
+                'Std Dev': config['std'],
+                'Center': config['center'] if config['type'] == 'user-defined' else 'Random'
+            })
+        st.table(pd.DataFrame(config_data))
+    def display_data_samples(self):
+        """Display original and scaled data samples"""
+        st.subheader("Generated Data Sample")
+        # Get random samples from each class
+        unique_classes = st.session_state.df['target'].unique()
+        samples_per_class = 2  # Number of samples to show per class
+        sampled_data = []
+        for class_name in unique_classes:
+            class_data = st.session_state.df[st.session_state.df['target'] == class_name]
+            sampled_data.append(class_data.sample(n=min(samples_per_class, len(class_data))))
+        sampled_df = pd.concat(sampled_data).sample(frac=1).reset_index(drop=True)
+        col1, col2 = st.columns(2)
+        with col1:
+            st.write("Original Data (Random samples from each class):")
+            st.write(sampled_df)
+        with col2:
+            st.write("Scaled Data (using best model's scaler):")
+            if st.session_state.best_model and st.session_state.best_model['status'] == 'success':
+                best_model_name = st.session_state.best_model['model_name']
+                scaler = joblib.load(st.session_state.best_model['scaler'])
+                features_df = sampled_df.drop('target', axis=1)
+                if best_model_name == 'MultinomialNB':
+                    features_scaled = self.model_manager.ensure_non_negative(features_df)
+                    features_scaled = scaler.transform(features_scaled)
+                else:
+                    features_scaled = scaler.transform(features_df)
+                scaled_df = pd.DataFrame(
+                    features_scaled,
+                    columns=features_df.columns,
+                    index=features_df.index
+                )
+                scaled_df['target'] = sampled_df['target']
+                st.write(scaled_df)
+            else:
+                st.write("No scaled data available (best model not found)")
+    def display_confusion_matrices(self):
+        """Display confusion matrices section"""
+        st.subheader("Confusion Matrices")
+        st.write("""
+        Confusion matrices show the model's prediction performance across different classes.
+        - Each row represents the actual class
+        - Each column represents the predicted class
+        - Diagonal elements represent correct predictions (True Positives for each class)
+        - Off-diagonal elements represent incorrect predictions
+        - Numbers show how many samples were classified for each combination
+        - Colors range from yellow (high values) to green-blue (low values) using the viridis colormap
+        """)
+        if st.session_state.confusion_matrices_fig is not None:
+            st.pyplot(st.session_state.confusion_matrices_fig)
+            plt.close()
+    def display_performance_summary(self, successful_df):
+        """Display performance metrics summary"""
+        st.subheader("Performance Metrics Summary")
+        all_models = successful_df['Model'].unique().tolist()
+        default_selection = all_models
+        col1, col2 = st.columns([3, 1])
+        with col1:
+            selected_models = st.multiselect(
+                "Select models to compare",
+                all_models,
+                default=default_selection
+            )
+        if not selected_models:
+            st.warning("Please select at least one model to display the comparison.")
+            return
+        fig_summary = self.visualizer.create_performance_summary_plot(
+            successful_df,
+            selected_models
+        )
+        st.plotly_chart(fig_summary, use_container_width=True)
+    def display_saved_models(self):
+        """Display saved models information and download buttons"""
+        st.subheader("Saved Models")
+        saved_models = []
+        for result in st.session_state.model_results:
+            if result['status'] == 'success' and result['model_path']:
+                # Load model and scaler
+                model = joblib.load(result['model_path'])
+                scaler = joblib.load(result['scaler'])
+                # Create binary data for download using pickle
+                model_bytes = pickle.dumps(model)
+                scaler_bytes = pickle.dumps(scaler)
+                saved_models.append({
+                    'Model': result['model_name'],
+                    'Accuracy': result['accuracy'],
+                    'Model_Binary': model_bytes,
+                    'Scaler_Binary': scaler_bytes
+                })
+        if saved_models:
+            # Display models table
+            display_df = pd.DataFrame([{
+                'Model': m['Model'],
+                'Accuracy': m['Accuracy']
+            } for m in saved_models])
+            st.dataframe(display_df.style.format({
+                'Accuracy': '{:.4f}'
+            }))
+            # Add download buttons for each model
+            st.write("Download Models:")
+            for model_data in saved_models:
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.download_button(
+                        label=f"Download {model_data['Model']} Model",
+                        data=model_data['Model_Binary'],
+                        file_name=f"{model_data['Model']}_model.pkl",
+                        mime="application/octet-stream"
+                    )
+                with col2:
+                    st.download_button(
+                        label=f"Download {model_data['Model']} Scaler",
+                        data=model_data['Scaler_Binary'],
+                        file_name=f"{model_data['Model']}_scaler.pkl",
+                        mime="application/octet-stream"
+                    )
+        else:
+            st.info("No models were saved. Models are saved automatically when accuracy exceeds 0.5")
+    def display_download_section(self):
+        """Display dataset download section"""
+        st.subheader("Download Dataset")
+        col1, col2 = st.columns(2)
+        with col1:
+            if st.session_state.df is not None:
+                csv = st.session_state.df.to_csv(index=False)
+                st.download_button(
+                    label="Download Original Dataset (CSV)",
+                    data=csv,
+                    file_name=f"synthetic_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
+                    mime='text/csv',
+                    help="Download the original unscaled dataset"
+                )
+        with col2:
+            if st.session_state.best_model and st.session_state.best_model['status'] == 'success':
+                best_model_name = st.session_state.best_model['model_name']
+                scaler = joblib.load(st.session_state.best_model['scaler'])
+                features_df = st.session_state.df.drop('target', axis=1)
+                if best_model_name == 'MultinomialNB':
+                    features_scaled = self.model_manager.ensure_non_negative(features_df)
+                    features_scaled = scaler.transform(features_scaled)
+                else:
+                    features_scaled = scaler.transform(features_df)
+                scaled_df = pd.DataFrame(
+                    features_scaled,
+                    columns=features_df.columns,
+                    index=features_df.index
+                )
+                scaled_df['target'] = st.session_state.df['target']
+                csv_scaled = scaled_df.to_csv(index=False)
+                st.download_button(
+                    label="Download Scaled Dataset (CSV)",
+                    data=csv_scaled,
+                    file_name=f"synthetic_data_scaled_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
+                    mime='text/csv',
+                    help="Download the scaled dataset (using best model's scaler)"
+                )
+    def display_dataset_statistics(self):
+        """Display dataset statistics"""
+        with st.expander("Dataset Statistics"):
+            col1, col2 = st.columns(2)
+            with col1:
+                st.write("Original Dataset Statistics:")
+                st.write(st.session_state.df.describe())
+            with col2:
+                if st.session_state.best_model and st.session_state.best_model['status'] == 'success':
+                    st.write("Scaled Dataset Statistics:")
+                    best_model_name = st.session_state.best_model['model_name']
+                    scaler = joblib.load(st.session_state.best_model['scaler'])
+                    features_df = st.session_state.df.drop('target', axis=1)
+                    if best_model_name == 'MultinomialNB':
+                        features_scaled = self.model_manager.ensure_non_negative(features_df)
+                        features_scaled = scaler.transform(features_scaled)
+                    else:
+                        features_scaled = scaler.transform(features_df)
+                    scaled_df = pd.DataFrame(
+                        features_scaled,
+                        columns=features_df.columns,
+                        index=features_df.index
+                    )
+                    scaled_df['target'] = st.session_state.df['target']
+                    st.write(scaled_df.describe())
+    def display_learning_curves(self):
+        """Display learning curves section"""
+        st.subheader("Learning Curves")
+        st.write("""
+        Learning curves show how model performance changes with increasing training data.
+        - Blue line: Training score
+        - Orange line: Cross-validation score
+        - Shaded areas represent standard deviation
+        """)
+        if st.session_state.learning_curves_fig is not None:
+            st.pyplot(st.session_state.learning_curves_fig)
+            plt.close()
+    def display_feature_visualization(self):
+        """Display 2D and 3D feature visualizations"""
+        st.subheader("Feature Visualization")
+        plot_type = st.radio("Select plot type", ["2D Plot", "3D Plot"], index=1)
+        if plot_type == "2D Plot":
+            col1, col2 = st.columns(2)
+            with col1:
+                x_feature = st.selectbox(
+                    "Select X-axis feature",
+                    st.session_state.features,
+                    index=0,
+                    key='x_2d'
+                )
+            with col2:
+                y_features = [f for f in st.session_state.features if f != x_feature]
+                y_feature = st.selectbox(
+                    "Select Y-axis feature",
+                    y_features,
+                    index=0,
+                    key='y_2d'
+                )
+            fig = px.scatter(
+                st.session_state.df,
+                x=x_feature,
+                y=y_feature,
+                color='target',
+                title=f"2D Visualization of {x_feature} vs {y_feature}",
+                labels={'target': 'Class'}
+            )
+            st.plotly_chart(fig, use_container_width=True)
+        else:  # 3D Plot
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                x_feature = st.selectbox(
+                    "Select X-axis feature",
+                    st.session_state.features,
+                    index=0,
+                    key='x_3d'
+                )
+            with col2:
+                y_features = [f for f in st.session_state.features if f != x_feature]
+                y_feature = st.selectbox(
+                    "Select Y-axis feature",
+                    y_features,
+                    index=0,
+                    key='y_3d'
+                )
+            with col3:
+                z_features = [f for f in st.session_state.features if f not in [x_feature, y_feature]]
+                z_feature = st.selectbox(
+                    "Select Z-axis feature",
+                    z_features,
+                    index=0,
+                    key='z_3d'
+                )
+            fig = px.scatter_3d(
+                st.session_state.df,
+                x=x_feature,
+                y=y_feature,
+                z=z_feature,
+                color='target',
+                title=f"3D Visualization of {x_feature} vs {y_feature} vs {z_feature}",
+                labels={'target': 'Class'}
+            )
+            fig.update_layout(
+                scene = dict(
+                    xaxis_title=x_feature,
+                    yaxis_title=y_feature,
+                    zaxis_title=z_feature
+                ),
+                scene_camera=dict(
+                    up=dict(x=0, y=0, z=1),
+                    center=dict(x=0, y=0, z=0),
+                    eye=dict(x=1.5, y=1.5, z=1.5)
+                )
+            )
+            st.plotly_chart(fig, use_container_width=True)
+    def get_data_source(self):
+        """Get user's choice of data source"""
+        st.sidebar.header("Data Source")
+        data_source = st.sidebar.radio(
+            "Choose data source",
+            ['Generate Synthetic Data', 'Upload Dataset'],
+            key='data_source_radio'
+        )
+        st.session_state.data_source = 'synthetic' if data_source == 'Generate Synthetic Data' else 'upload'
+        return st.session_state.data_source
+    def upload_dataset(self):
+        """Handle dataset upload"""
+        st.sidebar.header("Upload Dataset")
+        uploaded_file = st.sidebar.file_uploader(
+            "Choose a CSV file",
+            type="csv",
+            help="Upload a CSV file with features and target column"
+        )
+        if uploaded_file is not None:
+            try:
+                df = pd.read_csv(uploaded_file)
+                # Let user select target column
+                target_col = st.sidebar.selectbox(
+                    "Select target column",
+                    df.columns.tolist()
+                )
+                # Store features and target
+                features = [col for col in df.columns if col != target_col]
+                X = df[features]
+                y = df[target_col]
+                # Store in session state
+                st.session_state.df = df
+                st.session_state.features = features
+                # Train test split
+                test_size = st.sidebar.slider(
+                    "Test Size",
+                    min_value=10,
+                    max_value=50,
+                    value=30,
+                    step=5,
+                    format="%d%%",
+                    help="Percentage of data to use for testing"
+                )
+                X_train, X_test, y_train, y_test = train_test_split(
+                    X, y,
+                    test_size=test_size/100,
+                    random_state=42
+                )
+                # Store split data
+                st.session_state.X_train = X_train
+                st.session_state.X_test = X_test
+                st.session_state.y_train = y_train
+                st.session_state.y_test = y_test
+                # Store split information
+                st.session_state.split_info = {
+                    'total_samples': len(X),
+                    'train_samples': len(X_train),
+                    'test_samples': len(X_test),
+                    'test_percentage': test_size
+                }
+                return True
+            except Exception as e:
+                st.sidebar.error(f"Error loading dataset: {str(e)}")
+                return False
+        return False
+    def run(self):
+        """Main application logic"""
+        self.setup_page_config()
+        self.initialize_session_state()
+        st.title("ML Model Generator")
+        # Get data source choice
+        data_source = self.get_data_source()
+        if data_source == 'synthetic':
+            st.sidebar.header("Synthetic Data Generation")
+            # Get inputs from sidebar for synthetic data
+            features, feature_configs, classes = self.get_sidebar_inputs()
+            class_configs = self.get_class_configs(classes, features)
+            n_samples, test_size = self.get_training_params()
+            # Store features in session state
+            st.session_state.features = features
+            # Generate Data button
+            if st.sidebar.button("Generate Data and Train Models"):
+                self.generate_and_train(n_samples, feature_configs, classes, class_configs, test_size)
+        else:  # upload
+            # Handle dataset upload
+            if self.upload_dataset():
+                if st.sidebar.button("Train Models"):
+                    # Get classifiers and train models
+                    classifiers = self.model_manager.get_classifiers()
+                    results = []
+                    with st.spinner('Training models... Please wait.'):
+                        progress_bar = st.progress(0)
+                        for idx, (name, clf_dict) in enumerate(classifiers.items()):
+                            result = self.model_manager.train_and_evaluate_model(
+                                clf_dict,
+                                st.session_state.X_train,
+                                st.session_state.X_test,
+                                st.session_state.y_train,
+                                st.session_state.y_test,
+                                name
+                            )
+                            results.append(result)
+                            progress_bar.progress((idx + 1) / len(classifiers))
+                    st.session_state.model_results = results
+                    st.session_state.data_generated = True
+                    # Find best model
+                    successful_results = [r for r in results if r['status'] == 'success']
+                    if successful_results:
+                        best_model = max(successful_results, key=lambda x: x['accuracy'])
+                        st.session_state.best_model = best_model
+                        # Generate static visualizations
+                        st.session_state.confusion_matrices_fig = self.visualizer.create_confusion_matrices_plot(
+                            successful_results,
+                            st.session_state.y_test
+                        )
+                        st.session_state.learning_curves_fig = self.generate_learning_curves_figure(successful_results)
+        # Display results if data has been generated/uploaded and trained
+        if st.session_state.data_generated:
+            self.display_dataset_info()
+            self.display_data_samples()
+            self.display_feature_visualization()
+            self.display_download_section()
+            self.display_dataset_statistics()
+            self.display_best_model_performance()
+            successful_df = self.display_model_comparison()
+            if successful_df is not None and not successful_df.empty:
+                self.display_performance_summary(successful_df)
+                self.display_saved_models()
+                self.display_learning_curves()
+                self.display_confusion_matrices()
+        else:
+            if data_source == 'synthetic':
+                st.info("Please generate data using the sidebar button to view visualizations and results.")
+            else:
+                st.info("Please upload a dataset and click 'Train Models' to view visualizations and results.")
+def main():
+    app = StreamlitUI()
+    app.run()
+if __name__ == "__main__":
+    main()

README.md CHANGED Viewed

@@ -1,13 +1,102 @@
----
-title: Synthetic Data Generation
-emoji: 📚
-colorFrom: yellow
-colorTo: purple
-sdk: streamlit
-sdk_version: 1.44.1
-app_file: app.py
-pinned: false
-short_description: synthetic data generation, modeling and integration
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Synthetic Data Generation and ML Model Training
+A comprehensive Streamlit application for generating synthetic data, training machine learning models, and educational visualization of algorithm performance.
+## Live Demo
+**[Try the application online!](https://projectsyntheticdatageneration.streamlit.app/)**
+## Overview
+This application provides an end-to-end platform for:
+1. Generating customizable synthetic datasets
+2. Training and evaluating multiple machine learning classifiers
+3. Visualizing model performance and data characteristics
+4. Learning about different ML algorithms through interactive education
+5. Implementing and testing trained models
+## Features
+### Main App (`App.py`)
+- Synthetic data generation with customizable feature distributions
+- Support for multiple classifier algorithms with automatic preprocessing
+- Real-time visualization of model performance metrics
+- Model comparison and selection
+- Dataset exploration and visualization tools
+- Model saving and exporting functionality
+### Algorithm Education (`pages/02_Algorithm_Education.py`)
+- Detailed explanations of various ML classification algorithms
+- Interactive demonstrations with customizable parameters
+- Mathematical foundations and implementation details
+- Algorithm strengths, limitations, and use cases
+- Performance visualization across different data distributions
+### Model Implementation (`pages/03_Model_Implementation.py`)
+- Upload and use previously trained models
+- Real-time prediction with custom input values
+- Model and scaler integration
+## Installation
+```bash
+# Clone the repository
+git clone https://github.com/yourusername/synthetic_data_generation.git
+cd synthetic_data_generation
+# Install dependencies
+pip install -r requirements.txt
+# Run the application
+streamlit run App.py
+```
+## Requirements
+- Python 3.7+
+- streamlit>=1.28.0
+- numpy>=1.24.0
+- pandas>=2.0.0
+- scikit-learn>=1.2.0
+- plotly>=5.13.0
+- seaborn>=0.12.0
+- matplotlib>=3.7.0
+- joblib>=1.2.0
+## Usage
+### Generating Synthetic Data
+1. Define features and their distributions
+2. Configure class characteristics
+3. Set sample size and other generation parameters
+4. Generate and explore your synthetic dataset
+### Training Models
+1. Select classifier algorithms to evaluate
+2. Configure training parameters (test split, etc.)
+3. Train models and view performance metrics
+4. Compare model results through interactive visualizations
+### Educational Resources
+1. Navigate to the Algorithm Education page
+2. Select an algorithm to learn about
+3. Interact with the demo to see how parameters affect performance
+4. Examine mathematical foundations and implementation details
+### Model Implementation
+1. Upload previously saved model and scaler files
+2. Input feature values or generate random test values
+3. Make predictions and view results
+## Project Structure
+```
+synthetic_data_generation/
+├── App.py                  # Main application
+├── models/                 # Directory for saved models
+├── pages/                  # Additional application pages
+│   ├── 02_Algorithm_Education.py    # Educational content about ML algorithms
+│   └── 03_Model_implementation.py   # Model deployment and usage interface
+├── temp_uploads/           # Temporary directory for file uploads
+└── requirements.txt        # Project dependencies
+```

pages/02_Algorithm_Education.py ADDED Viewed

	@@ -0,0 +1,1250 @@

+import streamlit as st
+import numpy as np
+from sklearn.naive_bayes import GaussianNB
+from sklearn.svm import LinearSVC, SVC
+from sklearn.neural_network import MLPClassifier
+from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.linear_model import RidgeClassifier
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.metrics import accuracy_score, confusion_matrix
+from sklearn.model_selection import train_test_split, learning_curve
+import matplotlib.pyplot as plt
+import seaborn as sns
+def setup_page_config():
+    """Configure the Streamlit page"""
+    st.set_page_config(
+        page_title="Algorithm Education",
+        page_icon="🤖",
+        layout="wide"
+    )
+def page_introduction():
+    """Display the introduction section of the page"""
+    st.title("Machine Learning Algorithm Education 🎓")
+    st.markdown("""
+    Welcome to the Algorithm Education page! This interactive guide helps you understand various machine learning
+    algorithms used in classification tasks. Each algorithm is explained in detail with:
+    - 📝 Clear descriptions and explanations
+    - ✅ Advantages and limitations
+    - 🎯 Practical use cases
+    - 📊 Mathematical foundations
+    - 💻 Implementation examples
+    - 🔬 Interactive demonstrations
+    - 📚 Academic references
+    ### How to Use This Guide
+    1. Select an algorithm from the dropdown menu below
+    2. Explore its characteristics and implementation details
+    3. Try the interactive demo with different datasets
+    4. Compare performance metrics and visualizations
+    ### Available Algorithms
+    This guide covers popular classification algorithms including:
+    - Naive Bayes variants
+    - Support Vector Machines
+    - Neural Networks
+    - Tree-based methods
+    - Nearest Neighbors
+    - Linear Classifiers
+    - Ensemble Methods
+    ### Why Understanding Algorithms Matters
+    Choosing the right algorithm for your machine learning task is crucial for:
+    - Achieving optimal performance
+    - Efficient resource utilization
+    - Meeting specific problem constraints
+    - Understanding model behavior and limitations
+    """)
+def algorithm_info():
+    """Display detailed algorithm information"""
+    # First show the introduction
+    page_introduction()
+    algorithms = {
+        "Gaussian Naive Bayes (GaussianNB)": {
+            "description": """
+            A probabilistic classifier based on Bayes' theorem with strong independence assumptions between features.
+            Assumes features follow a Gaussian (normal) distribution.
+            """,
+            "pros": [
+                "Simple and fast",
+                "Works well with small datasets",
+                "Good for high-dimensional data",
+                "Performs well when features are normally distributed"
+            ],
+            "cons": [
+                "Assumes feature independence (often unrealistic)",
+                "Limited by Gaussian distribution assumption",
+                "May underperform when features are highly correlated"
+            ],
+            "use_cases": [
+                "Text classification",
+                "Spam detection",
+                "Medical diagnosis",
+                "Real-time prediction scenarios"
+            ],
+            "math_details": {
+                "main_formula": r"""
+                P(y|x_1,...,x_n) = \frac{P(y)\prod_{i=1}^{n}P(x_i|y)}{P(x_1,...,x_n)}
+                """,
+                "component_formulas": [
+                    {
+                        "name": "Gaussian Probability Density",
+                        "formula": r"""
+                        P(x_i|y) = \frac{1}{\sqrt{2\pi\sigma^2_y}} \exp\left(-\frac{(x_i-\mu_y)^2}{2\sigma^2_y}\right)
+                        """
+                    },
+                    {
+                        "name": "Class Prior Probability",
+                        "formula": r"""
+                        P(y) = \frac{\text{number of samples in class y}}{\text{total number of samples}}
+                        """
+                    }
+                ],
+                "explanation": """
+                - P(y|x₁,...,xₙ) is the posterior probability of class y given features
+                - P(y) is the prior probability of class y
+                - P(xᵢ|y) is the likelihood of feature xᵢ given class y
+                - μy and σ²y are the mean and variance of features in class y
+                """
+            },
+            "references": [
+                {
+                    "title": "Naive Bayes and Text Classification",
+                    "authors": "Sebastian Raschka",
+                    "publication": "arXiv preprint",
+                    "year": "2014",
+                    "url": "https://arxiv.org/abs/1410.5329"
+                },
+                {
+                    "title": "scikit-learn: Machine Learning in Python",
+                    "authors": "Pedregosa et al.",
+                    "publication": "Journal of Machine Learning Research",
+                    "year": "2011",
+                    "url": "https://jmlr.csail.mit.edu/papers/v12/pedregosa11a.html"
+                },
+                {
+                    "title": "Fundamental Mathematical Formulas Used in Machine Learning",
+                    "authors": "Showmik Setta",
+                    "publication": "Medium",
+                    "year": "2023",
+                    "url": "https://medium.com/@showmiklovesport/fundamental-mathematical-formulas-used-in-machine-learning-beginner-21c0843e61e0"
+                }
+            ]
+        },
+        "Linear Support Vector Classification (LinearSVC)": {
+            "description": """
+            A linear classifier that finds the hyperplane that best separates classes by maximizing the margin between them.
+            Optimized implementation of Support Vector Classification for linear classification.
+            """,
+            "pros": [
+                "Effective for high-dimensional spaces",
+                "Memory efficient",
+                "Faster than standard SVC with linear kernel",
+                "Works well when classes are linearly separable"
+            ],
+            "cons": [
+                "Only suitable for linear classification",
+                "Sensitive to feature scaling",
+                "May struggle with overlapping classes",
+                "No probability estimates by default"
+            ],
+            "use_cases": [
+                "Text classification",
+                "Image classification",
+                "Bioinformatics",
+                "High-dimensional data analysis"
+            ],
+            "math_details": {
+                "main_formula": r"""
+                \min_{w,b} \frac{1}{2}||w||^2 + C\sum_{i=1}^{n} \max(0, 1-y_i(w^Tx_i+b))
+                """,
+                "component_formulas": [
+                    {
+                        "name": "Decision Function",
+                        "formula": r"""
+                        f(x) = w^Tx + b
+                        """
+                    },
+                    {
+                        "name": "Margin Width",
+                        "formula": r"""
+                        \text{margin} = \frac{2}{||w||}
+                        """
+                    }
+                ],
+                "explanation": """
+                - w is the weight vector
+                - b is the bias term
+                - C is the regularization parameter
+                - yᵢ are the true labels (±1)
+                - xᵢ are the input features
+                """
+            },
+            "references": [
+                {
+                    "title": "A Tutorial on Support Vector Machines for Pattern Recognition",
+                    "authors": "Christopher J.C. Burges",
+                    "publication": "Data Mining and Knowledge Discovery",
+                    "year": "1998",
+                    "url": "https://link.springer.com/article/10.1023/A:1009715923555"
+                },
+                {
+                    "title": "Support Vector Machines",
+                    "authors": "Andrew Ng",
+                    "publication": "CS229 Lecture Notes, Stanford University",
+                    "year": "2018",
+                    "url": "http://cs229.stanford.edu/notes/cs229-notes3.pdf"
+                },
+                {
+                    "title": "Machine Learning Algorithms: Mathematical Deep Dive",
+                    "authors": "Vidushi Meel",
+                    "publication": "viso.ai",
+                    "year": "2021",
+                    "url": "https://viso.ai/deep-learning/machine-learning-algorithms-mathematical-guide/"
+                }
+            ]
+        },
+        "Support Vector Classification (SVC)": {
+            "description": """
+            A powerful classifier that can perform non-linear classification using different kernel functions to transform
+            the feature space. Creates an optimal hyperplane in a transformed feature space.
+            """,
+            "pros": [
+                "Effective for non-linear classification",
+                "Works well with high-dimensional data",
+                "Robust against overfitting",
+                "Versatile through different kernel functions"
+            ],
+            "cons": [
+                "Computationally intensive for large datasets",
+                "Sensitive to feature scaling",
+                "Kernel selection can be challenging",
+                "Memory intensive for large datasets"
+            ],
+            "use_cases": [
+                "Image classification",
+                "Handwriting recognition",
+                "Bioinformatics",
+                "Pattern recognition"
+            ],
+            "math_details": {
+                "main_formula": r"""
+                \min_{w,b} \frac{1}{2}||w||^2 + C\sum_{i=1}^{n} \xi_i
+                """,
+                "component_formulas": [
+                    {
+                        "name": "Kernel Function (RBF)",
+                        "formula": r"""
+                        K(x,x') = \exp\left(-\gamma ||x-x'||^2\right)
+                        """
+                    },
+                    {
+                        "name": "Decision Function",
+                        "formula": r"""
+                        f(x) = \sum_{i=1}^{n} \alpha_i y_i K(x_i,x) + b
+                        """
+                    }
+                ],
+                "explanation": """
+                - K(x,x') is the kernel function
+                - γ is the kernel coefficient
+                - αᵢ are the dual coefficients
+                - ξᵢ are the slack variables
+                """
+            },
+            "references": [
+                {
+                    "title": "Support Vector Networks",
+                    "authors": "Cortes C., Vapnik V.",
+                    "publication": "Machine Learning",
+                    "year": "1995",
+                    "url": "https://link.springer.com/article/10.1007/BF00994018"
+                },
+                {
+                    "title": "A Practical Guide to Support Vector Classification",
+                    "authors": "Hsu, Chang, and Lin",
+                    "publication": "BJU International",
+                    "year": "2003",
+                    "url": "https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf"
+                },
+                {
+                    "title": "Machine Learning Algorithms: Mathematical Deep Dive",
+                    "authors": "Vidushi Meel",
+                    "publication": "viso.ai",
+                    "year": "2021",
+                    "url": "https://viso.ai/deep-learning/machine-learning-algorithms-mathematical-guide/"
+                }
+            ]
+        },
+        "Multi-layer Perceptron (MLPClassifier)": {
+            "description": """
+            A neural network classifier that learns non-linear models by training multiple layers of nodes.
+            Each node uses a non-linear activation function to transform inputs.
+            """,
+            "pros": [
+                "Can learn highly non-linear patterns",
+                "Capable of learning complex relationships",
+                "Good generalization with proper regularization",
+                "Can handle multiple classes naturally"
+            ],
+            "cons": [
+                "Requires careful hyperparameter tuning",
+                "Computationally intensive",
+                "Sensitive to feature scaling",
+                "May get stuck in local minima"
+            ],
+            "use_cases": [
+                "Image recognition",
+                "Speech recognition",
+                "Complex pattern recognition",
+                "Financial prediction"
+            ],
+            "math_details": {
+                "main_formula": r"""
+                h_l = \sigma(W_l h_{l-1} + b_l)
+                """,
+                "component_formulas": [
+                    {
+                        "name": "ReLU Activation",
+                        "formula": r"""
+                        \sigma(x) = \max(0,x)
+                        """
+                    },
+                    {
+                        "name": "Softmax Output",
+                        "formula": r"""
+                        P(y=j|x) = \frac{e^{z_j}}{\sum_{k=1}^K e^{z_k}}
+                        """
+                    }
+                ],
+                "explanation": """
+                - hₗ is the output of layer l
+                - Wₗ is the weight matrix for layer l
+                - bₗ is the bias vector for layer l
+                - σ is the activation function
+                """
+            },
+            "references": [
+                {
+                    "title": "Learning representations by back-propagating errors",
+                    "authors": "Rumelhart, D. E., Hinton, G. E., & Williams, R. J.",
+                    "publication": "Nature",
+                    "year": "1986",
+                    "url": "https://www.nature.com/articles/323533a0"
+                },
+                {
+                    "title": "Gradient-based learning applied to document recognition",
+                    "authors": "LeCun Y., Bottou L., Bengio Y., & Haffner P.",
+                    "publication": "Proceedings of the IEEE",
+                    "year": "1998",
+                    "url": "https://ieeexplore.ieee.org/document/726791"
+                },
+                {
+                    "title": "Fundamental Mathematical Formulas Used in Machine Learning",
+                    "authors": "Showmik Setta",
+                    "publication": "Medium",
+                    "year": "2023",
+                    "url": "https://medium.com/@showmiklovesport/fundamental-mathematical-formulas-used-in-machine-learning-beginner-21c0843e61e0"
+                }
+            ]
+        },
+        "Extra Trees Classifier": {
+            "description": """
+            An ensemble method that builds multiple randomized decision trees and averages their predictions.
+            Similar to Random Forest but with additional randomization in the tree-building process.
+            """,
+            "pros": [
+                "Lower variance than Random Forest",
+                "Faster training than Random Forest",
+                "Good at handling high-dimensional data",
+                "Less prone to overfitting"
+            ],
+            "cons": [
+                "May have slightly lower accuracy than Random Forest",
+                "Can be memory intensive",
+                "Less interpretable than single decision trees",
+                "May require more trees than Random Forest"
+            ],
+            "use_cases": [
+                "Feature selection",
+                "Large dataset classification",
+                "Remote sensing",
+                "Biomedical classification"
+            ],
+            "math_details": {
+                "main_formula": r"""
+                \hat{f}_{et}(x) = \frac{1}{B}\sum_{b=1}^B \hat{f}_b(x)
+                """,
+                "component_formulas": [
+                    {
+                        "name": "Random Split Selection",
+                        "formula": r"""
+                        \text{gain}(s,D) = \frac{|D_l|}{|D|}H(D_l) + \frac{|D_r|}{|D|}H(D_r)
+                        """
+                    },
+                    {
+                        "name": "Entropy",
+                        "formula": r"""
+                        H(D) = -\sum_{k=1}^K p_k\log(p_k)
+                        """
+                    }
+                ],
+                "explanation": """
+                - B is the number of trees
+                - fᵦ is the prediction of the b-th tree
+                - Dₗ and Dᵣ are left and right splits
+                - pₖ is the proportion of class k in the node
+                """
+            },
+            "references": [
+                {
+                    "title": "Extremely randomized trees",
+                    "authors": "Geurts P., Ernst D., & Wehenkel L.",
+                    "publication": "Machine Learning",
+                    "year": "2006",
+                    "url": "https://link.springer.com/article/10.1007/s10994-006-6226-1"
+                },
+                {
+                    "title": "scikit-learn: Machine Learning in Python",
+                    "authors": "Pedregosa et al.",
+                    "publication": "Journal of Machine Learning Research",
+                    "year": "2011",
+                    "url": "https://jmlr.csail.mit.edu/papers/v12/pedregosa11a.html"
+                },
+                {
+                    "title": "Fundamental Mathematical Formulas Used in Machine Learning",
+                    "authors": "Showmik Setta",
+                    "publication": "Medium",
+                    "year": "2023",
+                    "url": "https://medium.com/@showmiklovesport/fundamental-mathematical-formulas-used-in-machine-learning-beginner-21c0843e61e0"
+                }
+            ]
+        },
+        "Random Forest Classifier": {
+            "description": """
+            An ensemble learning method that constructs multiple decision trees and combines their predictions.
+            Each tree is built using a random subset of features and bootstrap samples of the data.
+            """,
+            "pros": [
+                "Robust against overfitting",
+                "Handles non-linear relationships well",
+                "Provides feature importance",
+                "Works well with high-dimensional data"
+            ],
+            "cons": [
+                "Can be computationally intensive",
+                "Less interpretable than single decision trees",
+                "Memory intensive for large datasets",
+                "May overfit on noisy datasets"
+            ],
+            "use_cases": [
+                "Credit risk assessment",
+                "Medical diagnosis",
+                "Market prediction",
+                "Image classification"
+            ],
+            "math_details": {
+                "main_formula": r"""
+                \hat{f}_{rf}(x) = \frac{1}{B}\sum_{b=1}^B \hat{f}_b(x)
+                """,
+                "component_formulas": [
+                    {
+                        "name": "Random Split Selection",
+                        "formula": r"""
+                        \text{gain}(s,D) = \frac{|D_l|}{|D|}H(D_l) + \frac{|D_r|}{|D|}H(D_r)
+                        """
+                    },
+                    {
+                        "name": "Entropy",
+                        "formula": r"""
+                        H(D) = -\sum_{k=1}^K p_k\log(p_k)
+                        """
+                    }
+                ],
+                "explanation": """
+                - B is the number of trees
+                - fᵦ is the prediction of the b-th tree
+                - Dₗ and Dᵣ are left and right splits
+                - pₖ is the proportion of class k in the node
+                """
+            },
+            "references": [
+                {
+                    "title": "Random Forests",
+                    "authors": "Breiman L.",
+                    "publication": "Machine Learning",
+                    "year": "2001",
+                    "url": "https://link.springer.com/article/10.1023/A:1010933404324"
+                },
+                {
+                    "title": "An Introduction to Statistical Learning",
+                    "authors": "James G., Witten D., Hastie T., & Tibshirani R.",
+                    "publication": "Springer",
+                    "year": "2013",
+                    "url": "https://www.statlearning.com/"
+                },
+                {
+                    "title": "Machine Learning Algorithms: Mathematical Deep Dive",
+                    "authors": "Vidushi Meel",
+                    "publication": "viso.ai",
+                    "year": "2021",
+                    "url": "https://viso.ai/deep-learning/machine-learning-algorithms-mathematical-guide/"
+                }
+            ]
+        },
+        "K-Nearest Neighbors (KNeighborsClassifier)": {
+            "description": """
+            A non-parametric method that classifies a data point based on the majority class of its k nearest neighbors
+            in the feature space. Simple but effective algorithm.
+            """,
+            "pros": [
+                "Simple to understand and implement",
+                "No training phase",
+                "Naturally handles multi-class cases",
+                "Non-parametric (no assumptions about data)"
+            ],
+            "cons": [
+                "Computationally intensive for large datasets",
+                "Sensitive to irrelevant features",
+                "Requires feature scaling",
+                "Memory intensive (stores all training data)"
+            ],
+            "use_cases": [
+                "Recommendation systems",
+                "Pattern recognition",
+                "Data imputation",
+                "Anomaly detection"
+            ],
+            "math_details": {
+                "main_formula": r"""
+                \hat{f}_{knn}(x) = \frac{1}{k}\sum_{i=1}^k y_i
+                """,
+                "component_formulas": [
+                    {
+                        "name": "Distance Function",
+                        "formula": r"""
+                        d(x,x') = \sum_{i=1}^p |x_i - x'_i|^2
+                        """
+                    },
+                    {
+                        "name": "Decision Function",
+                        "formula": r"""
+                        f(x) = \text{sign}\left(\sum_{i=1}^k y_i \cdot \text{weight}(d(x,x_i))\right)
+                        """
+                    }
+                ],
+                "explanation": """
+                - d(x,x') is the distance function
+                - xᵢ are the k nearest neighbors
+                - yᵢ are the labels of the k nearest neighbors
+                - weight(d(x,x')) is the weight function based on distance
+                """
+            },
+            "references": [
+                {
+                    "title": "Nearest Neighbor Pattern Classification",
+                    "authors": "Cover T. & Hart P.",
+                    "publication": "IEEE Transactions on Information Theory",
+                    "year": "1967",
+                    "url": "https://ieeexplore.ieee.org/document/1053964"
+                },
+                {
+                    "title": "A Survey of Nearest Neighbor Techniques",
+                    "authors": "Bhatia N. & Vandana",
+                    "publication": "International Journal of Computer Science and Information Security",
+                    "year": "2010",
+                    "url": "https://arxiv.org/abs/1007.0085"
+                },
+                {
+                    "title": "Machine Learning Algorithms: Mathematical Deep Dive",
+                    "authors": "Vidushi Meel",
+                    "publication": "viso.ai",
+                    "year": "2021",
+                    "url": "https://viso.ai/deep-learning/machine-learning-algorithms-mathematical-guide/"
+                }
+            ]
+        },
+        "Ridge Classifier": {
+            "description": """
+            A linear classifier that uses L2 regularization to prevent overfitting. Similar to logistic regression
+            but with different loss function and regularization.
+            """,
+            "pros": [
+                "Good for multicollinear data",
+                "Less prone to overfitting",
+                "Computationally efficient",
+                "Works well with many features"
+            ],
+            "cons": [
+                "Only for linear classification",
+                "May underfit complex patterns",
+                "Sensitive to feature scaling",
+                "No probability estimates"
+            ],
+            "use_cases": [
+                "High-dimensional data classification",
+                "Text classification",
+                "Gene expression analysis",
+                "Simple binary classification"
+            ],
+            "math_details": {
+                "main_formula": r"""
+                \min_{w} ||Xw - y||^2_2 + \alpha ||w||^2_2
+                """,
+                "component_formulas": [
+                    {
+                        "name": "Decision Function",
+                        "formula": r"""
+                        f(x) = w^Tx
+                        """
+                    },
+                    {
+                        "name": "L2 Penalty",
+                        "formula": r"""
+                        \text{penalty} = \alpha ||w||^2_2 = \alpha \sum_{j=1}^p w_j^2
+                        """
+                    }
+                ],
+                "explanation": """
+                - w is the weight vector
+                - α is the regularization strength
+                - X is the feature matrix
+                - y is the target vector
+                - p is the number of features
+                """
+            },
+            "references": [
+                {
+                    "title": "Ridge Regression: Biased Estimation for Nonorthogonal Problems",
+                    "authors": "Hoerl A.E. & Kennard R.W.",
+                    "publication": "Technometrics",
+                    "year": "1970",
+                    "url": "https://www.tandfonline.com/doi/abs/10.1080/00401706.1970.10488634"
+                },
+                {
+                    "title": "The Elements of Statistical Learning",
+                    "authors": "Hastie T., Tibshirani R., & Friedman J.",
+                    "publication": "Springer",
+                    "year": "2009",
+                    "url": "https://web.stanford.edu/~hastie/ElemStatLearn/"
+                },
+                {
+                    "title": "Fundamental Mathematical Formulas Used in Machine Learning",
+                    "authors": "Showmik Setta",
+                    "publication": "Medium",
+                    "year": "2023",
+                    "url": "https://medium.com/@showmiklovesport/fundamental-mathematical-formulas-used-in-machine-learning-beginner-21c0843e61e0"
+                }
+            ]
+        },
+        "Multinomial Naive Bayes": {
+            "description": """
+            A specialized version of Naive Bayes for multinomially distributed data. Commonly used for text
+            classification with word counts.
+            """,
+            "pros": [
+                "Fast training and prediction",
+                "Works well with high-dimensional data",
+                "Good for text classification",
+                "Handles multiple classes well"
+            ],
+            "cons": [
+                "Assumes feature independence",
+                "Requires non-negative features",
+                "Sensitive to feature distribution",
+                "May underperform with continuous data"
+            ],
+            "use_cases": [
+                "Document classification",
+                "Spam detection",
+                "Language detection",
+                "Topic modeling"
+            ],
+            "math_details": {
+                "main_formula": r"""
+                P(y|x) = \frac{P(y)\prod_{i=1}^n P(x_i|y)}{\sum_{k} P(y_k)\prod_{i=1}^n P(x_i|y_k)}
+                """,
+                "component_formulas": [
+                    {
+                        "name": "Feature Probability",
+                        "formula": r"""
+                        P(x_i|y) = \frac{N_{yi} + \alpha}{N_y + \alpha n}
+                        """
+                    },
+                    {
+                        "name": "Log Probability",
+                        "formula": r"""
+                        \log P(y|x) = \log P(y) + \sum_{i=1}^n \log P(x_i|y)
+                        """
+                    }
+                ],
+                "explanation": """
+                - Nyᵢ is the count of feature i in class y
+                - Ny is the total count of all features in class y
+                - α is the smoothing parameter
+                - n is the number of features
+                """
+            },
+            "references": [
+                {
+                    "title": "A comparison of event models for naive Bayes text classification",
+                    "authors": "McCallum A. & Nigam K.",
+                    "publication": "AAAI-98 Workshop on Learning for Text Categorization",
+                    "year": "1998",
+                    "url": "https://www.cs.cmu.edu/~knigam/papers/multinomial-aaaiws98.pdf"
+                },
+                {
+                    "title": "An empirical study of the naive Bayes classifier",
+                    "authors": "Rish I.",
+                    "publication": "IJCAI 2001 Workshop on Empirical Methods in Artificial Intelligence",
+                    "year": "2001",
+                    "url": "https://www.researchgate.net/publication/228845263_An_Empirical_Study_of_the_Naive_Bayes_Classifier"
+                },
+                {
+                    "title": "Fundamental Mathematical Formulas Used in Machine Learning",
+                    "authors": "Showmik Setta",
+                    "publication": "Medium",
+                    "year": "2023",
+                    "url": "https://medium.com/@showmiklovesport/fundamental-mathematical-formulas-used-in-machine-learning-beginner-21c0843e61e0"
+                }
+            ]
+        },
+        "AdaBoost Classifier": {
+            "description": """
+            An ensemble method that builds a strong classifier by iteratively adding weak learners, focusing on
+            previously misclassified examples.
+            """,
+            "pros": [
+                "Good generalization",
+                "Less prone to overfitting",
+                "Can identify hard-to-classify instances",
+                "Works well with weak learners"
+            ],
+            "cons": [
+                "Sensitive to noisy data and outliers",
+                "Sequential nature (can't parallelize)",
+                "Can be computationally intensive",
+                "May require careful tuning"
+            ],
+            "use_cases": [
+                "Face detection",
+                "Object recognition",
+                "Medical diagnosis",
+                "Fraud detection"
+            ],
+            "math_details": {
+                "main_formula": r"""
+                F(x) = \text{sign}\left(\sum_{t=1}^T \alpha_t h_t(x)\right)
+                """,
+                "component_formulas": [
+                    {
+                        "name": "Weak Learner Weight",
+                        "formula": r"""
+                        \alpha_t = \frac{1}{2}\ln\left(\frac{1-\epsilon_t}{\epsilon_t}\right)
+                        """
+                    },
+                    {
+                        "name": "Sample Weight Update",
+                        "formula": r"""
+                        w_{i,t+1} = w_{i,t}\exp(-y_i\alpha_th_t(x_i))
+                        """
+                    }
+                ],
+                "explanation": """
+                - hₜ(x) is the weak learner prediction
+                - αₜ is the weight of weak learner t
+                - εₜ is the weighted error rate
+                - wᵢ,ₜ is the weight of sample i at iteration t
+                """
+            },
+            "references": [
+                {
+                    "title": "A Decision-Theoretic Generalization of On-Line Learning and an Application to Boosting",
+                    "authors": "Freund Y. & Schapire R.E.",
+                    "publication": "Journal of Computer and System Sciences",
+                    "year": "1997",
+                    "url": "https://www.sciencedirect.com/science/article/pii/S002200009791504X"
+                },
+                {
+                    "title": "Experiments with a New Boosting Algorithm",
+                    "authors": "Freund Y. & Schapire R.E.",
+                    "publication": "International Conference on Machine Learning",
+                    "year": "1996",
+                    "url": "https://icml.cc/Conferences/1996/papers/boosting.pdf"
+                },
+                {
+                    "title": "Machine Learning Algorithms: Mathematical Deep Dive",
+                    "authors": "Vidushi Meel",
+                    "publication": "viso.ai",
+                    "year": "2021",
+                    "url": "https://viso.ai/deep-learning/machine-learning-algorithms-mathematical-guide/"
+                }
+            ]
+        }
+    }
+    # Add implementation details to each algorithm
+    for algo_name in algorithms:
+        algorithms[algo_name]["implementation"] = {
+            "Gaussian Naive Bayes (GaussianNB)": {
+                "code": """
+from sklearn.naive_bayes import GaussianNB
+from sklearn.datasets import make_classification
+# Create sample dataset
+X, y = make_classification(n_samples=1000, n_features=20, n_classes=2)
+# Initialize and train the model
+gnb = GaussianNB()
+gnb.fit(X, y)
+# Make predictions
+y_pred = gnb.predict(X)
+                """,
+                "key_parameters": {
+                    "var_smoothing": "Portion of the largest variance of all features that is added to variances for calculation stability",
+                    "priors": "Prior probabilities of the classes"
+                },
+                "tips": [
+                    "Normalize features if they have very different scales",
+                    "Good as a baseline model for comparison",
+                    "Check feature distributions - should be roughly Gaussian"
+                ]
+            },
+            "Linear Support Vector Classification (LinearSVC)": {
+                "code": """
+from sklearn.svm import LinearSVC
+from sklearn.preprocessing import StandardScaler
+# Scale the features
+scaler = StandardScaler()
+X_scaled = scaler.fit_transform(X)
+# Initialize and train the model
+svc = LinearSVC(random_state=42, max_iter=1000)
+svc.fit(X_scaled, y)
+                """,
+                "key_parameters": {
+                    "C": "Regularization parameter (default=1.0)",
+                    "max_iter": "Maximum iterations for convergence",
+                    "dual": "Dual or primal formulation"
+                },
+                "tips": [
+                    "Always scale your features",
+                    "Increase max_iter if model doesn't converge",
+                    "Try different C values using cross-validation"
+                ]
+            },
+            "Support Vector Classification (SVC)": {
+                "code": """
+from sklearn.svm import SVC
+from sklearn.preprocessing import StandardScaler
+# Scale the features
+scaler = StandardScaler()
+X_scaled = scaler.fit_transform(X)
+# Initialize and train the model
+svc = SVC(random_state=42)
+svc.fit(X_scaled, y)
+                """,
+                "key_parameters": {
+                    "C": "Regularization parameter (default=1.0)",
+                    "kernel": "Kernel function used to transform the data",
+                    "gamma": "Kernel coefficient for 'rbf', 'poly', and 'sigmoid' kernels"
+                },
+                "tips": [
+                    "Always scale your features",
+                    "Try different kernels and gamma values",
+                    "Increase C if model underfits",
+                    "Decrease C if model overfits"
+                ]
+            },
+            "Multi-layer Perceptron (MLPClassifier)": {
+                "code": """
+from sklearn.neural_network import MLPClassifier
+from sklearn.preprocessing import StandardScaler
+# Scale the features
+scaler = StandardScaler()
+X_scaled = scaler.fit_transform(X)
+# Initialize and train the model
+mlp = MLPClassifier(random_state=42)
+mlp.fit(X_scaled, y)
+                """,
+                "key_parameters": {
+                    "hidden_layer_sizes": "Number of neurons in each layer",
+                    "activation": "Activation function used in the hidden layers",
+                    "solver": "Optimization algorithm used to train the model",
+                    "alpha": "L2 regularization parameter"
+                },
+                "tips": [
+                    "Always scale your features",
+                    "Try different activation functions",
+                    "Increase hidden_layer_sizes if model underfits",
+                    "Decrease hidden_layer_sizes if model overfits"
+                ]
+            },
+            "Extra Trees Classifier": {
+                "code": """
+from sklearn.ensemble import ExtraTreesClassifier
+from sklearn.preprocessing import StandardScaler
+# Scale the features
+scaler = StandardScaler()
+X_scaled = scaler.fit_transform(X)
+# Initialize and train the model
+et = ExtraTreesClassifier(random_state=42)
+et.fit(X_scaled, y)
+                """,
+                "key_parameters": {
+                    "n_estimators": "Number of trees in the forest",
+                    "max_depth": "Maximum depth of the trees",
+                    "min_samples_split": "Minimum number of samples required to split an internal node",
+                    "min_samples_leaf": "Minimum number of samples required to be at a leaf node"
+                },
+                "tips": [
+                    "Always scale your features",
+                    "Try different max_depth values",
+                    "Increase n_estimators if model underfits",
+                    "Decrease n_estimators if model overfits"
+                ]
+            },
+            "Random Forest Classifier": {
+                "code": """
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.preprocessing import StandardScaler
+# Scale the features
+scaler = StandardScaler()
+X_scaled = scaler.fit_transform(X)
+# Initialize and train the model
+rf = RandomForestClassifier(random_state=42)
+rf.fit(X_scaled, y)
+                """,
+                "key_parameters": {
+                    "n_estimators": "Number of trees in the forest",
+                    "max_depth": "Maximum depth of the trees",
+                    "min_samples_split": "Minimum number of samples required to split an internal node",
+                    "min_samples_leaf": "Minimum number of samples required to be at a leaf node"
+                },
+                "tips": [
+                    "Always scale your features",
+                    "Try different max_depth values",
+                    "Increase n_estimators if model underfits",
+                    "Decrease n_estimators if model overfits"
+                ]
+            },
+            "K-Nearest Neighbors (KNeighborsClassifier)": {
+                "code": """
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.preprocessing import StandardScaler
+# Scale the features
+scaler = StandardScaler()
+X_scaled = scaler.fit_transform(X)
+# Initialize and train the model
+knn = KNeighborsClassifier()
+knn.fit(X_scaled, y)
+                """,
+                "key_parameters": {
+                    "n_neighbors": "Number of neighbors to use",
+                    "weights": "Weight function used in prediction",
+                    "algorithm": "Algorithm used to compute the nearest neighbors",
+                    "leaf_size": "Maximum number of samples in each leaf"
+                },
+                "tips": [
+                    "Always scale your features",
+                    "Try different n_neighbors values",
+                    "Increase leaf_size if model underfits",
+                    "Decrease leaf_size if model overfits"
+                ]
+            },
+            "Ridge Classifier": {
+                "code": """
+from sklearn.linear_model import RidgeClassifier
+from sklearn.preprocessing import StandardScaler
+# Scale the features
+scaler = StandardScaler()
+X_scaled = scaler.fit_transform(X)
+# Initialize and train the model
+ridge = RidgeClassifier(random_state=42)
+ridge.fit(X_scaled, y)
+                """,
+                "key_parameters": {
+                    "alpha": "Regularization parameter (default=1.0)",
+                    "solver": "Optimization algorithm used to train the model",
+                    "max_iter": "Maximum number of iterations for the solver to converge"
+                },
+                "tips": [
+                    "Always scale your features",
+                    "Try different alpha values",
+                    "Increase max_iter if model doesn't converge",
+                    "Decrease max_iter if model overfits"
+                ]
+            },
+            "Multinomial Naive Bayes": {
+                "code": """
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.preprocessing import StandardScaler
+# Scale the features
+scaler = StandardScaler()
+X_scaled = scaler.fit_transform(X)
+# Initialize and train the model
+nb = MultinomialNB()
+nb.fit(X_scaled, y)
+                """,
+                "key_parameters": {
+                    "alpha": "Regularization parameter (default=1.0)",
+                    "fit_prior": "Whether to learn class prior probabilities or not",
+                    "class_prior": "Prior probabilities of the classes"
+                },
+                "tips": [
+                    "Always scale your features",
+                    "Try different alpha values",
+                    "Increase alpha if model underfits",
+                    "Decrease alpha if model overfits"
+                ]
+            },
+            "AdaBoost Classifier": {
+                "code": """
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.preprocessing import StandardScaler
+# Scale the features
+scaler = StandardScaler()
+X_scaled = scaler.fit_transform(X)
+# Initialize and train the model
+ada = AdaBoostClassifier(random_state=42)
+ada.fit(X_scaled, y)
+                """,
+                "key_parameters": {
+                    "n_estimators": "Number of trees in the forest",
+                    "learning_rate": "Learning rate used to update the weights of the weak classifiers",
+                    "algorithm": "Optimization algorithm used to train the model"
+                },
+                "tips": [
+                    "Always scale your features",
+                    "Try different learning_rate values",
+                    "Increase n_estimators if model underfits",
+                    "Decrease n_estimators if model overfits"
+                ]
+            }
+        }.get(algo_name, {})
+    # Algorithm selector
+    selected_algo = st.selectbox(
+        "Select an algorithm to learn more:",
+        list(algorithms.keys())
+    )
+    # Display algorithm information
+    if selected_algo:
+        st.header(selected_algo)
+        # Description
+        st.subheader("Description")
+        st.write(algorithms[selected_algo]["description"])
+        # Two-column layout for pros and cons
+        col1, col2 = st.columns(2)
+        with col1:
+            st.subheader("Advantages")
+            for pro in algorithms[selected_algo]["pros"]:
+                st.markdown(f"✅ {pro}")
+        with col2:
+            st.subheader("Disadvantages")
+            for con in algorithms[selected_algo]["cons"]:
+                st.markdown(f"⚠️ {con}")
+        # Use cases
+        st.subheader("Common Use Cases")
+        for use_case in algorithms[selected_algo]["use_cases"]:
+            st.markdown(f"🎯 {use_case}")
+        # Add mathematical details section
+        st.markdown("---")
+        display_math_details(algorithms[selected_algo])
+        # Add visual separator
+        st.markdown("---")
+        # Implementation section
+        if "implementation" in algorithms[selected_algo]:
+            st.subheader("Implementation Example")
+            # Code example
+            st.code(algorithms[selected_algo]["implementation"]["code"], language="python")
+            # Key Parameters
+            st.subheader("Key Parameters")
+            for param, desc in algorithms[selected_algo]["implementation"]["key_parameters"].items():
+                st.markdown(f"**`{param}`**: {desc}")
+            # Implementation Tips
+            st.subheader("Implementation Tips")
+            for tip in algorithms[selected_algo]["implementation"]["tips"]:
+                st.markdown(f"💡 {tip}")
+        # Add interactive demo section
+        st.subheader("Interactive Demo")
+        if st.checkbox("Show Interactive Demo"):
+            st.write("Select dataset:")
+            dataset_choice = st.selectbox(
+                "Choose a sample dataset",
+                ["Iris", "Breast Cancer", "Wine", "Digits"]
+            )
+            if st.button("Run Demo"):
+                try:
+                    with st.spinner("Running demo..."):
+                        demo_results = run_algorithm_demo(selected_algo, dataset_choice)
+                        # Display results
+                        st.write("Model Performance:")
+                        st.write(f"Accuracy: {demo_results['accuracy']:.4f}")
+                        # Show confusion matrix
+                        st.write("Confusion Matrix:")
+                        st.pyplot(demo_results['confusion_matrix_plot'])
+                        # Show learning curve
+                        st.write("Learning Curve:")
+                        st.pyplot(demo_results['learning_curve_plot'])
+                except Exception as e:
+                    st.error(f"Error running demo: {str(e)}")
+        # Add a references section to display in the UI
+        if st.checkbox("Show References"):
+            st.subheader("Academic References")
+            if "references" in algorithms[selected_algo]:
+                for ref in algorithms[selected_algo]["references"]:
+                    st.markdown(f"**{ref['title']}**")
+                    st.markdown(f"*{ref['authors']}* ({ref['year']})")
+                    st.markdown(f"Published in: {ref['publication']}")
+                    st.markdown(f"[Link to Publication]({ref['url']})")
+                    st.markdown("---")
+            else:
+                st.write("No references available for this algorithm.")
+def run_algorithm_demo(algorithm_name, dataset_name):
+    """Run a demo of the selected algorithm on the chosen dataset."""
+    from sklearn.datasets import load_iris, load_breast_cancer, load_wine, load_digits
+    from sklearn.model_selection import train_test_split, learning_curve
+    from sklearn.preprocessing import StandardScaler
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    # Load dataset
+    dataset_loaders = {
+        "Iris": load_iris,
+        "Breast Cancer": load_breast_cancer,
+        "Wine": load_wine,
+        "Digits": load_digits
+    }
+    data = dataset_loaders[dataset_name]()
+    X, y = data.data, data.target
+    # Split and scale data
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
+    scaler = StandardScaler()
+    X_train_scaled = scaler.fit_transform(X_train)
+    X_test_scaled = scaler.transform(X_test)
+    # Initialize and train model
+    model = get_model_instance(algorithm_name)
+    model.fit(X_train_scaled, y_train)
+    # Get predictions and accuracy
+    y_pred = model.predict(X_test_scaled)
+    accuracy = accuracy_score(y_test, y_pred)
+    # Create confusion matrix plot
+    plt.figure(figsize=(8, 6))
+    cm = confusion_matrix(y_test, y_pred)
+    sns.heatmap(cm, annot=True, fmt='d', cmap='viridis')
+    plt.title('Confusion Matrix')
+    plt.ylabel('True Label')
+    plt.xlabel('Predicted Label')
+    cm_plot = plt.gcf()
+    plt.close()
+    # Create learning curve plot
+    train_sizes, train_scores, test_scores = learning_curve(
+        model, X_train_scaled, y_train, cv=5,
+        train_sizes=np.linspace(0.1, 1.0, 5)
+    )
+    plt.figure(figsize=(8, 6))
+    plt.plot(train_sizes, np.mean(train_scores, axis=1), label='Training score')
+    plt.plot(train_sizes, np.mean(test_scores, axis=1), label='Cross-validation score')
+    plt.xlabel('Training Examples')
+    plt.ylabel('Score')
+    plt.title('Learning Curve')
+    plt.legend(loc='best')
+    lc_plot = plt.gcf()
+    plt.close()
+    return {
+        'accuracy': accuracy,
+        'confusion_matrix_plot': cm_plot,
+        'learning_curve_plot': lc_plot
+    }
+def get_model_instance(algorithm_name):
+    """Return an instance of the specified algorithm."""
+    models = {
+        "Gaussian Naive Bayes (GaussianNB)": GaussianNB(),
+        "Linear Support Vector Classification (LinearSVC)": LinearSVC(random_state=42),
+        "Support Vector Classification (SVC)": SVC(random_state=42),
+        "Multi-layer Perceptron (MLPClassifier)": MLPClassifier(random_state=42),
+        "Extra Trees Classifier": ExtraTreesClassifier(random_state=42),
+        "Random Forest Classifier": RandomForestClassifier(random_state=42),
+        "K-Nearest Neighbors (KNeighborsClassifier)": KNeighborsClassifier(),
+        "Ridge Classifier": RidgeClassifier(random_state=42),
+        "Multinomial Naive Bayes": MultinomialNB(),
+        "AdaBoost Classifier": AdaBoostClassifier(random_state=42)
+    }
+    return models[algorithm_name]
+def display_math_details(algorithm):
+    """Display mathematical details for the algorithm."""
+    if "math_details" in algorithm:
+        st.subheader("Mathematical Details")
+        # Main formula
+        st.write("Main Formula:")
+        st.latex(algorithm["math_details"]["main_formula"])
+        # Component formulas
+        st.write("Component Formulas:")
+        for component in algorithm["math_details"]["component_formulas"]:
+            st.write(f"**{component['name']}:**")
+            st.latex(component["formula"])
+        # Explanation
+        st.write("**Variable Explanations:**")
+        st.markdown(algorithm["math_details"]["explanation"])
+if __name__ == "__main__":
+    setup_page_config()
+    algorithm_info()

pages/03_Model_implementation.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import pickle
+import os
+from datetime import datetime
+from App import StreamlitUI
+def setup_page_config():
+    """Configure the Streamlit page"""
+    st.set_page_config(
+        page_title="Model Implementation",
+        page_icon="🤖",
+        layout="wide"
+    )
+def load_model_and_scaler(model_file, scaler_file):
+    try:
+        # Create a temporary directory if it doesn't exist
+        temp_dir = 'temp_uploads'
+        if not os.path.exists(temp_dir):
+            os.makedirs(temp_dir)
+        # Generate unique filenames
+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+        temp_model_path = os.path.join(temp_dir, f'model_{timestamp}.pkl')
+        temp_scaler_path = os.path.join(temp_dir, f'scaler_{timestamp}.pkl')
+        # Save uploaded files
+        with open(temp_model_path, 'wb') as f:
+            f.write(model_file.getbuffer())
+        with open(temp_scaler_path, 'wb') as f:
+            f.write(scaler_file.getbuffer())
+        # Load the files using pickle
+        with open(temp_model_path, 'rb') as f:
+            model = pickle.load(f)
+        with open(temp_scaler_path, 'rb') as f:
+            scaler = pickle.load(f)
+        # Clean up
+        os.remove(temp_model_path)
+        os.remove(temp_scaler_path)
+        return model, scaler
+    except Exception as e:
+        st.error(f"Error loading model or scaler: {str(e)}")
+        return None, None
+def predict(model, scaler, features):
+    try:
+        # Convert features to numpy array and reshape
+        features_array = np.array(features).reshape(1, -1)
+        # Scale features
+        features_scaled = scaler.transform(features_array)
+        # Make prediction
+        prediction = model.predict(features_scaled)
+        # Get prediction probabilities if available
+        try:
+            probabilities = model.predict_proba(features_scaled)
+            return prediction[0], probabilities[0]
+        except:
+            return prediction[0], None
+    except Exception as e:
+        st.error(f"Error making prediction: {str(e)}")
+        return None, None
+def generate_random_features(feature_names):
+    """Generate random but realistic values for features"""
+    random_values = {}
+    # Get ranges from default configs in App.py
+    feature_ranges = {}
+    for feature_name in feature_names:
+        min_val = float('inf')
+        max_val = float('-inf')
+        # Calculate min/max across all classes in default configs
+        for class_config in StreamlitUI().default_configs.values():
+            mean = class_config['mean']
+            std = class_config['std']
+            # Get index of matching feature
+            try:
+                idx = StreamlitUI().default_features.index(feature_name)
+                feature_min = mean[idx] - 3*std[idx]  # 3 std deviations for 99.7% coverage
+                feature_max = mean[idx] + 3*std[idx]
+                min_val = min(min_val, feature_min)
+                max_val = max(max_val, feature_max)
+            except ValueError:
+                continue
+        # If feature not found in defaults, use reasonable fallback range
+        if min_val == float('inf'):
+            min_val, max_val = 0, 100
+        feature_ranges[feature_name] = (min_val, max_val)
+    for feature in feature_names:
+        # Default range if feature not in predefined ranges
+        min_val, max_val = 0, 100
+        # Check if any of the known features are in the feature name
+        for key, (min_range, max_range) in feature_ranges.items():
+            if key.lower() in feature.lower():
+                min_val, max_val = min_range, max_range
+                break
+        random_values[feature] = round(np.random.uniform(min_val, max_val), 2)
+    return random_values
+def show():
+    st.title("Model Implementation")
+    # Initialize session state for random values if not exists
+    if 'random_values' not in st.session_state:
+        st.session_state.random_values = {}
+    # Keep file uploaders in sidebar
+    st.sidebar.subheader("Upload Model Files")
+    model_file = st.sidebar.file_uploader("Upload Model (.pkl)", type=['pkl'])
+    scaler_file = st.sidebar.file_uploader("Upload Scaler (.pkl)", type=['pkl'])
+    # Only proceed if both files are uploaded
+    if model_file and scaler_file:
+        model, scaler = load_model_and_scaler(model_file, scaler_file)
+        if model and scaler:
+            st.sidebar.success("Model and scaler loaded successfully!")
+            # Get feature names from scaler
+            feature_names = None
+            if hasattr(scaler, 'feature_names_in_'):
+                feature_names = scaler.feature_names_in_
+            elif hasattr(model, 'feature_names_in_'):
+                feature_names = model.feature_names_in_
+            if feature_names is None:
+                feature_names_input = st.sidebar.text_input(
+                    "Enter feature names (comma-separated)",
+                    "feature1, feature2, feature3"
+                )
+                feature_names = [f.strip() for f in feature_names_input.split(",")]
+                st.sidebar.info("Feature names were not found in the model/scaler. Using manually entered names.")
+            # Create two main columns for the page layout
+            input_col, result_col = st.columns(2)
+            # Left column for feature inputs
+            with input_col:
+                st.subheader("Enter Feature Values")
+                # Add randomization button
+                col1, col2 = st.columns([1, 2])
+                with col1:
+                    if st.button("🎲 Randomize"):
+                        # Generate new random values
+                        st.session_state.random_values = generate_random_features(feature_names)
+                        # Update session state for each feature
+                        for feature in feature_names:
+                            st.session_state[f"input_{feature}"] = st.session_state.random_values[feature]
+                with col2:
+                    st.markdown("<div style='margin-top: 8px;'>Generate realistic random values</div>",
+                              unsafe_allow_html=True)
+                # Create feature inputs in a grid layout
+                feature_values = {}
+                input_cols = st.columns(2)  # 2 columns for feature inputs
+                for idx, feature in enumerate(feature_names):
+                    with input_cols[idx % 2]:
+                        # Initialize session state for this input if not exists
+                        if f"input_{feature}" not in st.session_state:
+                            st.session_state[f"input_{feature}"] = 0.0
+                        feature_values[feature] = st.number_input(
+                            f"{feature}",
+                            key=f"input_{feature}",
+                            step=1.0,
+                            format="%.2f"
+                        )
+                # Make prediction button
+                predict_clicked = st.button("Make Prediction")
+            # Right column for prediction results
+            with result_col:
+                st.subheader("Prediction Results")
+                # Make prediction when values are available or button is clicked
+                if predict_clicked or st.session_state.random_values:
+                    # Prepare features in correct order
+                    features = [feature_values[feature] for feature in feature_names]
+                    # Get prediction
+                    prediction, probabilities = predict(model, scaler, features)
+                    if prediction is not None:
+                        st.write(f"Predicted Class: **{prediction}**")
+                        # Display probabilities if available
+                        if probabilities is not None:
+                            st.write("Class Probabilities:")
+                            prob_df = pd.DataFrame({
+                                'Class': model.classes_,
+                                'Probability': probabilities
+                            })
+                            # Display as bar chart
+                            st.bar_chart(
+                                prob_df.set_index('Class')
+                            )
+                else:
+                    st.info("Enter feature values and click 'Make Prediction' to see results.")
+    else:
+        st.sidebar.info("Please upload both model and scaler files to proceed.")
+if __name__ == "__main__":
+    setup_page_config()
+    show()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+streamlit>=1.28.0
+numpy>=1.24.0
+pandas>=2.0.0
+scikit-learn>=1.2.0
+plotly>=5.13.0
+seaborn>=0.12.0
+matplotlib>=3.7.0
+joblib>=1.2.0