Spaces:

damndeepesh
/

AutoML

Sleeping

App Files Files Community

damndeepesh commited on Jun 5, 2025

Commit

be587b5

verified ·

1 Parent(s): 1b86695

updated files

Browse files

Files changed (3) hide show

README.md +0 -10
app.py +497 -169
requirements.txt +5 -1

README.md CHANGED Viewed

@@ -1,13 +1,3 @@
----
-license: mit
-title: AutoML
-sdk: streamlit
-emoji: 🏆
-colorFrom: green
-colorTo: yellow
-pinned: true
-sdk_version: 1.45.1
----
 # AutoML & Explainability Web Application
 This Streamlit web application empowers users to perform end-to-end machine learning tasks with ease. Upload your data, automatically train and compare various models, understand their predictions through SHAP explainability, and export the best model for your needs.












1	# AutoML & Explainability Web Application
2
3	This Streamlit web application empowers users to perform end-to-end machine learning tasks with ease. Upload your data, automatically train and compare various models, understand their predictions through SHAP explainability, and export the best model for your needs.

app.py CHANGED Viewed

@@ -1,16 +1,20 @@
 import streamlit as st
 import pandas as pd
 import numpy as np
-from sklearn.model_selection import train_test_split, cross_val_score
 from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.svm import SVC, SVR
 from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, ElasticNet
 from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
 from sklearn.naive_bayes import GaussianNB
-from sklearn.preprocessing import StandardScaler, LabelEncoder
 from sklearn.impute import SimpleImputer
 from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score
 import shap
 import matplotlib.pyplot as plt
 import seaborn as sns
@@ -31,36 +35,15 @@ st.set_page_config(
 )
 # Custom CSS for better styling
-st.markdown("""
-<style>
-.main-header {
-    font-size: 2.5rem;
-    color: #1f77b4;
-    text-align: center;
-    margin-bottom: 2rem;
-}
-.metric-card {
-    background-color: #f0f2f6;
-    padding: 1rem;
-    border-radius: 0.5rem;
-    margin: 0.5rem 0;
-    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
-}
-.success-message {
-    background-color: #d4edda;
-    color: #155724;
-    padding: 1rem;
-    border-radius: 0.5rem;
-    border: 1px solid #c3e6cb;
-}
-.stButton>button {
-    width: 100%;
-    border-radius: 0.5rem;
-}
-</style>
-""", unsafe_allow_html=True)
 # --- Helper Functions ---
 def get_model_metrics(y_true, y_pred, y_proba=None, problem_type='Classification'):
     metrics = {}
     if problem_type == "Classification":
@@ -141,7 +124,7 @@ def data_upload_page():
             st.session_state.problem_type = None
             st.session_state.source_data_type = 'single'
         except Exception as e:
-            st.error(f"Error reading single file: {e}")
             return
     elif uploaded_train_file:
         try:
@@ -158,7 +141,7 @@ def data_upload_page():
             else:
                 st.session_state.test_data = None # Explicitly set to None
         except Exception as e:
-            st.error(f"Error reading train/test files: {e}")
             return
     if df is not None:
@@ -226,7 +209,7 @@ def data_upload_page():
                     st.session_state.problem_type = "Regression"
                 else:
                     st.session_state.problem_type = "Unsupported Target Type"
-                    st.error("Target column type is not suitable for classification or regression.")
                     return
                 st.success(f"Target column '{target_column}' selected. Problem Type: {st.session_state.problem_type}")
@@ -239,7 +222,7 @@ def data_upload_page():
                     col3_test.metric("Test Missing Values", st.session_state.test_data.isnull().sum().sum())
                     st.dataframe(st.session_state.test_data.head(5), use_container_width=True)
                     if target_column not in st.session_state.test_data.columns:
-                        st.error(f"Target column '{target_column}' not found in the uploaded test data. Please ensure column names match.")
                         return # Stop further processing if target is missing in test data
                 st.subheader(f"Target Column Distribution (in {'Training Data' if st.session_state.get('source_data_type') == 'separate' else 'Uploaded Data'}): {target_column}")
@@ -253,19 +236,17 @@ def data_upload_page():
                     st.pyplot(fig)
         except Exception as e:
-            st.error(f"Error reading or processing file: {e}")
             if auto_run_training and st.session_state.target_column:
                 st.session_state.auto_run_triggered = True
                 st.experimental_rerun() # Rerun to switch page or trigger training
         except Exception as e:
-            st.error(f"Error processing data: {e}")
-            import traceback
-            st.error(traceback.format_exc())
     else:
         st.info("👆 Please upload a CSV or Excel file (or separate train/test files) to get started.")
-def preprocess_data(df, target_column):
     X = df.drop(columns=[target_column])
     y = df[target_column].copy() # Use .copy() to avoid SettingWithCopyWarning
@@ -299,6 +280,9 @@ def preprocess_data(df, target_column):
     if len(cat_cols) > 0:
         X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
     # Encode categorical features
     le_dict_features = {}
     for col in cat_cols:
@@ -328,10 +312,10 @@ def model_training_page():
     data_available = (st.session_state.data is not None) or \
                      (st.session_state.train_data is not None)
     if not data_available or st.session_state.target_column is None:
-        st.warning("⚠️ Please upload data (single or train/test) and select a target column first.")
         return
     if st.session_state.problem_type == "Unsupported Target Type":
-        st.error("Cannot train models with the current target column type.")
         return
     target = st.session_state.target_column
@@ -343,7 +327,27 @@ def model_training_page():
     test_size = col1.slider("Test Size (if splitting single file)", 0.1, 0.5, 0.2, 0.05, disabled=disable_test_size)
     random_state = col1.number_input("Random State", value=42, min_value=0)
     cv_folds = col2.slider("Cross-Validation Folds", 3, 10, 5)
-    scale_features = col2.checkbox("Scale Numeric Features", value=True)
     # Auto-start training if triggered
     start_button_pressed = st.button("🎯 Start Training", type="primary", key='manual_start_train_button')
@@ -364,7 +368,7 @@ def model_training_page():
                     if st.session_state.test_data is not None:
                         df_test_processed = st.session_state.test_data.copy()
                         if target not in df_test_processed.columns:
-                            st.error(f"Target column '{target}' not found in test data during preprocessing. Aborting.")
                             return
                         X_test, y_test = preprocess_data(df_test_processed, target) # Preprocess test data separately
                         # Ensure X_test has same columns as X_train after preprocessing (esp. after one-hot encoding if added later)
@@ -384,54 +388,136 @@ def model_training_page():
                     )
                 if X_train is None or y_train is None:
-                    st.error("Training data (X_train, y_train) could not be prepared. Please check your data and selections.")
                     return
                 # Scaling should be fit on X_train and transformed on X_test
-                if scale_features:
                     num_cols_train = X_train.select_dtypes(include=np.number).columns
                     if len(num_cols_train) > 0:
-                        scaler = StandardScaler()
-                        X_train[num_cols_train] = scaler.fit_transform(X_train[num_cols_train])
-                        st.session_state.scaler = scaler # Save the fitted scaler
-                        if X_test is not None:
-                            num_cols_test = X_test.select_dtypes(include=np.number).columns
-                            # Ensure test set uses the same numeric columns in the same order as train set for scaling
-                            cols_to_scale_in_test = [col for col in num_cols_train if col in X_test.columns]
-                            if len(cols_to_scale_in_test) > 0:
-                                # Create a DataFrame with columns in the order of num_cols_train
-                                X_test_subset_for_scaling = X_test[cols_to_scale_in_test]
-                                X_test_scaled_values = scaler.transform(X_test_subset_for_scaling)
-                                X_test[cols_to_scale_in_test] = X_test_scaled_values
-                            # Handle missing/extra columns if necessary, for now assume they match or subset
                 st.session_state.update({'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test})
                 # Define models based on problem type
                 if st.session_state.problem_type == "Classification":
-                    models_to_train = {
-                        "Logistic Regression": LogisticRegression(random_state=random_state, max_iter=1000),
-                        "Decision Tree": DecisionTreeClassifier(random_state=random_state),
-                        "Random Forest": RandomForestClassifier(random_state=random_state),
-                        "Gradient Boosting": GradientBoostingClassifier(random_state=random_state),
-                        "Support Vector Machine": SVC(random_state=random_state, probability=True),
-                        "K-Nearest Neighbors": KNeighborsClassifier(),
-                        "Gaussian Naive Bayes": GaussianNB()
                     }
                     scoring = 'accuracy'
                 else: # Regression
-                    # Local imports for LinearRegression, Ridge, RandomForestRegressor, etc.
-                    # are removed as these models are now imported globally by the first search/replace block.
-                    # ElasticNet is also imported globally.
-                    models_to_train = {
-                        "Linear Regression": LinearRegression(),
-                        "Ridge Regression": Ridge(random_state=random_state),
-                        "ElasticNet Regression": ElasticNet(random_state=random_state),
-                        "Random Forest Regressor": RandomForestRegressor(random_state=random_state),
-                        "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=random_state),
-                        "Decision Tree Regressor": DecisionTreeRegressor(random_state=random_state),
-                        "Support Vector Regressor": SVR(),
-                        "K-Nearest Neighbors Regressor": KNeighborsRegressor()
                     }
                     scoring = 'r2'
@@ -440,22 +526,86 @@ def model_training_page():
                 progress_bar = st.progress(0)
                 status_text = st.empty()
-                for i, (name, model) in enumerate(models_to_train.items()):
-                    status_text.text(f"Training {name}...")
-                    model.fit(X_train, y_train)
-                    trained_models[name] = model
-                    y_pred_test = model.predict(X_test)
-                    y_proba_test = model.predict_proba(X_test) if hasattr(model, 'predict_proba') and st.session_state.problem_type == "Classification" else None
-                    metrics = get_model_metrics(y_test, y_pred_test, y_proba_test, problem_type=st.session_state.problem_type)
-                    cv_score = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring=scoring).mean()
-                    current_model_scores = {'CV Mean Score': cv_score}
-                    current_model_scores.update(metrics) # Add all relevant metrics
-                    model_scores_dict[name] = current_model_scores
-                    progress_bar.progress((i + 1) / len(models_to_train))
                 st.session_state.models = trained_models
                 st.session_state.model_scores = model_scores_dict
@@ -476,9 +626,7 @@ def model_training_page():
                 st.success(f"✅ Training completed! Best model: {best_model_name}")
             except Exception as e:
-                st.error(f"Error during training: {e}")
-                import traceback
-                st.error(traceback.format_exc())
 def model_comparison_page():
     st.header("📊 Model Comparison")
@@ -519,20 +667,130 @@ def model_comparison_page():
         st.subheader(f"📋 Detailed Metrics for Best Model: {best_model_name}")
         best_model = st.session_state.best_model_info['model']
         y_pred = best_model.predict(st.session_state.X_test)
-        col1, col2 = st.columns(2)
-        with col1:
-            st.text("Classification Report:")
-            report_df = pd.DataFrame(classification_report(st.session_state.y_test, y_pred, output_dict=True)).transpose()
-            st.dataframe(report_df.round(3), use_container_width=True)
-        with col2:
-            st.text("Confusion Matrix:")
-            cm = confusion_matrix(st.session_state.y_test, y_pred)
-            fig_cm, ax_cm = plt.subplots()
-            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax_cm)
-            ax_cm.set_xlabel('Predicted')
-            ax_cm.set_ylabel('Actual')
-            st.pyplot(fig_cm)
 def explainability_page():
     st.header("🔍 Model Explainability (SHAP)")
@@ -548,20 +806,36 @@ def explainability_page():
     with st.spinner("Generating SHAP explanations..."):
         try:
             # SHAP Explainer
-            if isinstance(best_model, (RandomForestClassifier, GradientBoostingClassifier, DecisionTreeClassifier,
-                                      RandomForestRegressor, GradientBoostingRegressor, DecisionTreeRegressor)):
-                explainer = shap.TreeExplainer(best_model)
-            elif isinstance(best_model, (LogisticRegression, LinearRegression, Ridge, ElasticNet)):
-                explainer = shap.LinearExplainer(best_model, X_test_df) # Pass data for LinearExplainer
-            elif isinstance(best_model, (SVC, SVR, KNeighborsClassifier, KNeighborsRegressor, GaussianNB)):
-                 # KernelExplainer can be slow or not directly applicable for some, use a subset of X_train for background data
-                 # For KNN and Naive Bayes, KernelExplainer is a common choice for SHAP if TreeExplainer/LinearExplainer aren't suitable.
-                background_data = shap.sample(st.session_state.X_train, min(100, len(st.session_state.X_train)))
-                if isinstance(background_data, np.ndarray):
-                    background_data = pd.DataFrame(background_data, columns=X_test_df.columns)
-                explainer = shap.KernelExplainer(best_model.predict_proba if hasattr(best_model, 'predict_proba') else best_model.predict, background_data)
-            else:
-                st.error(f"SHAP explanations not supported for {best_model_name} with current setup.")
                 return
             shap_values = explainer.shap_values(X_test_df)
@@ -614,9 +888,7 @@ def explainability_page():
                 st.metric("Predicted Value", f"{predicted:.2f}")
         except Exception as e:
-            st.error(f"Error generating SHAP explanations: {e}")
-            import traceback
-            st.error(traceback.format_exc())
 def model_export_page():
     st.header("💾 Model Export")
@@ -637,9 +909,31 @@ def model_export_page():
     steps = []
     if st.session_state.scaler:
         steps.append(('scaler', st.session_state.scaler))
-    steps.append(('model', best_model))
-    pipeline_to_export = Pipeline(steps)
-    st.session_state.trained_pipeline = pipeline_to_export
     export_format = st.selectbox("Choose export format:", ["Joblib (.joblib)", "Pickle (.pkl)"])
     file_name_suggestion = f"{best_model_name.lower().replace(' ', '_')}_pipeline"
@@ -664,12 +958,28 @@ def model_export_page():
             )
             st.success("Model pipeline ready for download!")
         except Exception as e:
-            st.error(f"Error exporting model: {e}")
     st.subheader("📖 How to use the exported pipeline:")
-    st.code(f"""
-import joblib # or import pickle
 import pandas as pd
 # Load the pipeline
 pipeline = joblib.load('{file_name}{'.joblib' if 'Joblib' in export_format else '.pkl'}')
@@ -683,40 +993,58 @@ pipeline = joblib.load('{file_name}{'.joblib' if 'Joblib' in export_format else
 # Make predictions
 # predictions = pipeline.predict(new_data)
 # print(predictions)
-""", language='python')
-# --- Main Application ---
-def main():
-    init_session_state()
-    st.markdown('<h1 class="main-header">🤖 AutoML & Explainability Platform</h1>', unsafe_allow_html=True)
-    st.sidebar.title("⚙️ Workflow")
-    page_options = ["Data Upload & Preview", "Model Training", "Model Comparison", "Explainability", "Model Export"]
-    # Handle auto-run navigation
-    if st.session_state.get('auto_run_triggered') and st.session_state.target_column:
-        st.session_state.auto_run_triggered = False # Reset trigger
-        st.session_state.current_page = "Model Training"
-        st.session_state.auto_run_triggered_for_training = True # Signal model_training_page to auto-start
-    if 'current_page' not in st.session_state:
-        st.session_state.current_page = "Data Upload & Preview"
-    page = st.sidebar.radio("Navigate", page_options, key='navigation_radio', index=page_options.index(st.session_state.current_page))
-    st.session_state.current_page = page # Update current page based on user selection
-    if page == "Data Upload & Preview":
-        data_upload_page()
-    elif page == "Model Training":
-        model_training_page()
-    elif page == "Model Comparison":
-        model_comparison_page()
-    elif page == "Explainability":
-        explainability_page()
-    elif page == "Model Export":
-        model_export_page()
-    st.sidebar.markdown("---_Developed with Trae AI_---")
-if __name__ == "__main__":
-    main()

 import streamlit as st
 import pandas as pd
 import numpy as np
+from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
 from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.svm import SVC, SVR
 from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, ElasticNet
 from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
 from sklearn.naive_bayes import GaussianNB
+from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
 from sklearn.impute import SimpleImputer
 from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score
+# Import advanced models
+import xgboost as xgb
+import lightgbm as lgb
+import catboost as cb
 import shap
 import matplotlib.pyplot as plt
 import seaborn as sns
 )
 # Custom CSS for better styling
 # --- Helper Functions ---
+def display_error(e, context="An unexpected error occurred"):
+    """Displays a user-friendly error message."""
+    st.error(f"😕 Oops! Something went wrong. {context}. Please check your inputs or the data format.")
+    st.error(f"Details: {str(e)}")
+    # Optionally, log the full traceback for debugging, but don't show it to the user by default
+    # import traceback
+    # st.expander("See Full Error Traceback").error(traceback.format_exc())
 def get_model_metrics(y_true, y_pred, y_proba=None, problem_type='Classification'):
     metrics = {}
     if problem_type == "Classification":
             st.session_state.problem_type = None
             st.session_state.source_data_type = 'single'
         except Exception as e:
+            display_error(e, "Failed to read the uploaded single file")
             return
     elif uploaded_train_file:
         try:
             else:
                 st.session_state.test_data = None # Explicitly set to None
         except Exception as e:
+            display_error(e, "Failed to read the uploaded train/test files")
             return
     if df is not None:
                     st.session_state.problem_type = "Regression"
                 else:
                     st.session_state.problem_type = "Unsupported Target Type"
+                    st.error("The selected target column has an unsupported data type. Please choose a numeric column for regression or a categorical/binary column for classification.")
                     return
                 st.success(f"Target column '{target_column}' selected. Problem Type: {st.session_state.problem_type}")
                     col3_test.metric("Test Missing Values", st.session_state.test_data.isnull().sum().sum())
                     st.dataframe(st.session_state.test_data.head(5), use_container_width=True)
                     if target_column not in st.session_state.test_data.columns:
+                        st.error(f"The target column '{target_column}' was not found in your uploaded test data. Please ensure the column names match exactly between your training and testing datasets.")
                         return # Stop further processing if target is missing in test data
                 st.subheader(f"Target Column Distribution (in {'Training Data' if st.session_state.get('source_data_type') == 'separate' else 'Uploaded Data'}): {target_column}")
                     st.pyplot(fig)
         except Exception as e:
+            display_error(e, "An error occurred while reading or performing initial processing on the file")
             if auto_run_training and st.session_state.target_column:
                 st.session_state.auto_run_triggered = True
                 st.experimental_rerun() # Rerun to switch page or trigger training
         except Exception as e:
+            display_error(e, "An error occurred during data processing and analysis")
     else:
         st.info("👆 Please upload a CSV or Excel file (or separate train/test files) to get started.")
+def preprocess_data(df, target_column, scaling_method="None"):
     X = df.drop(columns=[target_column])
     y = df[target_column].copy() # Use .copy() to avoid SettingWithCopyWarning
     if len(cat_cols) > 0:
         X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
+    # Scaling is handled in the model_training_page after splitting, so not here.
+    # This function will just do imputation and encoding.
     # Encode categorical features
     le_dict_features = {}
     for col in cat_cols:
     data_available = (st.session_state.data is not None) or \
                      (st.session_state.train_data is not None)
     if not data_available or st.session_state.target_column is None:
+        st.warning("⚠️ Please upload your data and select a target column on the 'Data Upload & Preview' page before proceeding to model training.")
         return
     if st.session_state.problem_type == "Unsupported Target Type":
+        st.error("Cannot train models because the selected target column has an unsupported data type. Please go back and select a suitable target column.")
         return
     target = st.session_state.target_column
     test_size = col1.slider("Test Size (if splitting single file)", 0.1, 0.5, 0.2, 0.05, disabled=disable_test_size)
     random_state = col1.number_input("Random State", value=42, min_value=0)
     cv_folds = col2.slider("Cross-Validation Folds", 3, 10, 5)
+    # scale_features checkbox is replaced by a selectbox for scaling_method
+    scaling_method_options = ["None", "StandardScaler", "MinMaxScaler"]
+    scaling_method = col2.selectbox("Numeric Feature Scaling", options=scaling_method_options, index=1, key='scaling_method_selector') # Default to StandardScaler
+    st.session_state.scaling_method = scaling_method # Store for use during preprocessing
+    # Initialize session state variables if they don't exist
+    if 'tuning_method' not in st.session_state:
+        st.session_state.tuning_method = None
+    if 'n_iter' not in st.session_state:
+        st.session_state.n_iter = 50 # Default value
+    st.subheader("Hyperparameter Tuning")
+    enable_tuning = st.checkbox("Enable Hyperparameter Tuning", value=False)
+    if enable_tuning:
+        # The selectbox will automatically update st.session_state.tuning_method
+        tuning_method_selected = st.selectbox("Select Tuning Method", ["Grid Search", "Randomized Search"], key='tuning_method')
+        if tuning_method_selected == "Randomized Search":
+            st.session_state.n_iter = st.number_input("Number of Iterations (for Randomized Search)", min_value=10, value=50, step=10, key='n_iter_randomized_search')
+    else:
+        # When tuning is disabled, explicitly set tuning_method to None
+        st.session_state.tuning_method = None
     # Auto-start training if triggered
     start_button_pressed = st.button("🎯 Start Training", type="primary", key='manual_start_train_button')
                     if st.session_state.test_data is not None:
                         df_test_processed = st.session_state.test_data.copy()
                         if target not in df_test_processed.columns:
+                            st.error(f"The target column '{target}' is missing from your test dataset. Please ensure both train and test datasets have the target column with the same name. Aborting training.")
                             return
                         X_test, y_test = preprocess_data(df_test_processed, target) # Preprocess test data separately
                         # Ensure X_test has same columns as X_train after preprocessing (esp. after one-hot encoding if added later)
                     )
                 if X_train is None or y_train is None:
+                    st.error("The training data (features X_train, target y_train) could not be prepared. This might be due to issues in the uploaded data or preprocessing steps. Please review your data and selections.")
                     return
                 # Scaling should be fit on X_train and transformed on X_test
+                current_scaling_method = st.session_state.get('scaling_method', 'StandardScaler') # Get from session state
+                if current_scaling_method != "None":
                     num_cols_train = X_train.select_dtypes(include=np.number).columns
                     if len(num_cols_train) > 0:
+                        if current_scaling_method == "StandardScaler":
+                            scaler = StandardScaler()
+                        elif current_scaling_method == "MinMaxScaler":
+                            scaler = MinMaxScaler()
+                        else:
+                            scaler = None # Should not happen
+                        if scaler:
+                            X_train[num_cols_train] = scaler.fit_transform(X_train[num_cols_train])
+                            st.session_state.scaler = scaler # Save the fitted scaler
+                            st.info(f"Numeric features in training data scaled using {current_scaling_method}.")
+                            if X_test is not None:
+                                num_cols_test = X_test.select_dtypes(include=np.number).columns
+                                # Ensure test set uses the same numeric columns in the same order as train set for scaling
+                                cols_to_scale_in_test = [col for col in num_cols_train if col in X_test.columns]
+                                if len(cols_to_scale_in_test) > 0:
+                                    # Create a DataFrame with columns in the order of num_cols_train
+                                    X_test_subset_for_scaling = X_test[cols_to_scale_in_test]
+                                    X_test_scaled_values = scaler.transform(X_test_subset_for_scaling)
+                                    X_test[cols_to_scale_in_test] = X_test_scaled_values
+                                    st.info(f"Numeric features in test data scaled using {current_scaling_method}.")
+                        else:
+                             st.session_state.scaler = None # Ensure it's None if no scaling applied
+                    else:
+                        st.session_state.scaler = None # Ensure it's None if no numeric columns
+                else:
+                    st.session_state.scaler = None # Ensure it's None if scaling_method is "None"
                 st.session_state.update({'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test})
                 # Define models based on problem type
+                # Define models and their parameter grids for tuning
                 if st.session_state.problem_type == "Classification":
+                    models_and_params = {
+                        "Logistic Regression": {
+                            'model': LogisticRegression(random_state=random_state, max_iter=1000),
+                            'params': {'C': [0.1, 1.0, 10.0], 'solver': ['liblinear', 'lbfgs']}
+                        },
+                        "Decision Tree": {
+                            'model': DecisionTreeClassifier(random_state=random_state),
+                            'params': {'max_depth': [None, 10, 20, 30], 'min_samples_leaf': [1, 5, 10]}
+                        },
+                        "Random Forest": {
+                            'model': RandomForestClassifier(random_state=random_state),
+                            'params': {'n_estimators': [100, 200], 'max_depth': [10, 20]}
+                        },
+                        "Gradient Boosting": {
+                            'model': GradientBoostingClassifier(random_state=random_state),
+                            'params': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1]}
+                        },
+                        "XGBoost": {
+                            'model': xgb.XGBClassifier(random_state=random_state, use_label_encoder=False, eval_metric='logloss'),
+                            'params': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 6]}
+                        },
+                        "LightGBM": {
+                            'model': lgb.LGBMClassifier(random_state=random_state),
+                            'params': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'num_leaves': [31, 50]}
+                        },
+                        "CatBoost": {
+                            'model': cb.CatBoostClassifier(random_state=random_state, verbose=0),
+                            'params': {'iterations': [100, 200], 'learning_rate': [0.01, 0.1], 'depth': [4, 6]}
+                        },
+                        "Support Vector Machine": {
+                            'model': SVC(random_state=random_state, probability=True),
+                            'params': {'C': [0.1, 1.0, 10.0], 'kernel': ['linear', 'rbf']}
+                        },
+                        "K-Nearest Neighbors": {
+                            'model': KNeighborsClassifier(),
+                            'params': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}
+                        },
+                        "Gaussian Naive Bayes": {
+                            'model': GaussianNB(),
+                            'params': {}
+                        }
                     }
                     scoring = 'accuracy'
                 else: # Regression
+                    models_and_params = {
+                        "Linear Regression": {
+                            'model': LinearRegression(),
+                            'params': {}
+                        },
+                        "Ridge Regression": {
+                            'model': Ridge(random_state=random_state),
+                            'params': {'alpha': [0.1, 1.0, 10.0]}
+                        },
+                        "ElasticNet Regression": {
+                            'model': ElasticNet(random_state=random_state),
+                            'params': {'alpha': [0.1, 1.0, 10.0], 'l1_ratio': [0.1, 0.5, 0.9]}
+                        },
+                        "Random Forest Regressor": {
+                            'model': RandomForestRegressor(random_state=random_state),
+                            'params': {'n_estimators': [100, 200], 'max_depth': [10, 20]}
+                        },
+                        "Gradient Boosting Regressor": {
+                            'model': GradientBoostingRegressor(random_state=random_state),
+                            'params': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1]}
+                        },
+                        "XGBoost Regressor": {
+                            'model': xgb.XGBRegressor(random_state=random_state),
+                            'params': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 6]}
+                        },
+                        "LightGBM Regressor": {
+                            'model': lgb.LGBMRegressor(random_state=random_state),
+                            'params': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'num_leaves': [31, 50]}
+                        },
+                        "CatBoost Regressor": {
+                            'model': cb.CatBoostRegressor(random_state=random_state, verbose=0),
+                            'params': {'iterations': [100, 200], 'learning_rate': [0.01, 0.1], 'depth': [4, 6]}
+                        },
+                        "Decision Tree Regressor": {
+                            'model': DecisionTreeRegressor(random_state=random_state),
+                            'params': {'max_depth': [None, 10, 20, 30], 'min_samples_leaf': [1, 5, 10]}
+                        },
+                        "Support Vector Regressor": {
+                            'model': SVR(),
+                            'params': {'C': [0.1, 1.0, 10.0], 'kernel': ['linear', 'rbf']}
+                        },
+                        "K-Nearest Neighbors Regressor": {
+                            'model': KNeighborsRegressor(),
+                            'params': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}
+                        }
                     }
                     scoring = 'r2'
                 progress_bar = st.progress(0)
                 status_text = st.empty()
+                tuning_enabled = st.session_state.get('tuning_method') is not None
+                n_iter = st.session_state.get('n_iter', 50) # Default for Randomized Search
+                for i, (name, model_info) in enumerate(models_and_params.items()):
+                    try:
+                        model = model_info['model']
+                        params = model_info['params']
+                        # Check if this is one of the newly added models
+                        is_new_model = name in ["XGBoost", "LightGBM", "CatBoost"] or name in ["XGBoost Regressor", "LightGBM Regressor", "CatBoost Regressor"]
+                        if is_new_model:
+                            status_text.text(f"Initializing {name}...")
+                        if tuning_enabled and params:
+                            status_text.text(f"Tuning {name}...")
+                            try:
+                                if st.session_state.tuning_method == "Grid Search":
+                                    tuner = GridSearchCV(model, params, cv=cv_folds, scoring=scoring, n_jobs=-1)
+                                else: # Randomized Search
+                                    tuner = RandomizedSearchCV(model, params, n_iter=n_iter, cv=cv_folds, scoring=scoring, random_state=random_state, n_jobs=-1)
+                                tuner.fit(X_train, y_train)
+                                best_model = tuner.best_estimator_
+                                st.write(f"Best parameters for {name}: {tuner.best_params_}")
+                            except Exception as e:
+                                display_error(e, f"Error during hyperparameter tuning for {name}")
+                                # Skip this model and continue with the next one
+                                continue
+                        else:
+                            status_text.text(f"Training {name}...")
+                            try:
+                                best_model = model
+                                best_model.fit(X_train, y_train)
+                            except Exception as e:
+                                display_error(e, f"Error during training for {name}")
+                                # Skip this model and continue with the next one
+                                continue
+                        trained_models[name] = best_model
+                        try:
+                            y_pred_test = best_model.predict(X_test)
+                            # Handle predict_proba for classification models
+                            if st.session_state.problem_type == "Classification" and hasattr(best_model, 'predict_proba'):
+                                try:
+                                    y_proba_test = best_model.predict_proba(X_test)
+                                except Exception as e:
+                                    st.warning(f"Could not compute prediction probabilities for {name}: {str(e)}")
+                                    y_proba_test = None
+                            else:
+                                y_proba_test = None
+                            metrics = get_model_metrics(y_test, y_pred_test, y_proba_test, problem_type=st.session_state.problem_type)
+                            # For tuned models, cross_val_score on the best_estimator_ might be redundant if tuner already did CV
+                            # But for consistency, we can still calculate it or use tuner.best_score_
+                            try:
+                                cv_score = cross_val_score(best_model, X_train, y_train, cv=cv_folds, scoring=scoring).mean()
+                            except Exception as e:
+                                st.warning(f"Could not compute cross-validation score for {name}: {str(e)}")
+                                cv_score = float('nan')  # Use NaN to indicate missing value
+                            current_model_scores = {'CV Mean Score': cv_score}
+                            current_model_scores.update(metrics) # Add all relevant metrics
+                            model_scores_dict[name] = current_model_scores
+                            if is_new_model:
+                                st.success(f"{name} trained successfully!")
+                        except Exception as e:
+                            display_error(e, f"Error during prediction or evaluation for {name}")
+                            # Skip adding this model to the scores dictionary
+                            continue
+                    except Exception as e:
+                        display_error(e, f"Unexpected error with {name}")
+                        # Skip this model entirely and continue with the next one
+                        continue
+                    progress_bar.progress((i + 1) / len(models_and_params))
                 st.session_state.models = trained_models
                 st.session_state.model_scores = model_scores_dict
                 st.success(f"✅ Training completed! Best model: {best_model_name}")
             except Exception as e:
+                display_error(e, "An error occurred during the model training process")
 def model_comparison_page():
     st.header("📊 Model Comparison")
         st.subheader(f"📋 Detailed Metrics for Best Model: {best_model_name}")
         best_model = st.session_state.best_model_info['model']
         y_pred = best_model.predict(st.session_state.X_test)
+        y_test = st.session_state.y_test
+        # Confusion Matrix
+        st.write("#### Confusion Matrix")
+        cm = confusion_matrix(y_test, y_pred)
+        fig_cm, ax_cm = plt.subplots()
+        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax_cm)
+        ax_cm.set_xlabel('Predicted')
+        ax_cm.set_ylabel('Actual')
+        ax_cm.set_title('Confusion Matrix')
+        st.pyplot(fig_cm)
+        # Classification Report
+        st.write("#### Classification Report")
+        report = classification_report(y_test, y_pred, output_dict=True)
+        report_df = pd.DataFrame(report).transpose()
+        st.dataframe(report_df.round(4))
+        # ROC Curve and AUC
+        if hasattr(best_model, 'predict_proba'):
+            st.write("#### ROC Curve")
+            try:
+                y_proba = best_model.predict_proba(st.session_state.X_test)
+                if y_proba.shape[1] > 2: # Multi-class classification
+                    # For multi-class, plot one-vs-rest ROC curves
+                    from sklearn.preprocessing import LabelBinarizer
+                    lb = LabelBinarizer()
+                    y_test_binarized = lb.fit_transform(y_test)
+                    fig_roc, ax_roc = plt.subplots()
+                    for i in range(y_proba.shape[1]):
+                        fpr, tpr, _ = roc_curve(y_test_binarized[:, i], y_proba[:, i])
+                        roc_auc = auc(fpr, tpr)
+                        ax_roc.plot(fpr, tpr, label=f'Class {lb.classes_[i]} (AUC = {roc_auc:.2f})')
+                    ax_roc.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
+                    ax_roc.set_xlabel('False Positive Rate')
+                    ax_roc.set_ylabel('True Positive Rate')
+                    ax_roc.set_title('ROC Curve (One-vs-Rest)')
+                    ax_roc.legend(loc='lower right')
+                    st.pyplot(fig_roc)
+                else: # Binary classification
+                    fpr, tpr, _ = roc_curve(y_test, y_proba[:, 1])
+                    roc_auc = auc(fpr, tpr)
+                    fig_roc, ax_roc = plt.subplots()
+                    ax_roc.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
+                    ax_roc.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
+                    ax_roc.set_xlabel('False Positive Rate')
+                    ax_roc.set_ylabel('True Positive Rate')
+                    ax_roc.set_title('Receiver Operating Characteristic (ROC) Curve')
+                    ax_roc.legend(loc='lower right')
+                    st.pyplot(fig_roc)
+            except Exception as e:
+                st.warning(f"Could not plot ROC curve: {e}")
+            # Precision-Recall Curve
+            st.write("#### Precision-Recall Curve")
+            try:
+                if y_proba.shape[1] > 2: # Multi-class classification
+                    # For multi-class, plot one-vs-rest Precision-Recall curves
+                    from sklearn.preprocessing import LabelBinarizer
+                    lb = LabelBinarizer()
+                    y_test_binarized = lb.fit_transform(y_test)
+                    fig_pr, ax_pr = plt.subplots()
+                    for i in range(y_proba.shape[1]):
+                        precision, recall, _ = precision_recall_curve(y_test_binarized[:, i], y_proba[:, i])
+                        pr_auc = auc(recall, precision)
+                        ax_pr.plot(recall, precision, label=f'Class {lb.classes_[i]} (AUC = {pr_auc:.2f})')
+                    ax_pr.set_xlabel('Recall')
+                    ax_pr.set_ylabel('Precision')
+                    ax_pr.set_title('Precision-Recall Curve (One-vs-Rest)')
+                    ax_pr.legend(loc='lower left')
+                    st.pyplot(fig_pr)
+                else: # Binary classification
+                    precision, recall, _ = precision_recall_curve(y_test, y_proba[:, 1])
+                    pr_auc = auc(recall, precision)
+                    fig_pr, ax_pr = plt.subplots()
+                    ax_pr.plot(recall, precision, label=f'Precision-Recall curve (area = {pr_auc:.2f})')
+                    ax_pr.set_xlabel('Recall')
+                    ax_pr.set_ylabel('Precision')
+                    ax_pr.set_title('Precision-Recall Curve')
+                    ax_pr.legend(loc='lower left')
+                    st.pyplot(fig_pr)
+            except Exception as e:
+                st.warning(f"Could not plot Precision-Recall curve: {e}")
+        else:
+            st.info("Model does not support `predict_proba` for ROC/PR curves.")
+    elif st.session_state.problem_type == "Regression" and st.session_state.X_test is not None:
+        st.subheader(f"📋 Detailed Metrics for Best Model: {best_model_name}")
+        best_model = st.session_state.best_model_info['model']
+        y_pred = best_model.predict(st.session_state.X_test)
+        y_test = st.session_state.y_test
+        # Residual Plot
+        st.write("#### Residual Plot")
+        residuals = y_test - y_pred
+        fig_res, ax_res = plt.subplots()
+        ax_res.scatter(y_pred, residuals)
+        ax_res.axhline(y=0, color='r', linestyle='--')
+        ax_res.set_xlabel('Predicted Values')
+        ax_res.set_ylabel('Residuals')
+        ax_res.set_title('Residual Plot')
+        st.pyplot(fig_res)
+        # Actual vs. Predicted Plot
+        st.write("#### Actual vs. Predicted Plot")
+        fig_ap, ax_ap = plt.subplots()
+        ax_ap.scatter(y_test, y_pred)
+        ax_ap.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2) # Diagonal line
+        ax_ap.set_xlabel('Actual Values')
+        ax_ap.set_ylabel('Predicted Values')
+        ax_ap.set_title('Actual vs. Predicted Plot')
+        st.pyplot(fig_ap)
+    # st.subheader("Cross-Validation Score Details")
+    # if st.session_state.model_scores:
+    #     cv_scores_df = pd.DataFrame({
+    #         'Model': list(st.session_state.model_scores.keys()),
+    #         'CV Mean Score': [v.get('CV Mean Score', 'N/A') for v in st.session_state.model_scores.values()]
+    #     })
+    #     st.dataframe(cv_scores_df.round(4), use_container_width=True)
+    # else:
+    #     st.info("No cross-validation scores available.")
 def explainability_page():
     st.header("🔍 Model Explainability (SHAP)")
     with st.spinner("Generating SHAP explanations..."):
         try:
             # SHAP Explainer
+            try:
+                # Check for the newly added models first
+                if isinstance(best_model, (xgb.XGBClassifier, xgb.XGBRegressor,
+                                          lgb.LGBMClassifier, lgb.LGBMRegressor,
+                                          cb.CatBoostClassifier, cb.CatBoostRegressor)):
+                    st.info(f"Using TreeExplainer for {best_model_name}")
+                    explainer = shap.TreeExplainer(best_model)
+                elif isinstance(best_model, (RandomForestClassifier, GradientBoostingClassifier, DecisionTreeClassifier,
+                                          RandomForestRegressor, GradientBoostingRegressor, DecisionTreeRegressor)):
+                    explainer = shap.TreeExplainer(best_model)
+                elif isinstance(best_model, (LogisticRegression, LinearRegression, Ridge, ElasticNet)):
+                    explainer = shap.LinearExplainer(best_model, X_test_df) # Pass data for LinearExplainer
+                elif isinstance(best_model, (SVC, SVR, KNeighborsClassifier, KNeighborsRegressor, GaussianNB)):
+                     # KernelExplainer can be slow or not directly applicable for some, use a subset of X_train for background data
+                     # For KNN and Naive Bayes, KernelExplainer is a common choice for SHAP if TreeExplainer/LinearExplainer aren't suitable.
+                    background_data = shap.sample(st.session_state.X_train, min(100, len(st.session_state.X_train)))
+                    if isinstance(background_data, np.ndarray):
+                        background_data = pd.DataFrame(background_data, columns=X_test_df.columns)
+                    explainer = shap.KernelExplainer(best_model.predict_proba if hasattr(best_model, 'predict_proba') else best_model.predict, background_data)
+                else:
+                    st.warning(f"SHAP explanations might not be optimized for the model type '{best_model_name}'. Using KernelExplainer as fallback.")
+                    # Fallback to KernelExplainer for unknown model types
+                    background_data = shap.sample(st.session_state.X_train, min(100, len(st.session_state.X_train)))
+                    if isinstance(background_data, np.ndarray):
+                        background_data = pd.DataFrame(background_data, columns=X_test_df.columns)
+                    predict_fn = best_model.predict_proba if hasattr(best_model, 'predict_proba') and st.session_state.problem_type == "Classification" else best_model.predict
+                    explainer = shap.KernelExplainer(predict_fn, background_data)
+            except Exception as e:
+                display_error(e, f"Error creating SHAP explainer for {best_model_name}")
+                st.error(f"SHAP explanations are currently not supported for the model type '{best_model_name}'. We are working on expanding compatibility.")
                 return
             shap_values = explainer.shap_values(X_test_df)
                 st.metric("Predicted Value", f"{predicted:.2f}")
         except Exception as e:
+            display_error(e, "An error occurred while generating SHAP explanations")
 def model_export_page():
     st.header("💾 Model Export")
     steps = []
     if st.session_state.scaler:
         steps.append(('scaler', st.session_state.scaler))
+    # Check if the model is one of the newly added models
+    is_new_model = isinstance(best_model, (xgb.XGBClassifier, xgb.XGBRegressor,
+                                         lgb.LGBMClassifier, lgb.LGBMRegressor,
+                                         cb.CatBoostClassifier, cb.CatBoostRegressor))
+    if is_new_model:
+        st.info(f"Preparing {best_model_name} for export. These advanced models may require additional libraries when loading.")
+        # Add model-specific export notes
+        if isinstance(best_model, (xgb.XGBClassifier, xgb.XGBRegressor)):
+            st.info("Note: To load this XGBoost model, ensure 'xgboost' is installed in your environment.")
+        elif isinstance(best_model, (lgb.LGBMClassifier, lgb.LGBMRegressor)):
+            st.info("Note: To load this LightGBM model, ensure 'lightgbm' is installed in your environment.")
+        elif isinstance(best_model, (cb.CatBoostClassifier, cb.CatBoostRegressor)):
+            st.info("Note: To load this CatBoost model, ensure 'catboost' is installed in your environment.")
+    try:
+        steps.append(('model', best_model))
+        pipeline_to_export = Pipeline(steps)
+        st.session_state.trained_pipeline = pipeline_to_export
+    except Exception as e:
+        display_error(e, f"Error creating pipeline for {best_model_name}")
+        st.warning("Falling back to exporting model without pipeline wrapper. Some preprocessing steps may need to be applied manually.")
+        st.session_state.trained_pipeline = best_model
     export_format = st.selectbox("Choose export format:", ["Joblib (.joblib)", "Pickle (.pkl)"])
     file_name_suggestion = f"{best_model_name.lower().replace(' ', '_')}_pipeline"
             )
             st.success("Model pipeline ready for download!")
         except Exception as e:
+            display_error(e, "An error occurred while exporting the model pipeline")
     st.subheader("📖 How to use the exported pipeline:")
+    # Determine if the best model is one of the newly added models
+    is_xgboost = isinstance(best_model, (xgb.XGBClassifier, xgb.XGBRegressor))
+    is_lightgbm = isinstance(best_model, (lgb.LGBMClassifier, lgb.LGBMRegressor))
+    is_catboost = isinstance(best_model, (cb.CatBoostClassifier, cb.CatBoostRegressor))
+    # Create code example with appropriate imports based on the model type
+    code_example = f"""import joblib # or import pickle
 import pandas as pd
+"""
+    # Add model-specific imports if needed
+    if is_xgboost:
+        code_example += "import xgboost as xgb  # Required for XGBoost models\n"
+    if is_lightgbm:
+        code_example += "import lightgbm as lgb  # Required for LightGBM models\n"
+    if is_catboost:
+        code_example += "import catboost as cb  # Required for CatBoost models\n"
+    code_example += f"""
 # Load the pipeline
 pipeline = joblib.load('{file_name}{'.joblib' if 'Joblib' in export_format else '.pkl'}')
 # Make predictions
 # predictions = pipeline.predict(new_data)
 # print(predictions)
+# For classification models with probability output
+# if hasattr(pipeline, 'predict_proba'):
+#     probabilities = pipeline.predict_proba(new_data)
+#     print(probabilities)
+"""
+    st.code(code_example, language='python')
+    # Add additional notes for advanced models
+    if is_xgboost or is_lightgbm or is_catboost:
+        st.info("⚠️ Note: When deploying this model in production, ensure all required libraries are installed in your deployment environment.")
+        st.info("💡 Tip: Consider using Docker to create a consistent environment for model deployment.")
+    st.subheader("🚀 Generate Flask API Endpoint")
+    if st.button("Generate Flask API Code", key='generate_flask_api_button'):
+        if st.session_state.trained_pipeline and st.session_state.X_train is not None:
+            # Ensure file_name and ext are defined in this scope, might need to get them from session_state or re-evaluate
+            # For simplicity, let's assume they are available or we use a default/placeholder
+            # This part might need adjustment based on how file_name and ext are handled in the download section
+            current_export_format = st.session_state.get('current_export_format', "Joblib (.joblib)") # Assuming this is stored or re-queried
+            current_file_name = st.session_state.get('current_file_name', f"{st.session_state.best_model_info['name'].lower().replace(' ', '_')}_pipeline")
+            ext_model = ".joblib" if "Joblib" in current_export_format else ".pkl"
+            model_pipeline_name = f"{current_file_name}{ext_model}"
+            flask_app_code = generate_flask_app_code(model_pipeline_name, list(st.session_state.X_train.columns), st.session_state.problem_type, is_xgboost, is_lightgbm, is_catboost)
+            st.code(flask_app_code, language='python')
+            b64_flask_app = base64.b64encode(flask_app_code.encode()).decode()
+            href_flask_app = f'<a href="data:file/text;base64,{b64_flask_app}" download="flask_api_app.py">Download flask_api_app.py</a>'
+            st.markdown(href_flask_app, unsafe_allow_html=True)
+            st.success("Flask API code generated and ready for download!")
+            st.info("Remember to install Flask (`pip install Flask`) and other necessary libraries (e.g., pandas, scikit-learn, joblib, and model-specific libraries) in the environment where you run this Flask app.")
+        else:
+            st.warning("Please ensure a model pipeline is trained and available, and training data (X_train) context exists.")
+# --- Helper function to generate Flask app code ---
+def generate_flask_app_code(model_path, feature_columns, problem_type, is_xgboost, is_lightgbm, is_catboost):
+    imports = [
+        "from flask import Flask, request, jsonify",
+        "import joblib",
+        "import pandas as pd",
+        "import numpy as np"
+    ]
+    if is_xgboost:
+        imports.append("import xgboost as xgb")
+    if is_lightgbm:
+        imports.append("import lightgbm as lgb")
+    if is_catboost:
+        imports.append("import catboost as cb")
+    import_str = "\n".join(imports)

requirements.txt CHANGED Viewed

@@ -6,4 +6,8 @@ shap
 matplotlib
 seaborn
 joblib
-openpyxl # For .xlsx file support

 matplotlib
 seaborn
 joblib
+openpyxl # For .xlsx file support
+xgboost>=1.7.0
+lightgbm>=4.0.0
+catboost>=1.2.0
+Flask