Spaces:

damndeepesh
/

AutoML

Sleeping

App Files Files Community

damndeepesh commited on Jun 5, 2025

Commit

52bd7d1

verified ·

1 Parent(s): 48d3d8a

Update app.py

Browse files

Files changed (1) hide show

app.py +182 -71

app.py CHANGED Viewed

@@ -23,6 +23,7 @@ import io
 import base64
 from datetime import datetime
 import warnings
 warnings.filterwarnings('ignore')
@@ -35,6 +36,35 @@ st.set_page_config(
 )
 # Custom CSS for better styling
 # --- Helper Functions ---
 def display_error(e, context="An unexpected error occurred"):
     """Displays a user-friendly error message."""
@@ -246,29 +276,25 @@ def data_upload_page():
     else:
         st.info("👆 Please upload a CSV or Excel file (or separate train/test files) to get started.")
-def preprocess_data(df, target_column, scaling_method="None"):
     X = df.drop(columns=[target_column])
     y = df[target_column].copy() # Use .copy() to avoid SettingWithCopyWarning
-    # Impute missing values in target variable y
     if y.isnull().any():
         if st.session_state.problem_type == "Classification":
-            # For classification, ensure y is int/str before mode imputation if it's float with NaNs
-            if pd.api.types.is_numeric_dtype(y) and y.nunique() > 2: # Check if it might be a float target for classification
-                 # If it's float and intended for classification, it might have been label encoded already or needs specific handling.
-                 # For now, let's assume if it's numeric and classification, it's likely already encoded or will be handled by LabelEncoder later.
-                 # If it's float due to NaNs, mode might be tricky. Let's ensure it's treated as object for mode for safety.
-                 y_imputer = SimpleImputer(strategy='most_frequent')
-                 y[:] = y_imputer.fit_transform(y.values.reshape(-1, 1)).ravel()
-            else:
-                y_imputer = SimpleImputer(strategy='most_frequent')
-                y[:] = y_imputer.fit_transform(y.values.reshape(-1, 1)).ravel()
         elif st.session_state.problem_type == "Regression":
             y_imputer = SimpleImputer(strategy='mean')
             y[:] = y_imputer.fit_transform(y.values.reshape(-1, 1)).ravel()
         st.warning(f"NaN values found and imputed in the target column '{target_column}'.")
-    # Impute missing values in features X
     num_imputer = SimpleImputer(strategy='mean')
     cat_imputer = SimpleImputer(strategy='most_frequent')
@@ -280,28 +306,92 @@ def preprocess_data(df, target_column, scaling_method="None"):
     if len(cat_cols) > 0:
         X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
-    # Scaling is handled in the model_training_page after splitting, so not here.
-    # This function will just do imputation and encoding.
-    # Encode categorical features
     le_dict_features = {}
-    for col in cat_cols:
         le = LabelEncoder()
         X[col] = le.fit_transform(X[col].astype(str))
         le_dict_features[col] = le
     st.session_state.le_dict.update(le_dict_features)
     # Ensure target y is correctly typed after imputation, especially for classification
     if st.session_state.problem_type == "Classification" and target_column in st.session_state.le_dict:
-        # If target was label encoded, ensure it's integer type after imputation
-        # This might be redundant if LabelEncoder was applied after imputation, but good for safety
-        pass # y should already be encoded if it was object type initially
     elif st.session_state.problem_type == "Classification" and y.dtype == 'float':
-        # If y is float after mean imputation (e.g. binary 0/1 became float)
-        # and it's for classification, convert to int if appropriate
-        # This case should be rare if 'most_frequent' is used for classification target imputation
-        # However, if it was numeric and became float due to NaNs, then imputed with mean (which is wrong for classification)
-        # This indicates a logic flaw in imputation strategy selection above. Assuming 'most_frequent' was used.
         pass
     return X, y
@@ -321,6 +411,11 @@ def model_training_page():
     target = st.session_state.target_column
     st.subheader("Training Configuration")
     col1, col2 = st.columns(2)
     # Disable test_size slider if separate test data is provided
     disable_test_size = st.session_state.get('source_data_type') == 'separate' and st.session_state.test_data is not None
@@ -363,17 +458,40 @@ def model_training_page():
                 if st.session_state.get('source_data_type') == 'separate' and st.session_state.train_data is not None:
                     df_train_processed = st.session_state.train_data.copy()
-                    X_train, y_train = preprocess_data(df_train_processed, target)
                     if st.session_state.test_data is not None:
                         df_test_processed = st.session_state.test_data.copy()
                         if target not in df_test_processed.columns:
                             st.error(f"The target column '{target}' is missing from your test dataset. Please ensure both train and test datasets have the target column with the same name. Aborting training.")
                             return
-                        X_test, y_test = preprocess_data(df_test_processed, target) # Preprocess test data separately
-                        # Ensure X_test has same columns as X_train after preprocessing (esp. after one-hot encoding if added later)
-                        # For now, LabelEncoder is per-column, SimpleImputer fits on data it sees.
-                        # If one-hot encoding is added, fit on X_train, transform X_test, align columns.
                     else: # No test file, split train_data
                         X_train, X_test, y_train, y_test = train_test_split(
                             X_train, y_train, test_size=test_size, random_state=random_state,
@@ -381,7 +499,7 @@ def model_training_page():
                         )
                 else: # Single file upload
                     df_processed = st.session_state.data.copy()
-                    X, y = preprocess_data(df_processed, target)
                     X_train, X_test, y_train, y_test = train_test_split(
                         X, y, test_size=test_size, random_state=random_state,
                         stratify=(y if st.session_state.problem_type == "Classification" else None)
@@ -1007,44 +1125,37 @@ pipeline = joblib.load('{file_name}{'.joblib' if 'Joblib' in export_format else
         st.info("⚠️ Note: When deploying this model in production, ensure all required libraries are installed in your deployment environment.")
         st.info("💡 Tip: Consider using Docker to create a consistent environment for model deployment.")
-    st.subheader("🚀 Generate Flask API Endpoint")
-    if st.button("Generate Flask API Code", key='generate_flask_api_button'):
-        if st.session_state.trained_pipeline and st.session_state.X_train is not None:
-            # Ensure file_name and ext are defined in this scope, might need to get them from session_state or re-evaluate
-            # For simplicity, let's assume they are available or we use a default/placeholder
-            # This part might need adjustment based on how file_name and ext are handled in the download section
-            current_export_format = st.session_state.get('current_export_format', "Joblib (.joblib)") # Assuming this is stored or re-queried
-            current_file_name = st.session_state.get('current_file_name', f"{st.session_state.best_model_info['name'].lower().replace(' ', '_')}_pipeline")
-            ext_model = ".joblib" if "Joblib" in current_export_format else ".pkl"
-            model_pipeline_name = f"{current_file_name}{ext_model}"
-            flask_app_code = generate_flask_app_code(model_pipeline_name, list(st.session_state.X_train.columns), st.session_state.problem_type, is_xgboost, is_lightgbm, is_catboost)
-            st.code(flask_app_code, language='python')
-            b64_flask_app = base64.b64encode(flask_app_code.encode()).decode()
-            href_flask_app = f'<a href="data:file/text;base64,{b64_flask_app}" download="flask_api_app.py">Download flask_api_app.py</a>'
-            st.markdown(href_flask_app, unsafe_allow_html=True)
-            st.success("Flask API code generated and ready for download!")
-            st.info("Remember to install Flask (`pip install Flask`) and other necessary libraries (e.g., pandas, scikit-learn, joblib, and model-specific libraries) in the environment where you run this Flask app.")
-        else:
-            st.warning("Please ensure a model pipeline is trained and available, and training data (X_train) context exists.")
-# --- Helper function to generate Flask app code ---
-def generate_flask_app_code(model_path, feature_columns, problem_type, is_xgboost, is_lightgbm, is_catboost):
-    imports = [
-        "from flask import Flask, request, jsonify",
-        "import joblib",
-        "import pandas as pd",
-        "import numpy as np"
-    ]
-    if is_xgboost:
-        imports.append("import xgboost as xgb")
-    if is_lightgbm:
-        imports.append("import lightgbm as lgb")
-    if is_catboost:
-        imports.append("import catboost as cb")
-    import_str = "\n".join(imports)

 import base64
 from datetime import datetime
 import warnings
+import featuretools as ft # Added featuretools import
 warnings.filterwarnings('ignore')
 )
 # Custom CSS for better styling
+st.markdown("""
+<style>
+.main-header {
+    font-size: 2.5rem;
+    color: #1f77b4;
+    text-align: center;
+    margin-bottom: 2rem;
+}
+.metric-card {
+    background-color: #f0f2f6;
+    padding: 1rem;
+    border-radius: 0.5rem;
+    margin: 0.5rem 0;
+    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+}
+.success-message {
+    background-color: #d4edda;
+    color: #155724;
+    padding: 1rem;
+    border-radius: 0.5rem;
+    border: 1px solid #c3e6cb;
+}
+.stButton>button {
+    width: 100%;
+    border-radius: 0.5rem;
+}
+</style>
+""", unsafe_allow_html=True)
 # --- Helper Functions ---
 def display_error(e, context="An unexpected error occurred"):
     """Displays a user-friendly error message."""
     else:
         st.info("👆 Please upload a CSV or Excel file (or separate train/test files) to get started.")
+# Add a checkbox for enabling feature engineering in the sidebar or a relevant section
+# This might be better placed in the model_training_page or a new 'Feature Engineering' page/section
+# For now, let's assume we add it to the model_training_page configuration area.
+def preprocess_data(df, target_column, perform_feature_engineering=False):
     X = df.drop(columns=[target_column])
     y = df[target_column].copy() # Use .copy() to avoid SettingWithCopyWarning
+    # --- Existing Imputation Logic for Target (y) ---
     if y.isnull().any():
         if st.session_state.problem_type == "Classification":
+            y_imputer = SimpleImputer(strategy='most_frequent')
+            y[:] = y_imputer.fit_transform(y.values.reshape(-1, 1)).ravel()
         elif st.session_state.problem_type == "Regression":
             y_imputer = SimpleImputer(strategy='mean')
             y[:] = y_imputer.fit_transform(y.values.reshape(-1, 1)).ravel()
         st.warning(f"NaN values found and imputed in the target column '{target_column}'.")
+    # --- Existing Imputation Logic for Features (X) ---
     num_imputer = SimpleImputer(strategy='mean')
     cat_imputer = SimpleImputer(strategy='most_frequent')
     if len(cat_cols) > 0:
         X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
+    # --- Existing Encoding Logic for Categorical Features (X) ---
     le_dict_features = {}
+    original_object_cols = X.select_dtypes(include='object').columns # Re-select after imputation
+    for col in original_object_cols: # Iterate over original object columns that are now imputed
         le = LabelEncoder()
         X[col] = le.fit_transform(X[col].astype(str))
         le_dict_features[col] = le
     st.session_state.le_dict.update(le_dict_features)
+    # --- Automated Feature Engineering with Featuretools (New) ---
+    if perform_feature_engineering:
+        with st.spinner("Performing automated feature engineering..."):
+            try:
+                # Create an EntitySet
+                es = ft.EntitySet(id='dataset')
+                # Add the dataframe as an entity.
+                # We need a unique index. If 'index' is not a column, reset index.
+                if 'index' not in X.columns:
+                    X_ft = X.reset_index()
+                    entity_index = 'index'
+                else: # if 'index' column already exists and is unique
+                    X_ft = X.copy()
+                    entity_index = 'index'
+                    if not X_ft[entity_index].is_unique:
+                        st.warning("Featuretools: 'index' column exists but is not unique. Resetting index for feature engineering.")
+                        X_ft = X.reset_index()
+                        entity_index = 'index'
+                es = es.add_dataframe(
+                    dataframe_name='data_table',
+                    dataframe=X_ft,
+                    index=entity_index, # Ensure this column is unique
+                    # time_index='your_time_column_if_any', # Specify if you have a time index
+                    # logical_types={col: ft.variable_types.Categorical for col in cat_cols} # Optional: specify logical types
+                )
+                # Run Deep Feature Synthesis (DFS)
+                # You might want to limit trans_primitives or agg_primitives for speed
+                feature_matrix, feature_defs = ft.dfs(
+                    entityset=es,
+                    target_dataframe_name='data_table',
+                    # agg_primitives=["mean", "sum", "mode", "std", "max", "min", "count"], # Example primitives
+                    # trans_primitives=["day", "month", "year", "weekday", "time_since_previous"], # Example primitives
+                    max_depth=1, # Keep max_depth low initially for speed
+                    verbose=0, # Set to 1 for more output
+                    n_jobs=1 # Can be set to -1 to use all cores, but might be slow in Streamlit
+                )
+                st.success(f"Featuretools generated {feature_matrix.shape[1] - X_ft.shape[1]} new features.")
+                # Featuretools might change column types (e.g., bool to int). Ensure consistency.
+                # Also, it might re-introduce object types if not handled carefully with logical_types.
+                # For simplicity, we'll try to convert new boolean columns to int and re-encode any new object columns.
+                new_cols = [col for col in feature_matrix.columns if col not in X_ft.columns and col != entity_index]
+                for col in new_cols:
+                    if feature_matrix[col].dtype == 'bool':
+                        feature_matrix[col] = feature_matrix[col].astype(int)
+                    elif feature_matrix[col].dtype == 'object':
+                        # This shouldn't happen often with default primitives if input was numeric/encoded
+                        # But if it does, re-encode
+                        le = LabelEncoder()
+                        feature_matrix[col] = le.fit_transform(feature_matrix[col].astype(str))
+                        st.session_state.le_dict[col] = le # Store new encoder
+                        st.info(f"Featuretools created new object column '{col}', which has been label encoded.")
+                X = feature_matrix.copy()
+                if entity_index in X.columns and entity_index != 'index': # if original index was not 'index'
+                     X = X.drop(columns=[entity_index])
+                elif entity_index == 'index' and 'index' in X.columns and X.index.name == 'index':
+                    # If 'index' was created by reset_index and is now the df index, it's fine.
+                    # If 'index' is a column AND the df index, drop the column to avoid duplication.
+                    if 'index' in X.columns and X.index.name == 'index':
+                         X = X.drop(columns=['index'])
+                st.write("Preview of data after feature engineering (first 5 rows, up to 10 columns):")
+                st.dataframe(X.head().iloc[:, :10])
+            except Exception as e:
+                st.error(f"Error during automated feature engineering: {e}")
+                st.warning("Skipping automated feature engineering due to error.")
+    # --- Existing Target Type Handling (y) ---
     # Ensure target y is correctly typed after imputation, especially for classification
     if st.session_state.problem_type == "Classification" and target_column in st.session_state.le_dict:
+        pass
     elif st.session_state.problem_type == "Classification" and y.dtype == 'float':
         pass
     return X, y
     target = st.session_state.target_column
     st.subheader("Training Configuration")
+    # --- Add Feature Engineering Checkbox Here ---
+    perform_feature_engineering_cb = st.checkbox("Enable Automated Feature Engineering (Featuretools)", value=False, key='feature_engineering_cb',
+                                                 help="Automatically generate new features. This can take time and significantly increase the number of features.")
+    st.session_state.perform_feature_engineering = perform_feature_engineering_cb
     col1, col2 = st.columns(2)
     # Disable test_size slider if separate test data is provided
     disable_test_size = st.session_state.get('source_data_type') == 'separate' and st.session_state.test_data is not None
                 if st.session_state.get('source_data_type') == 'separate' and st.session_state.train_data is not None:
                     df_train_processed = st.session_state.train_data.copy()
+                    X_train, y_train = preprocess_data(df_train_processed, target, st.session_state.get('perform_feature_engineering', False))
                     if st.session_state.test_data is not None:
                         df_test_processed = st.session_state.test_data.copy()
                         if target not in df_test_processed.columns:
                             st.error(f"The target column '{target}' is missing from your test dataset. Please ensure both train and test datasets have the target column with the same name. Aborting training.")
                             return
+                        # Pass perform_feature_engineering=False for test data, as features should be derived from training data structure
+                        # or apply transforms derived from training data. For simplicity now, we don't re-run DFS on test.
+                        # A more robust approach would be to save feature definitions from training and apply to test.
+                        X_test, y_test = preprocess_data(df_test_processed, target, perform_feature_engineering=False)
+                        # Align columns after feature engineering (if it happened on train)
+                        # This is crucial if featuretools was run on X_train only
+                        if st.session_state.get('perform_feature_engineering', False):
+                            st.write("Aligning columns between training and testing sets after feature engineering...")
+                            train_cols = X_train.columns
+                            test_cols = X_test.columns
+                            # Columns in train but not in test (add them to test, fill with 0 or median/mode)
+                            for col in train_cols:
+                                if col not in test_cols:
+                                    X_test[col] = 0 # Or a more sophisticated fill value
+                            # Columns in test but not in train (remove them from test)
+                            # This case is less likely if feature engineering is only on train
+                            cols_to_drop_from_test = [col for col in test_cols if col not in train_cols]
+                            if cols_to_drop_from_test:
+                                X_test = X_test.drop(columns=cols_to_drop_from_test)
+                            # Ensure order is the same
+                            X_test = X_test[train_cols]
+                            st.info(f"Test set columns aligned. X_test shape: {X_test.shape}")
                     else: # No test file, split train_data
                         X_train, X_test, y_train, y_test = train_test_split(
                             X_train, y_train, test_size=test_size, random_state=random_state,
                         )
                 else: # Single file upload
                     df_processed = st.session_state.data.copy()
+                    X, y = preprocess_data(df_processed, target, st.session_state.get('perform_feature_engineering', False))
                     X_train, X_test, y_train, y_test = train_test_split(
                         X, y, test_size=test_size, random_state=random_state,
                         stratify=(y if st.session_state.problem_type == "Classification" else None)
         st.info("⚠️ Note: When deploying this model in production, ensure all required libraries are installed in your deployment environment.")
         st.info("💡 Tip: Consider using Docker to create a consistent environment for model deployment.")
+# --- Main Application ---
+def main():
+    init_session_state()
+    st.markdown('<h1 class="main-header">🤖 AutoML & Explainability Platform</h1>', unsafe_allow_html=True)
+    st.sidebar.title("⚙️ Workflow")
+    page_options = ["Data Upload & Preview", "Model Training", "Model Comparison", "Explainability", "Model Export"]
+    # Handle auto-run navigation
+    if st.session_state.get('auto_run_triggered') and st.session_state.target_column:
+        st.session_state.auto_run_triggered = False # Reset trigger
+        st.session_state.current_page = "Model Training"
+        st.session_state.auto_run_triggered_for_training = True # Signal model_training_page to auto-start
+    if 'current_page' not in st.session_state:
+        st.session_state.current_page = "Data Upload & Preview"
+    page = st.sidebar.radio("Navigate", page_options, key='navigation_radio', index=page_options.index(st.session_state.current_page))
+    st.session_state.current_page = page # Update current page based on user selection
+    if page == "Data Upload & Preview":
+        data_upload_page()
+    elif page == "Model Training":
+        model_training_page()
+    elif page == "Model Comparison":
+        model_comparison_page()
+    elif page == "Explainability":
+        explainability_page()
+    elif page == "Model Export":
+        model_export_page()
+if __name__ == "__main__":
+    main()