damndeepesh commited on
Commit
52bd7d1
·
verified ·
1 Parent(s): 48d3d8a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +182 -71
app.py CHANGED
@@ -23,6 +23,7 @@ import io
23
  import base64
24
  from datetime import datetime
25
  import warnings
 
26
 
27
  warnings.filterwarnings('ignore')
28
 
@@ -35,6 +36,35 @@ st.set_page_config(
35
  )
36
 
37
  # Custom CSS for better styling
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  # --- Helper Functions ---
39
  def display_error(e, context="An unexpected error occurred"):
40
  """Displays a user-friendly error message."""
@@ -246,29 +276,25 @@ def data_upload_page():
246
  else:
247
  st.info("👆 Please upload a CSV or Excel file (or separate train/test files) to get started.")
248
 
249
- def preprocess_data(df, target_column, scaling_method="None"):
 
 
 
 
250
  X = df.drop(columns=[target_column])
251
  y = df[target_column].copy() # Use .copy() to avoid SettingWithCopyWarning
252
 
253
- # Impute missing values in target variable y
254
  if y.isnull().any():
255
  if st.session_state.problem_type == "Classification":
256
- # For classification, ensure y is int/str before mode imputation if it's float with NaNs
257
- if pd.api.types.is_numeric_dtype(y) and y.nunique() > 2: # Check if it might be a float target for classification
258
- # If it's float and intended for classification, it might have been label encoded already or needs specific handling.
259
- # For now, let's assume if it's numeric and classification, it's likely already encoded or will be handled by LabelEncoder later.
260
- # If it's float due to NaNs, mode might be tricky. Let's ensure it's treated as object for mode for safety.
261
- y_imputer = SimpleImputer(strategy='most_frequent')
262
- y[:] = y_imputer.fit_transform(y.values.reshape(-1, 1)).ravel()
263
- else:
264
- y_imputer = SimpleImputer(strategy='most_frequent')
265
- y[:] = y_imputer.fit_transform(y.values.reshape(-1, 1)).ravel()
266
  elif st.session_state.problem_type == "Regression":
267
  y_imputer = SimpleImputer(strategy='mean')
268
  y[:] = y_imputer.fit_transform(y.values.reshape(-1, 1)).ravel()
269
  st.warning(f"NaN values found and imputed in the target column '{target_column}'.")
270
 
271
- # Impute missing values in features X
272
  num_imputer = SimpleImputer(strategy='mean')
273
  cat_imputer = SimpleImputer(strategy='most_frequent')
274
 
@@ -280,28 +306,92 @@ def preprocess_data(df, target_column, scaling_method="None"):
280
  if len(cat_cols) > 0:
281
  X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
282
 
283
- # Scaling is handled in the model_training_page after splitting, so not here.
284
- # This function will just do imputation and encoding.
285
-
286
- # Encode categorical features
287
  le_dict_features = {}
288
- for col in cat_cols:
 
289
  le = LabelEncoder()
290
  X[col] = le.fit_transform(X[col].astype(str))
291
  le_dict_features[col] = le
292
  st.session_state.le_dict.update(le_dict_features)
293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  # Ensure target y is correctly typed after imputation, especially for classification
295
  if st.session_state.problem_type == "Classification" and target_column in st.session_state.le_dict:
296
- # If target was label encoded, ensure it's integer type after imputation
297
- # This might be redundant if LabelEncoder was applied after imputation, but good for safety
298
- pass # y should already be encoded if it was object type initially
299
  elif st.session_state.problem_type == "Classification" and y.dtype == 'float':
300
- # If y is float after mean imputation (e.g. binary 0/1 became float)
301
- # and it's for classification, convert to int if appropriate
302
- # This case should be rare if 'most_frequent' is used for classification target imputation
303
- # However, if it was numeric and became float due to NaNs, then imputed with mean (which is wrong for classification)
304
- # This indicates a logic flaw in imputation strategy selection above. Assuming 'most_frequent' was used.
305
  pass
306
 
307
  return X, y
@@ -321,6 +411,11 @@ def model_training_page():
321
  target = st.session_state.target_column
322
 
323
  st.subheader("Training Configuration")
 
 
 
 
 
324
  col1, col2 = st.columns(2)
325
  # Disable test_size slider if separate test data is provided
326
  disable_test_size = st.session_state.get('source_data_type') == 'separate' and st.session_state.test_data is not None
@@ -363,17 +458,40 @@ def model_training_page():
363
 
364
  if st.session_state.get('source_data_type') == 'separate' and st.session_state.train_data is not None:
365
  df_train_processed = st.session_state.train_data.copy()
366
- X_train, y_train = preprocess_data(df_train_processed, target)
367
 
368
  if st.session_state.test_data is not None:
369
  df_test_processed = st.session_state.test_data.copy()
370
  if target not in df_test_processed.columns:
371
  st.error(f"The target column '{target}' is missing from your test dataset. Please ensure both train and test datasets have the target column with the same name. Aborting training.")
372
  return
373
- X_test, y_test = preprocess_data(df_test_processed, target) # Preprocess test data separately
374
- # Ensure X_test has same columns as X_train after preprocessing (esp. after one-hot encoding if added later)
375
- # For now, LabelEncoder is per-column, SimpleImputer fits on data it sees.
376
- # If one-hot encoding is added, fit on X_train, transform X_test, align columns.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  else: # No test file, split train_data
378
  X_train, X_test, y_train, y_test = train_test_split(
379
  X_train, y_train, test_size=test_size, random_state=random_state,
@@ -381,7 +499,7 @@ def model_training_page():
381
  )
382
  else: # Single file upload
383
  df_processed = st.session_state.data.copy()
384
- X, y = preprocess_data(df_processed, target)
385
  X_train, X_test, y_train, y_test = train_test_split(
386
  X, y, test_size=test_size, random_state=random_state,
387
  stratify=(y if st.session_state.problem_type == "Classification" else None)
@@ -1007,44 +1125,37 @@ pipeline = joblib.load('{file_name}{'.joblib' if 'Joblib' in export_format else
1007
  st.info("⚠️ Note: When deploying this model in production, ensure all required libraries are installed in your deployment environment.")
1008
  st.info("💡 Tip: Consider using Docker to create a consistent environment for model deployment.")
1009
 
1010
- st.subheader("🚀 Generate Flask API Endpoint")
1011
- if st.button("Generate Flask API Code", key='generate_flask_api_button'):
1012
- if st.session_state.trained_pipeline and st.session_state.X_train is not None:
1013
- # Ensure file_name and ext are defined in this scope, might need to get them from session_state or re-evaluate
1014
- # For simplicity, let's assume they are available or we use a default/placeholder
1015
- # This part might need adjustment based on how file_name and ext are handled in the download section
1016
- current_export_format = st.session_state.get('current_export_format', "Joblib (.joblib)") # Assuming this is stored or re-queried
1017
- current_file_name = st.session_state.get('current_file_name', f"{st.session_state.best_model_info['name'].lower().replace(' ', '_')}_pipeline")
1018
-
1019
- ext_model = ".joblib" if "Joblib" in current_export_format else ".pkl"
1020
- model_pipeline_name = f"{current_file_name}{ext_model}"
1021
-
1022
- flask_app_code = generate_flask_app_code(model_pipeline_name, list(st.session_state.X_train.columns), st.session_state.problem_type, is_xgboost, is_lightgbm, is_catboost)
1023
-
1024
- st.code(flask_app_code, language='python')
1025
-
1026
- b64_flask_app = base64.b64encode(flask_app_code.encode()).decode()
1027
- href_flask_app = f'<a href="data:file/text;base64,{b64_flask_app}" download="flask_api_app.py">Download flask_api_app.py</a>'
1028
- st.markdown(href_flask_app, unsafe_allow_html=True)
1029
- st.success("Flask API code generated and ready for download!")
1030
- st.info("Remember to install Flask (`pip install Flask`) and other necessary libraries (e.g., pandas, scikit-learn, joblib, and model-specific libraries) in the environment where you run this Flask app.")
1031
- else:
1032
- st.warning("Please ensure a model pipeline is trained and available, and training data (X_train) context exists.")
1033
 
 
 
 
 
1034
 
1035
- # --- Helper function to generate Flask app code ---
1036
- def generate_flask_app_code(model_path, feature_columns, problem_type, is_xgboost, is_lightgbm, is_catboost):
1037
- imports = [
1038
- "from flask import Flask, request, jsonify",
1039
- "import joblib",
1040
- "import pandas as pd",
1041
- "import numpy as np"
1042
- ]
1043
- if is_xgboost:
1044
- imports.append("import xgboost as xgb")
1045
- if is_lightgbm:
1046
- imports.append("import lightgbm as lgb")
1047
- if is_catboost:
1048
- imports.append("import catboost as cb")
1049
-
1050
- import_str = "\n".join(imports)
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  import base64
24
  from datetime import datetime
25
  import warnings
26
+ import featuretools as ft # Added featuretools import
27
 
28
  warnings.filterwarnings('ignore')
29
 
 
36
  )
37
 
38
  # Custom CSS for better styling
39
+ st.markdown("""
40
+ <style>
41
+ .main-header {
42
+ font-size: 2.5rem;
43
+ color: #1f77b4;
44
+ text-align: center;
45
+ margin-bottom: 2rem;
46
+ }
47
+ .metric-card {
48
+ background-color: #f0f2f6;
49
+ padding: 1rem;
50
+ border-radius: 0.5rem;
51
+ margin: 0.5rem 0;
52
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
53
+ }
54
+ .success-message {
55
+ background-color: #d4edda;
56
+ color: #155724;
57
+ padding: 1rem;
58
+ border-radius: 0.5rem;
59
+ border: 1px solid #c3e6cb;
60
+ }
61
+ .stButton>button {
62
+ width: 100%;
63
+ border-radius: 0.5rem;
64
+ }
65
+ </style>
66
+ """, unsafe_allow_html=True)
67
+
68
  # --- Helper Functions ---
69
  def display_error(e, context="An unexpected error occurred"):
70
  """Displays a user-friendly error message."""
 
276
  else:
277
  st.info("👆 Please upload a CSV or Excel file (or separate train/test files) to get started.")
278
 
279
+ # Add a checkbox for enabling feature engineering in the sidebar or a relevant section
280
+ # This might be better placed in the model_training_page or a new 'Feature Engineering' page/section
281
+ # For now, let's assume we add it to the model_training_page configuration area.
282
+
283
+ def preprocess_data(df, target_column, perform_feature_engineering=False):
284
  X = df.drop(columns=[target_column])
285
  y = df[target_column].copy() # Use .copy() to avoid SettingWithCopyWarning
286
 
287
+ # --- Existing Imputation Logic for Target (y) ---
288
  if y.isnull().any():
289
  if st.session_state.problem_type == "Classification":
290
+ y_imputer = SimpleImputer(strategy='most_frequent')
291
+ y[:] = y_imputer.fit_transform(y.values.reshape(-1, 1)).ravel()
 
 
 
 
 
 
 
 
292
  elif st.session_state.problem_type == "Regression":
293
  y_imputer = SimpleImputer(strategy='mean')
294
  y[:] = y_imputer.fit_transform(y.values.reshape(-1, 1)).ravel()
295
  st.warning(f"NaN values found and imputed in the target column '{target_column}'.")
296
 
297
+ # --- Existing Imputation Logic for Features (X) ---
298
  num_imputer = SimpleImputer(strategy='mean')
299
  cat_imputer = SimpleImputer(strategy='most_frequent')
300
 
 
306
  if len(cat_cols) > 0:
307
  X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
308
 
309
+ # --- Existing Encoding Logic for Categorical Features (X) ---
 
 
 
310
  le_dict_features = {}
311
+ original_object_cols = X.select_dtypes(include='object').columns # Re-select after imputation
312
+ for col in original_object_cols: # Iterate over original object columns that are now imputed
313
  le = LabelEncoder()
314
  X[col] = le.fit_transform(X[col].astype(str))
315
  le_dict_features[col] = le
316
  st.session_state.le_dict.update(le_dict_features)
317
 
318
+ # --- Automated Feature Engineering with Featuretools (New) ---
319
+ if perform_feature_engineering:
320
+ with st.spinner("Performing automated feature engineering..."):
321
+ try:
322
+ # Create an EntitySet
323
+ es = ft.EntitySet(id='dataset')
324
+
325
+ # Add the dataframe as an entity.
326
+ # We need a unique index. If 'index' is not a column, reset index.
327
+ if 'index' not in X.columns:
328
+ X_ft = X.reset_index()
329
+ entity_index = 'index'
330
+ else: # if 'index' column already exists and is unique
331
+ X_ft = X.copy()
332
+ entity_index = 'index'
333
+ if not X_ft[entity_index].is_unique:
334
+ st.warning("Featuretools: 'index' column exists but is not unique. Resetting index for feature engineering.")
335
+ X_ft = X.reset_index()
336
+ entity_index = 'index'
337
+
338
+ es = es.add_dataframe(
339
+ dataframe_name='data_table',
340
+ dataframe=X_ft,
341
+ index=entity_index, # Ensure this column is unique
342
+ # time_index='your_time_column_if_any', # Specify if you have a time index
343
+ # logical_types={col: ft.variable_types.Categorical for col in cat_cols} # Optional: specify logical types
344
+ )
345
+
346
+ # Run Deep Feature Synthesis (DFS)
347
+ # You might want to limit trans_primitives or agg_primitives for speed
348
+ feature_matrix, feature_defs = ft.dfs(
349
+ entityset=es,
350
+ target_dataframe_name='data_table',
351
+ # agg_primitives=["mean", "sum", "mode", "std", "max", "min", "count"], # Example primitives
352
+ # trans_primitives=["day", "month", "year", "weekday", "time_since_previous"], # Example primitives
353
+ max_depth=1, # Keep max_depth low initially for speed
354
+ verbose=0, # Set to 1 for more output
355
+ n_jobs=1 # Can be set to -1 to use all cores, but might be slow in Streamlit
356
+ )
357
+ st.success(f"Featuretools generated {feature_matrix.shape[1] - X_ft.shape[1]} new features.")
358
+
359
+ # Featuretools might change column types (e.g., bool to int). Ensure consistency.
360
+ # Also, it might re-introduce object types if not handled carefully with logical_types.
361
+ # For simplicity, we'll try to convert new boolean columns to int and re-encode any new object columns.
362
+ new_cols = [col for col in feature_matrix.columns if col not in X_ft.columns and col != entity_index]
363
+ for col in new_cols:
364
+ if feature_matrix[col].dtype == 'bool':
365
+ feature_matrix[col] = feature_matrix[col].astype(int)
366
+ elif feature_matrix[col].dtype == 'object':
367
+ # This shouldn't happen often with default primitives if input was numeric/encoded
368
+ # But if it does, re-encode
369
+ le = LabelEncoder()
370
+ feature_matrix[col] = le.fit_transform(feature_matrix[col].astype(str))
371
+ st.session_state.le_dict[col] = le # Store new encoder
372
+ st.info(f"Featuretools created new object column '{col}', which has been label encoded.")
373
+
374
+ X = feature_matrix.copy()
375
+ if entity_index in X.columns and entity_index != 'index': # if original index was not 'index'
376
+ X = X.drop(columns=[entity_index])
377
+ elif entity_index == 'index' and 'index' in X.columns and X.index.name == 'index':
378
+ # If 'index' was created by reset_index and is now the df index, it's fine.
379
+ # If 'index' is a column AND the df index, drop the column to avoid duplication.
380
+ if 'index' in X.columns and X.index.name == 'index':
381
+ X = X.drop(columns=['index'])
382
+
383
+ st.write("Preview of data after feature engineering (first 5 rows, up to 10 columns):")
384
+ st.dataframe(X.head().iloc[:, :10])
385
+
386
+ except Exception as e:
387
+ st.error(f"Error during automated feature engineering: {e}")
388
+ st.warning("Skipping automated feature engineering due to error.")
389
+
390
+ # --- Existing Target Type Handling (y) ---
391
  # Ensure target y is correctly typed after imputation, especially for classification
392
  if st.session_state.problem_type == "Classification" and target_column in st.session_state.le_dict:
393
+ pass
 
 
394
  elif st.session_state.problem_type == "Classification" and y.dtype == 'float':
 
 
 
 
 
395
  pass
396
 
397
  return X, y
 
411
  target = st.session_state.target_column
412
 
413
  st.subheader("Training Configuration")
414
+ # --- Add Feature Engineering Checkbox Here ---
415
+ perform_feature_engineering_cb = st.checkbox("Enable Automated Feature Engineering (Featuretools)", value=False, key='feature_engineering_cb',
416
+ help="Automatically generate new features. This can take time and significantly increase the number of features.")
417
+ st.session_state.perform_feature_engineering = perform_feature_engineering_cb
418
+
419
  col1, col2 = st.columns(2)
420
  # Disable test_size slider if separate test data is provided
421
  disable_test_size = st.session_state.get('source_data_type') == 'separate' and st.session_state.test_data is not None
 
458
 
459
  if st.session_state.get('source_data_type') == 'separate' and st.session_state.train_data is not None:
460
  df_train_processed = st.session_state.train_data.copy()
461
+ X_train, y_train = preprocess_data(df_train_processed, target, st.session_state.get('perform_feature_engineering', False))
462
 
463
  if st.session_state.test_data is not None:
464
  df_test_processed = st.session_state.test_data.copy()
465
  if target not in df_test_processed.columns:
466
  st.error(f"The target column '{target}' is missing from your test dataset. Please ensure both train and test datasets have the target column with the same name. Aborting training.")
467
  return
468
+ # Pass perform_feature_engineering=False for test data, as features should be derived from training data structure
469
+ # or apply transforms derived from training data. For simplicity now, we don't re-run DFS on test.
470
+ # A more robust approach would be to save feature definitions from training and apply to test.
471
+ X_test, y_test = preprocess_data(df_test_processed, target, perform_feature_engineering=False)
472
+
473
+ # Align columns after feature engineering (if it happened on train)
474
+ # This is crucial if featuretools was run on X_train only
475
+ if st.session_state.get('perform_feature_engineering', False):
476
+ st.write("Aligning columns between training and testing sets after feature engineering...")
477
+ train_cols = X_train.columns
478
+ test_cols = X_test.columns
479
+
480
+ # Columns in train but not in test (add them to test, fill with 0 or median/mode)
481
+ for col in train_cols:
482
+ if col not in test_cols:
483
+ X_test[col] = 0 # Or a more sophisticated fill value
484
+
485
+ # Columns in test but not in train (remove them from test)
486
+ # This case is less likely if feature engineering is only on train
487
+ cols_to_drop_from_test = [col for col in test_cols if col not in train_cols]
488
+ if cols_to_drop_from_test:
489
+ X_test = X_test.drop(columns=cols_to_drop_from_test)
490
+
491
+ # Ensure order is the same
492
+ X_test = X_test[train_cols]
493
+ st.info(f"Test set columns aligned. X_test shape: {X_test.shape}")
494
+
495
  else: # No test file, split train_data
496
  X_train, X_test, y_train, y_test = train_test_split(
497
  X_train, y_train, test_size=test_size, random_state=random_state,
 
499
  )
500
  else: # Single file upload
501
  df_processed = st.session_state.data.copy()
502
+ X, y = preprocess_data(df_processed, target, st.session_state.get('perform_feature_engineering', False))
503
  X_train, X_test, y_train, y_test = train_test_split(
504
  X, y, test_size=test_size, random_state=random_state,
505
  stratify=(y if st.session_state.problem_type == "Classification" else None)
 
1125
  st.info("⚠️ Note: When deploying this model in production, ensure all required libraries are installed in your deployment environment.")
1126
  st.info("💡 Tip: Consider using Docker to create a consistent environment for model deployment.")
1127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1128
 
1129
+ # --- Main Application ---
1130
+ def main():
1131
+ init_session_state()
1132
+ st.markdown('<h1 class="main-header">🤖 AutoML & Explainability Platform</h1>', unsafe_allow_html=True)
1133
 
1134
+ st.sidebar.title("⚙️ Workflow")
1135
+ page_options = ["Data Upload & Preview", "Model Training", "Model Comparison", "Explainability", "Model Export"]
1136
+
1137
+ # Handle auto-run navigation
1138
+ if st.session_state.get('auto_run_triggered') and st.session_state.target_column:
1139
+ st.session_state.auto_run_triggered = False # Reset trigger
1140
+ st.session_state.current_page = "Model Training"
1141
+ st.session_state.auto_run_triggered_for_training = True # Signal model_training_page to auto-start
1142
+
1143
+ if 'current_page' not in st.session_state:
1144
+ st.session_state.current_page = "Data Upload & Preview"
1145
+
1146
+ page = st.sidebar.radio("Navigate", page_options, key='navigation_radio', index=page_options.index(st.session_state.current_page))
1147
+ st.session_state.current_page = page # Update current page based on user selection
1148
+
1149
+ if page == "Data Upload & Preview":
1150
+ data_upload_page()
1151
+ elif page == "Model Training":
1152
+ model_training_page()
1153
+ elif page == "Model Comparison":
1154
+ model_comparison_page()
1155
+ elif page == "Explainability":
1156
+ explainability_page()
1157
+ elif page == "Model Export":
1158
+ model_export_page()
1159
+
1160
+ if __name__ == "__main__":
1161
+ main()