Spaces:

pavanmutha
/

AIAgentDataAnalysis

Paused

App Files Files Community

pavanmutha commited on Apr 15, 2025

Commit

c8f222f

verified ·

1 Parent(s): a6f26d3

Update app.py

Browse files

Files changed (1) hide show

app.py +913 -143

app.py CHANGED Viewed

@@ -1,10 +1,16 @@
-# Required imports (ensure all are present from previous code)
 import os
 import gradio as gr
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import shap
 import lime.lime_tabular
 import optuna
 import wandb
@@ -13,42 +19,814 @@ import time
 import psutil
 import shutil
 import ast
-from smolagents import HfApiModel, CodeAgent
 from huggingface_hub import login
 from sklearn.model_selection import train_test_split, cross_val_score
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
 from sklearn.linear_model import LogisticRegression
-from sklearn.svm import SVC
 from sklearn.preprocessing import LabelEncoder, StandardScaler
 from sklearn.pipeline import Pipeline
 from datetime import datetime
-from PIL import Image
 import warnings
-import joblib
-# (Keep all previous setup, functions like clean_data, upload_file, AI Agent, prepare_data, train_and_compare_models)
-# ... (paste the previous code here up to the explainability function) ...
-# --- Model Explainability (REVISED) ---
-def explainability(_=None): # Add dummy input for button click signature
     """Generates SHAP and LIME explanations for the best performing model."""
     global split_data_global, best_model_details_global, wandb_run
     if split_data_global is None:
-        print("Error: Data not split. Please run comparison first.")
-        return [], None, "Error: Data not prepared. Run 'Train & Compare' first." # Return empty list for gallery
     if best_model_details_global is None:
-         print("Error: Best model details not found. Please run comparison first.")
-         return [], None, "Error: Best model not identified. Run 'Train & Compare' first." # Return empty list
-    X_train, X_test, y_train, y_test = split_data_global
     best_model_name = best_model_details_global['name']
-    best_model = best_model_details_global['model'] # Use the stored, already fitted best model
     print(f"--- Generating explanations for the best model: {best_model_name} ---")
-    # Define paths dynamically
     output_dir = "./explainability_plots"
     if os.path.exists(output_dir): shutil.rmtree(output_dir)
     os.makedirs(output_dir)
@@ -59,16 +837,15 @@ def explainability(_=None): # Add dummy input for button click signature
     status_message = f"Explaining best model: {best_model_name}"
     all_shap_paths = [] # Initialize empty list for gallery output
     run_name = f"Explain_{best_model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
     config = {"task": "Explainability", "best_model": best_model_name, "explainers": ["SHAP", "LIME"]}
-    # Init separate wandb run for explainability
     wandb_run_explain = None
     if wandb.run is None or wandb.run.mode != "disabled":
         try:
-            # Ensure previous run is finished if still active from comparison
             if wandb.run and wandb.run.id:
-                print(f"Finishing potentially active WandB run ({wandb.run.id}) before starting explanation run.")
                 wandb.finish()
             wandb_run_explain = wandb.init(project="ai-data-analysis-gradio", name=run_name, config=config, reinit=True)
             print(f"WandB run '{run_name}' initialized for Explainability.")
@@ -76,6 +853,7 @@ def explainability(_=None): # Add dummy input for button click signature
             print(f"Error initializing Wandb run for Explainability: {e}")
             wandb_run_explain = None # Ensure it's None if init fails
     else:
         wandb_run_explain = None
     try:
@@ -83,146 +861,132 @@ def explainability(_=None): # Add dummy input for button click signature
         print("Calculating SHAP values...")
         shap_values = None
         explainer = None
-        model_for_shap = None # Define variable to hold the model actually used by SHAP
-        X_test_for_plot = X_test # Default unless subsetting happens
         # Determine explainer type and get SHAP values
         if isinstance(best_model, (RandomForestClassifier, GradientBoostingClassifier)):
-            model_for_shap = best_model # Use directly
-            explainer = shap.TreeExplainer(model_for_shap)
-            shap_values = explainer.shap_values(X_test)
         elif isinstance(best_model, Pipeline):
-            # Get the final estimator from the pipeline
             final_estimator_name, final_estimator = best_model.steps[-1]
             print(f"Handling Pipeline. Final estimator: {final_estimator_name} ({type(final_estimator)})")
             if isinstance(final_estimator, (RandomForestClassifier, GradientBoostingClassifier)):
-                 # Retrain the tree model part outside the pipeline for SHAP TreeExplainer
-                 print("Note: Retraining tree model component without pipeline for SHAP TreeExplainer.")
-                 model_for_shap = type(final_estimator)(**final_estimator.get_params())
-                 # Apply preprocessing steps if any before fitting
-                 # Simple case: assume only scaling before tree
-                 pipeline_transforms = Pipeline(best_model.steps[:-1])
-                 X_train_transformed = pipeline_transforms.fit_transform(X_train)
-                 X_test_transformed = pipeline_transforms.transform(X_test) # Use transformed data for SHAP
-                 model_for_shap.fit(X_train_transformed, y_train)
-                 explainer = shap.TreeExplainer(model_for_shap)
-                 shap_values = explainer.shap_values(X_test_transformed) # Explain on transformed test data
-                 X_test_for_plot = pd.DataFrame(X_test_transformed, columns=X_test.columns) # Use transformed data for plotting too
             elif isinstance(final_estimator, LogisticRegression):
                  print("Using SHAP KernelExplainer for Logistic Regression in Pipeline (can be slow)...")
-                 # Create a predict_proba function for the *entire* pipeline
                  predict_proba_pipeline = lambda x: best_model.predict_proba(pd.DataFrame(x, columns=X_test.columns))
-                 # Use a background dataset (summary) - kmeans is common
                  print("Summarizing training data for KernelExplainer background...")
-                 X_train_summary = shap.kmeans(X_train, min(100, X_train.shape[0])) # Use min to avoid errors on small data
                  explainer = shap.KernelExplainer(predict_proba_pipeline, X_train_summary)
-                 # Use a smaller subset of X_test for KernelExplainer speed
                  subset_size = min(50, X_test.shape[0])
                  print(f"Calculating SHAP values for {subset_size} test instances...")
-                 X_test_subset_np = X_test.sample(subset_size, random_state=42).values # Sample before converting to numpy
-                 # X_test_subset_np = shap.sample(X_test.values, subset_size) # Sample numpy array directly
-                 shap_values = explainer.shap_values(X_test_subset_np)
                  # Create DataFrame from subset for plotting consistency
-                 X_test_for_plot = pd.DataFrame(X_test_subset_np, columns=X_test.columns)
                  print("SHAP values calculated using KernelExplainer.")
             else:
-                print(f"Warning: SHAP explainer not implemented for final pipeline step {type(final_estimator)}. Skipping SHAP.")
         else:
-            print(f"Warning: SHAP explainer not explicitly handled for model type {type(best_model)}. Trying KernelExplainer as fallback.")
-            # Fallback to KernelExplainer (might be slow)
-            try:
-                 predict_proba_fallback = lambda x: best_model.predict_proba(pd.DataFrame(x, columns=X_test.columns))
-                 X_train_summary = shap.kmeans(X_train, min(100, X_train.shape[0]))
-                 explainer = shap.KernelExplainer(predict_proba_fallback, X_train_summary)
-                 subset_size = min(50, X_test.shape[0])
-                 X_test_subset_np = X_test.sample(subset_size, random_state=42).values
-                 shap_values = explainer.shap_values(X_test_subset_np)
-                 X_test_for_plot = pd.DataFrame(X_test_subset_np, columns=X_test.columns)
-                 print("Used KernelExplainer fallback.")
-            except Exception as kernel_fallback_e:
-                 print(f"KernelExplainer fallback failed: {kernel_fallback_e}. Skipping SHAP.")
         # --- Generate SHAP Plots (if shap_values exist) ---
         if shap_values is not None:
-            num_classes = len(np.unique(y_train)) # Get number of classes from original y_train
             # SHAP Summary Plot
             print("Generating SHAP summary plot...")
-            plt.figure(figsize=(10, 6)) # Ensure figure context
             try:
-                plot_shap_values = shap_values # Default
                 title_suffix = f"({best_model_name})"
                 if num_classes == 2 and isinstance(shap_values, list) and len(shap_values) == 2:
-                    plot_shap_values = shap_values[1] # Plot for class 1 in binary case
-                    title_suffix = f"({best_model_name} - Positive Class)"
-                    print("Plotting SHAP summary for positive class (binary)")
                 elif num_classes > 2 and isinstance(shap_values, list):
                      title_suffix = f"({best_model_name} - Multiclass Avg Impact)"
                      print("Plotting SHAP summary for multiclass (average impact)")
-                shap.summary_plot(plot_shap_values, X_test_for_plot, show=False, plot_type="dot")
                 plt.title(f"SHAP Feature Importance Summary {title_suffix}")
                 plt.tight_layout()
                 plt.savefig(shap_summary_path, bbox_inches='tight')
                 plt.clf()
                 print(f"SHAP summary plot saved to {shap_summary_path}")
-                all_shap_paths.append(shap_summary_path) # Add to list for gallery
                 if wandb_run_explain: wandb.log({"shap_summary": wandb.Image(shap_summary_path)}, commit=False)
             except Exception as summary_e:
                  print(f"Error generating SHAP summary plot: {summary_e}")
-                 plt.clf() # Clear figure even on error
             # SHAP Dependence Plots
             print("Calculating global feature importance for dependence plots...")
-            global_shap_values = None
             try:
-                if isinstance(shap_values, list): # Multi-class or binary list format
-                    # Handle case where list elements might have different shapes (e.g., KernelExplainer?)
-                    abs_shap_arrays = [np.abs(sv) for sv in shap_values if isinstance(sv, np.ndarray) and sv.ndim == 2]
-                    if abs_shap_arrays:
-                         # Ensure all arrays have the same number of features before stacking
-                         min_features = min(arr.shape[1] for arr in abs_shap_arrays)
-                         consistent_shap_arrays = [arr[:, :min_features] for arr in abs_shap_arrays]
-                         stacked_shap = np.stack(consistent_shap_arrays, axis=0) # Shape (n_classes, n_instances, n_features)
-                         global_shap_values = stacked_shap.mean(axis=(0, 1)) # Mean over classes and instances -> (n_features,)
-                         print(f"Calculated global SHAP values (list input), shape: {global_shap_values.shape}")
-                    else:
-                        print("Warning: Could not extract valid 2D arrays from SHAP values list.")
-                elif isinstance(shap_values, np.ndarray) and shap_values.ndim == 2: # Regression or binary array format
-                    global_shap_values = np.abs(shap_values).mean(axis=0) # Mean over instances -> (n_features,)
-                    print(f"Calculated global SHAP values (array input), shape: {global_shap_values.shape}")
                 else:
-                    print(f"Warning: Unexpected SHAP values type/shape for global importance: {type(shap_values)}. Skipping dependence plots.")
             except Exception as gsi_e:
                 print(f"Error calculating global SHAP importance: {gsi_e}")
-            # Generate plots if importance calculated successfully
-            if global_shap_values is not None and len(global_shap_values) > 0 :
                 try:
-                    feature_indices = np.argsort(global_shap_values)[::-1] # Indices sorted by importance
-                    num_features_to_plot = min(2, len(global_shap_values), len(X_test_for_plot.columns)) # Plot top 2 or fewer
                     if num_features_to_plot > 0:
                         top_feature_indices = feature_indices[:num_features_to_plot]
-                        top_features = X_test_for_plot.columns[top_feature_indices]
                         print(f"Generating SHAP dependence plots for top features: {list(top_features)}")
-                        for feature_idx, feature_name in zip(top_feature_indices, top_features):
                             plt.figure(figsize=(8, 5))
-                            # Select appropriate SHAP values for dependence plot
-                            # For binary list, use class 1; for multiclass list, use default (class 0 usually) or specify; for array, use array.
                             shap_values_for_dep = shap_values
                             if isinstance(shap_values, list):
-                                shap_values_for_dep = shap_values[1] if num_classes == 2 and len(shap_values)==2 else shap_values[0] # Default to class 0 for multi or if binary isn't len 2
-                            shap.dependence_plot(feature_idx, shap_values_for_dep, X_test_for_plot, interaction_index='auto', show=False)
-                            plt.title(f"SHAP Dependence Plot: {feature_name} ({best_model_name})")
                             plt.tight_layout()
                             dep_path = os.path.join(output_dir, f"shap_dependence_{best_model_name}_{feature_name}.png")
                             plt.savefig(dep_path, bbox_inches='tight')
@@ -232,13 +996,14 @@ def explainability(_=None): # Add dummy input for button click signature
                             print(f"Saved dependence plot: {dep_path}")
                             if wandb_run_explain: wandb.log({f"shap_dependence_{feature_name}": wandb.Image(dep_path)}, commit=False)
                     else:
-                        print("Skipping dependence plots: Not enough features.")
                 except Exception as dep_e:
                     print(f"Could not generate SHAP dependence plots: {dep_e}")
-                    plt.clf() # Ensure figure is cleared
             else:
                  print("Skipping dependence plots due to issue calculating global SHAP values.")
         else:
              print("Skipping SHAP plots as SHAP values were not generated.")
@@ -247,51 +1012,47 @@ def explainability(_=None): # Add dummy input for button click signature
         print("Generating LIME explanation for the first test instance...")
         try:
             predict_fn_lime = None
-            # Create predict_proba function needed by LIME
             if hasattr(best_model, 'predict_proba'):
-                 # Handle numpy vs pandas input for pipeline/model
                  def predict_proba_wrapper(x_np):
-                      # Convert numpy array back to DataFrame for pipeline/model consistency
                       x_df = pd.DataFrame(x_np, columns=X_train.columns)
                       return best_model.predict_proba(x_df)
                  predict_fn_lime = predict_proba_wrapper
             else:
-                 print("Warning: Model does not have predict_proba. LIME might not work as expected.")
-                 # Dummy fallback returning equal probabilities
                  num_classes_lime = len(np.unique(y_train))
                  predict_fn_lime = lambda x: np.ones((len(x), num_classes_lime)) / num_classes_lime
-            # Get class names
-            if hasattr(best_model, 'classes_'):
-                 class_names_str = [str(c) for c in best_model.classes_]
-            else: # Infer from y_train if no classes_ attribute
-                 class_names_str = [str(c) for c in sorted(np.unique(y_train))]
             lime_explainer = lime.lime_tabular.LimeTabularExplainer(
-                training_data=X_train.values, # LIME needs numpy array for background
                 feature_names=X_train.columns.tolist(),
-                class_names=class_names_str,
-                mode='classification' if len(class_names_str) > 1 else 'regression'
             )
             instance_idx = 0
-            instance_to_explain = X_test.iloc[instance_idx].values # Explain first instance
-            true_class = y_test[instance_idx] if isinstance(y_test, (np.ndarray, list)) else y_test.iloc[instance_idx] # Get true class safely
             lime_exp = lime_explainer.explain_instance(
                 data_row=instance_to_explain,
-                predict_fn=predict_fn_lime, # Use the wrapper
                 num_features=10,
-                num_samples=1000 # Adjust as needed for speed/accuracy
             )
             print(f"LIME explanation generated for instance {instance_idx}.")
             lime_fig = lime_exp.as_pyplot_figure()
-            # Attempt to get predicted class label for title
-            predicted_class_idx = lime_exp.available_labels()[0] # Often the predicted class is first
-            predicted_class_label = class_names_str[predicted_class_idx]
-            lime_fig.suptitle(f"LIME Exp (Inst {instance_idx}, True: {true_class}, Pred: {predicted_class_label}, Model: {best_model_name})", y=1.03, fontsize=10)
-            lime_fig.tight_layout(rect=[0, 0, 1, 0.98]) # Adjust layout
             lime_fig.savefig(lime_path, bbox_inches='tight')
             plt.clf()
             print(f"LIME plot saved to {lime_path}")
@@ -299,30 +1060,33 @@ def explainability(_=None): # Add dummy input for button click signature
         except Exception as lime_e:
             print(f"Error generating LIME explanation: {lime_e}")
             if wandb_run_explain: wandb.log({"lime_error": str(lime_e)}, commit=False)
             lime_path = None # Indicate failure
         # Final status message
-        status_message = f"Explanations generated for {best_model_name}. Check plots."
-        if not all_shap_paths: status_message += " (SHAP failed/skipped)."
         if not lime_path: status_message += " (LIME failed/skipped)."
         # Return paths to the plots and status
-        # Ensure lime_path is valid before returning, otherwise None
         valid_lime_path = lime_path if lime_path and os.path.exists(lime_path) else None
-        return all_shap_paths, valid_lime_path, status_message
     except Exception as e:
         print(f"An error occurred during explainability: {e}")
         import traceback
-        traceback.print_exc() # Print full traceback for debugging
         status_message = f"Error during explanation: {e}"
         if wandb_run_explain: wandb_run_explain.finish(exit_code=1)
         return [], None, status_message # Return empty list/None for paths on error
     finally:
         plt.close('all') # Close all matplotlib figures
-        if wandb_run_explain and wandb.run and wandb.run.id == wandb_run_explain.id: # Check if it's the correct run
             try:
                  wandb_run_explain.finish()
                  print(f"WandB run '{run_name}' finished.")
             except Exception as finish_e:
@@ -330,9 +1094,8 @@ def explainability(_=None): # Add dummy input for button click signature
         wandb_run_explain = None # Reset
-# --- Gradio Interface (Keep the same as the previous version) ---
-# ... (paste the Gradio Blocks UI code here) ...
 with gr.Blocks(theme=gr.themes.Soft(), title="AI Data Analysis & Model Comparison") as demo:
     gr.Markdown(
         """
@@ -354,7 +1117,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="AI Data Analysis & Model Compariso
         with gr.Row():
              with gr.Column(scale=1):
                 agent_notes = gr.Textbox(label="Optional: Specific requests for the AI Agent", placeholder="e.g., 'Focus on correlations with column X'")
-                agent_btn = gr.Button("Run AI Analysis", variant="secondary")
              with gr.Column(scale=2):
                 insights_output = gr.HTML(label="AI Agent Analysis Report")
         with gr.Row():
@@ -369,7 +1132,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="AI Data Analysis & Model Compariso
                   optuna_trials_slider = gr.Slider(minimum=5, maximum=50, value=10, step=5, label="Optuna Trials per Model")
                   compare_btn = gr.Button("Train & Compare Models", variant="primary")
              with gr.Column(scale=2):
-                  comparison_output = gr.DataFrame(label="Model Comparison Results (Sorted by F1 Score)", interactive=False)
     # --- Row 4: Model Explainability ---
     with gr.Accordion("💡 Step 4: Explain Best Model (SHAP & LIME)", open=False):
@@ -378,41 +1141,47 @@ with gr.Blocks(theme=gr.themes.Soft(), title="AI Data Analysis & Model Compariso
              explain_status = gr.Textbox(label="Explanation Status", interactive=False)
         with gr.Row():
              # Use Gallery for SHAP as there can be multiple plots
-             shap_gallery = gr.Gallery(label="SHAP Plots (Summary + Top Feature Dependence)", height=400, object_fit="contain", columns=2, preview=True)
              lime_img = gr.Image(label="LIME Explanation (for first test instance)", type="filepath", interactive=False)
     # --- Connect Components ---
     file_input.change(
         fn=upload_file,
         inputs=file_input,
         outputs=df_output
     )
     agent_btn.click(
         fn=analyze_data,
         inputs=[file_input, agent_notes],
         outputs=[insights_output, visual_output]
     )
     compare_btn.click(
         fn=train_and_compare_models,
         inputs=[tune_rf_checkbox, tune_gb_checkbox, optuna_trials_slider],
         outputs=[comparison_output]
     )
     explain_btn.click(
         fn=explainability,
         inputs=[], # Uses global best model details
         outputs=[shap_gallery, lime_img, explain_status] # Output list of SHAP plots, one LIME plot, and status
     )
 # --- Launch the App ---
 if __name__ == "__main__":
     # Clean up temporary files/dirs from previous runs before launching
-    temp_dirs = ['./figures', './explainability_plots', './__pycache__'] # Add explainability dir
-    temp_files = [f for f in os.listdir('.') if f.lower().endswith('.joblib')] # Only remove joblib now
     for d in temp_dirs:
         if os.path.exists(d):
@@ -429,5 +1198,6 @@ if __name__ == "__main__":
              except Exception as e:
                  print(f"Warning: Could not clean up file {f}: {e}")
-    demo.launch(debug=False) # Turn debug=True if you need Gradio's traceback

+# -*- coding: utf-8 -*-
+"""
+Gradio App for AI Data Analysis, Model Comparison, and Explainability
+Requires: HF_TOKEN and WANDB_API_KEY environment variables.
+"""
 import os
 import gradio as gr
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import shap
+import lime
 import lime.lime_tabular
 import optuna
 import wandb
 import psutil
 import shutil
 import ast
+from smolagents import HfApiModel, CodeAgent # Assuming smolagents is installed
 from huggingface_hub import login
 from sklearn.model_selection import train_test_split, cross_val_score
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
 from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC # Kept import in case you add it later
 from sklearn.preprocessing import LabelEncoder, StandardScaler
 from sklearn.pipeline import Pipeline
 from datetime import datetime
+# from PIL import Image # PIL often implicitly used by matplotlib/wandb, explicit import usually not needed unless manipulating images directly
 import warnings
+import joblib # For saving models
+# Suppress common warnings (especially from SHAP/LIME/Sklearn)
+warnings.filterwarnings("ignore")
+# --- Authentication and Setup ---
+print("--- Initializing Setup ---")
+hf_token = os.getenv("HF_TOKEN")
+wandb_api_key = os.getenv("WANDB_API_KEY")
+# Initialize wandb run variable globally, helps manage state across functions
+wandb_run = None # Tracks the *current* active run (e.g., comparison or explainability)
+if not hf_token:
+    print("Warning: HF_TOKEN environment variable not set. Hugging Face Hub features may fail.")
+else:
+    try:
+        login(token=hf_token)
+        print("Hugging Face login successful.")
+    except Exception as e:
+        print(f"Hugging Face login failed: {e}")
+if not wandb_api_key:
+    print("Warning: WANDB_API_KEY environment variable not set. WandB logging will be disabled.")
+    # Initialize wandb in disabled mode if no key and not already initialized
+    if wandb.run is None:
+        try:
+            wandb.init(mode="disabled")
+            print("WandB initialized in disabled mode.")
+        except Exception as e:
+            print(f"Failed to initialize WandB in disabled mode: {e}")
+else:
+    try:
+        wandb.login(key=wandb_api_key)
+        print("WandB login successful.")
+    except Exception as e:
+        print(f"WandB login failed: {e}. Disabling WandB.")
+        if wandb.run is None:
+             try:
+                 wandb.init(mode="disabled")
+                 print("WandB initialized in disabled mode due to login failure.")
+             except Exception as e_init:
+                 print(f"Failed to initialize WandB in disabled mode: {e_init}")
+# SmolAgent initialization
+agent = None # Initialize agent to None
+try:
+    print("Initializing SmolAgent...")
+    model_api = HfApiModel("mistralai/Mixtral-8x7B-Instruct-v0.1", token=hf_token)
+    agent = CodeAgent(tools=[], model=model_api, additional_authorized_imports=[
+        "numpy", "pandas", "matplotlib.pyplot", "seaborn", "sklearn", "json", "os"
+    ])
+    print("SmolAgent initialized successfully.")
+except Exception as e:
+    print(f"Error initializing SmolAgent: {e}. AI Agent features might fail.")
+# Global variables
+df_global = None
+split_data_global = None # To store (X_train, X_test, y_train, y_test)
+comparison_results_global = None # To store comparison DataFrame
+best_model_details_global = None # To store {'name': best_name, 'model': best_model_obj, 'params': best_params}
+print("Global variables initialized.")
+print("--- Setup Complete ---")
+# --- Data Handling ---
+def clean_data(df):
+    """Cleans the input DataFrame."""
+    print("Starting data cleaning...")
+    df_cleaned = df.copy() # Work on a copy
+    # Drop columns/rows that are entirely empty
+    df_cleaned = df_cleaned.dropna(how='all', axis=1).dropna(how='all', axis=0)
+    print(f"Shape after dropping fully empty rows/cols: {df_cleaned.shape}")
+    # Encode object columns
+    object_cols = df_cleaned.select_dtypes(include='object').columns
+    if not object_cols.empty:
+        print(f"Encoding object columns: {list(object_cols)}")
+        for col in object_cols:
+            # Convert to string first to handle mixed types or NaN representations
+            df_cleaned[col] = df_cleaned[col].astype(str)
+            df_cleaned[col] = LabelEncoder().fit_transform(df_cleaned[col])
+    else:
+        print("No object columns found to encode.")
+    # Impute missing values in numeric columns with the mean
+    numeric_cols = df_cleaned.select_dtypes(include=np.number).columns
+    if not numeric_cols.empty:
+        cols_with_na = df_cleaned[numeric_cols].isnull().sum()
+        cols_to_impute = cols_with_na[cols_with_na > 0].index
+        if not cols_to_impute.empty:
+            print(f"Imputing NaNs with mean in columns: {list(cols_to_impute)}")
+            for col in cols_to_impute: # Iterate through columns needing imputation
+                mean_val = df_cleaned[col].mean()
+                df_cleaned[col] = df_cleaned[col].fillna(mean_val)
+        else:
+            print("No NaNs found in numeric columns to impute.")
+    else:
+        print("No numeric columns found for NaN imputation.")
+    print("Data cleaning finished.")
+    return df_cleaned
+def upload_file(file):
+    """Handles file upload, cleaning, and global state update."""
+    global df_global, split_data_global, comparison_results_global, best_model_details_global
+    # Reset all global states when a new file is uploaded or file is cleared
+    df_global = None
+    split_data_global = None
+    comparison_results_global = None
+    best_model_details_global = None
+    print("Reset global data states on file change.")
+    if file is None:
+        # No file uploaded or file removed by user
+        return pd.DataFrame({"Status": ["No file uploaded or file removed."]})
+    print(f"Uploading file: {file.name}")
+    try:
+        ext = os.path.splitext(file.name)[-1].lower()
+        if ext == ".csv":
+            df = pd.read_csv(file.name)
+        elif ext in [".xls", ".xlsx"]:
+            df = pd.read_excel(file.name)
+        else:
+             return pd.DataFrame({"Error": [f"Unsupported file type: {ext}"]})
+        print(f"Original data shape: {df.shape}")
+        df_cleaned = clean_data(df)
+        print(f"Cleaned data shape: {df_cleaned.shape}")
+        df_global = df_cleaned # Store the cleaned data
+        # dependent globals (split_data_global, etc.) remain None until downstream functions are called
+        print("Global DataFrame updated with cleaned data.")
+        return df_global.head() # Return head of CLEANED data for preview
+    except Exception as e:
+        print(f"Error processing file {file.name}: {e}")
+        # Ensure globals are None on error
+        df_global = None
+        split_data_global = None
+        comparison_results_global = None
+        best_model_details_global = None
+        return pd.DataFrame({"Error": [f"Failed to process file: {e}"]})
+# --- AI Agent Analysis ---
+def format_observations(observations):
+    """Formats the observations dictionary into HTML list items."""
+    if not isinstance(observations, dict):
+        return f"<p style='color: orange;'>Observations data is not a dictionary: {type(observations)}</p>"
+    items_html = ""
+    for key, value in observations.items():
+        formatted_key = key.replace('_', ' ').title()
+        if isinstance(value, (dict, list)):
+             formatted_value = json.dumps(value, indent=2)
+             value_html = f"<pre style='margin: 0; padding: 8px; background: #ffffff; border: 1px solid #ccc; border-radius: 4px; font-size: 0.9em; white-space: pre-wrap; word-wrap: break-word;'>{formatted_value}</pre>"
+        else:
+            formatted_value = str(value)
+            value_html = f"<p style='margin: 0; padding: 8px; background: #ffffff; border: 1px solid #ccc; border-radius: 4px; font-size: 0.9em;'>{formatted_value}</p>"
+        items_html += f"""
+        <div style="margin-bottom: 12px; padding: 10px; background: #fdfefe; border-radius: 4px; box-shadow: 0 1px 3px rgba(0,0,0,0.1);">
+            <h4 style="margin: 0 0 8px 0; color: #34495e;">{formatted_key}</h4>
+            {value_html}
+        </div>
+        """
+    return items_html if items_html else "<p>No observations found.</p>"
+def format_insights(insights, visuals):
+    """Formats insights and embeds corresponding visuals."""
+    if not isinstance(insights, dict):
+         return f"<p style='color: orange;'>Insights data is not a dictionary: {type(insights)}</p>"
+    items_html = ""
+    visual_idx = 0
+    insight_keys = list(insights.keys())
+    for i, key in enumerate(insight_keys):
+        insight_text = str(insights[key])
+        formatted_key = key.replace('_', ' ').title()
+        items_html += f"""
+        <div style="margin: 20px 0; padding: 15px; background: #ffffff; border-radius: 8px; box-shadow: 0 2px 5px rgba(0,0,0,0.1);">
+            <h4 style='margin-top: 0; margin-bottom: 10px; color: #16a085;'>Insight {i+1}: {formatted_key}</h4>
+            <p style="margin-bottom: 15px;">{insight_text}</p>
+        """
+        # Embed visual if available for this insight index
+        if visual_idx < len(visuals):
+            img_path = visuals[visual_idx]
+            # Gradio uses /file= syntax for temporary files
+            # Ensure path is correctly formatted for Gradio's file serving
+            items_html += f'<img src="/file={img_path}" alt="Visualization for {formatted_key}" style="max-width: 95%; height: auto; display: block; margin-top: 10px; border-radius: 6px; border: 1px solid #eee; box-shadow: 0 1px 3px rgba(0,0,0,0.1);">'
+            visual_idx += 1
+        items_html += "</div>"
+    # Add any remaining visuals
+    if visual_idx < len(visuals):
+         items_html += "<h4 style='margin-top: 25px; color: #2980b9;'>Additional Visualizations:</h4>"
+         for i in range(visual_idx, len(visuals)):
+             img_path = visuals[i]
+             items_html += f"""
+             <div style="margin: 20px 0; padding: 15px; background: #ffffff; border-radius: 8px; box-shadow: 0 2px 5px rgba(0,0,0,0.1);">
+                 <img src="/file={img_path}" alt="Additional Visualization {i+1}" style="max-width: 95%; height: auto; display: block; margin: auto; border-radius: 6px; border: 1px solid #eee; box-shadow: 0 1px 3px rgba(0,0,0,0.1);">
+             </div>
+             """
+    return items_html if (items_html or visuals) else "<p>No insights or visuals generated/found.</p>" # Show message only if both empty
+def format_analysis_report(raw_output, visuals):
+    """Formats the AI agent's output into readable HTML."""
+    print("Formatting AI analysis report...")
+    report_html = ""
+    analysis_dict = {}
+    parsing_error = None
+    try:
+        # Attempt to parse the output string into a dictionary
+        if isinstance(raw_output, str):
+            cleaned_output = raw_output.strip()
+            if cleaned_output.startswith("```python"):
+                cleaned_output = cleaned_output[len("```python"):].strip()
+            elif cleaned_output.startswith("```json"):
+                 cleaned_output = cleaned_output[len("```json"):].strip()
+            if cleaned_output.endswith("```"):
+                cleaned_output = cleaned_output[:-len("```")].strip()
+            dict_start_index = cleaned_output.find('{')
+            if dict_start_index != -1:
+                 # Try parsing from the first brace
+                 try:
+                      analysis_dict = ast.literal_eval(cleaned_output[dict_start_index:])
+                 except (SyntaxError, ValueError, TypeError) as e:
+                      parsing_error = f"Error parsing agent output: {e}\nRaw output:\n{raw_output}"
+                      print(parsing_error)
+            else:
+                parsing_error = f"Could not find dictionary start '{{' in agent output.\nRaw output:\n{raw_output}"
+                print(parsing_error)
+        elif isinstance(raw_output, dict):
+            analysis_dict = raw_output # Already a dict
+        else:
+            parsing_error = f"Output is not a string or dictionary, type: {type(raw_output)}.\nRaw output:\n{str(raw_output)}"
+            print(parsing_error)
+        # --- Build HTML Report ---
+        report_html = """
+        <div style="font-family: Arial, sans-serif; line-height: 1.6; color: #333; padding: 15px; border: 1px solid #ddd; border-radius: 8px; background-color: #f9f9f9;">
+            <h1 style="color: #2c3e50; border-bottom: 2px solid #3498db; padding-bottom: 10px; margin-top: 0;">📊 AI Data Analysis Report</h1>
+        """
+        # Display parsing error if any
+        if parsing_error:
+             report_html += f"<div style='background-color: #f8d7da; color: #721c24; border: 1px solid #f5c6cb; padding: 10px; border-radius: 5px; margin-bottom: 15px;'><pre>{parsing_error}</pre></div>"
+        # Observations Section
+        observations = analysis_dict.get('observations', {})
+        report_html += """
+        <div style="margin-top: 20px; background: #ecf0f1; padding: 15px; border-radius: 5px;">
+            <h2 style="color: #2980b9; margin-top: 0;">🔍 Key Observations</h2>
+        """
+        report_html += format_observations(observations) if observations else "<p>No 'observations' found or parsed.</p>"
+        report_html += "</div>"
+        # Insights Section
+        insights = analysis_dict.get('insights', {})
+        report_html += """
+        <div style="margin-top: 25px;">
+            <h2 style="color: #2980b9;">💡 Insights & Visualizations</h2>
+        """
+        # Format insights and add visuals
+        report_html += format_insights(insights, visuals) if (insights or visuals) else "<p>No 'insights' or visuals found or parsed.</p>"
+        report_html += "</div>"
+        report_html += "</div>" # Close main container
+        print("Report formatting complete.")
+        return report_html, visuals
+    except Exception as e:
+        print(f"Critical error in format_analysis_report: {e}")
+        error_message = f"<p style='color: red; font-weight: bold;'>Error generating report:</p><pre>{str(e)}</pre>"
+        raw_display = f"<p style='font-weight: bold;'>Raw Agent Output:</p><pre>{str(raw_output)}</pre>"
+        return error_message + raw_display, visuals
+def analyze_data(csv_file, additional_notes=""):
+    """Runs the SmolAgent for data analysis and visualization."""
+    global df_global, agent # Need agent globally
+    if df_global is None:
+        return "<p style='color:red;'>Please upload a file first.</p>", []
+    if agent is None:
+         return "<p style='color:red;'>AI Agent is not available (initialization failed).</p>", []
+    if csv_file is None: # Check if file object exists
+        return "<p style='color:red;'>File object missing, please re-upload.</p>", []
+    print("--- Starting AI Agent Analysis ---")
+    start_time = time.time()
+    process = psutil.Process(os.getpid())
+    initial_memory = process.memory_info().rss / 1024 ** 2
+    # Ensure figures directory exists and is empty
+    figures_dir = './figures'
+    try:
+        if os.path.exists(figures_dir):
+            shutil.rmtree(figures_dir)
+            print(f"Cleaned existing directory: {figures_dir}")
+        os.makedirs(figures_dir)
+        print(f"Created directory: {figures_dir}")
+    except Exception as e:
+        print(f"Error managing figures directory: {e}")
+        return f"<p style='color:red;'>Error setting up visualization directory: {e}</p>", []
+    # --- WandB Setup for Agent Run ---
+    wandb_run_agent = None
+    run_name = f"AgentAnalysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+    if wandb.run is None or wandb.run.mode != "disabled":
+         try:
+             # Finish any potentially lingering run first
+             if wandb.run and wandb.run.id:
+                print(f"Finishing potentially active WandB run ({wandb.run.id}) before Agent run.")
+                wandb.finish()
+             wandb_run_agent = wandb.init(
+                 project="ai-data-analysis-gradio",
+                 name=run_name,
+                 config={
+                     "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+                     "agent_type": "CodeAgent",
+                     "task": "EDA and Visualization",
+                     "additional_notes": additional_notes,
+                     "source_file": os.path.basename(csv_file.name) if csv_file else "N/A",
+                     "data_shape": df_global.shape
+                 },
+                 reinit=True
+             )
+             print(f"WandB run '{run_name}' initialized for Agent Analysis.")
+         except Exception as e:
+             print(f"Error initializing WandB run for Agent Analysis: {e}")
+             wandb_run_agent = None # Ensure run is None if init fails
+    else:
+        print("WandB disabled, skipping Agent run logging.")
+    # --- Run Agent ---
+    analysis_result = None
+    visuals = []
+    try:
+        prompt = f"""
+Analyze the provided dataset (available as `df_global`).
+Focus on generating clear insights and high-quality visualizations.
+**Tasks:**
+1.  **Load Data:** The data is already loaded into the `df_global` pandas DataFrame. Use this DataFrame directly.
+2.  **Understand Data:** Briefly describe the data (shape, columns, basic types). Put in 'observations'.
+3.  **Generate Observations:** Provide at least 3 key statistical observations (e.g., correlations, value distributions, unique counts). Structure this under an 'observations' key in your output dictionary.
+4.  **Generate Insights:** Extract at least 5 meaningful insights from the data. These should be understandable conclusions or patterns discovered. Structure this under an 'insights' key in your output dictionary.
+5.  **Create Visualizations:** Generate exactly 5 publication-quality visualizations (e.g., histograms, scatter plots, heatmaps, bar charts) that support your insights.
+    *   **Save EACH plot** to the './figures/' directory with a unique name (e.g., './figures/plot_1.png', './figures/plot_2.png'). Use `plt.savefig('./figures/unique_name.png', bbox_inches='tight')` and `plt.clf()` after each plot.
+    *   Make sure plots have titles and clear labels.
+    *   **DO NOT** use `plt.show()`.
+**Output Format:**
+Return a Python dictionary strictly following this structure:
+{{
+    'observations': {{
+        'data_description': 'Brief description...',
+        'observation_1_key': 'Description of observation 1.',
+        # ... more observations
+    }},
+    'insights': {{
+        'insight_1_key': 'Description of insight 1.',
+        # ... more insights
+    }}
+}}
+**Additional Context/Requests:** {additional_notes}
+Ensure all code is executable. Access the data using the variable `df_global`.
+"""
+        print("Running AI agent...")
+        # Pass the DataFrame in additional_args
+        # IMPORTANT: Ensure your SmolAgent setup correctly handles passing DataFrames.
+        # If not, you might need to save df_global to a temporary CSV and pass the path.
+        analysis_result = agent.run(prompt, additional_args={"df_global": df_global.copy()}) # Pass a copy
+        print("AI agent finished.")
+        # print(f"Raw Agent Output:\n{analysis_result}") # For debugging
+        # Check for generated figures
+        if os.path.exists(figures_dir):
+             visuals = [os.path.join(figures_dir, f) for f in os.listdir(figures_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
+             print(f"Found {len(visuals)} visualizations in {figures_dir}.")
+             # Filter out invalid paths (e.g., if agent created non-image files)
+             visuals = [v for v in visuals if os.path.isfile(v)]
+        else:
+            print(f"Warning: Figures directory '{figures_dir}' not found after agent run.")
+    except Exception as e:
+        print(f"Error during AI agent execution: {e}")
+        import traceback
+        traceback.print_exc()
+        if wandb_run_agent: wandb_run_agent.finish(exit_code=1)
+        return f"<p style='color:red;'>Error running AI agent: {e}</p>", []
+    # --- Logging and Cleanup ---
+    execution_time = time.time() - start_time
+    final_memory = process.memory_info().rss / 1024 ** 2
+    memory_usage = final_memory - initial_memory
+    print(f"Agent execution time: {execution_time:.2f}s")
+    print(f"Memory usage during agent execution: {memory_usage:.2f} MB")
+    if wandb_run_agent:
+        try:
+            wandb.log({
+                "agent_execution_time_sec": execution_time,
+                "agent_memory_usage_mb": memory_usage,
+                "visualizations_generated": len(visuals)
+            }, commit=False) # Commit later or let finish handle it
+            # Log visualizations
+            logged_images_count = 0
+            for viz_path in visuals:
+                if os.path.exists(viz_path):
+                    try:
+                        img_name = os.path.basename(viz_path)
+                        wandb.log({f"agent_visualization_{img_name}": wandb.Image(viz_path)}, commit=False)
+                        logged_images_count += 1
+                    except Exception as log_e:
+                        print(f"Warning: Could not log image {viz_path} to WandB: {log_e}")
+                else:
+                    print(f"Warning: Visualization path not found for logging: {viz_path}")
+            print(f"Attempted to log {logged_images_count}/{len(visuals)} visualizations to WandB.")
+            # Log the raw analysis result text
+            log_data = {}
+            if analysis_result:
+                 log_data["agent_raw_output"] = str(analysis_result)[:10000] # Truncate long output
+            wandb.log(log_data, commit=True) # Commit logs here
+            print("Logged agent results to WandB.")
+        except Exception as e:
+            print(f"Error logging agent results to WandB: {e}")
+        finally:
+            wandb_run_agent.finish()
+            print(f"WandB run '{run_name}' finished.")
+    # Format report
+    return format_analysis_report(analysis_result, visuals)
+# --- Model Training and Comparison ---
+def prepare_data(df, target_column=None):
+    """Prepares data for modeling (selects target, splits, handles encoding)."""
+    global split_data_global # Allow modification of global state
+    print("--- Preparing Data for Modeling ---")
+    if df is None or df.empty:
+        print("Error: DataFrame is None or empty in prepare_data.")
+        raise ValueError("Cannot prepare data: DataFrame is empty.")
+    # Determine target column if not specified
+    if target_column is None:
+        target_column = df.columns[-1] # Default to last column
+        print(f"Target column automatically selected: '{target_column}'")
+    elif target_column not in df.columns:
+        print(f"Error: Specified target column '{target_column}' not found.")
+        raise ValueError(f"Target column '{target_column}' not found.")
+    else:
+         print(f"Using specified target column: '{target_column}'")
+    X = df.drop(columns=[target_column])
+    y = df[target_column].copy() # Use copy to avoid SettingWithCopyWarning
+    # Ensure target `y` is numeric for classification/regression models
+    le = None # LabelEncoder object
+    if y.dtype == 'object' or pd.api.types.is_categorical_dtype(y):
+        print(f"Encoding target column '{target_column}' with LabelEncoder.")
+        le = LabelEncoder()
+        y = le.fit_transform(y) # Overwrite y with encoded values
+        print(f"Target classes found: {le.classes_}") # Useful for interpretation later
+    # Check for non-numeric features (should be handled by clean_data, but as safeguard)
+    non_numeric_cols = X.select_dtypes(exclude=np.number).columns
+    if not non_numeric_cols.empty:
+        print(f"Warning: Non-numeric columns found in features after cleaning: {list(non_numeric_cols)}. Dropping them.")
+        X = X.drop(columns=non_numeric_cols)
+    if X.empty:
+         print("Error: No features remaining after dropping non-numeric columns.")
+         raise ValueError("No features remaining to train the model.")
+    # Check if target has only one class after potential encoding/filtering
+    if y.nunique() < 2:
+        print(f"Error: Target column '{target_column}' has fewer than 2 unique values after processing. Cannot stratify or train meaningful classifier.")
+        raise ValueError("Target column must have at least two unique classes for classification.")
+    # Split data
+    try:
+         X_train, X_test, y_train, y_test = train_test_split(
+             X, y, test_size=0.3, random_state=42, stratify=y # Always try to stratify for classification
+         )
+         print(f"Data split: X_train {X_train.shape}, X_test {X_test.shape}, y_train {y_train.shape}, y_test {y_test.shape}")
+    except ValueError as split_e:
+         # This can happen if a class has too few members (e.g., only 1) for stratification
+         print(f"Stratified split failed ({split_e}). Trying non-stratified split.")
+         X_train, X_test, y_train, y_test = train_test_split(
+             X, y, test_size=0.3, random_state=42
+         )
+         print(f"Data split (non-stratified): X_train {X_train.shape}, X_test {X_test.shape}, y_train {y_train.shape}, y_test {y_test.shape}")
+    # Store the split data AND the label encoder if used
+    split_data_global = (X_train, X_test, y_train, y_test, le) # Add le to the tuple
+    print("Data prepared and split stored globally.")
+    return X_train, X_test, y_train, y_test
+def train_and_compare_models(tune_rf=True, tune_gb=True, n_trials_optuna=10):
+    """Trains, (optionally) tunes, evaluates multiple models, and logs comparison."""
+    global df_global, split_data_global, comparison_results_global, best_model_details_global, wandb_run
+    if df_global is None:
+        print("Error: No data loaded for training/comparison.")
+        return pd.DataFrame({"Error": ["Please upload data first."]})
+    print("--- Starting Model Training and Comparison ---")
+    run_name = f"CompareModels_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+    models_to_compare = {
+        # Use Pipelines for models benefiting from scaling
+        "LogisticRegression": Pipeline([('scaler', StandardScaler()), ('logreg', LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'))]),
+        "RandomForest": RandomForestClassifier(random_state=42, class_weight='balanced'), # Add class_weight
+        "GradientBoosting": GradientBoostingClassifier(random_state=42) # GB doesn't have class_weight in quite the same way
+    }
+    # Filter models based on input type (add later if needed)
+    # is_classification = True # Assume classification for now
+    config = {
+        "task": "Model Comparison",
+        "models": list(models_to_compare.keys()),
+        "tune_rf": tune_rf,
+        "tune_gb": tune_gb,
+        "optuna_trials": n_trials_optuna if (tune_rf or tune_gb) else 0,
+        "data_shape": df_global.shape,
+        "test_size": 0.3,
+        "stratify": True # Assuming stratification was attempted
+    }
+    # --- WandB Setup for Comparison Run ---
+    if wandb.run is None or wandb.run.mode != "disabled":
+        try:
+            # Finish any potentially lingering run first
+            if wandb.run and wandb.run.id:
+               print(f"Finishing potentially active WandB run ({wandb.run.id}) before Comparison run.")
+               wandb.finish()
+            wandb_run = wandb.init(project="ai-data-analysis-gradio", name=run_name, config=config, reinit=True)
+            print(f"WandB run '{run_name}' initialized for Model Comparison.")
+        except Exception as e:
+            print(f"Error initializing WandB run for Comparison: {e}")
+            wandb_run = None # Ensure it's None if init fails
+    else:
+         print("WandB disabled, skipping Comparison run logging.")
+         wandb_run = None # Explicitly set to None if disabled
+    results = []
+    best_f1 = -1 # Initialize best F1 score
+    best_model_obj = None
+    best_model_name = None
+    best_model_params = None
+    try:
+        # Prepare data if not already split
+        if split_data_global:
+            print("Using previously split data.")
+            X_train, X_test, y_train, y_test, _ = split_data_global # Unpack (ignore label encoder here)
+        else:
+            print("Preparing data for comparison...")
+            # Use default target (last column) if prepare_data hasn't been run
+            X_train, X_test, y_train, y_test = prepare_data(df_global)
+        # --- Optuna Objective Functions ---
+        # (Ensure X_train, y_train are accessible within objectives)
+        def objective_rf(trial):
+            params = {
+                "n_estimators": trial.suggest_int("n_estimators", 50, 250, step=50),
+                "max_depth": trial.suggest_int("max_depth", 5, 20, log=True), # Log scale for depth
+                "min_samples_split": trial.suggest_int("min_samples_split", 2, 16),
+                "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 16),
+                "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
+                 "class_weight": trial.suggest_categorical("class_weight", ["balanced", "balanced_subsample", None]),
+                "random_state": 42
+            }
+            # Note: class_weight='balanced' might require specific sklearn version or handling
+            try:
+                 model = RandomForestClassifier(**params)
+                 # Use smaller CV during tuning for speed
+                 score = cross_val_score(model, X_train, y_train, cv=3, scoring="f1_weighted", n_jobs=-1).mean() # Tune based on F1 weighted
+                 if wandb_run: wandb.log({"optuna_rf_trial": trial.number, "optuna_rf_cv_f1w": score, **params}, commit=False)
+                 return score
+            except ValueError as e:
+                 print(f"Optuna RF trial error (params {params}): {e}")
+                 return -1 # Return poor score on error
+        def objective_gb(trial):
+            params = {
+                "n_estimators": trial.suggest_int("n_estimators", 50, 250, step=50),
+                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
+                "max_depth": trial.suggest_int("max_depth", 3, 10),
+                "min_samples_split": trial.suggest_int("min_samples_split", 2, 16),
+                "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 16),
+                "subsample": trial.suggest_float("subsample", 0.6, 1.0),
+                "random_state": 42
+            }
+            try:
+                 model = GradientBoostingClassifier(**params)
+                 score = cross_val_score(model, X_train, y_train, cv=3, scoring="f1_weighted", n_jobs=-1).mean() # Tune based on F1 weighted
+                 if wandb_run: wandb.log({"optuna_gb_trial": trial.number, "optuna_gb_cv_f1w": score, **params}, commit=False)
+                 return score
+            except ValueError as e:
+                 print(f"Optuna GB trial error (params {params}): {e}")
+                 return -1
+        # --- Model Training Loop ---
+        for name, model_pipeline in models_to_compare.items(): # Use model_pipeline for clarity
+            print(f"--- Training and Evaluating: {name} ---")
+            start_time = time.time()
+            current_params = model_pipeline.get_params() # Start with default/pipeline params
+            final_model = model_pipeline # The model/pipeline to be trained
+            try:
+                # Optional Tuning with Optuna
+                if name == "RandomForest" and tune_rf:
+                    print(f"Tuning {name} with Optuna ({n_trials_optuna} trials)...")
+                    study_rf = optuna.create_study(direction="maximize", study_name=f"{name}_tune_{run_name}")
+                    study_rf.optimize(objective_rf, n_trials=n_trials_optuna, timeout=300)
+                    best_params_rf = study_rf.best_params
+                    # Important: Re-initialize the model *without* pipeline for RF tuning
+                    final_model = RandomForestClassifier(**best_params_rf, random_state=42)
+                    current_params = final_model.get_params() # Update params to tuned ones
+                    print(f"Best RF params (CV F1w: {study_rf.best_value:.4f}): {best_params_rf}")
+                    if wandb_run: wandb.log({f"{name}_best_cv_f1w": study_rf.best_value, f"{name}_best_params": best_params_rf}, commit=False)
+                elif name == "GradientBoosting" and tune_gb:
+                    print(f"Tuning {name} with Optuna ({n_trials_optuna} trials)...")
+                    study_gb = optuna.create_study(direction="maximize", study_name=f"{name}_tune_{run_name}")
+                    study_gb.optimize(objective_gb, n_trials=n_trials_optuna, timeout=300)
+                    best_params_gb = study_gb.best_params
+                    final_model = GradientBoostingClassifier(**best_params_gb, random_state=42)
+                    current_params = final_model.get_params()
+                    print(f"Best GB params (CV F1w: {study_gb.best_value:.4f}): {best_params_gb}")
+                    if wandb_run: wandb.log({f"{name}_best_cv_f1w": study_gb.best_value, f"{name}_best_params": best_params_gb}, commit=False)
+                # Train the final model (tuned or default/pipeline)
+                final_model.fit(X_train, y_train)
+                # Evaluate on the test set
+                y_pred = final_model.predict(X_test)
+                accuracy = accuracy_score(y_test, y_pred)
+                # Use weighted avg for metrics suitable for multi-class and imbalanced datasets
+                precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
+                recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
+                f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)
+                duration = time.time() - start_time
+                print(f"{name} Test Set - Accuracy: {accuracy:.4f}, F1 (Weighted): {f1:.4f}, Time: {duration:.2f}s")
+                metrics = {
+                    "Model": name,
+                    "Test Accuracy": accuracy, # Changed key for clarity
+                    "Test Precision (Weighted)": precision,
+                    "Test Recall (Weighted)": recall,
+                    "Test F1 Score (Weighted)": f1,
+                    "Training Time (s)": duration,
+                    "Tuned": (name == "RandomForest" and tune_rf) or (name == "GradientBoosting" and tune_gb)
+                }
+                results.append(metrics)
+                # Log individual model metrics to WandB
+                if wandb_run:
+                    # Create a flat dictionary for logging
+                    log_metrics = {f"{name}_{k.lower().replace(' (weighted)','_w').replace(' ','_')}": v
+                                   for k, v in metrics.items() if k not in ["Model", "Tuned"]}
+                    log_metrics[f"{name}_tuned_flag"] = metrics["Tuned"]
+                    wandb.log(log_metrics, commit=False)
+                # Check if this is the best model so far based on F1 score
+                if f1 > best_f1:
+                    print(f"*** New best model found: {name} (F1: {f1:.4f}) ***")
+                    best_f1 = f1
+                    best_model_name = name
+                    best_model_obj = final_model # Store the fitted model/pipeline object
+                    best_model_params = current_params # Store its parameters
+            except Exception as train_e:
+                 print(f"ERROR training/evaluating {name}: {train_e}")
+                 import traceback
+                 traceback.print_exc()
+                 results.append({"Model": name, "Error": str(train_e)})
+                 if wandb_run: wandb.log({f"{name}_error": str(train_e)}, commit=False)
+        # --- Finalize Comparison ---
+        if not results:
+             print("No models were successfully trained.")
+             return pd.DataFrame({"Status": ["Model training failed for all candidates."]})
+        comparison_df = pd.DataFrame(results)
+        # Handle cases where F1 score might be missing due to errors
+        if "Test F1 Score (Weighted)" in comparison_df.columns:
+             comparison_df = comparison_df.sort_values(by="Test F1 Score (Weighted)", ascending=False).reset_index(drop=True)
+        else:
+             print("Warning: F1 score column missing, cannot sort results.")
+        comparison_results_global = comparison_df # Store globally
+        print("\n--- Model Comparison Summary ---")
+        print(comparison_df.to_string())
+        # Store best model details globally if found
+        if best_model_obj is not None:
+            best_model_details_global = {
+                'name': best_model_name,
+                'model': best_model_obj, # Store the actual fitted model/pipeline
+                'params': best_model_params, # Store params used (tuned or default)
+                'f1_score': best_f1
+            }
+            print(f"Stored details for best model: {best_model_name}")
+            # Optional: Save the best model artifact locally and log to WandB
+            output_dir_models = "./saved_models"
+            os.makedirs(output_dir_models, exist_ok=True)
+            model_filename = os.path.join(output_dir_models, f"best_model_{best_model_name.lower().replace(' ','_')}.joblib")
+            try:
+                joblib.dump(best_model_obj, model_filename)
+                print(f"Best model saved locally to {model_filename}")
+                if wandb_run:
+                    # Log artifact to WandB
+                    # Clean params dict for artifact metadata (remove complex objects if any)
+                    clean_params_meta = {k: str(v) for k, v in best_model_params.items() if isinstance(v, (str, int, float, bool, list))}
+                    artifact = wandb.Artifact(f'best_model-{wandb_run.id}', type='model',
+                                              metadata={'model_type': best_model_name, 'test_f1_score': best_f1, **clean_params_meta})
+                    artifact.add_file(model_filename)
+                    wandb_run.log_artifact(artifact)
+                    print("Logged best model artifact to WandB.")
+            except Exception as save_e:
+                 print(f"Error saving/logging best model artifact: {save_e}")
+        # Log comparison table to WandB
+        if wandb_run and not comparison_df.empty:
+            try:
+                # Filter out potential error rows before creating table
+                valid_comparison_df = comparison_df.dropna(subset=[col for col in comparison_df.columns if col != 'Error'])
+                if not valid_comparison_df.empty:
+                    wandb_comparison_table = wandb.Table(dataframe=valid_comparison_df)
+                    wandb_run.log({"model_comparison_summary": wandb_comparison_table}, commit=True) # Commit final logs
+                    print("Logged comparison summary table to WandB.")
+                else:
+                    print("No valid results to log to WandB table.")
+            except Exception as log_e:
+                 print(f"Error logging comparison table to WandB: {log_e}")
+        return comparison_df
+    except Exception as e:
+        print(f"An error occurred during model comparison: {e}")
+        import traceback
+        traceback.print_exc()
+        if wandb_run: wandb_run.finish(exit_code=1) # Mark run as failed
+        return pd.DataFrame({"Error": [f"Comparison failed: {e}"]})
+    finally:
+        if wandb_run and wandb.run: # Check if wandb_run was initialized and is still active
+             # Ensure logs are committed before finishing
+             try:
+                 wandb.log({}, commit=True)
+             except Exception:
+                 pass # Ignore if already finished or other issue
+             wandb_run.finish()
+             print(f"WandB run '{run_name}' finished.")
+        wandb_run = None # Reset global run variable
+# --- Model Explainability ---
+def explainability(_=None):
     """Generates SHAP and LIME explanations for the best performing model."""
     global split_data_global, best_model_details_global, wandb_run
     if split_data_global is None:
+        print("Error: Data not split. Please run 'Train & Compare' first.")
+        return [], None, "Error: Data not prepared. Run 'Train & Compare' first."
     if best_model_details_global is None:
+         print("Error: Best model details not found. Please run 'Train & Compare' first.")
+         return [], None, "Error: Best model not identified. Run 'Train & Compare' first."
+    X_train, X_test, y_train, y_test, label_encoder = split_data_global # Unpack label encoder
     best_model_name = best_model_details_global['name']
+    best_model = best_model_details_global['model'] # Use the stored, already fitted best model/pipeline
     print(f"--- Generating explanations for the best model: {best_model_name} ---")
+    # Define paths dynamically within a dedicated directory
     output_dir = "./explainability_plots"
     if os.path.exists(output_dir): shutil.rmtree(output_dir)
     os.makedirs(output_dir)
     status_message = f"Explaining best model: {best_model_name}"
     all_shap_paths = [] # Initialize empty list for gallery output
+    # --- WandB Setup for Explainability Run ---
     run_name = f"Explain_{best_model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
     config = {"task": "Explainability", "best_model": best_model_name, "explainers": ["SHAP", "LIME"]}
     wandb_run_explain = None
     if wandb.run is None or wandb.run.mode != "disabled":
         try:
+            # Finish any potentially lingering run first
             if wandb.run and wandb.run.id:
+                print(f"Finishing potentially active WandB run ({wandb.run.id}) before Explainability run.")
                 wandb.finish()
             wandb_run_explain = wandb.init(project="ai-data-analysis-gradio", name=run_name, config=config, reinit=True)
             print(f"WandB run '{run_name}' initialized for Explainability.")
             print(f"Error initializing Wandb run for Explainability: {e}")
             wandb_run_explain = None # Ensure it's None if init fails
     else:
+        print("WandB disabled, skipping Explainability run logging.")
         wandb_run_explain = None
     try:
         print("Calculating SHAP values...")
         shap_values = None
         explainer = None
+        X_test_for_shap = X_test # Data to pass to SHAP (might be transformed)
         # Determine explainer type and get SHAP values
         if isinstance(best_model, (RandomForestClassifier, GradientBoostingClassifier)):
+            print("Using SHAP TreeExplainer for standalone tree model.")
+            explainer = shap.TreeExplainer(best_model)
+            shap_values = explainer.shap_values(X_test) # Use original X_test
         elif isinstance(best_model, Pipeline):
             final_estimator_name, final_estimator = best_model.steps[-1]
             print(f"Handling Pipeline. Final estimator: {final_estimator_name} ({type(final_estimator)})")
+            pipeline_transforms = Pipeline(best_model.steps[:-1]) # Get all steps EXCEPT the last one
             if isinstance(final_estimator, (RandomForestClassifier, GradientBoostingClassifier)):
+                 print("Using SHAP TreeExplainer for tree model within Pipeline.")
+                 # Need to transform data using the pipeline's transform steps first
+                 try:
+                     print("Transforming data using pipeline steps before TreeExplainer...")
+                     X_train_transformed = pipeline_transforms.fit_transform(X_train) # Fit on train
+                     X_test_transformed = pipeline_transforms.transform(X_test)     # Transform test
+                     X_test_for_shap = pd.DataFrame(X_test_transformed, columns=X_test.columns, index=X_test.index) # Keep index/columns
+                     print("Data transformed.")
+                     # Explain the final estimator using the *transformed* data
+                     explainer = shap.TreeExplainer(final_estimator)
+                     shap_values = explainer.shap_values(X_test_for_shap)
+                 except Exception as transform_e:
+                      print(f"ERROR transforming data for TreeExplainer in Pipeline: {transform_e}. Skipping SHAP.")
             elif isinstance(final_estimator, LogisticRegression):
                  print("Using SHAP KernelExplainer for Logistic Regression in Pipeline (can be slow)...")
                  predict_proba_pipeline = lambda x: best_model.predict_proba(pd.DataFrame(x, columns=X_test.columns))
                  print("Summarizing training data for KernelExplainer background...")
+                 # Use original X_train for background summary as KernelExplainer handles model internally
+                 X_train_summary = shap.kmeans(X_train, min(100, X_train.shape[0]), random_state=42)
                  explainer = shap.KernelExplainer(predict_proba_pipeline, X_train_summary)
                  subset_size = min(50, X_test.shape[0])
                  print(f"Calculating SHAP values for {subset_size} test instances...")
+                 X_test_subset_np = X_test.sample(subset_size, random_state=42).values
+                 shap_values = explainer.shap_values(X_test_subset_np, nsamples='auto') # Let SHAP choose nsamples
                  # Create DataFrame from subset for plotting consistency
+                 X_test_for_shap = pd.DataFrame(X_test_subset_np, columns=X_test.columns) # Use subset for plotting
                  print("SHAP values calculated using KernelExplainer.")
             else:
+                print(f"Warning: SHAP not implemented for final pipeline step {type(final_estimator)}. Skipping SHAP.")
         else:
+            print(f"Warning: SHAP not explicitly handled for model type {type(best_model)}. Skipping SHAP.")
         # --- Generate SHAP Plots (if shap_values exist) ---
         if shap_values is not None:
+            num_classes = len(np.unique(y_train))
             # SHAP Summary Plot
             print("Generating SHAP summary plot...")
+            plt.figure(figsize=(10, 6))
             try:
+                plot_shap_values = shap_values
                 title_suffix = f"({best_model_name})"
+                class_names = getattr(label_encoder, 'classes_', [f'Class {i}' for i in range(num_classes)]) if label_encoder else [f'Class {i}' for i in range(num_classes)]
                 if num_classes == 2 and isinstance(shap_values, list) and len(shap_values) == 2:
+                    # Typically, shap_values[1] corresponds to the positive class
+                    plot_shap_values = shap_values[1]
+                    positive_class_name = class_names[1] if len(class_names) > 1 else "Class 1"
+                    title_suffix = f"({best_model_name} - Class: {positive_class_name})"
+                    print(f"Plotting SHAP summary for positive class ({positive_class_name})")
                 elif num_classes > 2 and isinstance(shap_values, list):
                      title_suffix = f"({best_model_name} - Multiclass Avg Impact)"
                      print("Plotting SHAP summary for multiclass (average impact)")
+                shap.summary_plot(plot_shap_values, X_test_for_shap, show=False, plot_type="dot", class_names=class_names)
                 plt.title(f"SHAP Feature Importance Summary {title_suffix}")
                 plt.tight_layout()
                 plt.savefig(shap_summary_path, bbox_inches='tight')
                 plt.clf()
                 print(f"SHAP summary plot saved to {shap_summary_path}")
+                all_shap_paths.append(shap_summary_path)
                 if wandb_run_explain: wandb.log({"shap_summary": wandb.Image(shap_summary_path)}, commit=False)
             except Exception as summary_e:
                  print(f"Error generating SHAP summary plot: {summary_e}")
+                 plt.clf()
             # SHAP Dependence Plots
             print("Calculating global feature importance for dependence plots...")
+            global_shap_values_mean = None
             try:
+                if isinstance(shap_values, list): # Multi-class or binary list
+                     abs_shap_arrays = [np.abs(sv) for sv in shap_values if isinstance(sv, np.ndarray) and sv.ndim == 2 and sv.shape[1] == X_test_for_shap.shape[1]]
+                     if abs_shap_arrays:
+                         stacked_shap = np.stack(abs_shap_arrays, axis=0)
+                         global_shap_values_mean = stacked_shap.mean(axis=(0, 1))
+                         print(f"Calculated global SHAP values (list input), shape: {global_shap_values_mean.shape}")
+                elif isinstance(shap_values, np.ndarray) and shap_values.ndim == 2 and shap_values.shape[1] == X_test_for_shap.shape[1]: # Regression or binary array
+                    global_shap_values_mean = np.abs(shap_values).mean(axis=0)
+                    print(f"Calculated global SHAP values (array input), shape: {global_shap_values_mean.shape}")
                 else:
+                    print(f"Warning: SHAP values structure not suitable for global importance calculation. Shape: {getattr(shap_values, 'shape', 'N/A')}, Type: {type(shap_values)}")
             except Exception as gsi_e:
                 print(f"Error calculating global SHAP importance: {gsi_e}")
+            if global_shap_values_mean is not None and len(global_shap_values_mean) > 0:
                 try:
+                    feature_indices = np.argsort(global_shap_values_mean)[::-1]
+                    num_features_to_plot = min(2, len(global_shap_values_mean))
                     if num_features_to_plot > 0:
                         top_feature_indices = feature_indices[:num_features_to_plot]
+                        # Ensure indices are within bounds of columns
+                        valid_indices = [idx for idx in top_feature_indices if idx < len(X_test_for_shap.columns)]
+                        top_features = X_test_for_shap.columns[valid_indices]
                         print(f"Generating SHAP dependence plots for top features: {list(top_features)}")
+                        for feature_idx, feature_name in zip(valid_indices, top_features):
                             plt.figure(figsize=(8, 5))
                             shap_values_for_dep = shap_values
+                            class_idx_dep = 0 # Default to first class for multiclass
                             if isinstance(shap_values, list):
+                                if num_classes == 2 and len(shap_values) == 2:
+                                     shap_values_for_dep = shap_values[1] # Use positive class for binary
+                                     class_idx_dep = 1
+                                elif len(shap_values) > 0:
+                                     shap_values_for_dep = shap_values[0] # Default class 0 for multiclass
+                            shap.dependence_plot(feature_idx, shap_values_for_dep, X_test_for_shap, interaction_index='auto', show=False)
+                            dep_title = f"SHAP Dependence: {feature_name} ({best_model_name})"
+                            if isinstance(shap_values, list): dep_title += f" (Class Index {class_idx_dep})"
+                            plt.title(dep_title)
                             plt.tight_layout()
                             dep_path = os.path.join(output_dir, f"shap_dependence_{best_model_name}_{feature_name}.png")
                             plt.savefig(dep_path, bbox_inches='tight')
                             print(f"Saved dependence plot: {dep_path}")
                             if wandb_run_explain: wandb.log({f"shap_dependence_{feature_name}": wandb.Image(dep_path)}, commit=False)
                     else:
+                         print("Skipping dependence plots: Not enough features.")
                 except Exception as dep_e:
                     print(f"Could not generate SHAP dependence plots: {dep_e}")
+                    import traceback
+                    traceback.print_exc()
+                    plt.clf()
             else:
                  print("Skipping dependence plots due to issue calculating global SHAP values.")
         else:
              print("Skipping SHAP plots as SHAP values were not generated.")
         print("Generating LIME explanation for the first test instance...")
         try:
             predict_fn_lime = None
             if hasattr(best_model, 'predict_proba'):
                  def predict_proba_wrapper(x_np):
                       x_df = pd.DataFrame(x_np, columns=X_train.columns)
                       return best_model.predict_proba(x_df)
                  predict_fn_lime = predict_proba_wrapper
             else:
+                 print("Warning: Model lacks predict_proba. LIME results might be unreliable.")
                  num_classes_lime = len(np.unique(y_train))
                  predict_fn_lime = lambda x: np.ones((len(x), num_classes_lime)) / num_classes_lime
+            # Use class names from label encoder if available
+            class_names_lime = getattr(label_encoder, 'classes_', [str(c) for c in sorted(np.unique(y_train))]) if label_encoder else [str(c) for c in sorted(np.unique(y_train))]
+            # Ensure class names are strings
+            class_names_lime = [str(cn) for cn in class_names_lime]
             lime_explainer = lime.lime_tabular.LimeTabularExplainer(
+                training_data=X_train.values,
                 feature_names=X_train.columns.tolist(),
+                class_names=class_names_lime,
+                mode='classification' if len(class_names_lime) > 1 else 'regression'
             )
             instance_idx = 0
+            instance_to_explain = X_test.iloc[instance_idx].values
+            true_class_encoded = y_test[instance_idx] if isinstance(y_test, np.ndarray) else y_test.iloc[instance_idx]
+            # Decode true class if label encoder exists
+            true_class_label = class_names_lime[true_class_encoded] if label_encoder and true_class_encoded < len(class_names_lime) else str(true_class_encoded)
             lime_exp = lime_explainer.explain_instance(
                 data_row=instance_to_explain,
+                predict_fn=predict_fn_lime,
                 num_features=10,
+                num_samples=1000
             )
             print(f"LIME explanation generated for instance {instance_idx}.")
             lime_fig = lime_exp.as_pyplot_figure()
+            predicted_class_idx = lime_exp.available_labels()[0]
+            predicted_class_label = class_names_lime[predicted_class_idx] if predicted_class_idx < len(class_names_lime) else f"Index {predicted_class_idx}"
+            lime_fig.suptitle(f"LIME Exp (Inst {instance_idx}, True: {true_class_label}, Pred: {predicted_class_label}, Model: {best_model_name})", y=1.03, fontsize=10)
+            lime_fig.tight_layout(rect=[0, 0, 1, 0.98])
             lime_fig.savefig(lime_path, bbox_inches='tight')
             plt.clf()
             print(f"LIME plot saved to {lime_path}")
         except Exception as lime_e:
             print(f"Error generating LIME explanation: {lime_e}")
+            import traceback
+            traceback.print_exc()
             if wandb_run_explain: wandb.log({"lime_error": str(lime_e)}, commit=False)
             lime_path = None # Indicate failure
         # Final status message
+        status_message = f"Explanations generated for {best_model_name}."
+        if not all_shap_paths: status_message += " (SHAP failed/skipped or generated no plots)."
         if not lime_path: status_message += " (LIME failed/skipped)."
         # Return paths to the plots and status
         valid_lime_path = lime_path if lime_path and os.path.exists(lime_path) else None
+        valid_shap_paths = [p for p in all_shap_paths if p and os.path.exists(p)] # Filter out non-existent paths
+        return valid_shap_paths, valid_lime_path, status_message
     except Exception as e:
         print(f"An error occurred during explainability: {e}")
         import traceback
+        traceback.print_exc()
         status_message = f"Error during explanation: {e}"
         if wandb_run_explain: wandb_run_explain.finish(exit_code=1)
         return [], None, status_message # Return empty list/None for paths on error
     finally:
         plt.close('all') # Close all matplotlib figures
+        if wandb_run_explain and wandb.run and wandb.run.id == wandb_run_explain.id:
             try:
+                 wandb.log({}, commit=True) # Commit final logs for explain run
                  wandb_run_explain.finish()
                  print(f"WandB run '{run_name}' finished.")
             except Exception as finish_e:
         wandb_run_explain = None # Reset
+# --- Gradio Interface ---
+print("--- Setting up Gradio Interface ---")
 with gr.Blocks(theme=gr.themes.Soft(), title="AI Data Analysis & Model Comparison") as demo:
     gr.Markdown(
         """
         with gr.Row():
              with gr.Column(scale=1):
                 agent_notes = gr.Textbox(label="Optional: Specific requests for the AI Agent", placeholder="e.g., 'Focus on correlations with column X'")
+                agent_btn = gr.Button("Run AI Analysis", variant="secondary", interactive=(agent is not None)) # Disable if agent failed init
              with gr.Column(scale=2):
                 insights_output = gr.HTML(label="AI Agent Analysis Report")
         with gr.Row():
                   optuna_trials_slider = gr.Slider(minimum=5, maximum=50, value=10, step=5, label="Optuna Trials per Model")
                   compare_btn = gr.Button("Train & Compare Models", variant="primary")
              with gr.Column(scale=2):
+                  comparison_output = gr.DataFrame(label="Model Comparison Results (Sorted by Test F1 Score)", interactive=False)
     # --- Row 4: Model Explainability ---
     with gr.Accordion("💡 Step 4: Explain Best Model (SHAP & LIME)", open=False):
              explain_status = gr.Textbox(label="Explanation Status", interactive=False)
         with gr.Row():
              # Use Gallery for SHAP as there can be multiple plots
+             shap_gallery = gr.Gallery(label="SHAP Plots (Summary + Top Feature Dependence)", elem_id="shap-gallery", height=450, object_fit="contain", columns=1, preview=True) # Better display
              lime_img = gr.Image(label="LIME Explanation (for first test instance)", type="filepath", interactive=False)
     # --- Connect Components ---
+    print("Connecting Gradio components...")
+    # Link file upload to function
     file_input.change(
         fn=upload_file,
         inputs=file_input,
         outputs=df_output
     )
+    # Link AI agent button
     agent_btn.click(
         fn=analyze_data,
         inputs=[file_input, agent_notes],
         outputs=[insights_output, visual_output]
     )
+    # Link model comparison button
     compare_btn.click(
         fn=train_and_compare_models,
         inputs=[tune_rf_checkbox, tune_gb_checkbox, optuna_trials_slider],
         outputs=[comparison_output]
     )
+    # Link explain button
     explain_btn.click(
         fn=explainability,
         inputs=[], # Uses global best model details
         outputs=[shap_gallery, lime_img, explain_status] # Output list of SHAP plots, one LIME plot, and status
     )
+    print("Gradio components connected.")
 # --- Launch the App ---
 if __name__ == "__main__":
+    print("--- Cleaning up temporary directories/files ---")
     # Clean up temporary files/dirs from previous runs before launching
+    temp_dirs = ['./figures', './explainability_plots', './saved_models', './__pycache__']
+    temp_files = [] # Don't delete pngs automatically if they are inside the cleaned dirs
     for d in temp_dirs:
         if os.path.exists(d):
              except Exception as e:
                  print(f"Warning: Could not clean up file {f}: {e}")
+    print("--- Launching Gradio App ---")
+    demo.launch(debug=False, share=False) # Set debug=True for detailed Gradio errors if needed
+    print("--- Gradio App Closed ---")