import pandas as pd
import numpy as np
import gradio as gr
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import f_classif
import matplotlib.pyplot as plt
import seaborn as sns
import io
from PIL import Image

# Constants
RANDOM_STATE = 42
MIN_ROWS = 10
MIN_COLS = 2
MAX_FEATURES_TO_SHOW = 10

# Global variable to store trained model and data
global_data = {'model': None, 'scaler': None, 'X_columns': None, 'y_type': None, 'uniques': None}

def update_dropdown(file):
    if file is None:
        return gr.update(choices=[], value=None)
    try:
        if file.name.endswith('.csv'):
            df = pd.read_csv(file.name)
        elif file.name.endswith('.xlsx'):
            df = pd.read_excel(file.name)
        else:
            return gr.update(choices=[], value=None)
        return gr.update(choices=list(df.columns), value=None)
    except Exception as e:
        print(f"Error in update_dropdown: {e}")  # Debug logging
        return gr.update(choices=[], value=None)

def analyze_file(file, label_col, n_clusters):
    if file is None:
        return ("Please upload a file.", None, None, None, None, None)
    
    try:
        if file.name.endswith('.csv'):
            df = pd.read_csv(file.name)
        elif file.name.endswith('.xlsx'):
            df = pd.read_excel(file.name)
        else:
            return ("Unsupported file type. Please upload a CSV or XLSX file.", None, None, None, None, None)
    except Exception as e:
        print(f"Error reading file: {e}")  # Debug logging
        return (f"Error reading file: {e}", None, None, None, None, None)

    if df.empty:
        return ("File is empty.", None, None, None, None, None)
    if label_col not in df.columns:
        return (f"Label column '{label_col}' not found.", None, None, None, None, None)
    
    df = df.dropna()
    if df.shape[0] < MIN_ROWS:
        return (f"Not enough data rows (less than {MIN_ROWS}) after removing missing values.", None, None, None, None, None)
    if df.shape[1] < MIN_COLS:
        return ("Need at least one feature and one label column.", None, None, None, None, None)

    y = df[label_col]
    X = df.drop(columns=[label_col])
    X_processed = pd.get_dummies(X)
    if X_processed.shape[1] == 0:
        return ("No valid features after preprocessing.", None, None, None, None, None)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_processed)

    results_text = ""
    model_img = None
    fi_img = None
    kmeans_img = None
    agg_img = None
    diff_img = None

    try:
        if pd.api.types.is_numeric_dtype(y):
            # Regression
            X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=RANDOM_STATE)
            model = RandomForestRegressor(random_state=RANDOM_STATE)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            results_text += (
                "Regression Results:\n"
                f"- MSE: {mse:.3f}\n"
                f"- R²: {r2:.3f}\n"
                "\nCheck the 'Feature Importances' tab to see the top features impacting predictions.\n"
            )
            # 2D Plots: Top 3 features vs predicted and true vs predicted
            fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False)
            top_features = fi.head(3).index
            fig, axes = plt.subplots(2, 2, figsize=(12, 10))
            axes = axes.flatten()
            for i, feature in enumerate(top_features):
                ax = axes[i]
                ax.scatter(X_test[feature], y_pred, alpha=0.5)
                ax.set_xlabel(feature)
                ax.set_ylabel('Predicted Value')
                ax.set_title(f'{feature} vs Predicted')
            ax = axes[3]
            ax.scatter(y_test, y_pred, alpha=0.5)
            ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label='Perfect Prediction')
            ax.set_xlabel('True Value')
            ax.set_ylabel('Predicted Value')
            ax.set_title('True vs Predicted')
            min_val = min(y_test.min(), y_pred.min())
            max_val = max(y_test.max(), y_pred.max())
            ax.set_xlim(min_val, max_val)
            ax.set_ylim(min_val, max_val)
            ax.legend()
            plt.tight_layout()
            buf = io.BytesIO()
            plt.savefig(buf, format="png", bbox_inches="tight")
            plt.close()
            buf.seek(0)
            model_img = Image.open(buf)
            global_data.update({'model': model, 'scaler': scaler, 'X_columns': X_processed.columns, 'y_type': 'regression', 'uniques': None})
        else:
            # Classification
            if len(y.unique()) < 2:
                return ("Label must have at least 2 unique values.", None, None, None, None, None)
            y_encoded, uniques = pd.factorize(y)
            X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.3, random_state=RANDOM_STATE)
            model = RandomForestClassifier(random_state=RANDOM_STATE)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            cr = classification_report(y_test, y_pred, target_names=[str(u) for u in uniques])
            results_text += "Classification Results:\n" + cr + "\n"
            # 2D Confusion Matrix
            cm = confusion_matrix(y_test, y_pred)
            plt.figure(figsize=(8, 6))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[str(u) for u in uniques], yticklabels=[str(u) for u in uniques])
            plt.xlabel('Predicted')
            plt.ylabel('True')
            plt.title('Confusion Matrix')
            buf = io.BytesIO()
            plt.savefig(buf, format="png", bbox_inches="tight")
            plt.close()
            buf.seek(0)
            model_img = Image.open(buf)
            global_data.update({'model': model, 'scaler': scaler, 'X_columns': X_processed.columns, 'y_type': 'classification', 'uniques': uniques})
    except Exception as e:
        results_text += f"\nError during model training: {e}"

    try:
        fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW)
        plt.figure(figsize=(10, 6))
        sns.barplot(x=fi.values, y=fi.index)
        plt.title("Top 10 Feature Importances")
        plt.xlabel("Importance")
        plt.ylabel("Feature")
        buf = io.BytesIO()
        plt.savefig(buf, format="png", bbox_inches="tight")
        plt.close()
        buf.seek(0)
        fi_img = Image.open(buf)
    except Exception as e:
        results_text += f"\nWarning: Could not compute feature importance: {e}"

    try:
        kmeans = KMeans(n_clusters=n_clusters, random_state=RANDOM_STATE)
        clusters_kmeans = kmeans.fit_predict(X_scaled)
        pca = PCA(n_components=2, random_state=RANDOM_STATE)
        X_pca = pca.fit_transform(X_scaled)
        explained_var = sum(pca.explained_variance_ratio_)
        plt.figure(figsize=(8, 6))
        scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_kmeans, cmap="viridis", alpha=0.7)
        plt.xlabel("PCA 1")
        plt.ylabel("PCA 2")
        plt.title(f"KMeans Clustering (PCA, {explained_var:.2%} variance explained)")
        plt.colorbar(scatter, ticks=range(n_clusters))
        buf = io.BytesIO()
        plt.savefig(buf, format="png", bbox_inches="tight")
        plt.close()
        buf.seek(0)
        kmeans_img = Image.open(buf)
    except Exception as e:
        results_text += f"\nWarning: KMeans clustering failed: {e}"

    try:
        agg = AgglomerativeClustering(n_clusters=n_clusters)
        clusters_agg = agg.fit_predict(X_scaled)
        plt.figure(figsize=(8, 6))
        scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_agg, cmap="plasma", alpha=0.7)
        plt.xlabel("PCA 1")
        plt.ylabel("PCA 2")
        plt.title(f"Agglomerative Clustering (PCA, {explained_var:.2%} variance explained)")
        plt.colorbar(scatter, ticks=range(n_clusters))
        buf = io.BytesIO()
        plt.savefig(buf, format="png", bbox_inches="tight")
        plt.close()
        buf.seek(0)
        agg_img = Image.open(buf)
    except Exception as e:
        results_text += f"\nWarning: Agglomerative clustering failed: {e}"

    try:
        f_scores, _ = f_classif(X_processed, clusters_kmeans)
        # Handle potential division by zero or NaN values
        f_scores = np.nan_to_num(f_scores, nan=0.0, posinf=0.0)
        f_series = pd.Series(f_scores, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW)
        plt.figure(figsize=(10, 6))
        sns.barplot(data=f_series.reset_index(), x="index", y=0, hue="index", legend=False)  # Fix palette warning
        plt.title("Top 10 Differentiating Features (ANOVA F-scores)")
        plt.xlabel("F-score")
        plt.ylabel("Feature")
        plt.xticks(rotation=45)
        buf = io.BytesIO()
        plt.savefig(buf, format="png", bbox_inches="tight")
        plt.close()
        buf.seek(0)
        diff_img = Image.open(buf)
    except Exception as e:
        results_text += f"\nWarning: Could not compute differentiating features: {e}"

    return results_text, model_img, fi_img, kmeans_img, agg_img, diff_img

def predict_interactive(*args):
    if global_data['model'] is None:
        return "Please analyze a file first to train the model."
    
    try:
        # Convert args to kwargs based on column names
        kwargs = {}
        if len(args) > 0 and global_data['X_columns'] is not None:
            for i, col in enumerate(global_data['X_columns']):
                if i < len(args):
                    kwargs[col] = args[i]
        
        # Create DataFrame from user inputs
        input_data = pd.DataFrame([kwargs])
        
        # Handle categorical variables with one-hot encoding
        X_processed = pd.get_dummies(input_data)
        
        # Ensure all expected columns are present
        for col in global_data['X_columns']:
            if col not in X_processed.columns:
                X_processed[col] = 0
        
        # Reorder columns to match training data
        X_processed = X_processed[global_data['X_columns']]
        
        # Scale the input
        X_scaled = global_data['scaler'].transform(X_processed)
        
        # Predict
        prediction = global_data['model'].predict(X_scaled)
        
        if global_data['y_type'] == 'classification':
            pred_value = global_data['uniques'][int(prediction[0])]
            return f"Predicted class: {pred_value}"
        else:
            return f"Predicted value: {prediction[0]:.3f}"
    except Exception as e:
        return f"Error in prediction: {str(e)}. Please ensure all inputs are valid numbers or categories."

def create_interactive_inputs(file, label_col):
    if file is None or label_col is None:
        print("No file or label column provided")  # Debug logging
        return []
    
    try:
        if file.name.endswith('.csv'):
            df = pd.read_csv(file.name)
        elif file.name.endswith('.xlsx'):
            df = pd.read_excel(file.name)
        else:
            print("Unsupported file type")  # Debug logging
            return []
        
        if df.empty or label_col not in df.columns:
            print(f"Empty DataFrame or invalid label column: {label_col}")  # Debug logging
            return []
        
        X = df.drop(columns=[label_col])
        if X.empty:
            print("No features available after dropping label column")  # Debug logging
            return []
        
        components = []
        for col in X.columns:
            examples = X[col].dropna().sample(min(3, len(X[col].dropna()))).tolist()
            if pd.api.types.is_numeric_dtype(X[col]):
                components.append(gr.Number(label=f"{col} (e.g., {', '.join(map(str, examples))})", value=None))
            else:
                unique_values = X[col].dropna().unique().tolist()
                components.append(gr.Dropdown(label=f"{col} (e.g., {', '.join(map(str, examples))})", choices=unique_values, value=None))
        print(f"Generated {len(components)} input components")  # Debug logging
        return components
    except Exception as e:
        print(f"Error in create_interactive_inputs: {e}")  # Debug logging
        return []

with gr.Blocks() as demo:
    gr.Markdown("## Data Analysis Explorer")
    gr.Markdown("Upload a CSV or XLSX file to explore classification, regression, and clustering. Select a column to predict and the number of clusters!")

    with gr.Row():
        file_input = gr.File(label="Upload CSV or XLSX", file_types=[".csv", ".xlsx"])
        label_dropdown = gr.Dropdown(label="Select Column to Predict", choices=[], interactive=True)
        clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, value=3, label="Number of Clusters")

    file_input.change(fn=update_dropdown, inputs=file_input, outputs=label_dropdown)
    analyze_btn = gr.Button("Analyze")

    with gr.Tabs():
        with gr.TabItem("Prediction Results"):
            gr.Markdown("### Classification or Regression")
            gr.Markdown("""
            - **Regression**: Predicts numbers (e.g., sales). Uses Random Forest.
            - **Classification**: Predicts categories (e.g., yes/no). Uses Random Forest.
            - Rows with missing values are removed. 70% of data trains the model; 30% tests it.
            """)
            results_textbox = gr.Textbox(label="Performance Metrics", lines=10)

        with gr.TabItem("Prediction Plot"):
            gr.Markdown("### Prediction Visualization")
            gr.Markdown("For regression: scatter plots of top 3 features vs. predicted values and true vs. predicted. For classification: confusion matrix.")
            model_img_output = gr.Image(label="Prediction Output")

        with gr.TabItem("Feature Importances"):
            gr.Markdown("### Top 10 Key Features")
            gr.Markdown("Shows the most important features for predictions. Higher bars mean bigger impact.")
            fi_output = gr.Image(label="Feature Importances")

        with gr.TabItem("KMeans Clustering"):
            gr.Markdown("### KMeans Clustering")
            gr.Markdown("Groups similar data points without using the selected column. Colors show clusters in 2D (PCA projection).")
            kmeans_output = gr.Image(label="KMeans Clusters")

        with gr.TabItem("Agglomerative Clustering"):
            gr.Markdown("### Agglomerative Clustering")
            gr.Markdown("Another way to group data hierarchically. Compare with KMeans to see differences!")
            agg_output = gr.Image(label="Agglomerative Clusters")

        with gr.TabItem("Cluster Differences"):
            gr.Markdown("### Top 10 Cluster-Differentiating Features")
            gr.Markdown("Shows features that vary most between clusters, helping explain the groupings.")
            diff_output = gr.Image(label="Differentiating Features")

        with gr.TabItem("Interactive"):
            gr.Markdown("### Interactive Prediction")
            gr.Markdown("Enter values for each feature to get a prediction based on the trained model.")
            with gr.Column():
                input_components = gr.State(value=[])
                dynamic_inputs = gr.Column(visible=True)
                predict_btn = gr.Button("Predict")
                prediction_output = gr.Textbox(label="Prediction Result")
            
            def update_inputs(file, label_col):
                print(f"Updating inputs with file: {file}, label_col: {label_col}")  # Debug logging
                components = create_interactive_inputs(file, label_col)
                # Return the components and update the Column's visibility
                return components, gr.update(visible=True)  # Only update visibility, components are rendered in Blocks
                # Use Blocks to render components dynamically
                with dynamic_inputs:
                    for component in components:
                        component.render()

            file_input.change(
                fn=update_inputs,
                inputs=[file_input, label_dropdown],
                outputs=[input_components, dynamic_inputs]
            )
            label_dropdown.change(
                fn=update_inputs,
                inputs=[file_input, label_dropdown],
                outputs=[input_components, dynamic_inputs]
            )
            predict_btn.click(
                fn=predict_interactive,
                inputs=input_components,
                outputs=prediction_output
            )

    analyze_btn.click(fn=analyze_file, inputs=[file_input, label_dropdown, clusters_slider],
                      outputs=[results_textbox, model_img_output, fi_output, kmeans_output, agg_output, diff_output])

demo.launch(debug=True)  # Enable debug mode for more detailed error logging