import pandas as pd
import numpy as np
from sklearn.datasets import load_iris, load_wine, make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import time

_current_model_params = None

def _get_current_model():
    return _current_model_params

def _set_current_model(params):
    global _current_model_params
    _current_model_params = params

def load_data(file_obj=None, dataset_choice="Iris"):
    """Load multi-class classification datasets"""
    if file_obj is not None:
        if file_obj.name.endswith(".csv"):
            encodings = ["utf-8", "latin-1", "iso-8859-1", "cp1252"]
            for encoding in encodings:
                try:
                    return pd.read_csv(file_obj.name, encoding=encoding)
                except UnicodeDecodeError:
                    continue
            return pd.read_csv(file_obj.name, encoding="utf-8", errors="replace")
        elif file_obj.name.endswith((".xlsx", ".xls")):
            return pd.read_excel(file_obj.name)
        else:
            raise ValueError("Unsupported format. Upload CSV or Excel files.")
    
    datasets = {
        "Iris": lambda: _sklearn_to_df(load_iris()),
        "Wine": lambda: _sklearn_to_df(load_wine()),
        "Synthetic (3 classes)": lambda: _synthetic_multiclass(n_classes=3),
        "Synthetic (5 classes)": lambda: _synthetic_multiclass(n_classes=5),
    }
    
    if dataset_choice not in datasets:
        # Fallback if choice is invalid
        return datasets["Iris"]()
    return datasets[dataset_choice]()

def _sklearn_to_df(data):
    """Convert sklearn dataset to DataFrame"""
    df = pd.DataFrame(data.data, columns=getattr(data, "feature_names", None))
    if df.columns.isnull().any():
        df.columns = [f"feature_{i}" for i in range(df.shape[1])]
    df["target"] = data.target
    return df

def _synthetic_multiclass(n_classes=3):
    """Generate synthetic multi-class classification dataset"""
    X, y = make_classification(n_samples=1000, n_features=10, n_informative=8, 
                               n_redundant=2, n_classes=n_classes, random_state=42)
    df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
    df["target"] = y
    return df

def create_input_components(df, target_col):
    """Create input components for feature values"""
    feature_cols = [c for c in df.columns if c != target_col]
    components = []
    for col in feature_cols:
        data = df[col]
        val = pd.to_numeric(data, errors="coerce").dropna().mean()
        val = 0.0 if pd.isna(val) else float(val)
        components.append(
            {
                "name": col,
                "type": "number",
                "value": round(val, 3),
                "minimum": None,
                "maximum": None,
            }
        )
    return components

def one_hot_encode(y, num_classes):
    """Convert integer labels to one-hot encoded vectors"""
    return np.eye(num_classes)[y]

def preprocess_data(df, target_col, new_point_dict):
    """Preprocess data for softmax regression"""
    feature_cols = [c for c in df.columns if c != target_col]
    X = df[feature_cols].copy()
    y = df[target_col].copy()

    # Convert to numeric
    for col in feature_cols:
        X[col] = pd.to_numeric(X[col], errors="coerce").fillna(0.0)

    # Ensure target is numeric and get number of classes
    y = pd.to_numeric(y, errors="coerce").fillna(0).astype(int)
    num_classes = len(np.unique(y))
    
    if num_classes < 2:
        raise ValueError(f"Target must have at least 2 classes. Found {num_classes}.")
    
    # Prepare new point
    new_point = []
    for col in feature_cols:
        if col in new_point_dict:
            try:
                new_point.append(float(new_point_dict[col]))
            except Exception:
                new_point.append(0.0)
        else:
            new_point.append(0.0)
    
    new_point = np.array(new_point, dtype=float).reshape(1, -1)
    
    return X.values, y.values, num_classes, new_point, feature_cols

def add_bias(X):
    """Add bias column to feature matrix"""
    return np.c_[np.ones(X.shape[0]), X]

def softmax(Z):
    """Softmax activation function: exp(z_k) / sum(exp(z_j))"""
    # Shift Z for numerical stability to avoid overflow with exp()
    Z_shifted = Z - np.max(Z, axis=1, keepdims=True)
    exp_Z = np.exp(Z_shifted)
    return exp_Z / np.sum(exp_Z, axis=1, keepdims=True)

def predict_proba(X, Theta):
    """Make probability predictions: Y_hat = softmax(X @ Theta)"""
    Z = X.dot(Theta)
    return softmax(Z)

def predict_class(X, Theta):
    """Make class predictions using argmax"""
    proba = predict_proba(X, Theta)
    return np.argmax(proba, axis=1)

def compute_loss(Y_hat, Y_one_hot):
    """Compute Categorical Cross-Entropy loss: -sum(y_k * log(y_hat_k))"""
    eps = 1e-15
    Y_hat = np.clip(Y_hat, eps, 1 - eps)
    return -np.mean(np.sum(Y_one_hot * np.log(Y_hat), axis=1))

def compute_gradient(Y_hat, Y_one_hot, X):
    """Compute gradient: X.T @ (Y_hat - Y_one_hot) / N"""
    N = X.shape[0]
    return X.T.dot(Y_hat - Y_one_hot) / N

def update_theta(Theta, gradient, lr):
    """Update parameters using gradient descent"""
    return Theta - lr * gradient

def compute_accuracy(y_true, y_pred):
    """Compute classification accuracy"""
    return np.mean(y_true == y_pred)

def normalize_features(X_train, X_val=None, X_test=None):
    """Normalize features using standardization (zero mean, unit variance)"""
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)
    std[std == 0] = 1
    
    X_train_norm = (X_train - mean) / std
    X_val_norm = (X_val - mean) / std if X_val is not None else None
    X_test_norm = (X_test - mean) / std if X_test is not None else None
    
    return X_train_norm, X_val_norm, X_test_norm, mean, std

def train_softmax_regression_with_validation(X_train, y_train, X_val, y_val, num_classes, epochs, learning_rate, batch_size=None):
    """
    Train softmax regression with mini-batch gradient descent
    Returns:
        Theta, train_losses, val_losses, train_accuracies, val_accuracies, X_mean, X_std
    """
    X_train_norm, X_val_norm, _, X_mean, X_std = normalize_features(X_train, X_val)
    
    X_train_bias = add_bias(X_train_norm)
    X_val_bias = add_bias(X_val_norm)
    
    # Initialize Theta: (n_features + 1) x num_classes
    np.random.seed(42)
    Theta = np.random.randn(X_train_bias.shape[1], num_classes) * 0.01
    
    # One-hot encode targets
    Y_train_one_hot = one_hot_encode(y_train, num_classes)
    Y_val_one_hot = one_hot_encode(y_val, num_classes)
    
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []
    
    n_samples = X_train_bias.shape[0]
    
    if batch_size is None or batch_size == "Full Batch" or int(batch_size) >= n_samples:
        actual_batch_size = n_samples
    else:
        actual_batch_size = int(batch_size)
    
    for epoch in range(epochs):
        # Shuffle training data
        indices = np.random.permutation(n_samples)
        X_train_shuffled = X_train_bias[indices]
        Y_train_one_hot_shuffled = Y_train_one_hot[indices]
        
        for i in range(0, n_samples, actual_batch_size):
            X_batch = X_train_shuffled[i:i+actual_batch_size]
            Y_batch = Y_train_one_hot_shuffled[i:i+actual_batch_size]
            
            Y_batch_hat = predict_proba(X_batch, Theta)
            gradient = compute_gradient(Y_batch_hat, Y_batch, X_batch)
            Theta = update_theta(Theta, gradient, learning_rate)
        
        # Compute metrics
        Y_train_hat = predict_proba(X_train_bias, Theta)
        train_loss = compute_loss(Y_train_hat, Y_train_one_hot)
        train_losses.append(train_loss)
        
        y_train_pred = predict_class(X_train_bias, Theta)
        train_acc = compute_accuracy(y_train, y_train_pred)
        train_accuracies.append(train_acc)
        
        Y_val_hat = predict_proba(X_val_bias, Theta)
        val_loss = compute_loss(Y_val_hat, Y_val_one_hot)
        val_losses.append(val_loss)
        
        y_val_pred = predict_class(X_val_bias, Theta)
        val_acc = compute_accuracy(y_val, y_val_pred)
        val_accuracies.append(val_acc)
    
    return Theta, train_losses, val_losses, train_accuracies, val_accuracies, X_mean, X_std, y_val, y_val_pred

def create_confusion_matrix_chart(y_true, y_pred, num_classes):
    """Create confusion matrix visualization using plotly"""
    cm = confusion_matrix(y_true, y_pred)
    labels = [f"Class {i}" for i in range(num_classes)]
    
    fig = px.imshow(cm,
                    labels=dict(x="Predicted Label", y="True Label", color="Count"),
                    x=labels,
                    y=labels,
                    text_auto=True,
                    color_continuous_scale='Blues')
    
    fig.update_layout(
        title="Confusion Matrix (Validation Set)",
        plot_bgcolor="white",
        height=400,
        margin=dict(l=40, r=40, t=80, b=40)
    )
    return fig

def run_softmax_regression_and_visualize(df, target_col, new_point_dict, 
                                        epochs, learning_rate, batch_size_str="Full Batch", 
                                        train_test_split_ratio=0.8):
    """Run softmax regression training and generate visualizations"""
    X, y, num_classes, new_point, feature_cols = preprocess_data(df, target_col, new_point_dict)
    
    if epochs < 1:
        return None, None, None, "Number of epochs must be ≥ 1.", None
    if learning_rate <= 0:
        return None, None, None, "Learning rate must be > 0.", None
    
    test_size = 1.0 - train_test_split_ratio
    # Ensure stratify works even with small classes by checking counts if needed, 
    # but for simplicity we'll assume data is sufficient for demo.
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
    
    start_time = time.time()
    Theta, train_losses, val_losses, train_accuracies, val_accuracies, X_mean, X_std, y_val_final, y_val_pred_final = train_softmax_regression_with_validation(
        X_train, y_train, X_val, y_val, num_classes, epochs, learning_rate, batch_size_str
    )
    training_time = time.time() - start_time
    
    _set_current_model({
        "Theta": Theta, 
        "feature_cols": feature_cols,
        "X_mean": X_mean,
        "X_std": X_std,
        "num_classes": num_classes
    })
    
    # Make prediction for new point
    new_point_norm = (new_point - X_mean) / X_std
    new_point_bias = add_bias(new_point_norm)
    prediction_proba = predict_proba(new_point_bias, Theta)[0]
    prediction_class = np.argmax(prediction_proba)
    
    final_train_loss = train_losses[-1]
    final_val_loss = val_losses[-1]
    final_train_acc = train_accuracies[-1]
    final_val_acc = val_accuracies[-1]
    
    train_loss_fig = create_training_loss_chart(train_losses, train_accuracies)
    val_loss_fig = create_validation_loss_chart(val_losses, val_accuracies)
    # confusion_fig = create_confusion_matrix_chart(y_val_final, y_val_pred_final, num_classes)
    
    results_display = create_results_display(
        Theta, prediction_proba, prediction_class, feature_cols, epochs, learning_rate, num_classes,
        split_info={
            "train_size": len(X_train),
            "val_size": len(X_val),
            "train_ratio": train_test_split_ratio,
            "val_ratio": 1.0 - train_test_split_ratio,
            "train_loss": final_train_loss,
            "val_loss": final_val_loss,
            "train_acc": final_train_acc,
            "val_acc": final_val_acc,
            "batch_size": batch_size_str,
            "training_time": training_time
        }
    )
    
    return train_loss_fig, val_loss_fig, results_display

def create_training_loss_chart(train_losses, train_accuracies):
    """Create training loss and accuracy visualization"""
    if not train_losses or len(train_losses) == 0:
        return None
    
    epochs = list(range(1, len(train_losses) + 1))
    valid_losses = [loss if not (np.isinf(loss) or np.isnan(loss)) else None for loss in train_losses]
    
    fig = make_subplots(
        rows=2, cols=1,
        subplot_titles=("Training Loss (Categorical Cross-Entropy)", "Training Accuracy"),
        vertical_spacing=0.15,
        row_heights=[0.5, 0.5]
    )
    
    fig.add_trace(
        go.Scatter(
            x=epochs,
            y=valid_losses,
            mode='lines+markers',
            name='Training Loss',
            line=dict(color='#1976D2', width=3),
            marker=dict(size=6),
            showlegend=True
        ),
        row=1, col=1
    )
    
    if train_accuracies and len(train_accuracies) == len(train_losses):
        valid_accuracies = [acc * 100 if not (np.isinf(acc) or np.isnan(acc)) else None for acc in train_accuracies]
        fig.add_trace(
            go.Scatter(
                x=epochs,
                y=valid_accuracies,
                mode='lines+markers',
                name='Training Accuracy',
                line=dict(color='#42A5F5', width=3),
                marker=dict(size=6),
                showlegend=True
            ),
            row=2, col=1
        )
    
    fig.update_xaxes(title_text="Epoch", row=1, col=1, showgrid=True, gridwidth=1, gridcolor='lightgray')
    fig.update_yaxes(title_text="Loss", row=1, col=1, showgrid=True, gridwidth=1, gridcolor='lightgray')
    fig.update_xaxes(title_text="Epoch", row=2, col=1, showgrid=True, gridwidth=1, gridcolor='lightgray')
    fig.update_yaxes(title_text="Accuracy (%)", row=2, col=1, showgrid=True, gridwidth=1, gridcolor='lightgray', range=[0, 100])
    
    fig.update_layout(
        title="Training Metrics Over Epochs",
        plot_bgcolor="white",
        height=600,
        margin=dict(l=40, r=40, t=80, b=40)
    )
    
    return fig

def create_validation_loss_chart(val_losses, val_accuracies):
    """Create validation loss and accuracy visualization"""
    if not val_losses or len(val_losses) == 0:
        return None
    
    epochs = list(range(1, len(val_losses) + 1))
    valid_losses = [loss if not (np.isinf(loss) or np.isnan(loss)) else None for loss in val_losses]
    
    fig = make_subplots(
        rows=2, cols=1,
        subplot_titles=("Validation Loss (Categorical Cross-Entropy)", "Validation Accuracy"),
        vertical_spacing=0.15,
        row_heights=[0.5, 0.5]
    )
    
    fig.add_trace(
        go.Scatter(
            x=epochs,
            y=valid_losses,
            mode='lines+markers',
            name='Validation Loss',
            line=dict(color='#7B1FA2', width=3),
            marker=dict(size=6),
            showlegend=True
        ),
        row=1, col=1
    )
    
    if val_accuracies and len(val_accuracies) == len(val_losses):
        valid_accuracies = [acc * 100 if not (np.isinf(acc) or np.isnan(acc)) else None for acc in val_accuracies]
        fig.add_trace(
            go.Scatter(
                x=epochs,
                y=valid_accuracies,
                mode='lines+markers',
                name='Validation Accuracy',
                line=dict(color='#BA68C8', width=3),
                marker=dict(size=6),
                showlegend=True
            ),
            row=2, col=1
        )
    
    fig.update_xaxes(title_text="Epoch", row=1, col=1, showgrid=True, gridwidth=1, gridcolor='lightgray')
    fig.update_yaxes(title_text="Loss", row=1, col=1, showgrid=True, gridwidth=1, gridcolor='lightgray')
    fig.update_xaxes(title_text="Epoch", row=2, col=1, showgrid=True, gridwidth=1, gridcolor='lightgray')
    fig.update_yaxes(title_text="Accuracy (%)", row=2, col=1, showgrid=True, gridwidth=1, gridcolor='lightgray', range=[0, 100])
    
    fig.update_layout(
        title="Validation Metrics Over Epochs",
        plot_bgcolor="white",
        height=600,
        margin=dict(l=40, r=40, t=80, b=40)
    )
    
    return fig

def create_results_display(Theta, prediction_proba, prediction_class, feature_cols, epochs, learning_rate, num_classes, split_info):
    """Create HTML display showing model results"""
    
    # Format Theta for display (just showing shape or first few parameters if needed, usually too large for multi-class)
    theta_shape_str = f"{Theta.shape[0]} x {Theta.shape[1]}"
    
    # Format predicted probabilities for each class
    proba_str = "<br>".join([f"• Class {i}: <strong>{p:.4f}</strong> ({p*100:.2f}%)" for i, p in enumerate(prediction_proba)])

    html_content = f"""
    <div style='background:#5eb4f2;border-left:6px solid #1976D2;padding:14px 16px;border-radius:10px;'>
        <strong style='color:#0D47A1;'>📊 Softmax Regression Results</strong><br><br>
        
        <div style='margin:8px 0;'>
            <strong style='color:#1976D2;'>🔧 Model Configuration:</strong><br>
            • Epochs: {epochs} | Learning Rate: {learning_rate}<br>
            • Batch Size: {split_info.get('batch_size', 'Full Batch')} | Features: {len(feature_cols)} | Classes: {num_classes}<br>
            • Normalization: Standardized | Activation: Softmax | Loss: Categorical Cross-Entropy<br>
        </div>
        
        <div style='margin:8px 0;'>
            <strong style='color:#1976D2;'>📊 Data Split:</strong><br>
            • Training: {split_info['train_size']} samples ({split_info['train_ratio']:.1%})<br>
            • Validation: {split_info['val_size']} samples ({split_info['val_ratio']:.1%})<br>
        </div>
        
        <div style='margin:8px 0;'>
            <strong style='color:#1976D2;'>📈 Performance Metrics:</strong><br>
            • Training Loss (CCE): <span style='background:#BBDEFB;padding:2px 6px;border-radius:4px;'><strong>{split_info['train_loss']:.4f}</strong></span><br>
            • Validation Loss (CCE): <span style='background:#C5CAE9;padding:2px 6px;border-radius:4px;'><strong>{split_info['val_loss']:.4f}</strong></span><br>
            • Training Accuracy: <span style='background:#BBDEFB;padding:2px 6px;border-radius:4px;'><strong>{split_info['train_acc']*100:.2f}%</strong></span><br>
            • Validation Accuracy: <span style='background:#C5CAE9;padding:2px 6px;border-radius:4px;'><strong>{split_info['val_acc']*100:.2f}%</strong></span><br>
            • Training Time: <span style='background:#E1BEE7;padding:2px 6px;border-radius:4px;'><strong>{split_info['training_time']:.4f}s</strong></span><br>
        </div>
        
        <div style='margin:8px 0;'>
            <strong style='color:#1976D2;'>🎯 Learned Parameters (Θ):</strong><br>
            • Theta Shape = <code style='background:#87BAC3;padding:2px 6px;border-radius:4px;'>{theta_shape_str}</code> (Features+Bias x Classes)<br>
        </div>
        
        <div style='margin:8px 0;'>
            <strong style='color:#1976D2;'>🔮 Prediction for New Point:</strong><br>
            • Predicted Class: <span style='background:#FE6244;padding:2px 6px;border-radius:4px;font-size:1.1em;'><strong>Class {prediction_class}</strong></span><br>
            <div style='margin-top:8px;font-size:0.95em;'>
            <strong>Class Probabilities:</strong><br>
            {proba_str}
            </div>
        </div>
    </div>
    """
    
    return html_content