"""
MLP Model Evaluation Script for AI vs Human Music Detection
==========================================================

This script evaluates the performance of the trained MLP classifier on test data.
It gives a complete performance report showing how well the model can distinguish
between AI-generated and human-composed music.

What this script does:
- Loads our saved/trained MLP model
- Tests it on held-out test data (music the model has never seen)
- Calculates accuracy, precision, recall, and F1-score
- Reports confusion statistics (true positives, true negatives, false positives, false negatives)
- Displays sample predictions with probabilities for transparency

Quick Start:
---------------------------
# Basic evaluation with default model path
python evaluate.py

# Evaluate a specific model
python evaluate.py --model "models/fusion/mlp_multimodal.pth"

# From code
from evaluate import evaluate_model
results = evaluate_model("models/fusion/mlp_multimodal.pth")

Performance Metrics Explained:
------------------------------
- Accuracy: Overall correctness (how many songs classified correctly)
- Precision: Of songs predicted as human, how many actually were human
- Recall: Of all human songs, how many did we correctly identify  
- F1-Score: Balance between precision and recall (harmonic mean)
- Confusion stats: 
    TP = Human songs correctly identified  
    TN = AI songs correctly identified  
    FP = AI songs incorrectly labeled as human  
    FN = Human songs incorrectly labeled as AI  

Expected Output:
----------------
Loading model from: models/fusion/mlp_multimodal.pth
Loaded dataset: (50000, 684), Labels: 50000
Test set size: (10000, 684)
Evaluating model on test set...

Sample predictions:
True: 1, Pred: 1, Prob: 0.8234  # Correctly identified human song
True: 0, Pred: 0, Prob: 0.1456  # Correctly identified AI song
True: 1, Pred: 0, Prob: 0.4123  # Missed a human song (false negative)

=== Evaluation Results ===
Test Accuracy: 87.54%
Test Loss: 0.3412
Precision: 0.8832
Recall: 0.8654
F1-Score: 0.8742
"""

import argparse
import logging
import numpy as np
from pathlib import Path

from src.models.mlp import build_mlp, load_config
from src.utils.config_loader import DATASET_NPZ
from sklearn.model_selection import train_test_split

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


def evaluate_model(model_path: str = "models/fusion/mlp_multimodal.pth"):
    logger.info(f"Loading model from: {model_path}")
    
    # Check if dataset exists
    if not Path(DATASET_NPZ).exists():
        raise FileNotFoundError(f"Dataset not found at {DATASET_NPZ}. Run train.py first.")
    
    # Load the full dataset
    loaded_data = np.load(DATASET_NPZ)
    X = loaded_data["X"]
    Y = loaded_data["Y"]
    
    logger.info(f"Loaded dataset: {X.shape}, Labels: {len(Y)}")
    
    # Split data (same as training)
    from src.utils.dataset import dataset_scaler
    data = dataset_scaler(X, Y)
    X_test, y_test = data["test"]
    
    logger.info(f"Test set size: {X_test.shape}")
    
    # Load configuration
    config = load_config("config/model_config.yml")
    
    # Build model architecture (needed for loading weights)
    mlp_classifier = build_mlp(input_dim=X_test.shape[1], config=config)
    
    # Load trained model
    mlp_classifier.load_model(model_path)
    
    # Evaluate on test set
    logger.info("Evaluating model on test set...")
    test_results = mlp_classifier.evaluate(X_test, y_test)
    
    # Get predictions for detailed analysis
    probabilities, predictions = mlp_classifier.predict(X_test)

    # Show a few sample predictions
    for i in range(10): 
        print(f"True: {y_test[i]}, Pred: {predictions[i]}, Prob: {probabilities[i]:.4f} "
              f"(Probability of predicted class)")
    
    logger.info("=== Evaluation Results ===")
    logger.info(f"Test Accuracy: {test_results['test_accuracy']:.2f}%")
    logger.info(f"Test Loss: {test_results['test_loss']:.4f}")
    
    # Additional statistics
    true_positives = np.sum((y_test == 1) & (predictions == 1))
    true_negatives = np.sum((y_test == 0) & (predictions == 0))
    false_positives = np.sum((y_test == 0) & (predictions == 1))
    false_negatives = np.sum((y_test == 1) & (predictions == 0))
    
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    logger.info(f"Precision: {precision:.4f}")
    logger.info(f"Recall: {recall:.4f}")
    logger.info(f"F1-Score: {f1_score:.4f}")
    
    # Include all metrics in return dict
    return {
        "test_accuracy": test_results["test_accuracy"],
        "test_loss": test_results["test_loss"],
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score,
        "true_positives": int(true_positives),
        "true_negatives": int(true_negatives),
        "false_positives": int(false_positives),
        "false_negatives": int(false_negatives)
    }


def main():
    """Main evaluation function."""
    parser = argparse.ArgumentParser(description='Evaluate Bach-or-Bot MLP classifier')
    parser.add_argument('--model', default='models/fusion/mlp_multimodal.pth',
                       help='Path to trained model')
    args = parser.parse_args()
    
    try:
        results = evaluate_model(args.model)
        logger.info("Evaluation completed successfully!")
    except Exception as e:
        logger.error(f"Evaluation failed: {str(e)}")
        raise


if __name__ == "__main__":
    main()