File size: 5,830 Bytes
fc7b4a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
"""
MLP Model Evaluation Script for AI vs Human Music Detection
==========================================================

This script evaluates the performance of the trained MLP classifier on test data.
It gives a complete performance report showing how well the model can distinguish
between AI-generated and human-composed music.

What this script does:
- Loads our saved/trained MLP model
- Tests it on held-out test data (music the model has never seen)
- Calculates accuracy, precision, recall, and F1-score
- Reports confusion statistics (true positives, true negatives, false positives, false negatives)
- Displays sample predictions with probabilities for transparency

Quick Start:
---------------------------
# Basic evaluation with default model path
python evaluate.py

# Evaluate a specific model
python evaluate.py --model "models/fusion/mlp_multimodal.pth"

# From code
from evaluate import evaluate_model
results = evaluate_model("models/fusion/mlp_multimodal.pth")

Performance Metrics Explained:
------------------------------
- Accuracy: Overall correctness (how many songs classified correctly)
- Precision: Of songs predicted as human, how many actually were human
- Recall: Of all human songs, how many did we correctly identify  
- F1-Score: Balance between precision and recall (harmonic mean)
- Confusion stats: 
    TP = Human songs correctly identified  
    TN = AI songs correctly identified  
    FP = AI songs incorrectly labeled as human  
    FN = Human songs incorrectly labeled as AI  

Expected Output:
----------------
Loading model from: models/fusion/mlp_multimodal.pth
Loaded dataset: (50000, 684), Labels: 50000
Test set size: (10000, 684)
Evaluating model on test set...

Sample predictions:
True: 1, Pred: 1, Prob: 0.8234  # Correctly identified human song
True: 0, Pred: 0, Prob: 0.1456  # Correctly identified AI song
True: 1, Pred: 0, Prob: 0.4123  # Missed a human song (false negative)

=== Evaluation Results ===
Test Accuracy: 87.54%
Test Loss: 0.3412
Precision: 0.8832
Recall: 0.8654
F1-Score: 0.8742
"""

import argparse
import logging
import numpy as np
from pathlib import Path

from src.models.mlp import build_mlp, load_config
from src.utils.config_loader import DATASET_NPZ
from sklearn.model_selection import train_test_split

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


def evaluate_model(model_path: str = "models/fusion/mlp_multimodal.pth"):
    logger.info(f"Loading model from: {model_path}")
    
    # Check if dataset exists
    if not Path(DATASET_NPZ).exists():
        raise FileNotFoundError(f"Dataset not found at {DATASET_NPZ}. Run train.py first.")
    
    # Load the full dataset
    loaded_data = np.load(DATASET_NPZ)
    X = loaded_data["X"]
    Y = loaded_data["Y"]
    
    logger.info(f"Loaded dataset: {X.shape}, Labels: {len(Y)}")
    
    # Split data (same as training)
    from src.utils.dataset import dataset_scaler
    data = dataset_scaler(X, Y)
    X_test, y_test = data["test"]
    
    logger.info(f"Test set size: {X_test.shape}")
    
    # Load configuration
    config = load_config("config/model_config.yml")
    
    # Build model architecture (needed for loading weights)
    mlp_classifier = build_mlp(input_dim=X_test.shape[1], config=config)
    
    # Load trained model
    mlp_classifier.load_model(model_path)
    
    # Evaluate on test set
    logger.info("Evaluating model on test set...")
    test_results = mlp_classifier.evaluate(X_test, y_test)
    
    # Get predictions for detailed analysis
    probabilities, predictions = mlp_classifier.predict(X_test)

    # Show a few sample predictions
    for i in range(10): 
        print(f"True: {y_test[i]}, Pred: {predictions[i]}, Prob: {probabilities[i]:.4f} "
              f"(Probability of predicted class)")
    
    logger.info("=== Evaluation Results ===")
    logger.info(f"Test Accuracy: {test_results['test_accuracy']:.2f}%")
    logger.info(f"Test Loss: {test_results['test_loss']:.4f}")
    
    # Additional statistics
    true_positives = np.sum((y_test == 1) & (predictions == 1))
    true_negatives = np.sum((y_test == 0) & (predictions == 0))
    false_positives = np.sum((y_test == 0) & (predictions == 1))
    false_negatives = np.sum((y_test == 1) & (predictions == 0))
    
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    logger.info(f"Precision: {precision:.4f}")
    logger.info(f"Recall: {recall:.4f}")
    logger.info(f"F1-Score: {f1_score:.4f}")
    
    # Include all metrics in return dict
    return {
        "test_accuracy": test_results["test_accuracy"],
        "test_loss": test_results["test_loss"],
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score,
        "true_positives": int(true_positives),
        "true_negatives": int(true_negatives),
        "false_positives": int(false_positives),
        "false_negatives": int(false_negatives)
    }


def main():
    """Main evaluation function."""
    parser = argparse.ArgumentParser(description='Evaluate Bach-or-Bot MLP classifier')
    parser.add_argument('--model', default='models/fusion/mlp_multimodal.pth',
                       help='Path to trained model')
    args = parser.parse_args()
    
    try:
        results = evaluate_model(args.model)
        logger.info("Evaluation completed successfully!")
    except Exception as e:
        logger.error(f"Evaluation failed: {str(e)}")
        raise


if __name__ == "__main__":
    main()