Spaces:
Sleeping
Sleeping
File size: 5,830 Bytes
fc7b4a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
"""
MLP Model Evaluation Script for AI vs Human Music Detection
==========================================================
This script evaluates the performance of the trained MLP classifier on test data.
It gives a complete performance report showing how well the model can distinguish
between AI-generated and human-composed music.
What this script does:
- Loads our saved/trained MLP model
- Tests it on held-out test data (music the model has never seen)
- Calculates accuracy, precision, recall, and F1-score
- Reports confusion statistics (true positives, true negatives, false positives, false negatives)
- Displays sample predictions with probabilities for transparency
Quick Start:
---------------------------
# Basic evaluation with default model path
python evaluate.py
# Evaluate a specific model
python evaluate.py --model "models/fusion/mlp_multimodal.pth"
# From code
from evaluate import evaluate_model
results = evaluate_model("models/fusion/mlp_multimodal.pth")
Performance Metrics Explained:
------------------------------
- Accuracy: Overall correctness (how many songs classified correctly)
- Precision: Of songs predicted as human, how many actually were human
- Recall: Of all human songs, how many did we correctly identify
- F1-Score: Balance between precision and recall (harmonic mean)
- Confusion stats:
TP = Human songs correctly identified
TN = AI songs correctly identified
FP = AI songs incorrectly labeled as human
FN = Human songs incorrectly labeled as AI
Expected Output:
----------------
Loading model from: models/fusion/mlp_multimodal.pth
Loaded dataset: (50000, 684), Labels: 50000
Test set size: (10000, 684)
Evaluating model on test set...
Sample predictions:
True: 1, Pred: 1, Prob: 0.8234 # Correctly identified human song
True: 0, Pred: 0, Prob: 0.1456 # Correctly identified AI song
True: 1, Pred: 0, Prob: 0.4123 # Missed a human song (false negative)
=== Evaluation Results ===
Test Accuracy: 87.54%
Test Loss: 0.3412
Precision: 0.8832
Recall: 0.8654
F1-Score: 0.8742
"""
import argparse
import logging
import numpy as np
from pathlib import Path
from src.models.mlp import build_mlp, load_config
from src.utils.config_loader import DATASET_NPZ
from sklearn.model_selection import train_test_split
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def evaluate_model(model_path: str = "models/fusion/mlp_multimodal.pth"):
logger.info(f"Loading model from: {model_path}")
# Check if dataset exists
if not Path(DATASET_NPZ).exists():
raise FileNotFoundError(f"Dataset not found at {DATASET_NPZ}. Run train.py first.")
# Load the full dataset
loaded_data = np.load(DATASET_NPZ)
X = loaded_data["X"]
Y = loaded_data["Y"]
logger.info(f"Loaded dataset: {X.shape}, Labels: {len(Y)}")
# Split data (same as training)
from src.utils.dataset import dataset_scaler
data = dataset_scaler(X, Y)
X_test, y_test = data["test"]
logger.info(f"Test set size: {X_test.shape}")
# Load configuration
config = load_config("config/model_config.yml")
# Build model architecture (needed for loading weights)
mlp_classifier = build_mlp(input_dim=X_test.shape[1], config=config)
# Load trained model
mlp_classifier.load_model(model_path)
# Evaluate on test set
logger.info("Evaluating model on test set...")
test_results = mlp_classifier.evaluate(X_test, y_test)
# Get predictions for detailed analysis
probabilities, predictions = mlp_classifier.predict(X_test)
# Show a few sample predictions
for i in range(10):
print(f"True: {y_test[i]}, Pred: {predictions[i]}, Prob: {probabilities[i]:.4f} "
f"(Probability of predicted class)")
logger.info("=== Evaluation Results ===")
logger.info(f"Test Accuracy: {test_results['test_accuracy']:.2f}%")
logger.info(f"Test Loss: {test_results['test_loss']:.4f}")
# Additional statistics
true_positives = np.sum((y_test == 1) & (predictions == 1))
true_negatives = np.sum((y_test == 0) & (predictions == 0))
false_positives = np.sum((y_test == 0) & (predictions == 1))
false_negatives = np.sum((y_test == 1) & (predictions == 0))
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
logger.info(f"Precision: {precision:.4f}")
logger.info(f"Recall: {recall:.4f}")
logger.info(f"F1-Score: {f1_score:.4f}")
# Include all metrics in return dict
return {
"test_accuracy": test_results["test_accuracy"],
"test_loss": test_results["test_loss"],
"precision": precision,
"recall": recall,
"f1_score": f1_score,
"true_positives": int(true_positives),
"true_negatives": int(true_negatives),
"false_positives": int(false_positives),
"false_negatives": int(false_negatives)
}
def main():
"""Main evaluation function."""
parser = argparse.ArgumentParser(description='Evaluate Bach-or-Bot MLP classifier')
parser.add_argument('--model', default='models/fusion/mlp_multimodal.pth',
help='Path to trained model')
args = parser.parse_args()
try:
results = evaluate_model(args.model)
logger.info("Evaluation completed successfully!")
except Exception as e:
logger.error(f"Evaluation failed: {str(e)}")
raise
if __name__ == "__main__":
main()
|