Spaces:

Pandaisop
/

voice-detection-api

Running

voice-detection-api / trainer /evaluate_model.py

vineetshukla.work@gmail.com

final commit

c5c9261 3 months ago

8.35 kB

	"""
	Evaluation Engine — Comprehensive model evaluation with industry-standard metrics.
	Produces: EER, AUC-ROC, per-language breakdown, calibration curves, confusion matrix.
	"""
	import os
	import json
	import yaml
	import argparse
	import numpy as np
	import pandas as pd
	import torch
	import torch.nn.functional as F
	import logging
	import time
	from pathlib import Path
	from torch.utils.data import DataLoader
	from sklearn.metrics import (
	roc_curve, auc, precision_recall_fscore_support,
	confusion_matrix, accuracy_score, classification_report,
	)

	logging.basicConfig(level=logging.INFO, format="%(asctime)s \| %(levelname)s \| %(message)s")
	logger = logging.getLogger(__name__)


	def compute_eer(labels, scores):
	"""Compute Equal Error Rate."""
	fpr, tpr, thresholds = roc_curve(labels, scores)
	fnr = 1 - tpr
	eer_idx = np.nanargmin(np.abs(fpr - fnr))
	eer = (fpr[eer_idx] + fnr[eer_idx]) / 2
	eer_threshold = thresholds[eer_idx]
	return eer, eer_threshold, fpr, tpr


	def compute_calibration(labels, probs, n_bins=10):
	"""Compute Expected Calibration Error (ECE) and reliability diagram data."""
	bin_boundaries = np.linspace(0, 1, n_bins + 1)
	bin_data = []

	for i in range(n_bins):
	lo, hi = bin_boundaries[i], bin_boundaries[i + 1]
	mask = (probs >= lo) & (probs < hi)
	if mask.sum() == 0:
	continue
	bin_acc = labels[mask].mean()
	bin_conf = probs[mask].mean()
	bin_count = mask.sum()
	bin_data.append({
	"bin_center": (lo + hi) / 2,
	"accuracy": float(bin_acc),
	"confidence": float(bin_conf),
	"count": int(bin_count),
	})

	# ECE
	total = len(labels)
	ece = sum(
	(b["count"] / total) * abs(b["accuracy"] - b["confidence"])
	for b in bin_data
	)
	return ece, bin_data


	def evaluate_model(cfg: dict):
	"""Full model evaluation pipeline."""
	from train import AudioDataset, build_model

	device = "cuda" if torch.cuda.is_available() else "cpu"
	output_dir = cfg["paths"]["output_dir"]
	metadata_dir = os.path.join(output_dir, "metadata")
	eval_dir = os.path.join(output_dir, "evaluation")
	os.makedirs(eval_dir, exist_ok=True)

	# Load test set
	test_csv = os.path.join(metadata_dir, "test.csv")
	if not os.path.exists(test_csv):
	logger.error("❌ test.csv not found. Run prepare_data.py first.")
	return

	test_dataset = AudioDataset(test_csv, cfg, augment=False)
	test_loader = DataLoader(
	test_dataset, batch_size=cfg["training"]["batch_size"],
	shuffle=False, num_workers=0
	)

	# Load model
	model, _ = build_model(cfg, device)
	model_path = os.path.join(output_dir, "best_model", "model.pt")
	if os.path.exists(model_path):
	model.load_state_dict(torch.load(model_path, map_location=device))
	logger.info(f"✅ Loaded model from {model_path}")
	else:
	logger.warning("⚠️ No saved model found. Using randomly initialized model.")

	model.eval()

	# ============ Collect Predictions ============
	all_labels = []
	all_probs = []
	all_preds = []
	latencies = []

	with torch.no_grad():
	for batch in test_loader:
	inputs = batch["input_values"].to(device)
	labels = batch["labels"]

	start = time.time()
	logits = model(inputs)
	latency = (time.time() - start) / inputs.size(0)

	probs = F.softmax(logits, dim=-1)[:, 1].cpu().numpy() # P(AI_GENERATED)
	preds = logits.argmax(dim=-1).cpu().numpy()

	all_labels.extend(labels.numpy())
	all_probs.extend(probs)
	all_preds.extend(preds)
	latencies.append(latency)

	all_labels = np.array(all_labels)
	all_probs = np.array(all_probs)
	all_preds = np.array(all_preds)

	# ============ Compute Metrics ============
	report = {}

	# 1. EER
	eer, eer_threshold, fpr, tpr = compute_eer(all_labels, all_probs)
	report["eer"] = round(float(eer), 4)
	report["eer_threshold"] = round(float(eer_threshold), 4)

	# 2. AUC-ROC
	auc_roc = auc(fpr, tpr)
	report["auc_roc"] = round(float(auc_roc), 4)

	# 3. Accuracy, Precision, Recall, F1
	accuracy = accuracy_score(all_labels, all_preds)
	precision, recall, f1, _ = precision_recall_fscore_support(
	all_labels, all_preds, average="binary", pos_label=1
	)
	report["accuracy"] = round(float(accuracy), 4)
	report["precision"] = round(float(precision), 4)
	report["recall"] = round(float(recall), 4)
	report["f1"] = round(float(f1), 4)

	# 4. Confusion Matrix
	cm = confusion_matrix(all_labels, all_preds)
	report["confusion_matrix"] = cm.tolist()

	# 5. Calibration
	if cfg["evaluation"]["calibration_curve"]:
	ece, bin_data = compute_calibration(all_labels, all_probs)
	report["ece"] = round(float(ece), 4)
	report["calibration_bins"] = bin_data

	# 6. Latency
	if cfg["evaluation"]["latency_benchmark"]:
	avg_latency = np.mean(latencies)
	report["avg_latency_ms"] = round(float(avg_latency * 1000), 2)

	# 7. Per-language breakdown (if language column exists)
	if cfg["evaluation"]["per_language"]:
	test_df = pd.read_csv(test_csv)
	if "language" in test_df.columns:
	languages = test_df["language"].unique()
	lang_report = {}
	for lang in languages:
	mask = test_df["language"] == lang
	if mask.sum() < 2:
	continue
	l_labels = all_labels[mask.values]
	l_probs = all_probs[mask.values]
	l_preds = all_preds[mask.values]
	l_eer, _, _, _ = compute_eer(l_labels, l_probs)
	l_acc = accuracy_score(l_labels, l_preds)
	lang_report[lang] = {
	"samples": int(mask.sum()),
	"eer": round(float(l_eer), 4),
	"accuracy": round(float(l_acc), 4),
	}
	report["per_language"] = lang_report

	# ============ Print Report ============
	print("\n" + "=" * 60)
	print(" 📊 MODEL EVALUATION REPORT")
	print("=" * 60)
	print(f" Samples tested : {len(all_labels)}")
	print(f" EER : {report['eer']:.4f} (threshold={report['eer_threshold']:.4f})")
	print(f" AUC-ROC : {report['auc_roc']:.4f}")
	print(f" Accuracy : {report['accuracy']:.4f}")
	print(f" Precision : {report['precision']:.4f}")
	print(f" Recall : {report['recall']:.4f}")
	print(f" F1 Score : {report['f1']:.4f}")
	if "ece" in report:
	print(f" ECE (Calibration) : {report['ece']:.4f}")
	if "avg_latency_ms" in report:
	print(f" Avg Latency : {report['avg_latency_ms']:.1f} ms/sample")
	print()
	print(" Confusion Matrix:")
	print(f" {'':12s} Pred HUMAN Pred AI")
	print(f" {'True HUMAN':12s} {cm[0][0]:6d} {cm[0][1]:6d}")
	print(f" {'True AI':12s} {cm[1][0]:6d} {cm[1][1]:6d}")

	if "per_language" in report:
	print("\n Per-Language Breakdown:")
	for lang, metrics in report["per_language"].items():
	print(f" {lang:10s}: EER={metrics['eer']:.4f} Acc={metrics['accuracy']:.4f} "
	f"(n={metrics['samples']})")

	print("=" * 60 + "\n")

	# ============ Save Report ============
	report_path = os.path.join(eval_dir, "evaluation_report.json")
	with open(report_path, "w") as f:
	json.dump(report, f, indent=2)
	logger.info(f"📋 Full report saved to {report_path}")

	# Save classification report
	cls_report = classification_report(
	all_labels, all_preds,
	target_names=["HUMAN", "AI_GENERATED"],
	output_dict=True
	)
	with open(os.path.join(eval_dir, "classification_report.json"), "w") as f:
	json.dump(cls_report, f, indent=2)

	return report


	def main():
	parser = argparse.ArgumentParser(description="Evaluate trained deepfake detection model")
	parser.add_argument("--config", type=str, default="config.yaml")
	args = parser.parse_args()

	with open(args.config, "r") as f:
	cfg = yaml.safe_load(f)

	evaluate_model(cfg)


	if __name__ == "__main__":
	main()