Spaces:

Pandaisop
/

voice-detection-api

Sleeping

File size: 8,351 Bytes

c5c9261

"""
Evaluation Engine — Comprehensive model evaluation with industry-standard metrics.
Produces: EER, AUC-ROC, per-language breakdown, calibration curves, confusion matrix.
"""
import os
import json
import yaml
import argparse
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import logging
import time
from pathlib import Path
from torch.utils.data import DataLoader
from sklearn.metrics import (
    roc_curve, auc, precision_recall_fscore_support,
    confusion_matrix, accuracy_score, classification_report,
)

logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
logger = logging.getLogger(__name__)


def compute_eer(labels, scores):
    """Compute Equal Error Rate."""
    fpr, tpr, thresholds = roc_curve(labels, scores)
    fnr = 1 - tpr
    eer_idx = np.nanargmin(np.abs(fpr - fnr))
    eer = (fpr[eer_idx] + fnr[eer_idx]) / 2
    eer_threshold = thresholds[eer_idx]
    return eer, eer_threshold, fpr, tpr


def compute_calibration(labels, probs, n_bins=10):
    """Compute Expected Calibration Error (ECE) and reliability diagram data."""
    bin_boundaries = np.linspace(0, 1, n_bins + 1)
    bin_data = []

    for i in range(n_bins):
        lo, hi = bin_boundaries[i], bin_boundaries[i + 1]
        mask = (probs >= lo) & (probs < hi)
        if mask.sum() == 0:
            continue
        bin_acc = labels[mask].mean()
        bin_conf = probs[mask].mean()
        bin_count = mask.sum()
        bin_data.append({
            "bin_center": (lo + hi) / 2,
            "accuracy": float(bin_acc),
            "confidence": float(bin_conf),
            "count": int(bin_count),
        })

    # ECE
    total = len(labels)
    ece = sum(
        (b["count"] / total) * abs(b["accuracy"] - b["confidence"])
        for b in bin_data
    )
    return ece, bin_data


def evaluate_model(cfg: dict):
    """Full model evaluation pipeline."""
    from train import AudioDataset, build_model

    device = "cuda" if torch.cuda.is_available() else "cpu"
    output_dir = cfg["paths"]["output_dir"]
    metadata_dir = os.path.join(output_dir, "metadata")
    eval_dir = os.path.join(output_dir, "evaluation")
    os.makedirs(eval_dir, exist_ok=True)

    # Load test set
    test_csv = os.path.join(metadata_dir, "test.csv")
    if not os.path.exists(test_csv):
        logger.error("❌ test.csv not found. Run prepare_data.py first.")
        return

    test_dataset = AudioDataset(test_csv, cfg, augment=False)
    test_loader = DataLoader(
        test_dataset, batch_size=cfg["training"]["batch_size"],
        shuffle=False, num_workers=0
    )

    # Load model
    model, _ = build_model(cfg, device)
    model_path = os.path.join(output_dir, "best_model", "model.pt")
    if os.path.exists(model_path):
        model.load_state_dict(torch.load(model_path, map_location=device))
        logger.info(f"✅ Loaded model from {model_path}")
    else:
        logger.warning("⚠️  No saved model found. Using randomly initialized model.")

    model.eval()

    # ============ Collect Predictions ============
    all_labels = []
    all_probs = []
    all_preds = []
    latencies = []

    with torch.no_grad():
        for batch in test_loader:
            inputs = batch["input_values"].to(device)
            labels = batch["labels"]

            start = time.time()
            logits = model(inputs)
            latency = (time.time() - start) / inputs.size(0)

            probs = F.softmax(logits, dim=-1)[:, 1].cpu().numpy()  # P(AI_GENERATED)
            preds = logits.argmax(dim=-1).cpu().numpy()

            all_labels.extend(labels.numpy())
            all_probs.extend(probs)
            all_preds.extend(preds)
            latencies.append(latency)

    all_labels = np.array(all_labels)
    all_probs = np.array(all_probs)
    all_preds = np.array(all_preds)

    # ============ Compute Metrics ============
    report = {}

    # 1. EER
    eer, eer_threshold, fpr, tpr = compute_eer(all_labels, all_probs)
    report["eer"] = round(float(eer), 4)
    report["eer_threshold"] = round(float(eer_threshold), 4)

    # 2. AUC-ROC
    auc_roc = auc(fpr, tpr)
    report["auc_roc"] = round(float(auc_roc), 4)

    # 3. Accuracy, Precision, Recall, F1
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average="binary", pos_label=1
    )
    report["accuracy"] = round(float(accuracy), 4)
    report["precision"] = round(float(precision), 4)
    report["recall"] = round(float(recall), 4)
    report["f1"] = round(float(f1), 4)

    # 4. Confusion Matrix
    cm = confusion_matrix(all_labels, all_preds)
    report["confusion_matrix"] = cm.tolist()

    # 5. Calibration
    if cfg["evaluation"]["calibration_curve"]:
        ece, bin_data = compute_calibration(all_labels, all_probs)
        report["ece"] = round(float(ece), 4)
        report["calibration_bins"] = bin_data

    # 6. Latency
    if cfg["evaluation"]["latency_benchmark"]:
        avg_latency = np.mean(latencies)
        report["avg_latency_ms"] = round(float(avg_latency * 1000), 2)

    # 7. Per-language breakdown (if language column exists)
    if cfg["evaluation"]["per_language"]:
        test_df = pd.read_csv(test_csv)
        if "language" in test_df.columns:
            languages = test_df["language"].unique()
            lang_report = {}
            for lang in languages:
                mask = test_df["language"] == lang
                if mask.sum() < 2:
                    continue
                l_labels = all_labels[mask.values]
                l_probs = all_probs[mask.values]
                l_preds = all_preds[mask.values]
                l_eer, _, _, _ = compute_eer(l_labels, l_probs)
                l_acc = accuracy_score(l_labels, l_preds)
                lang_report[lang] = {
                    "samples": int(mask.sum()),
                    "eer": round(float(l_eer), 4),
                    "accuracy": round(float(l_acc), 4),
                }
            report["per_language"] = lang_report

    # ============ Print Report ============
    print("\n" + "=" * 60)
    print("  📊  MODEL EVALUATION REPORT")
    print("=" * 60)
    print(f"  Samples tested    : {len(all_labels)}")
    print(f"  EER               : {report['eer']:.4f}  (threshold={report['eer_threshold']:.4f})")
    print(f"  AUC-ROC           : {report['auc_roc']:.4f}")
    print(f"  Accuracy          : {report['accuracy']:.4f}")
    print(f"  Precision         : {report['precision']:.4f}")
    print(f"  Recall            : {report['recall']:.4f}")
    print(f"  F1 Score          : {report['f1']:.4f}")
    if "ece" in report:
        print(f"  ECE (Calibration) : {report['ece']:.4f}")
    if "avg_latency_ms" in report:
        print(f"  Avg Latency       : {report['avg_latency_ms']:.1f} ms/sample")
    print()
    print("  Confusion Matrix:")
    print(f"    {'':12s} Pred HUMAN  Pred AI")
    print(f"    {'True HUMAN':12s}   {cm[0][0]:6d}    {cm[0][1]:6d}")
    print(f"    {'True AI':12s}   {cm[1][0]:6d}    {cm[1][1]:6d}")

    if "per_language" in report:
        print("\n  Per-Language Breakdown:")
        for lang, metrics in report["per_language"].items():
            print(f"    {lang:10s}: EER={metrics['eer']:.4f}  Acc={metrics['accuracy']:.4f}  "
                  f"(n={metrics['samples']})")

    print("=" * 60 + "\n")

    # ============ Save Report ============
    report_path = os.path.join(eval_dir, "evaluation_report.json")
    with open(report_path, "w") as f:
        json.dump(report, f, indent=2)
    logger.info(f"📋 Full report saved to {report_path}")

    # Save classification report
    cls_report = classification_report(
        all_labels, all_preds,
        target_names=["HUMAN", "AI_GENERATED"],
        output_dict=True
    )
    with open(os.path.join(eval_dir, "classification_report.json"), "w") as f:
        json.dump(cls_report, f, indent=2)

    return report


def main():
    parser = argparse.ArgumentParser(description="Evaluate trained deepfake detection model")
    parser.add_argument("--config", type=str, default="config.yaml")
    args = parser.parse_args()

    with open(args.config, "r") as f:
        cfg = yaml.safe_load(f)

    evaluate_model(cfg)


if __name__ == "__main__":
    main()