""" VoiceDetector - Forensic Deepfake Audio Detection Using original AASIST model (EER: 0.83% on ASVspoof 2019 LA) """ import os import sys import time import gradio as gr import numpy as np import torch import librosa import librosa.display import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from PIL import Image import io # Import original AASIST model from aasist_model import Model as AASISTModel # ============================================ # Detector Class # ============================================ class AASISTDetector: def __init__(self): self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.sample_rate = 16000 self.max_length = 64600 # ~4 seconds # Original AASIST config self.model_config = { "architecture": "AASIST", "nb_samp": 64600, "first_conv": 128, "filts": [70, [1, 32], [32, 32], [32, 64], [64, 64]], "gat_dims": [64, 32], "pool_ratios": [0.5, 0.7, 0.5, 0.5], "temperatures": [2.0, 2.0, 100.0, 100.0] } self.model = AASISTModel(self.model_config).to(self.device) self._load_weights() self.model.eval() print(f"[AASIST] Loaded on {self.device}") print(f"[AASIST] Parameters: {sum(p.numel() for p in self.model.parameters()):,}") def _load_weights(self): weights_path = os.path.join(os.path.dirname(__file__), "AASIST.pth") if not os.path.exists(weights_path): print(f"[AASIST] ERROR: Weights not found at {weights_path}") return checkpoint = torch.load(weights_path, map_location=self.device, weights_only=False) self.model.load_state_dict(checkpoint, strict=False) print(f"[AASIST] Weights loaded from {weights_path}") def analyze(self, audio_path): start_time = time.time() # Load audio audio, sr = librosa.load(audio_path, sr=self.sample_rate, mono=True) original_duration = len(audio) / self.sample_rate # Normalize if np.max(np.abs(audio)) > 0: audio = audio / np.max(np.abs(audio)) # Multi-segment analysis for better detection # Analyze multiple segments and use weighted voting segment_results = [] if len(audio) <= self.max_length: # Short audio: analyze as single segment padded = np.pad(audio, (0, self.max_length - len(audio)), mode='constant') segment_results.append(self._analyze_segment(padded)) else: # Long audio: analyze multiple overlapping segments # Sample from beginning, middle, and end for comprehensive coverage step = self.max_length // 2 # 50% overlap for i in range(0, len(audio) - self.max_length + 1, step): segment = audio[i:i + self.max_length] segment_results.append(self._analyze_segment(segment)) # Also analyze the last segment if we haven't covered the end if len(audio) - self.max_length > (len(segment_results) - 1) * step: segment = audio[-self.max_length:] segment_results.append(self._analyze_segment(segment)) # Aggregate results with balanced approach all_genuine = [r[0] for r in segment_results] all_deepfake = [r[1] for r in segment_results] max_deepfake = max(all_deepfake) avg_deepfake = np.mean(all_deepfake) avg_genuine = np.mean(all_genuine) # Count how many segments are deepfake vs genuine n_deepfake_segs = sum(1 for d in all_deepfake if d > 0.6) n_genuine_segs = sum(1 for g in all_genuine if g > 0.6) total_segs = len(segment_results) # Majority voting with average as tiebreaker # If majority of segments agree, use that if n_deepfake_segs > total_segs * 0.5: # More than half segments are deepfake prob_deepfake = 0.6 * max_deepfake + 0.4 * avg_deepfake prob_genuine = 1.0 - prob_deepfake elif n_genuine_segs > total_segs * 0.5: # More than half segments are genuine prob_genuine = avg_genuine prob_deepfake = avg_deepfake else: # Mixed results - use weighted average prob_deepfake = 0.5 * max_deepfake + 0.5 * avg_deepfake prob_genuine = 1.0 - prob_deepfake # Prediction thresholds if prob_deepfake >= 0.60: prediction = "DEEPFAKE" confidence = prob_deepfake elif prob_genuine >= 0.60: prediction = "GENUINO" confidence = prob_genuine else: prediction = "SOSPECHOSO" confidence = max(prob_genuine, prob_deepfake) return { 'prediction': prediction, 'confidence': confidence * 100, 'prob_genuine': prob_genuine * 100, 'prob_deepfake': prob_deepfake * 100, 'processing_time_ms': (time.time() - start_time) * 1000, 'duration': original_duration, 'segments_analyzed': len(segment_results), 'max_deepfake_segment': max_deepfake * 100, 'avg_deepfake': avg_deepfake * 100 } def _analyze_segment(self, audio_segment): """Analyze a single audio segment and return (prob_genuine, prob_deepfake)""" audio_tensor = torch.FloatTensor(audio_segment).unsqueeze(0).to(self.device) with torch.no_grad(): _, output = self.model(audio_tensor) probs = torch.softmax(output, dim=1) prob_genuine = probs[0, 0].item() prob_deepfake = probs[0, 1].item() return (prob_genuine, prob_deepfake) # ============================================ # Visualization # ============================================ def create_spectrogram(audio_path): try: y, sr = librosa.load(audio_path, sr=16000) fig, axes = plt.subplots(2, 2, figsize=(12, 8)) fig.suptitle('Analisis Espectral', fontsize=14, fontweight='bold') librosa.display.waveshow(y, sr=sr, ax=axes[0, 0], color='#2E86AB') axes[0, 0].set_title('Forma de Onda') S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128) S_dB = librosa.power_to_db(S, ref=np.max) img = librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel', ax=axes[0, 1], cmap='magma') axes[0, 1].set_title('Espectrograma Mel') fig.colorbar(img, ax=axes[0, 1], format='%+2.0f dB') mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) img2 = librosa.display.specshow(mfccs, sr=sr, x_axis='time', ax=axes[1, 0], cmap='coolwarm') axes[1, 0].set_title('MFCC') fig.colorbar(img2, ax=axes[1, 0]) centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0] t = librosa.frames_to_time(range(len(centroid)), sr=sr) axes[1, 1].plot(t, centroid, color='#E94F37', linewidth=1.5) axes[1, 1].fill_between(t, centroid, alpha=0.3, color='#E94F37') axes[1, 1].set_title('Centroide Espectral') axes[1, 1].set_xlabel('Tiempo (s)') axes[1, 1].set_ylabel('Hz') plt.tight_layout() buf = io.BytesIO() plt.savefig(buf, format='png', dpi=100, bbox_inches='tight') buf.seek(0) img = Image.open(buf) plt.close(fig) return img except Exception as e: print(f"Error creating spectrogram: {e}") return None def create_confidence_chart(prob_genuine, prob_deepfake): fig, ax = plt.subplots(figsize=(8, 3)) categories = ['GENUINO', 'DEEPFAKE'] values = [prob_genuine, prob_deepfake] colors = ['#28A745' if prob_genuine > prob_deepfake else '#6C757D', '#DC3545' if prob_deepfake > prob_genuine else '#6C757D'] bars = ax.barh(categories, values, color=colors, height=0.5) for bar, val in zip(bars, values): ax.text(val + 1, bar.get_y() + bar.get_height()/2, f'{val:.1f}%', va='center', fontweight='bold') ax.set_xlim(0, 105) ax.set_xlabel('Probabilidad (%)') ax.set_title('Distribucion de Confianza', fontweight='bold') ax.axvline(x=50, color='gray', linestyle='--', alpha=0.5) ax.axvline(x=60, color='orange', linestyle='--', alpha=0.7, label='Threshold (60%)') ax.legend(loc='lower right') plt.tight_layout() buf = io.BytesIO() plt.savefig(buf, format='png', dpi=100, bbox_inches='tight') buf.seek(0) img = Image.open(buf) plt.close(fig) return img # ============================================ # Main # ============================================ DETECTOR = None def analyze_audio(audio_file): global DETECTOR if audio_file is None: return "Esperando audio...", "", None, None if DETECTOR is None: DETECTOR = AASISTDetector() try: audio_path = audio_file if isinstance(audio_file, str) else audio_file.name result = DETECTOR.analyze(audio_path) prediction = result['prediction'] confidence = result['confidence'] if prediction == "DEEPFAKE": pred_display = f"## 🔴 DEEPFAKE DETECTADO\n### Confianza: {confidence:.1f}%" elif prediction == "GENUINO": pred_display = f"## 🟢 AUDIO GENUINO\n### Confianza: {confidence:.1f}%" else: pred_display = f"## 🟡 SOSPECHOSO\n### Confianza: {confidence:.1f}%" summary = f""" ### Resultados | Metrica | Valor | |---------|-------| | **Veredicto** | {prediction} | | **Confianza** | {confidence:.1f}% | | **Prob. Genuino** | {result['prob_genuine']:.1f}% | | **Prob. Deepfake** | {result['prob_deepfake']:.1f}% | | **Segmentos analizados** | {result.get('segments_analyzed', 1)} | | **Max Deepfake (segmento)** | {result.get('max_deepfake_segment', result['prob_deepfake']):.1f}% | | **Tiempo** | {result['processing_time_ms']:.0f}ms | | **Duracion** | {result['duration']:.1f}s | **Modelo:** AASIST (Multi-segment analysis) """ spectrogram = create_spectrogram(audio_path) confidence_chart = create_confidence_chart(result['prob_genuine'], result['prob_deepfake']) return pred_display, summary, spectrogram, confidence_chart except Exception as e: import traceback error_msg = f"Error: {str(e)}\n{traceback.format_exc()}" print(error_msg) return f"Error: {str(e)}", "", None, None # ============================================ # Gradio Interface # ============================================ with gr.Blocks(title="VoiceDetector") as app: gr.HTML("""
AASIST | EER: 0.83%