Spaces:
Runtime error
Runtime error
| """ | |
| VoiceDetector - Forensic Deepfake Audio Detection | |
| Using original AASIST model (EER: 0.83% on ASVspoof 2019 LA) | |
| """ | |
| import os | |
| import sys | |
| import time | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| import librosa | |
| import librosa.display | |
| import matplotlib | |
| matplotlib.use('Agg') | |
| import matplotlib.pyplot as plt | |
| from PIL import Image | |
| import io | |
| # Import original AASIST model | |
| from aasist_model import Model as AASISTModel | |
| # ============================================ | |
| # Detector Class | |
| # ============================================ | |
| class AASISTDetector: | |
| def __init__(self): | |
| self.device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| self.sample_rate = 16000 | |
| self.max_length = 64600 # ~4 seconds | |
| # Original AASIST config | |
| self.model_config = { | |
| "architecture": "AASIST", | |
| "nb_samp": 64600, | |
| "first_conv": 128, | |
| "filts": [70, [1, 32], [32, 32], [32, 64], [64, 64]], | |
| "gat_dims": [64, 32], | |
| "pool_ratios": [0.5, 0.7, 0.5, 0.5], | |
| "temperatures": [2.0, 2.0, 100.0, 100.0] | |
| } | |
| self.model = AASISTModel(self.model_config).to(self.device) | |
| self._load_weights() | |
| self.model.eval() | |
| print(f"[AASIST] Loaded on {self.device}") | |
| print(f"[AASIST] Parameters: {sum(p.numel() for p in self.model.parameters()):,}") | |
| def _load_weights(self): | |
| weights_path = os.path.join(os.path.dirname(__file__), "AASIST.pth") | |
| if not os.path.exists(weights_path): | |
| print(f"[AASIST] ERROR: Weights not found at {weights_path}") | |
| return | |
| checkpoint = torch.load(weights_path, map_location=self.device, weights_only=False) | |
| self.model.load_state_dict(checkpoint, strict=False) | |
| print(f"[AASIST] Weights loaded from {weights_path}") | |
| def analyze(self, audio_path): | |
| start_time = time.time() | |
| # Load audio | |
| audio, sr = librosa.load(audio_path, sr=self.sample_rate, mono=True) | |
| original_duration = len(audio) / self.sample_rate | |
| # Normalize | |
| if np.max(np.abs(audio)) > 0: | |
| audio = audio / np.max(np.abs(audio)) | |
| # Multi-segment analysis for better detection | |
| # Analyze multiple segments and use weighted voting | |
| segment_results = [] | |
| if len(audio) <= self.max_length: | |
| # Short audio: analyze as single segment | |
| padded = np.pad(audio, (0, self.max_length - len(audio)), mode='constant') | |
| segment_results.append(self._analyze_segment(padded)) | |
| else: | |
| # Long audio: analyze multiple overlapping segments | |
| # Sample from beginning, middle, and end for comprehensive coverage | |
| step = self.max_length // 2 # 50% overlap | |
| for i in range(0, len(audio) - self.max_length + 1, step): | |
| segment = audio[i:i + self.max_length] | |
| segment_results.append(self._analyze_segment(segment)) | |
| # Also analyze the last segment if we haven't covered the end | |
| if len(audio) - self.max_length > (len(segment_results) - 1) * step: | |
| segment = audio[-self.max_length:] | |
| segment_results.append(self._analyze_segment(segment)) | |
| # Aggregate results with balanced approach | |
| all_genuine = [r[0] for r in segment_results] | |
| all_deepfake = [r[1] for r in segment_results] | |
| max_deepfake = max(all_deepfake) | |
| avg_deepfake = np.mean(all_deepfake) | |
| avg_genuine = np.mean(all_genuine) | |
| # Count how many segments are deepfake vs genuine | |
| n_deepfake_segs = sum(1 for d in all_deepfake if d > 0.6) | |
| n_genuine_segs = sum(1 for g in all_genuine if g > 0.6) | |
| total_segs = len(segment_results) | |
| # Majority voting with average as tiebreaker | |
| # If majority of segments agree, use that | |
| if n_deepfake_segs > total_segs * 0.5: | |
| # More than half segments are deepfake | |
| prob_deepfake = 0.6 * max_deepfake + 0.4 * avg_deepfake | |
| prob_genuine = 1.0 - prob_deepfake | |
| elif n_genuine_segs > total_segs * 0.5: | |
| # More than half segments are genuine | |
| prob_genuine = avg_genuine | |
| prob_deepfake = avg_deepfake | |
| else: | |
| # Mixed results - use weighted average | |
| prob_deepfake = 0.5 * max_deepfake + 0.5 * avg_deepfake | |
| prob_genuine = 1.0 - prob_deepfake | |
| # Prediction thresholds | |
| if prob_deepfake >= 0.60: | |
| prediction = "DEEPFAKE" | |
| confidence = prob_deepfake | |
| elif prob_genuine >= 0.60: | |
| prediction = "GENUINO" | |
| confidence = prob_genuine | |
| else: | |
| prediction = "SOSPECHOSO" | |
| confidence = max(prob_genuine, prob_deepfake) | |
| return { | |
| 'prediction': prediction, | |
| 'confidence': confidence * 100, | |
| 'prob_genuine': prob_genuine * 100, | |
| 'prob_deepfake': prob_deepfake * 100, | |
| 'processing_time_ms': (time.time() - start_time) * 1000, | |
| 'duration': original_duration, | |
| 'segments_analyzed': len(segment_results), | |
| 'max_deepfake_segment': max_deepfake * 100, | |
| 'avg_deepfake': avg_deepfake * 100 | |
| } | |
| def _analyze_segment(self, audio_segment): | |
| """Analyze a single audio segment and return (prob_genuine, prob_deepfake)""" | |
| audio_tensor = torch.FloatTensor(audio_segment).unsqueeze(0).to(self.device) | |
| with torch.no_grad(): | |
| _, output = self.model(audio_tensor) | |
| probs = torch.softmax(output, dim=1) | |
| prob_genuine = probs[0, 0].item() | |
| prob_deepfake = probs[0, 1].item() | |
| return (prob_genuine, prob_deepfake) | |
| # ============================================ | |
| # Visualization | |
| # ============================================ | |
| def create_spectrogram(audio_path): | |
| try: | |
| y, sr = librosa.load(audio_path, sr=16000) | |
| fig, axes = plt.subplots(2, 2, figsize=(12, 8)) | |
| fig.suptitle('Analisis Espectral', fontsize=14, fontweight='bold') | |
| librosa.display.waveshow(y, sr=sr, ax=axes[0, 0], color='#2E86AB') | |
| axes[0, 0].set_title('Forma de Onda') | |
| S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128) | |
| S_dB = librosa.power_to_db(S, ref=np.max) | |
| img = librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel', ax=axes[0, 1], cmap='magma') | |
| axes[0, 1].set_title('Espectrograma Mel') | |
| fig.colorbar(img, ax=axes[0, 1], format='%+2.0f dB') | |
| mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) | |
| img2 = librosa.display.specshow(mfccs, sr=sr, x_axis='time', ax=axes[1, 0], cmap='coolwarm') | |
| axes[1, 0].set_title('MFCC') | |
| fig.colorbar(img2, ax=axes[1, 0]) | |
| centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0] | |
| t = librosa.frames_to_time(range(len(centroid)), sr=sr) | |
| axes[1, 1].plot(t, centroid, color='#E94F37', linewidth=1.5) | |
| axes[1, 1].fill_between(t, centroid, alpha=0.3, color='#E94F37') | |
| axes[1, 1].set_title('Centroide Espectral') | |
| axes[1, 1].set_xlabel('Tiempo (s)') | |
| axes[1, 1].set_ylabel('Hz') | |
| plt.tight_layout() | |
| buf = io.BytesIO() | |
| plt.savefig(buf, format='png', dpi=100, bbox_inches='tight') | |
| buf.seek(0) | |
| img = Image.open(buf) | |
| plt.close(fig) | |
| return img | |
| except Exception as e: | |
| print(f"Error creating spectrogram: {e}") | |
| return None | |
| def create_confidence_chart(prob_genuine, prob_deepfake): | |
| fig, ax = plt.subplots(figsize=(8, 3)) | |
| categories = ['GENUINO', 'DEEPFAKE'] | |
| values = [prob_genuine, prob_deepfake] | |
| colors = ['#28A745' if prob_genuine > prob_deepfake else '#6C757D', | |
| '#DC3545' if prob_deepfake > prob_genuine else '#6C757D'] | |
| bars = ax.barh(categories, values, color=colors, height=0.5) | |
| for bar, val in zip(bars, values): | |
| ax.text(val + 1, bar.get_y() + bar.get_height()/2, f'{val:.1f}%', va='center', fontweight='bold') | |
| ax.set_xlim(0, 105) | |
| ax.set_xlabel('Probabilidad (%)') | |
| ax.set_title('Distribucion de Confianza', fontweight='bold') | |
| ax.axvline(x=50, color='gray', linestyle='--', alpha=0.5) | |
| ax.axvline(x=60, color='orange', linestyle='--', alpha=0.7, label='Threshold (60%)') | |
| ax.legend(loc='lower right') | |
| plt.tight_layout() | |
| buf = io.BytesIO() | |
| plt.savefig(buf, format='png', dpi=100, bbox_inches='tight') | |
| buf.seek(0) | |
| img = Image.open(buf) | |
| plt.close(fig) | |
| return img | |
| # ============================================ | |
| # Main | |
| # ============================================ | |
| DETECTOR = None | |
| def analyze_audio(audio_file): | |
| global DETECTOR | |
| if audio_file is None: | |
| return "Esperando audio...", "", None, None | |
| if DETECTOR is None: | |
| DETECTOR = AASISTDetector() | |
| try: | |
| audio_path = audio_file if isinstance(audio_file, str) else audio_file.name | |
| result = DETECTOR.analyze(audio_path) | |
| prediction = result['prediction'] | |
| confidence = result['confidence'] | |
| if prediction == "DEEPFAKE": | |
| pred_display = f"## π΄ DEEPFAKE DETECTADO\n### Confianza: {confidence:.1f}%" | |
| elif prediction == "GENUINO": | |
| pred_display = f"## π’ AUDIO GENUINO\n### Confianza: {confidence:.1f}%" | |
| else: | |
| pred_display = f"## π‘ SOSPECHOSO\n### Confianza: {confidence:.1f}%" | |
| summary = f""" | |
| ### Resultados | |
| | Metrica | Valor | | |
| |---------|-------| | |
| | **Veredicto** | {prediction} | | |
| | **Confianza** | {confidence:.1f}% | | |
| | **Prob. Genuino** | {result['prob_genuine']:.1f}% | | |
| | **Prob. Deepfake** | {result['prob_deepfake']:.1f}% | | |
| | **Segmentos analizados** | {result.get('segments_analyzed', 1)} | | |
| | **Max Deepfake (segmento)** | {result.get('max_deepfake_segment', result['prob_deepfake']):.1f}% | | |
| | **Tiempo** | {result['processing_time_ms']:.0f}ms | | |
| | **Duracion** | {result['duration']:.1f}s | | |
| **Modelo:** AASIST (Multi-segment analysis) | |
| """ | |
| spectrogram = create_spectrogram(audio_path) | |
| confidence_chart = create_confidence_chart(result['prob_genuine'], result['prob_deepfake']) | |
| return pred_display, summary, spectrogram, confidence_chart | |
| except Exception as e: | |
| import traceback | |
| error_msg = f"Error: {str(e)}\n{traceback.format_exc()}" | |
| print(error_msg) | |
| return f"Error: {str(e)}", "", None, None | |
| # ============================================ | |
| # Gradio Interface | |
| # ============================================ | |
| with gr.Blocks(title="VoiceDetector") as app: | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); | |
| color: white; border-radius: 10px; margin-bottom: 20px;"> | |
| <h1>π VoiceDetector</h1> | |
| <h3>Deteccion de Deepfakes de Audio</h3> | |
| <p>AASIST | EER: 0.83%</p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| audio_input = gr.Audio(label="Cargar Audio", type="filepath", sources=["upload", "microphone"]) | |
| analyze_btn = gr.Button("π¬ Analizar", variant="primary", size="lg") | |
| gr.Markdown(""" | |
| **Formatos:** WAV, MP3, FLAC, OGG | |
| **Resultado:** | |
| - π’ GENUINO: Audio real | |
| - π΄ DEEPFAKE: Audio IA | |
| - π‘ SOSPECHOSO: Revisar | |
| """) | |
| with gr.Column(scale=2): | |
| prediction_output = gr.Markdown(value="*Esperando audio...*") | |
| summary_output = gr.Markdown() | |
| with gr.Row(): | |
| spectrogram_output = gr.Image(label="Analisis Espectral") | |
| confidence_output = gr.Image(label="Confianza") | |
| analyze_btn.click(analyze_audio, inputs=audio_input, | |
| outputs=[prediction_output, summary_output, spectrogram_output, confidence_output]) | |
| audio_input.change(analyze_audio, inputs=audio_input, | |
| outputs=[prediction_output, summary_output, spectrogram_output, confidence_output]) | |
| if __name__ == "__main__": | |
| app.launch(server_name="0.0.0.0", server_port=7860) | |