# app.py — DeepFake AI Forensics (Audio-Only, Fixed Label Mapping) import os import subprocess import tempfile import warnings warnings.filterwarnings('ignore') import torch import numpy as np import librosa import librosa.display import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import gradio as gr from transformers import AutoFeatureExtractor, AutoModelForAudioClassification # ========================================== # 1. MODEL LOADING # ========================================== MODEL_NAME = "Hemgg/Deepfake-audio-detection" print("[+] Loading AI forensic model...") extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME) model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME) model.eval() device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) print(f"[+] Model loaded on {device}") # ========================================== # 2. AUDIO PREPROCESSING (ROBUST) # ========================================== def normalize_audio(file_path): """ Converts ANY audio to standard 16kHz mono WAV via FFmpeg. Fixes WhatsApp voice notes (Opus/OGG disguised as MP3), corrupt headers, and exotic codecs. """ out = tempfile.mktemp(suffix=".wav") cmd = [ "ffmpeg", "-y", "-i", file_path, "-vn", # no video "-acodec", "pcm_s16le", # 16-bit PCM "-ar", "16000", # 16 kHz "-ac", "1", # mono "-af", "loudnorm=I=-16:TP=-1.5:LRA=11", # normalize levels out ] result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if result.returncode != 0: err = result.stderr.decode('utf-8', errors='ignore')[:300] raise RuntimeError(f"FFmpeg could not decode this file. It may be corrupted or use an unsupported codec.\nDetails: {err}") return out def convert_to_audio(file_path): ext = os.path.splitext(file_path)[1].lower().lstrip('.') # Supported audio formats (including WhatsApp Opus) audio_exts = ["wav", "mp3", "flac", "m4a", "ogg", "opus", "aac", "wma", "oga"] if ext in audio_exts: print(f"[+] Audio detected ({ext}) → normalizing via ffmpeg...") return normalize_audio(file_path) # Unknown extension? Try ffmpeg anyway as last resort print(f"[+] Unknown format ({ext}) → attempting ffmpeg decode...") try: return normalize_audio(file_path) except Exception: raise ValueError(f"Unsupported file format: {ext}. Please upload MP3, WAV, M4A, OGG, OPUS, or FLAC.") def load_audio(path): audio, _ = librosa.load(path, sr=16000) audio = librosa.util.normalize(audio) return audio # ========================================== # 3. INFERENCE & DSP # ========================================== def predict(audio): inputs = extractor(audio, sampling_rate=16000, return_tensors="pt", padding=True).to(device) with torch.no_grad(): logits = model(**inputs).logits probs = torch.softmax(logits, dim=-1)[0] # CORRECTED: config.json says id2label: 0="AIVoice", 1="HumanVoice" ai_p = float(probs[0]) human_p = float(probs[1]) return ai_p, human_p def audio_features(audio): mfcc = librosa.feature.mfcc(y=audio, sr=16000, n_mfcc=13) return { "mfcc_var": float(np.mean(np.var(mfcc, axis=1))), "energy": float(np.mean(audio ** 2)), "zcr": float(np.mean(librosa.feature.zero_crossing_rate(audio))), "spectral_centroid": float(np.mean(librosa.feature.spectral_centroid(y=audio, sr=16000))), } def analyze(file_path): audio_path = convert_to_audio(file_path) audio = load_audio(audio_path) ai_p, human_p = predict(audio) feats = audio_features(audio) # Calibrated ensemble: neural model is primary (80%), DSP is secondary (20%) # DSP anomaly score — lower variance in MFCC and unnatural spectral centroid can indicate AI dsp_score = min(1.0, max(0.0, (feats["mfcc_var"] / 800.0) * 0.5 + (1.0 - min(feats["zcr"] * 5, 1.0)) * 0.3 + (feats["energy"] * 1.5) * 0.2 )) # Weighted fusion: trust the neural model more, use DSP as a soft modifier ai_score = np.clip((ai_p * 0.80 + dsp_score * 0.20), 0.0, 1.0) # Also compute human confidence for display human_score = 1.0 - ai_score if ai_score < 0.40: verdict = "HUMAN VOICE" level = "LOW RISK" color = "#059669" icon = "🧑" glow = "rgba(5,150,105,0.18)" elif ai_score < 0.60: verdict = "UNCERTAIN / MIXED" level = "MEDIUM RISK" color = "#d97706" icon = "⚠️" glow = "rgba(217,119,6,0.18)" else: verdict = "AI / SYNTHETIC VOICE" level = "HIGH RISK" color = "#dc2626" icon = "🤖" glow = "rgba(220,38,38,0.18)" confidence = int(max(ai_score, human_score) * 100) return verdict, level, confidence, ai_score, human_score, feats, audio_path, color, icon, glow # ========================================== # 4. VISUALIZATION # ========================================== def generate_audio_plots(audio_path): y, sr = librosa.load(audio_path, sr=16000, duration=10) fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 7)) fig.patch.set_facecolor('#f0f5ff') ax1.set_facecolor('#ffffff') librosa.display.waveshow(y, sr=sr, ax=ax1, color='#2563eb', alpha=0.85) ax1.set_title('Waveform Analysis', color='#1e293b', fontsize=13, fontweight='bold', pad=12) ax1.tick_params(colors='#64748b') for spine in ax1.spines.values(): spine.set_color('#cbd5e1') ax2.set_facecolor('#ffffff') mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128) mel_db = librosa.power_to_db(mel, ref=np.max) img = librosa.display.specshow(mel_db, sr=sr, ax=ax2, x_axis='time', y_axis='mel', cmap='viridis') cbar = plt.colorbar(img, ax=ax2, format='%+2.0f dB') cbar.ax.yaxis.set_tick_params(color='#64748b') plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color='#64748b') ax2.set_title('Mel Spectrogram', color='#1e293b', fontsize=13, fontweight='bold', pad=12) ax2.tick_params(colors='#64748b') ax2.yaxis.label.set_color('#64748b') ax2.xaxis.label.set_color('#64748b') for spine in ax2.spines.values(): spine.set_color('#cbd5e1') plt.tight_layout() plot_path = '/tmp/audio_analysis.png' plt.savefig(plot_path, facecolor='#f0f5ff', bbox_inches='tight', dpi=150) plt.close() return plot_path # ========================================== # 5. HTML BUILDERS # ========================================== def confidence_circle(percentage, color): radius = 50 circumference = 2 * 3.14159 * radius offset = circumference - (percentage / 100) * circumference return f"""
Neural + DSP ensemble detection for synthetic voice identification.
Audio-only analysis
.mp3 to .ogg or .opus before uploading.
Hemgg/Deepfake-audio-detection
Wav2Vec 2.0 base model running on GPU/CPU with HuggingFace Transformers.
MFCC variance + signal energy + zero-crossing rate + spectral centroid fused with neural output (80/20 weighting).
FFmpeg extracts and normalizes audio from any format — including WhatsApp voice notes with disguised extensions.