# app.py — DeepFake AI Forensics (Audio-Only, Fixed Label Mapping) import os import subprocess import tempfile import warnings warnings.filterwarnings('ignore') import torch import numpy as np import librosa import librosa.display import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import gradio as gr from transformers import AutoFeatureExtractor, AutoModelForAudioClassification # ========================================== # 1. MODEL LOADING # ========================================== MODEL_NAME = "Hemgg/Deepfake-audio-detection" print("[+] Loading AI forensic model...") extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME) model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME) model.eval() device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) print(f"[+] Model loaded on {device}") # ========================================== # 2. AUDIO PREPROCESSING (ROBUST) # ========================================== def normalize_audio(file_path): """ Converts ANY audio to standard 16kHz mono WAV via FFmpeg. Fixes WhatsApp voice notes (Opus/OGG disguised as MP3), corrupt headers, and exotic codecs. """ out = tempfile.mktemp(suffix=".wav") cmd = [ "ffmpeg", "-y", "-i", file_path, "-vn", # no video "-acodec", "pcm_s16le", # 16-bit PCM "-ar", "16000", # 16 kHz "-ac", "1", # mono "-af", "loudnorm=I=-16:TP=-1.5:LRA=11", # normalize levels out ] result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if result.returncode != 0: err = result.stderr.decode('utf-8', errors='ignore')[:300] raise RuntimeError(f"FFmpeg could not decode this file. It may be corrupted or use an unsupported codec.\nDetails: {err}") return out def convert_to_audio(file_path): ext = os.path.splitext(file_path)[1].lower().lstrip('.') # Supported audio formats (including WhatsApp Opus) audio_exts = ["wav", "mp3", "flac", "m4a", "ogg", "opus", "aac", "wma", "oga"] if ext in audio_exts: print(f"[+] Audio detected ({ext}) → normalizing via ffmpeg...") return normalize_audio(file_path) # Unknown extension? Try ffmpeg anyway as last resort print(f"[+] Unknown format ({ext}) → attempting ffmpeg decode...") try: return normalize_audio(file_path) except Exception: raise ValueError(f"Unsupported file format: {ext}. Please upload MP3, WAV, M4A, OGG, OPUS, or FLAC.") def load_audio(path): audio, _ = librosa.load(path, sr=16000) audio = librosa.util.normalize(audio) return audio # ========================================== # 3. INFERENCE & DSP # ========================================== def predict(audio): inputs = extractor(audio, sampling_rate=16000, return_tensors="pt", padding=True).to(device) with torch.no_grad(): logits = model(**inputs).logits probs = torch.softmax(logits, dim=-1)[0] # CORRECTED: config.json says id2label: 0="AIVoice", 1="HumanVoice" ai_p = float(probs[0]) human_p = float(probs[1]) return ai_p, human_p def audio_features(audio): mfcc = librosa.feature.mfcc(y=audio, sr=16000, n_mfcc=13) return { "mfcc_var": float(np.mean(np.var(mfcc, axis=1))), "energy": float(np.mean(audio ** 2)), "zcr": float(np.mean(librosa.feature.zero_crossing_rate(audio))), "spectral_centroid": float(np.mean(librosa.feature.spectral_centroid(y=audio, sr=16000))), } def analyze(file_path): audio_path = convert_to_audio(file_path) audio = load_audio(audio_path) ai_p, human_p = predict(audio) feats = audio_features(audio) # Calibrated ensemble: neural model is primary (80%), DSP is secondary (20%) # DSP anomaly score — lower variance in MFCC and unnatural spectral centroid can indicate AI dsp_score = min(1.0, max(0.0, (feats["mfcc_var"] / 800.0) * 0.5 + (1.0 - min(feats["zcr"] * 5, 1.0)) * 0.3 + (feats["energy"] * 1.5) * 0.2 )) # Weighted fusion: trust the neural model more, use DSP as a soft modifier ai_score = np.clip((ai_p * 0.80 + dsp_score * 0.20), 0.0, 1.0) # Also compute human confidence for display human_score = 1.0 - ai_score if ai_score < 0.40: verdict = "HUMAN VOICE" level = "LOW RISK" color = "#059669" icon = "🧑" glow = "rgba(5,150,105,0.18)" elif ai_score < 0.60: verdict = "UNCERTAIN / MIXED" level = "MEDIUM RISK" color = "#d97706" icon = "⚠️" glow = "rgba(217,119,6,0.18)" else: verdict = "AI / SYNTHETIC VOICE" level = "HIGH RISK" color = "#dc2626" icon = "🤖" glow = "rgba(220,38,38,0.18)" confidence = int(max(ai_score, human_score) * 100) return verdict, level, confidence, ai_score, human_score, feats, audio_path, color, icon, glow # ========================================== # 4. VISUALIZATION # ========================================== def generate_audio_plots(audio_path): y, sr = librosa.load(audio_path, sr=16000, duration=10) fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 7)) fig.patch.set_facecolor('#f0f5ff') ax1.set_facecolor('#ffffff') librosa.display.waveshow(y, sr=sr, ax=ax1, color='#2563eb', alpha=0.85) ax1.set_title('Waveform Analysis', color='#1e293b', fontsize=13, fontweight='bold', pad=12) ax1.tick_params(colors='#64748b') for spine in ax1.spines.values(): spine.set_color('#cbd5e1') ax2.set_facecolor('#ffffff') mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128) mel_db = librosa.power_to_db(mel, ref=np.max) img = librosa.display.specshow(mel_db, sr=sr, ax=ax2, x_axis='time', y_axis='mel', cmap='viridis') cbar = plt.colorbar(img, ax=ax2, format='%+2.0f dB') cbar.ax.yaxis.set_tick_params(color='#64748b') plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color='#64748b') ax2.set_title('Mel Spectrogram', color='#1e293b', fontsize=13, fontweight='bold', pad=12) ax2.tick_params(colors='#64748b') ax2.yaxis.label.set_color('#64748b') ax2.xaxis.label.set_color('#64748b') for spine in ax2.spines.values(): spine.set_color('#cbd5e1') plt.tight_layout() plot_path = '/tmp/audio_analysis.png' plt.savefig(plot_path, facecolor='#f0f5ff', bbox_inches='tight', dpi=150) plt.close() return plot_path # ========================================== # 5. HTML BUILDERS # ========================================== def confidence_circle(percentage, color): radius = 50 circumference = 2 * 3.14159 * radius offset = circumference - (percentage / 100) * circumference return f"""
{percentage}%
Confidence
""" def probability_bar(label, percentage, color, icon): return f"""
{icon} {label} {percentage:.1f}%
""" EQUALIZER_HTML = """
""" # ========================================== # 6. GRADIO HANDLERS # ========================================== def detect_audio(audio_file): if audio_file is None: return ( None, '
❌ No audio file provided
', "Waiting...", "#64748b", EQUALIZER_HTML + '
Upload audio to begin forensic analysis
' ) try: verdict, level, confidence, ai_score, human_score, feats, audio_path, color, icon, glow = analyze(audio_file) plot_path = generate_audio_plots(audio_path) ai_pct = ai_score * 100 human_pct = human_score * 100 status_emoji = "🟢" if ai_pct < 40 else "🟡" if ai_pct < 60 else "🔴" status_text = "LIKELY REAL" if ai_pct < 40 else "SUSPICIOUS" if ai_pct < 60 else "HIGH RISK" circle = confidence_circle(confidence, color) result_html = f"""
{icon}
Final Verdict
{verdict}
{circle}
Risk Level
{level}
Confidence
{confidence}%
Status
{status_emoji} {status_text}
📊 Probability Breakdown
{probability_bar("AI / Synthetic", ai_pct, "#dc2626", "🤖")} {probability_bar("Human / Real", human_pct, "#059669", "🧑")}
🔬 DSP Forensic Signatures
MFCC Variance
{feats['mfcc_var']:.4f}
Signal Energy
{feats['energy']:.6f}
Zero Crossing
{feats['zcr']:.4f}
Spectral Centroid
{feats['spectral_centroid']:.1f} Hz
Interpretation Guide:
● 0–40% Very likely genuine human voice  |  ● 40–60% Mixed signal, manual review advised  |  ● 60–100% Strong synthetic / AI indicators detected
""" return plot_path, result_html, f"{ai_pct:.1f}%", color, "" except Exception as e: err_msg = str(e) if "FFmpeg" in err_msg: err_html = f"
❌ File Decode Error

{err_msg}

WhatsApp voice notes are often .opus or .ogg files disguised as .mp3. Try renaming the file to .ogg or exporting it differently.
" else: err_html = f"
❌ Analysis Error: {err_msg}
" return None, err_html, "Error", "#dc2626", "" # ========================================== # 7. GRADIO UI — ENHANCED AUDIO-ONLY # ========================================== CUSTOM_CSS = """ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&display=swap'); .gradio-container { max-width: 1250px !important; margin: auto !important; font-family: 'Inter', sans-serif !important; background: #e8f0fe !important; } body { background: #e8f0fe !important; } /* Upload zones */ .upload-container { background: #ffffff !important; border: 2px dashed #93c5fd !important; border-radius: 16px !important; transition: all 0.3s ease !important; } .upload-container:hover { border-color: #2563eb !important; background: #eff6ff !important; box-shadow: 0 0 30px rgba(37, 99, 235, 0.12) !important; } /* Buttons */ button.primary { background: linear-gradient(135deg, #2563eb 0%, #7c3aed 100%) !important; border: none !important; border-radius: 12px !important; font-weight: 700 !important; letter-spacing: 0.5px !important; padding: 14px 32px !important; box-shadow: 0 4px 24px rgba(37, 99, 235, 0.25) !important; transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important; } button.primary:hover { transform: translateY(-2px) !important; box-shadow: 0 8px 32px rgba(37, 99, 235, 0.4) !important; } /* Equalizer Animation */ @keyframes eq-bounce { 0%, 100% { transform: scaleY(0.25); opacity: 0.5; } 50% { transform: scaleY(1); opacity: 1; } } /* Scrollbar */ ::-webkit-scrollbar { width: 8px; } ::-webkit-scrollbar-track { background: #dbeafe; } ::-webkit-scrollbar-thumb { background: #93c5fd; border-radius: 4px; } ::-webkit-scrollbar-thumb:hover { background: #2563eb; } /* Format badges */ .format-badge { display: inline-block; background: #eff6ff; border: 1px solid #bfdbfe; color: #2563eb; padding: 4px 12px; border-radius: 20px; font-size: 0.75em; font-weight: 600; letter-spacing: 0.5px; } /* Text inputs */ input, textarea { color: #1e293b !important; background: #ffffff !important; border: 1px solid #bfdbfe !important; } /* Audio player styling */ audio { border-radius: 12px !important; width: 100% !important; } /* Result cards */ .result-card { background: #ffffff; border-radius: 16px; padding: 24px; border: 1px solid #bfdbfe; box-shadow: 0 2px 8px rgba(0,0,0,0.04); } """ def build_ui(): with gr.Blocks( title="DeepFake AI Forensics — Audio Detector", theme=gr.themes.Base( primary_hue="blue", neutral_hue="slate", ), css=CUSTOM_CSS, ) as demo: # Header gr.HTML("""

🔍 DeepFake AI Forensics

Neural + DSP ensemble detection for synthetic voice identification.
Audio-only analysis

""") with gr.Row(): # ═══════════════════════════════════════════ # LEFT COLUMN — Upload # ═══════════════════════════════════════════ with gr.Column(scale=1, min_width=360): gr.Markdown("### 📤 Upload Audio File", elem_classes="section-title") gr.HTML("""
MP3 WAV M4A FLAC OGG OPUS AAC
📎 Maximum file size: 50 MB
💡 WhatsApp voice notes: If your file fails to upload, try renaming it from .mp3 to .ogg or .opus before uploading.
""") audio_input = gr.Audio( label="", type="filepath", elem_classes="upload-container" ) audio_waves = gr.HTML(value=EQUALIZER_HTML + '
Audio waveform ready for analysis
') audio_btn = gr.Button("🔍 Analyze Audio", variant="primary", size="lg") audio_score_text = gr.Textbox( label="", value="--%", interactive=False ) # ═══════════════════════════════════════════ # RIGHT COLUMN — Results # ═══════════════════════════════════════════ with gr.Column(scale=2): gr.Markdown("### 📊 Forensic Analysis Report", elem_classes="section-title") audio_plot = gr.Image( label="", show_label=False, elem_classes="result-image" ) audio_result = gr.HTML( value="""
📊
Results will appear here
Upload an audio file and click analyze to begin
""" ) audio_btn.click( fn=detect_audio, inputs=[audio_input], outputs=[audio_plot, audio_result, audio_score_text, audio_score_text, audio_waves] ) # ═══════════════════════════════════════════ # HOW IT WORKS SECTION # ═══════════════════════════════════════════ gr.HTML("""

🧠 Detection Pipeline

🧠

Transformer Classifier

Hemgg/Deepfake-audio-detection Wav2Vec 2.0 base model running on GPU/CPU with HuggingFace Transformers.

📊

DSP Ensemble

MFCC variance + signal energy + zero-crossing rate + spectral centroid fused with neural output (80/20 weighting).

🔧

Universal Decoder

FFmpeg extracts and normalizes audio from any format — including WhatsApp voice notes with disguised extensions.

📈 Score Interpretation

0–40% 🟢 Very likely genuine / human-created
40–60% 🟡 Uncertain / mixed signal — manual review recommended
60–100% 🔴 Strong AI-generated / synthetic voice indicators
⚠️ Important Limitations

No automated detector is 100% accurate. Adversarial AI models may evade detection. Compressed or noisy audio reduces reliability. Always use human expert judgment for critical decisions.
""") gr.HTML("""
Neural Audio Forensics • Powered by HuggingFace Transformers & DSP Signal Processing
""") return demo if __name__ == "__main__": demo = build_ui() demo.launch( server_name="0.0.0.0", server_port=7860, share=False, ssr_mode=False )