Spaces:
Running
Running
| # app.py β DeepFake AI Forensics (Audio-Only, Fixed Label Mapping) | |
| import os | |
| import subprocess | |
| import tempfile | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| import torch | |
| import numpy as np | |
| import librosa | |
| import librosa.display | |
| import matplotlib | |
| matplotlib.use('Agg') | |
| import matplotlib.pyplot as plt | |
| import gradio as gr | |
| from transformers import AutoFeatureExtractor, AutoModelForAudioClassification | |
| # ========================================== | |
| # 1. MODEL LOADING | |
| # ========================================== | |
| MODEL_NAME = "Hemgg/Deepfake-audio-detection" | |
| print("[+] Loading AI forensic model...") | |
| extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME) | |
| model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME) | |
| model.eval() | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model.to(device) | |
| print(f"[+] Model loaded on {device}") | |
| # ========================================== | |
| # 2. AUDIO PREPROCESSING (ROBUST) | |
| # ========================================== | |
| def normalize_audio(file_path): | |
| """ | |
| Converts ANY audio to standard 16kHz mono WAV via FFmpeg. | |
| Fixes WhatsApp voice notes (Opus/OGG disguised as MP3), | |
| corrupt headers, and exotic codecs. | |
| """ | |
| out = tempfile.mktemp(suffix=".wav") | |
| cmd = [ | |
| "ffmpeg", "-y", | |
| "-i", file_path, | |
| "-vn", # no video | |
| "-acodec", "pcm_s16le", # 16-bit PCM | |
| "-ar", "16000", # 16 kHz | |
| "-ac", "1", # mono | |
| "-af", "loudnorm=I=-16:TP=-1.5:LRA=11", # normalize levels | |
| out | |
| ] | |
| result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
| if result.returncode != 0: | |
| err = result.stderr.decode('utf-8', errors='ignore')[:300] | |
| raise RuntimeError(f"FFmpeg could not decode this file. It may be corrupted or use an unsupported codec.\nDetails: {err}") | |
| return out | |
| def convert_to_audio(file_path): | |
| ext = os.path.splitext(file_path)[1].lower().lstrip('.') | |
| # Supported audio formats (including WhatsApp Opus) | |
| audio_exts = ["wav", "mp3", "flac", "m4a", "ogg", "opus", "aac", "wma", "oga"] | |
| if ext in audio_exts: | |
| print(f"[+] Audio detected ({ext}) β normalizing via ffmpeg...") | |
| return normalize_audio(file_path) | |
| # Unknown extension? Try ffmpeg anyway as last resort | |
| print(f"[+] Unknown format ({ext}) β attempting ffmpeg decode...") | |
| try: | |
| return normalize_audio(file_path) | |
| except Exception: | |
| raise ValueError(f"Unsupported file format: {ext}. Please upload MP3, WAV, M4A, OGG, OPUS, or FLAC.") | |
| def load_audio(path): | |
| audio, _ = librosa.load(path, sr=16000) | |
| audio = librosa.util.normalize(audio) | |
| return audio | |
| # ========================================== | |
| # 3. INFERENCE & DSP | |
| # ========================================== | |
| def predict(audio): | |
| inputs = extractor(audio, sampling_rate=16000, return_tensors="pt", padding=True).to(device) | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| probs = torch.softmax(logits, dim=-1)[0] | |
| # CORRECTED: config.json says id2label: 0="AIVoice", 1="HumanVoice" | |
| ai_p = float(probs[0]) | |
| human_p = float(probs[1]) | |
| return ai_p, human_p | |
| def audio_features(audio): | |
| mfcc = librosa.feature.mfcc(y=audio, sr=16000, n_mfcc=13) | |
| return { | |
| "mfcc_var": float(np.mean(np.var(mfcc, axis=1))), | |
| "energy": float(np.mean(audio ** 2)), | |
| "zcr": float(np.mean(librosa.feature.zero_crossing_rate(audio))), | |
| "spectral_centroid": float(np.mean(librosa.feature.spectral_centroid(y=audio, sr=16000))), | |
| } | |
| def analyze(file_path): | |
| audio_path = convert_to_audio(file_path) | |
| audio = load_audio(audio_path) | |
| ai_p, human_p = predict(audio) | |
| feats = audio_features(audio) | |
| # Calibrated ensemble: neural model is primary (80%), DSP is secondary (20%) | |
| # DSP anomaly score β lower variance in MFCC and unnatural spectral centroid can indicate AI | |
| dsp_score = min(1.0, max(0.0, | |
| (feats["mfcc_var"] / 800.0) * 0.5 + | |
| (1.0 - min(feats["zcr"] * 5, 1.0)) * 0.3 + | |
| (feats["energy"] * 1.5) * 0.2 | |
| )) | |
| # Weighted fusion: trust the neural model more, use DSP as a soft modifier | |
| ai_score = np.clip((ai_p * 0.80 + dsp_score * 0.20), 0.0, 1.0) | |
| # Also compute human confidence for display | |
| human_score = 1.0 - ai_score | |
| if ai_score < 0.40: | |
| verdict = "HUMAN VOICE" | |
| level = "LOW RISK" | |
| color = "#059669" | |
| icon = "π§" | |
| glow = "rgba(5,150,105,0.18)" | |
| elif ai_score < 0.60: | |
| verdict = "UNCERTAIN / MIXED" | |
| level = "MEDIUM RISK" | |
| color = "#d97706" | |
| icon = "β οΈ" | |
| glow = "rgba(217,119,6,0.18)" | |
| else: | |
| verdict = "AI / SYNTHETIC VOICE" | |
| level = "HIGH RISK" | |
| color = "#dc2626" | |
| icon = "π€" | |
| glow = "rgba(220,38,38,0.18)" | |
| confidence = int(max(ai_score, human_score) * 100) | |
| return verdict, level, confidence, ai_score, human_score, feats, audio_path, color, icon, glow | |
| # ========================================== | |
| # 4. VISUALIZATION | |
| # ========================================== | |
| def generate_audio_plots(audio_path): | |
| y, sr = librosa.load(audio_path, sr=16000, duration=10) | |
| fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 7)) | |
| fig.patch.set_facecolor('#f0f5ff') | |
| ax1.set_facecolor('#ffffff') | |
| librosa.display.waveshow(y, sr=sr, ax=ax1, color='#2563eb', alpha=0.85) | |
| ax1.set_title('Waveform Analysis', color='#1e293b', fontsize=13, fontweight='bold', pad=12) | |
| ax1.tick_params(colors='#64748b') | |
| for spine in ax1.spines.values(): | |
| spine.set_color('#cbd5e1') | |
| ax2.set_facecolor('#ffffff') | |
| mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128) | |
| mel_db = librosa.power_to_db(mel, ref=np.max) | |
| img = librosa.display.specshow(mel_db, sr=sr, ax=ax2, x_axis='time', y_axis='mel', cmap='viridis') | |
| cbar = plt.colorbar(img, ax=ax2, format='%+2.0f dB') | |
| cbar.ax.yaxis.set_tick_params(color='#64748b') | |
| plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color='#64748b') | |
| ax2.set_title('Mel Spectrogram', color='#1e293b', fontsize=13, fontweight='bold', pad=12) | |
| ax2.tick_params(colors='#64748b') | |
| ax2.yaxis.label.set_color('#64748b') | |
| ax2.xaxis.label.set_color('#64748b') | |
| for spine in ax2.spines.values(): | |
| spine.set_color('#cbd5e1') | |
| plt.tight_layout() | |
| plot_path = '/tmp/audio_analysis.png' | |
| plt.savefig(plot_path, facecolor='#f0f5ff', bbox_inches='tight', dpi=150) | |
| plt.close() | |
| return plot_path | |
| # ========================================== | |
| # 5. HTML BUILDERS | |
| # ========================================== | |
| def confidence_circle(percentage, color): | |
| radius = 50 | |
| circumference = 2 * 3.14159 * radius | |
| offset = circumference - (percentage / 100) * circumference | |
| return f""" | |
| <div style="display: flex; flex-direction: column; align-items: center; justify-content: center; margin: 10px 0;"> | |
| <div style="position: relative; width: 140px; height: 140px; filter: drop-shadow(0 0 12px {color}30);"> | |
| <svg width="140" height="140" viewBox="0 0 120 120" style="transform: rotate(-90deg);"> | |
| <circle cx="60" cy="60" r="{radius}" stroke="#dbeafe" stroke-width="10" fill="none"/> | |
| <circle cx="60" cy="60" r="{radius}" stroke="{color}" stroke-width="10" fill="none" | |
| stroke-linecap="round" | |
| stroke-dasharray="{circumference}" | |
| stroke-dashoffset="{offset}" | |
| style="transition: stroke-dashoffset 1.2s ease-out;"/> | |
| </svg> | |
| <div style="position: absolute; top: 50%; left: 50%; transform: translate(-50%, -50%); text-align: center;"> | |
| <div style="font-size: 2em; font-weight: 800; color: {color}; line-height: 1;">{percentage}%</div> | |
| <div style="font-size: 0.65em; color: #64748b; text-transform: uppercase; letter-spacing: 1px;">Confidence</div> | |
| </div> | |
| </div> | |
| </div> | |
| """ | |
| def probability_bar(label, percentage, color, icon): | |
| return f""" | |
| <div style="margin-bottom: 14px;"> | |
| <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 6px;"> | |
| <span style="font-weight: 600; color: #1e293b; font-size: 0.95em;">{icon} {label}</span> | |
| <span style="font-weight: 700; color: {color}; font-size: 1em;">{percentage:.1f}%</span> | |
| </div> | |
| <div style="width: 100%; height: 10px; background: #e2e8f0; border-radius: 5px; overflow: hidden;"> | |
| <div style="width: {percentage}%; height: 100%; background: linear-gradient(90deg, {color}, {color}aa); border-radius: 5px; transition: width 1s ease-out;"></div> | |
| </div> | |
| </div> | |
| """ | |
| EQUALIZER_HTML = """ | |
| <div style="display: flex; align-items: flex-end; justify-content: center; height: 50px; gap: 5px; margin: 16px 0;"> | |
| <div class="eq-bar" style="width: 6px; height: 40%; background: linear-gradient(to top, #4f46e5, #2563eb); border-radius: 3px; animation: eq-bounce 0.8s infinite ease-in-out 0s;"></div> | |
| <div class="eq-bar" style="width: 6px; height: 70%; background: linear-gradient(to top, #4f46e5, #2563eb); border-radius: 3px; animation: eq-bounce 0.9s infinite ease-in-out 0.1s;"></div> | |
| <div class="eq-bar" style="width: 6px; height: 50%; background: linear-gradient(to top, #4f46e5, #2563eb); border-radius: 3px; animation: eq-bounce 0.7s infinite ease-in-out 0.2s;"></div> | |
| <div class="eq-bar" style="width: 6px; height: 80%; background: linear-gradient(to top, #4f46e5, #2563eb); border-radius: 3px; animation: eq-bounce 1.0s infinite ease-in-out 0.15s;"></div> | |
| <div class="eq-bar" style="width: 6px; height: 60%; background: linear-gradient(to top, #4f46e5, #2563eb); border-radius: 3px; animation: eq-bounce 0.85s infinite ease-in-out 0.05s;"></div> | |
| <div class="eq-bar" style="width: 6px; height: 90%; background: linear-gradient(to top, #4f46e5, #2563eb); border-radius: 3px; animation: eq-bounce 0.75s infinite ease-in-out 0.25s;"></div> | |
| <div class="eq-bar" style="width: 6px; height: 45%; background: linear-gradient(to top, #4f46e5, #2563eb); border-radius: 3px; animation: eq-bounce 0.95s infinite ease-in-out 0.3s;"></div> | |
| <div class="eq-bar" style="width: 6px; height: 65%; background: linear-gradient(to top, #4f46e5, #2563eb); border-radius: 3px; animation: eq-bounce 0.8s infinite ease-in-out 0.12s;"></div> | |
| </div> | |
| """ | |
| # ========================================== | |
| # 6. GRADIO HANDLERS | |
| # ========================================== | |
| def detect_audio(audio_file): | |
| if audio_file is None: | |
| return ( | |
| None, | |
| '<div style="text-align:center;color:#dc2626;padding:30px;">β No audio file provided</div>', | |
| "Waiting...", | |
| "#64748b", | |
| EQUALIZER_HTML + '<div style="text-align:center;color:#64748b;font-size:0.9em;">Upload audio to begin forensic analysis</div>' | |
| ) | |
| try: | |
| verdict, level, confidence, ai_score, human_score, feats, audio_path, color, icon, glow = analyze(audio_file) | |
| plot_path = generate_audio_plots(audio_path) | |
| ai_pct = ai_score * 100 | |
| human_pct = human_score * 100 | |
| status_emoji = "π’" if ai_pct < 40 else "π‘" if ai_pct < 60 else "π΄" | |
| status_text = "LIKELY REAL" if ai_pct < 40 else "SUSPICIOUS" if ai_pct < 60 else "HIGH RISK" | |
| circle = confidence_circle(confidence, color) | |
| result_html = f""" | |
| <div style="background: #ffffff; | |
| border: 1px solid {color}35; border-radius: 20px; padding: 28px; | |
| box-shadow: 0 4px 24px {glow}, 0 1px 3px rgba(0,0,0,0.08);"> | |
| <div style="display: flex; align-items: center; gap: 20px; margin-bottom: 24px; flex-wrap: wrap;"> | |
| <div style="font-size: 3em; line-height: 1;">{icon}</div> | |
| <div style="flex: 1; min-width: 200px;"> | |
| <div style="font-size: 0.8em; color: #64748b; text-transform: uppercase; letter-spacing: 2px; margin-bottom: 4px;">Final Verdict</div> | |
| <div style="font-size: 1.5em; font-weight: 800; color: {color}; letter-spacing: -0.5px;">{verdict}</div> | |
| </div> | |
| <div style="min-width: 140px;"> | |
| {circle} | |
| </div> | |
| </div> | |
| <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); gap: 12px; margin-bottom: 24px;"> | |
| <div style="background: #f8fafc; border-radius: 12px; padding: 16px; border-left: 3px solid {color};"> | |
| <div style="font-size: 0.7em; color: #64748b; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 6px;">Risk Level</div> | |
| <div style="font-size: 1.2em; font-weight: 700; color: {color};">{level}</div> | |
| </div> | |
| <div style="background: #f8fafc; border-radius: 12px; padding: 16px; border-left: 3px solid #2563eb;"> | |
| <div style="font-size: 0.7em; color: #64748b; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 6px;">Confidence</div> | |
| <div style="font-size: 1.2em; font-weight: 700; color: #2563eb;">{confidence}%</div> | |
| </div> | |
| <div style="background: #f8fafc; border-radius: 12px; padding: 16px; border-left: 3px solid #d97706;"> | |
| <div style="font-size: 0.7em; color: #64748b; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 6px;">Status</div> | |
| <div style="font-size: 1em; font-weight: 600; color: #d97706;">{status_emoji} {status_text}</div> | |
| </div> | |
| </div> | |
| <div style="background: #f8fafc; border-radius: 12px; padding: 20px; margin-bottom: 20px;"> | |
| <div style="font-size: 0.75em; color: #64748b; text-transform: uppercase; letter-spacing: 1.5px; margin-bottom: 14px;">π Probability Breakdown</div> | |
| {probability_bar("AI / Synthetic", ai_pct, "#dc2626", "π€")} | |
| {probability_bar("Human / Real", human_pct, "#059669", "π§")} | |
| </div> | |
| <div style="background: #f8fafc; border-radius: 12px; padding: 18px; margin-bottom: 20px;"> | |
| <div style="font-size: 0.75em; color: #64748b; text-transform: uppercase; letter-spacing: 1.5px; margin-bottom: 12px;">π¬ DSP Forensic Signatures</div> | |
| <div style="display: flex; justify-content: space-around; font-family: 'SF Mono', monospace; font-size: 0.9em; flex-wrap: wrap; gap: 12px;"> | |
| <div style="text-align: center;"> | |
| <div style="color: #94a3b8; font-size: 0.8em;">MFCC Variance</div> | |
| <div style="color: #1e293b; font-weight: 600;">{feats['mfcc_var']:.4f}</div> | |
| </div> | |
| <div style="text-align: center;"> | |
| <div style="color: #94a3b8; font-size: 0.8em;">Signal Energy</div> | |
| <div style="color: #1e293b; font-weight: 600;">{feats['energy']:.6f}</div> | |
| </div> | |
| <div style="text-align: center;"> | |
| <div style="color: #94a3b8; font-size: 0.8em;">Zero Crossing</div> | |
| <div style="color: #1e293b; font-weight: 600;">{feats['zcr']:.4f}</div> | |
| </div> | |
| <div style="text-align: center;"> | |
| <div style="color: #94a3b8; font-size: 0.8em;">Spectral Centroid</div> | |
| <div style="color: #1e293b; font-weight: 600;">{feats['spectral_centroid']:.1f} Hz</div> | |
| </div> | |
| </div> | |
| </div> | |
| <div style="font-size: 0.8em; color: #64748b; border-top: 1px solid #e2e8f0; padding-top: 14px; line-height: 1.6;"> | |
| <strong style="color: #475569;">Interpretation Guide:</strong><br> | |
| <span style="color: #059669;">β 0β40%</span> Very likely genuine human voice | | |
| <span style="color: #d97706;">β 40β60%</span> Mixed signal, manual review advised | | |
| <span style="color: #dc2626;">β 60β100%</span> Strong synthetic / AI indicators detected | |
| </div> | |
| </div> | |
| """ | |
| return plot_path, result_html, f"{ai_pct:.1f}%", color, "" | |
| except Exception as e: | |
| err_msg = str(e) | |
| if "FFmpeg" in err_msg: | |
| err_html = f"<div style='color:#dc2626;padding:30px;'><strong>β File Decode Error</strong><br><br>{err_msg}<br><br><span style='color:#475569;font-size:0.9em;'>WhatsApp voice notes are often .opus or .ogg files disguised as .mp3. Try renaming the file to .ogg or exporting it differently.</span></div>" | |
| else: | |
| err_html = f"<div style='color:#dc2626;padding:30px;'>β Analysis Error: {err_msg}</div>" | |
| return None, err_html, "Error", "#dc2626", "" | |
| # ========================================== | |
| # 7. GRADIO UI β ENHANCED AUDIO-ONLY | |
| # ========================================== | |
| CUSTOM_CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&display=swap'); | |
| .gradio-container { | |
| max-width: 1250px !important; | |
| margin: auto !important; | |
| font-family: 'Inter', sans-serif !important; | |
| background: #e8f0fe !important; | |
| } | |
| body { background: #e8f0fe !important; } | |
| /* Upload zones */ | |
| .upload-container { | |
| background: #ffffff !important; | |
| border: 2px dashed #93c5fd !important; | |
| border-radius: 16px !important; | |
| transition: all 0.3s ease !important; | |
| } | |
| .upload-container:hover { | |
| border-color: #2563eb !important; | |
| background: #eff6ff !important; | |
| box-shadow: 0 0 30px rgba(37, 99, 235, 0.12) !important; | |
| } | |
| /* Buttons */ | |
| button.primary { | |
| background: linear-gradient(135deg, #2563eb 0%, #7c3aed 100%) !important; | |
| border: none !important; | |
| border-radius: 12px !important; | |
| font-weight: 700 !important; | |
| letter-spacing: 0.5px !important; | |
| padding: 14px 32px !important; | |
| box-shadow: 0 4px 24px rgba(37, 99, 235, 0.25) !important; | |
| transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important; | |
| } | |
| button.primary:hover { | |
| transform: translateY(-2px) !important; | |
| box-shadow: 0 8px 32px rgba(37, 99, 235, 0.4) !important; | |
| } | |
| /* Equalizer Animation */ | |
| @keyframes eq-bounce { | |
| 0%, 100% { transform: scaleY(0.25); opacity: 0.5; } | |
| 50% { transform: scaleY(1); opacity: 1; } | |
| } | |
| /* Scrollbar */ | |
| ::-webkit-scrollbar { width: 8px; } | |
| ::-webkit-scrollbar-track { background: #dbeafe; } | |
| ::-webkit-scrollbar-thumb { background: #93c5fd; border-radius: 4px; } | |
| ::-webkit-scrollbar-thumb:hover { background: #2563eb; } | |
| /* Format badges */ | |
| .format-badge { | |
| display: inline-block; | |
| background: #eff6ff; | |
| border: 1px solid #bfdbfe; | |
| color: #2563eb; | |
| padding: 4px 12px; | |
| border-radius: 20px; | |
| font-size: 0.75em; | |
| font-weight: 600; | |
| letter-spacing: 0.5px; | |
| } | |
| /* Text inputs */ | |
| input, textarea { | |
| color: #1e293b !important; | |
| background: #ffffff !important; | |
| border: 1px solid #bfdbfe !important; | |
| } | |
| /* Audio player styling */ | |
| audio { | |
| border-radius: 12px !important; | |
| width: 100% !important; | |
| } | |
| /* Result cards */ | |
| .result-card { | |
| background: #ffffff; | |
| border-radius: 16px; | |
| padding: 24px; | |
| border: 1px solid #bfdbfe; | |
| box-shadow: 0 2px 8px rgba(0,0,0,0.04); | |
| } | |
| """ | |
| def build_ui(): | |
| with gr.Blocks( | |
| title="DeepFake AI Forensics β Audio Detector", | |
| theme=gr.themes.Base( | |
| primary_hue="blue", | |
| neutral_hue="slate", | |
| ), | |
| css=CUSTOM_CSS, | |
| ) as demo: | |
| # Header | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 40px 20px 10px 20px;"> | |
| <div style="display: inline-block; position: relative;"> | |
| <div style="position: absolute; top: -30px; left: 50%; transform: translateX(-50%); width: 280px; height: 280px; | |
| background: radial-gradient(circle, rgba(37,99,235,0.12) 0%, transparent 70%); border-radius: 50%; pointer-events: none;"></div> | |
| <h1 style="font-size: 2.8em; font-weight: 800; margin: 0; | |
| background: linear-gradient(135deg, #1e40af 0%, #2563eb 40%, #7c3aed 100%); | |
| -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; | |
| letter-spacing: -1.5px; position: relative;"> | |
| π DeepFake AI Forensics | |
| </h1> | |
| </div> | |
| <p style="font-size: 1.05em; color: #475569; margin-top: 14px; max-width: 560px; margin-left: auto; margin-right: auto; line-height: 1.6;"> | |
| Neural + DSP ensemble detection for synthetic voice identification. | |
| <br><span style="color: #2563eb; font-weight: 600;">Audio-only analysis</span> | |
| </p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| # βββββββββββββββββββββββββββββββββββββββββββ | |
| # LEFT COLUMN β Upload | |
| # βββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Column(scale=1, min_width=360): | |
| gr.Markdown("### π€ Upload Audio File", elem_classes="section-title") | |
| gr.HTML(""" | |
| <div style="margin-bottom: 12px; display: flex; flex-wrap: wrap; gap: 6px;"> | |
| <span class="format-badge">MP3</span> | |
| <span class="format-badge">WAV</span> | |
| <span class="format-badge">M4A</span> | |
| <span class="format-badge">FLAC</span> | |
| <span class="format-badge">OGG</span> | |
| <span class="format-badge">OPUS</span> | |
| <span class="format-badge">AAC</span> | |
| </div> | |
| <div style="font-size: 0.8em; color: #475569; margin-bottom: 16px; display: flex; align-items: center; gap: 6px;"> | |
| <span style="font-size: 1.2em;">π</span> | |
| <span>Maximum file size: <strong style="color: #1e40af;">50 MB</strong></span> | |
| </div> | |
| <div style="font-size: 0.75em; color: #94a3b8; background: #eff6ff; border-radius: 8px; padding: 10px 12px; margin-bottom: 12px; line-height: 1.5;"> | |
| π‘ <strong>WhatsApp voice notes:</strong> If your file fails to upload, try renaming it from <code>.mp3</code> to <code>.ogg</code> or <code>.opus</code> before uploading. | |
| </div> | |
| """) | |
| audio_input = gr.Audio( | |
| label="", | |
| type="filepath", | |
| elem_classes="upload-container" | |
| ) | |
| audio_waves = gr.HTML(value=EQUALIZER_HTML + '<div style="text-align:center;color:#64748b;font-size:0.85em;">Audio waveform ready for analysis</div>') | |
| audio_btn = gr.Button("π Analyze Audio", variant="primary", size="lg") | |
| audio_score_text = gr.Textbox( | |
| label="", | |
| value="--%", | |
| interactive=False | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββ | |
| # RIGHT COLUMN β Results | |
| # βββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Column(scale=2): | |
| gr.Markdown("### π Forensic Analysis Report", elem_classes="section-title") | |
| audio_plot = gr.Image( | |
| label="", | |
| show_label=False, | |
| elem_classes="result-image" | |
| ) | |
| audio_result = gr.HTML( | |
| value=""" | |
| <div style="background: #ffffff; border: 2px dashed #bfdbfe; border-radius: 20px; padding: 50px 30px; text-align: center; margin-top: 8px;"> | |
| <div style="font-size: 3em; margin-bottom: 16px;">π</div> | |
| <div style="color: #64748b; font-size: 1.1em; font-weight: 600;">Results will appear here</div> | |
| <div style="color: #94a3b8; font-size: 0.9em; margin-top: 8px;">Upload an audio file and click analyze to begin</div> | |
| </div> | |
| """ | |
| ) | |
| audio_btn.click( | |
| fn=detect_audio, | |
| inputs=[audio_input], | |
| outputs=[audio_plot, audio_result, audio_score_text, audio_score_text, audio_waves] | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββ | |
| # HOW IT WORKS SECTION | |
| # βββββββββββββββββββββββββββββββββββββββββββ | |
| gr.HTML(""" | |
| <div style="max-width: 900px; margin: 40px auto 0 auto; padding: 20px 0 40px 0;"> | |
| <h2 style="color: #1e293b; font-size: 1.7em; margin-bottom: 28px; text-align: center; font-weight: 700;">π§ Detection Pipeline</h2> | |
| <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(260px, 1fr)); gap: 18px; margin-bottom: 36px;"> | |
| <div style="background: #ffffff; border: 1px solid #bfdbfe; border-radius: 16px; padding: 24px; box-shadow: 0 2px 8px rgba(0,0,0,0.04);"> | |
| <div style="font-size: 2em; margin-bottom: 10px;">π§ </div> | |
| <h3 style="color: #1e40af; margin: 0 0 6px 0; font-size: 1.1em;">Transformer Classifier</h3> | |
| <p style="color: #475569; font-size: 0.9em; line-height: 1.5; margin: 0;"> | |
| <code style="background: #eff6ff; padding: 2px 6px; border-radius: 4px; color: #2563eb;">Hemgg/Deepfake-audio-detection</code> | |
| Wav2Vec 2.0 base model running on GPU/CPU with HuggingFace Transformers. | |
| </p> | |
| </div> | |
| <div style="background: #ffffff; border: 1px solid #bfdbfe; border-radius: 16px; padding: 24px; box-shadow: 0 2px 8px rgba(0,0,0,0.04);"> | |
| <div style="font-size: 2em; margin-bottom: 10px;">π</div> | |
| <h3 style="color: #2563eb; margin: 0 0 6px 0; font-size: 1.1em;">DSP Ensemble</h3> | |
| <p style="color: #475569; font-size: 0.9em; line-height: 1.5; margin: 0;"> | |
| MFCC variance + signal energy + zero-crossing rate + spectral centroid fused with neural output (80/20 weighting). | |
| </p> | |
| </div> | |
| <div style="background: #ffffff; border: 1px solid #bfdbfe; border-radius: 16px; padding: 24px; box-shadow: 0 2px 8px rgba(0,0,0,0.04);"> | |
| <div style="font-size: 2em; margin-bottom: 10px;">π§</div> | |
| <h3 style="color: #7c3aed; margin: 0 0 6px 0; font-size: 1.1em;">Universal Decoder</h3> | |
| <p style="color: #475569; font-size: 0.9em; line-height: 1.5; margin: 0;"> | |
| FFmpeg extracts and normalizes audio from any format β including WhatsApp voice notes with disguised extensions. | |
| </p> | |
| </div> | |
| </div> | |
| <h3 style="color: #1e293b; margin-bottom: 16px; font-size: 1.2em;">π Score Interpretation</h3> | |
| <div style="background: #ffffff; border-radius: 14px; padding: 20px; border: 1px solid #bfdbfe; margin-bottom: 28px; box-shadow: 0 2px 8px rgba(0,0,0,0.04);"> | |
| <div style="display: flex; align-items: center; margin-bottom: 10px; padding: 10px 14px; background: #f0fdf4; border-radius: 10px; border-left: 4px solid #059669;"> | |
| <span style="color: #059669; font-weight: 700; min-width: 70px; font-size: 0.95em;">0β40%</span> | |
| <span style="color: #475569; margin-left: 12px; font-size: 0.9em;">π’ Very likely genuine / human-created</span> | |
| </div> | |
| <div style="display: flex; align-items: center; margin-bottom: 10px; padding: 10px 14px; background: #fffbeb; border-radius: 10px; border-left: 4px solid #d97706;"> | |
| <span style="color: #d97706; font-weight: 700; min-width: 70px; font-size: 0.95em;">40β60%</span> | |
| <span style="color: #475569; margin-left: 12px; font-size: 0.9em;">π‘ Uncertain / mixed signal β manual review recommended</span> | |
| </div> | |
| <div style="display: flex; align-items: center; padding: 10px 14px; background: #fef2f2; border-radius: 10px; border-left: 4px solid #dc2626;"> | |
| <span style="color: #dc2626; font-weight: 700; min-width: 70px; font-size: 0.95em;">60β100%</span> | |
| <span style="color: #475569; margin-left: 12px; font-size: 0.9em;">π΄ Strong AI-generated / synthetic voice indicators</span> | |
| </div> | |
| </div> | |
| <div style="background: #fff7ed; border: 1px solid #fed7aa; border-radius: 14px; padding: 20px; color: #9a3412; font-size: 0.88em; line-height: 1.6;"> | |
| <strong style="color: #c2410c;">β οΈ Important Limitations</strong><br><br> | |
| No automated detector is 100% accurate. Adversarial AI models may evade detection. | |
| Compressed or noisy audio reduces reliability. Always use human expert judgment for critical decisions. | |
| </div> | |
| </div> | |
| """) | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 30px 20px; color: #94a3b8; font-size: 0.82em; border-top: 1px solid #bfdbfe; margin-top: 10px;"> | |
| Neural Audio Forensics β’ Powered by HuggingFace Transformers & DSP Signal Processing | |
| </div> | |
| """) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = build_ui() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| ssr_mode=False | |
| ) |