| """ |
| app.py β Multimodal Deepfake Detection β Gradio Frontend |
| ========================================================= |
| Supports: |
| β’ Audio-only detection (upload WAV / FLAC / MP3) |
| β’ Video-only detection (upload MP4 β runs GenConViT via subprocess) |
| β’ Multimodal fusion (upload video with audio track) |
| |
| Hosting: |
| HuggingFace Spaces (recommended) β set HF_SPACE=1 to auto-detect. |
| Local: python app.py |
| |
| Model weights are downloaded at startup from HuggingFace Hub. |
| """ |
|
|
| import os |
| import sys |
| import json |
| import time |
| import tempfile |
| import subprocess |
| import textwrap |
| from pathlib import Path |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| def _patch_gradio_client(): |
| try: |
| import gradio_client.utils as _gc |
| _orig = _gc._json_schema_to_python_type |
|
|
| def _safe(schema, defs=None): |
| |
| |
| if not isinstance(schema, dict): |
| return "any" |
| return _orig(schema, defs) |
|
|
| _gc._json_schema_to_python_type = _safe |
| except Exception: |
| pass |
|
|
| _patch_gradio_client() |
|
|
| import gradio as gr |
|
|
|
|
| |
| |
| |
| APP_DIR = Path(__file__).parent.resolve() |
| PROJ_DIR = APP_DIR.parent |
| AUDIO_DIR = PROJ_DIR / "audio_detection" |
| FUSION_DIR = PROJ_DIR / "fusion" |
| VIDEO_DIR = PROJ_DIR / "video_detection" |
| GENCONVIT_WEIGHT_DIR = VIDEO_DIR / "GenConViT" / "weight" |
|
|
| sys.path.insert(0, str(AUDIO_DIR)) |
| sys.path.insert(0, str(FUSION_DIR)) |
|
|
|
|
| |
| |
| |
| _GENCONVIT_WEIGHTS = { |
| "genconvit_ed_inference.pth": "https://huggingface.co/Deressa/GenConViT/resolve/main/genconvit_ed_inference.pth", |
| "genconvit_vae_inference.pth": "https://huggingface.co/Deressa/GenConViT/resolve/main/genconvit_vae_inference.pth", |
| } |
|
|
| def _ensure_genconvit_weights(): |
| """Download GenConViT pretrained weights if they are not already present.""" |
| import urllib.request |
| GENCONVIT_WEIGHT_DIR.mkdir(parents=True, exist_ok=True) |
| for fname, url in _GENCONVIT_WEIGHTS.items(): |
| dest = GENCONVIT_WEIGHT_DIR / fname |
| if not dest.exists(): |
| print(f"[Video] Downloading {fname} from HuggingFace β¦") |
| try: |
| urllib.request.urlretrieve(url, str(dest)) |
| print(f"[Video] β {fname} downloaded ({dest.stat().st_size // 1_048_576} MB)") |
| except Exception as exc: |
| print(f"[Video] β Failed to download {fname}: {exc}") |
| else: |
| print(f"[Video] Weight already present: {fname}") |
|
|
| _ensure_genconvit_weights() |
|
|
|
|
| |
| |
| |
| _audio_detector = None |
| _fusion_module = None |
|
|
|
|
| def get_audio_detector(): |
| global _audio_detector |
| if _audio_detector is None: |
| from inference import AudioDeepfakeDetector |
| _audio_detector = AudioDeepfakeDetector( |
| device="cuda" if _cuda_available() else "cpu" |
| ) |
| return _audio_detector |
|
|
|
|
| def get_fusion(): |
| global _fusion_module |
| if _fusion_module is None: |
| from fusion import MultimodalFusion |
| _fusion_module = MultimodalFusion(strategy="weighted_average", alpha=0.5) |
| return _fusion_module |
|
|
|
|
| def _cuda_available(): |
| try: |
| import torch |
| return torch.cuda.is_available() |
| except ImportError: |
| return False |
|
|
|
|
| |
| |
| |
|
|
| def _score_bar(score_fake: float) -> str: |
| """ |
| Build a text-based confidence bar for display. |
| score_fake: 0..1, 1=100% fake |
| """ |
| filled = int(round(score_fake * 20)) |
| bar = "β" * filled + "β" * (20 - filled) |
| return f"[{bar}] {score_fake*100:.1f}% Fake" |
|
|
|
|
| def run_audio_inference(audio_path: str): |
| """Run Nes2Net on a local audio file. Returns result dict.""" |
| if audio_path is None: |
| return None |
| os.chdir(str(AUDIO_DIR)) |
| det = get_audio_detector() |
| result = det.predict(audio_path) |
| os.chdir(str(PROJ_DIR)) |
| return result |
|
|
|
|
| def run_video_inference_via_subprocess(video_path: str): |
| """ |
| Run GenConViT video inference. |
| Loads video_detection/inference.py directly via importlib to avoid |
| the sys.modules name collision with audio_detection/inference.py. |
| """ |
| video_dir = PROJ_DIR / "video_detection" |
| try: |
| import importlib.util |
| |
| |
| |
| spec = importlib.util.spec_from_file_location( |
| "video_inference", |
| str(video_dir / "inference.py"), |
| ) |
| video_mod = importlib.util.module_from_spec(spec) |
| spec.loader.exec_module(video_mod) |
| VideoDeepfakeDetector = video_mod.VideoDeepfakeDetector |
|
|
| det = VideoDeepfakeDetector( |
| ed_weight="genconvit_ed_inference", |
| vae_weight="genconvit_vae_inference", |
| num_frames=15, |
| fp16=False, |
| ) |
| return det.predict(video_path) |
| except Exception as exc: |
| return { |
| "error": str(exc), |
| "label": "Unavailable", |
| "prob_fake": None, |
| "prob_real": None, |
| "normalized_score": None, |
| "faces_detected": False, |
| } |
|
|
|
|
|
|
| def extract_audio_from_video(video_path: str) -> str | None: |
| """Extract 16kHz mono WAV from a video file using ffmpeg.""" |
| out_path = os.path.join(tempfile.gettempdir(), "deepfake_extracted.wav") |
| try: |
| subprocess.run( |
| [ |
| "ffmpeg", "-y", "-i", video_path, |
| "-vn", "-acodec", "pcm_s16le", |
| "-ar", "16000", "-ac", "1", |
| out_path, |
| ], |
| capture_output=True, check=True, |
| ) |
| return out_path |
| except Exception: |
| return None |
|
|
|
|
| |
| |
| |
|
|
| def predict_audio(audio_file): |
| """Called when user submits an audio file.""" |
| if audio_file is None: |
| return ( |
| "No file uploaded.", |
| "", |
| gr.update(visible=False), |
| ) |
| try: |
| t0 = time.time() |
| result = run_audio_inference(audio_file) |
| elapsed = time.time() - t0 |
|
|
| if result is None: |
| return "Model not loaded.", "", gr.update(visible=False) |
|
|
| label = result.get("label", "Unknown") |
| real_score = result.get("real_score", 0.5) |
| fake_score = 1 - real_score |
| conf = result.get("confidence", abs(real_score - 0.5) * 2) |
|
|
| verdict = "FAKE AUDIO DETECTED" if label.upper() == "FAKE" else "GENUINE AUDIO" |
| colour = "#ff4b4b" if label.upper() == "FAKE" else "#21c354" |
|
|
| bar = _score_bar(fake_score) |
|
|
| summary_html = f""" |
| <div class="glass-panel status-{'fake' if label.upper() == 'FAKE' else 'real'}"> |
| <div class="result-title" style="color: {colour};">{verdict}</div> |
| <div class="score-container"> |
| <span class="score-label">Confidence Metrics</span> |
| <span class="score-value" style="color: {colour};">{conf:.1%}</span> |
| </div> |
| <div class="score-container"> |
| <span class="score-label">P(Fake) vs P(Real)</span> |
| <span class="score-value">{fake_score:.4f} | {real_score:.4f}</span> |
| </div> |
| <div class="progress-track"> |
| <div class="progress-fill" style="width: {fake_score * 100}%;"></div> |
| </div> |
| <div style="text-align: right; margin-top: 12px; font-size: 0.85rem; color: #64748b;"> |
| Execution Time: {elapsed:.2f}s |
| </div> |
| </div> |
| """ |
| details = json.dumps(result, indent=2) |
| return summary_html, details, gr.update(visible=True) |
| except Exception as exc: |
| return f"<b>Error:</b> {exc}", "", gr.update(visible=False) |
|
|
|
|
| def predict_video(video_file): |
| """Called when user submits a video file.""" |
| if video_file is None: |
| return "No file uploaded.", "", gr.update(visible=False) |
| try: |
| t0 = time.time() |
| result = run_video_inference_via_subprocess(video_file) |
| elapsed = time.time() - t0 |
|
|
| if result.get("error"): |
| return ( |
| f"<b style='color:#ff4b4b'>Video model unavailable locally.</b>" |
| f"<br><small>{result['error']}</small>" |
| f"<br><br>To evaluate videos, run the notebook on Colab/Kaggle.", |
| json.dumps(result, indent=2), |
| gr.update(visible=True), |
| ) |
|
|
| label = result.get("label", "Unknown") |
| prob_fake = result.get("prob_fake", 0.5) |
| prob_real = result.get("prob_real", 0.5) |
| conf = result.get("confidence", 0) |
| faces = result.get("faces_detected", False) |
|
|
| verdict = "DEEPFAKE VIDEO DETECTED" if label.upper() == "FAKE" else "GENUINE VIDEO" |
| colour = "#ff4b4b" if label.upper() == "FAKE" else "#21c354" |
| bar = _score_bar(prob_fake) |
|
|
| summary_html = f""" |
| <div class="glass-panel status-{'fake' if label.upper() == 'FAKE' else 'real'}"> |
| <div class="result-title" style="color: {colour};">{verdict}</div> |
| <div class="score-container"> |
| <span class="score-label">Facial Extraction Target</span> |
| <span class="score-value">{"Face Found" if faces else "No Face"}</span> |
| </div> |
| <div class="score-container"> |
| <span class="score-label">P(Fake) vs P(Real)</span> |
| <span class="score-value">{prob_fake:.4f} | {prob_real:.4f}</span> |
| </div> |
| <div class="progress-track"> |
| <div class="progress-fill" style="width: {prob_fake * 100}%;"></div> |
| </div> |
| <div style="text-align: right; margin-top: 12px; font-size: 0.85rem; color: #64748b;"> |
| Execution Time: {elapsed:.2f}s |
| </div> |
| </div> |
| """ |
| return summary_html, json.dumps(result, indent=2), gr.update(visible=True) |
| except Exception as exc: |
| return f"<b>Error:</b> {exc}", "", gr.update(visible=False) |
|
|
|
|
| def predict_multimodal(video_file): |
| """Fuse audio + video scores from a single video file.""" |
| if video_file is None: |
| return "No file uploaded.", "", gr.update(visible=False) |
| try: |
| t0 = time.time() |
|
|
| |
| audio_path = extract_audio_from_video(video_file) |
|
|
| |
| audio_result = run_audio_inference(audio_path) if audio_path else None |
| video_result = run_video_inference_via_subprocess(video_file) |
|
|
| |
| from fusion import MultimodalFusion |
| _, fused = MultimodalFusion.from_detector_results( |
| audio_result, video_result, strategy="weighted_average", alpha=0.5 |
| ) |
|
|
| elapsed = time.time() - t0 |
| label = fused["label"] |
| score = fused["fused_score"] |
| conf = fused["confidence"] |
| mods = ", ".join(fused["modalities_used"]) or "none" |
|
|
| fake_score = 1 - score |
| verdict = "DEEPFAKE DETECTED" if label.upper() == "FAKE" else "GENUINE MEDIA" |
| colour = "#ff4b4b" if label.upper() == "FAKE" else "#21c354" |
| bar = _score_bar(fake_score) |
|
|
| summary_html = f""" |
| <div class="glass-panel status-{'fake' if label.upper() == 'FAKE' else 'real'}"> |
| <div class="result-title" style="color: {colour};">{verdict}</div> |
| <div class="score-container"> |
| <span class="score-label">Multimodal Fused Score (Fake)</span> |
| <span class="score-value" style="color: {colour};">{fake_score * 100:.2f}%</span> |
| </div> |
| <div class="progress-track"> |
| <div class="progress-fill" style="width: {fake_score * 100}%;"></div> |
| </div> |
| |
| <div style="display: flex; gap: 16px; margin-top: 16px;"> |
| <div class="score-container" style="flex: 1;"> |
| <span class="score-label">Audio P(Real)</span> |
| <span class="score-value">{fused.get('audio_score', 'N/A')}</span> |
| </div> |
| <div class="score-container" style="flex: 1;"> |
| <span class="score-label">Video P(Real)</span> |
| <span class="score-value">{fused.get('video_score', 'N/A')}</span> |
| </div> |
| </div> |
| |
| <div style="text-align: right; margin-top: 16px; font-size: 0.8rem; color: #64748b;"> |
| Active Streams: {mods} | Inference Time: {elapsed:.2f}s |
| </div> |
| </div> |
| """ |
| return summary_html, json.dumps(fused, indent=2), gr.update(visible=True) |
| except Exception as exc: |
| return f"<b>Error:</b> {exc}", "", gr.update(visible=False) |
|
|
|
|
| |
| |
| |
|
|
| THEME = gr.themes.Soft( |
| primary_hue="violet", |
| secondary_hue="blue", |
| neutral_hue="slate", |
| font=[gr.themes.GoogleFont("Outfit"), "sans-serif"], |
| ).set( |
| body_background_fill="linear-gradient(135deg, #0f172a 0%, #1e1b4b 100%)", |
| body_background_fill_dark="linear-gradient(135deg, #0f172a 0%, #1e1b4b 100%)", |
| block_background_fill="rgba(30, 41, 59, 0.7)", |
| block_background_fill_dark="rgba(30, 41, 59, 0.7)", |
| block_border_color="rgba(148, 163, 184, 0.2)", |
| block_border_width="1px", |
| block_label_text_color="#cbd5e1", |
| block_shadow="0 8px 32px 0 rgba(0, 0, 0, 0.3)", |
| button_primary_background_fill="linear-gradient(90deg, #8b5cf6 0%, #3b82f6 100%)", |
| button_primary_background_fill_hover="linear-gradient(90deg, #7c3aed 0%, #2563eb 100%)", |
| button_primary_text_color="#ffffff", |
| input_background_fill="rgba(15, 23, 42, 0.6)", |
| input_border_color="rgba(99, 102, 241, 0.3)", |
| panel_background_fill="rgba(30, 41, 59, 0.4)", |
| ) |
|
|
| DESCRIPTION = textwrap.dedent(""" |
| ## Multimodal Deepfake Detection System |
| **Nes2Net (audio)** | **GenConViT (video)** | **Late fusion** |
| |
| > Upload an audio clip, a video, or a video with audio to detect deepfakes. |
| """) |
|
|
|
|
| CUSTOM_CSS = """ |
| @import url('https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;600;800&display=swap'); |
| |
| /* Force overriding Gradio's internal container instead of just 'body' */ |
| .gradio-container { |
| background-color: #0b0f19 !important; |
| background-image: radial-gradient(at 0% 0%, rgba(17, 24, 39, 1) 0, transparent 50%), radial-gradient(at 100% 0%, rgba(30, 27, 75, 1) 0, transparent 50%), radial-gradient(at 50% 100%, rgba(15, 23, 42, 1) 0, transparent 50%) !important; |
| background-attachment: fixed !important; |
| color: #f8fafc !important; |
| font-family: 'Outfit', sans-serif !important; |
| } |
| |
| /* Force standard Gradio wrappers to be slightly transparent to see background */ |
| .wrap, .panel, .gap, .form { |
| background-color: rgba(15, 23, 42, 0.4) !important; |
| border-color: rgba(255, 255, 255, 0.05) !important; |
| } |
| |
| .hero-header { |
| text-align: center; |
| padding: 30px 10px; |
| margin-bottom: 30px; |
| background: rgba(15, 23, 42, 0.4) !important; |
| border-radius: 16px; |
| border: 1px solid rgba(255, 255, 255, 0.05) !important; |
| backdrop-filter: blur(20px); |
| box-shadow: 0 10px 40px rgba(0,0,0,0.5); |
| } |
| .hero-title { |
| font-size: 3rem; |
| font-weight: 800; |
| text-transform: uppercase; |
| letter-spacing: 2px; |
| background: linear-gradient(to right, #8b5cf6, #3b82f6, #06b6d4) !important; |
| -webkit-background-clip: text !important; |
| background-clip: text !important; |
| -webkit-text-fill-color: transparent !important; |
| margin-bottom: 10px; |
| } |
| .hero-subtitle { |
| font-size: 1.1rem; |
| color: #94a3b8 !important; |
| font-weight: 300; |
| } |
| |
| .glass-panel { |
| background: linear-gradient(145deg, rgba(30, 41, 59, 0.6) 0%, rgba(15, 23, 42, 0.8) 100%); |
| backdrop-filter: blur(24px); |
| border: 1px solid rgba(148, 163, 184, 0.1); |
| border-radius: 20px; |
| padding: 32px; |
| box-shadow: 0 15px 35px rgba(0,0,0,0.5), inset 0 1px 0 rgba(255,255,255,0.05); |
| margin-bottom: 24px; |
| transition: transform 0.3s ease, box-shadow 0.3s ease; |
| } |
| .glass-panel:hover { |
| transform: translateY(-5px); |
| box-shadow: 0 20px 40px rgba(0,0,0,0.6), inset 0 1px 0 rgba(255,255,255,0.1); |
| } |
| |
| .status-fake { border-top: 4px solid #f43f5e; box-shadow: 0 10px 40px rgba(244, 63, 94, 0.15); } |
| .status-real { border-top: 4px solid #10b981; box-shadow: 0 10px 40px rgba(16, 185, 129, 0.15); } |
| .result-title { font-size: 2.2rem; font-weight: 800; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 24px; text-align: center; text-shadow: 0 4px 10px rgba(0,0,0,0.4); } |
| |
| .score-container { display: flex; justify-content: space-between; align-items: center; padding: 16px 20px; background: rgba(0, 0, 0, 0.3); border-radius: 12px; margin-top: 16px; border: 1px solid rgba(255,255,255,0.03); } |
| .score-label { font-size: 1rem; color: #94a3b8; font-weight: 600; text-transform: uppercase; letter-spacing: 1px; } |
| .score-value { font-size: 1.5rem; font-weight: 800; color: #f8fafc; } |
| |
| @keyframes fillout { from { width: 0; opacity: 0; } to { opacity: 1; } } |
| .progress-track { width: 100%; height: 14px; background: rgba(0, 0, 0, 0.5); border-radius: 7px; overflow: hidden; margin-top: 12px; box-shadow: inset 0 2px 4px rgba(0,0,0,0.5); } |
| .progress-fill { height: 100%; border-radius: 7px; animation: fillout 1.2s cubic-bezier(0.16, 1, 0.3, 1) forwards; position: relative; } |
| .status-fake .progress-fill { background: linear-gradient(90deg, #be123c, #f43f5e); } |
| .status-real .progress-fill { background: linear-gradient(90deg, #047857, #10b981); } |
| |
| .gradio-container .prose * { padding: 0 !important; } |
| |
| .fusion-btn { |
| background: linear-gradient(90deg, #8b5cf6, #3b82f6) !important; |
| border: none !important; |
| box-shadow: 0 0 15px rgba(139, 92, 246, 0.5) !important; |
| animation: pulseGlow 2s infinite; |
| } |
| @keyframes pulseGlow { |
| 0% { box-shadow: 0 0 15px rgba(139, 92, 246, 0.5); } |
| 50% { box-shadow: 0 0 30px rgba(139, 92, 246, 0.9); } |
| 100% { box-shadow: 0 0 15px rgba(139, 92, 246, 0.5); } |
| } |
| """ |
|
|
| def build_ui(): |
| |
| with gr.Blocks(theme=THEME, css=CUSTOM_CSS, title="Multimodal Deepfake Detection") as demo: |
| |
| |
| gr.HTML(""" |
| <div class="hero-header"> |
| <div class="hero-title">Multimodal Deepfake Detection</div> |
| <div class="hero-subtitle">Industry-grade neural security layer fusing Nes2Net acoustics and GenConViT optics.</div> |
| </div> |
| """) |
|
|
| with gr.Tabs(): |
| |
| with gr.TabItem("Audio Detection"): |
| gr.Markdown("### Upload a speech sample to detect AI-synthesised audio.") |
| with gr.Row(): |
| with gr.Column(scale=1): |
| audio_input = gr.Audio( |
| label="Upload Audio", |
| type="filepath", |
| sources=["upload", "microphone"], |
| ) |
| audio_btn = gr.Button("Analyse Audio", variant="primary") |
| with gr.Column(scale=2): |
| audio_result = gr.HTML(label="Result") |
| audio_details = gr.Textbox(label="Raw JSON output", |
| lines=10, visible=False, |
| interactive=False) |
| audio_expand_btn = gr.Button("Show raw output", size="sm", |
| visible=False) |
|
|
| audio_expand_btn.click( |
| lambda: gr.update(visible=True), |
| outputs=audio_details, |
| ) |
| audio_btn.click( |
| predict_audio, |
| inputs=audio_input, |
| outputs=[audio_result, audio_details, audio_expand_btn], |
| ) |
|
|
| |
| with gr.TabItem("Video Detection"): |
| gr.Markdown( |
| "### Upload a video to detect face manipulation.\n" |
| "> **Note:** Full GPU inference requires Colab/Kaggle. " |
| "The local model may report 'Unavailable'." |
| ) |
| with gr.Row(): |
| with gr.Column(scale=1): |
| video_input = gr.Video(label="Upload Video") |
| video_btn = gr.Button("Analyse Video", variant="primary") |
| with gr.Column(scale=2): |
| video_result = gr.HTML(label="Result") |
| video_details = gr.Textbox(label="Raw JSON output", |
| lines=10, visible=False, |
| interactive=False) |
| video_expand = gr.Button("Show raw output", size="sm", |
| visible=False) |
|
|
| video_expand.click( |
| lambda: gr.update(visible=True), |
| outputs=video_details, |
| ) |
| video_btn.click( |
| predict_video, |
| inputs=video_input, |
| outputs=[video_result, video_details, video_expand], |
| ) |
|
|
| |
| with gr.TabItem("Multimodal Fusion"): |
| gr.Markdown( |
| "### Upload a video with audio to get a fused verdict.\n" |
| "Both the audio track and video frames will be analysed " |
| "and combined via weighted-average score fusion." |
| ) |
| with gr.Row(): |
| with gr.Column(scale=1): |
| mm_input = gr.Video(label="Upload Video (with audio)") |
| mm_btn = gr.Button("Launch Deep Multimodal Fusion π", variant="primary", elem_classes=["fusion-btn"]) |
| with gr.Column(scale=2): |
| mm_result = gr.HTML(label="Result") |
| mm_details = gr.Textbox(label="Raw JSON output", |
| lines=10, visible=False, |
| interactive=False) |
| mm_expand = gr.Button("Show raw output", size="sm", |
| visible=False) |
|
|
| mm_expand.click( |
| lambda: gr.update(visible=True), |
| outputs=mm_details, |
| ) |
| mm_btn.click( |
| predict_multimodal, |
| inputs=mm_input, |
| outputs=[mm_result, mm_details, mm_expand], |
| ) |
|
|
| gr.Markdown(""" |
| --- |
| **Model Architecture:** |
| Audio β Wav2Vec 2.0 (XLSR-300M) + Nes2Net (ASVspoof 2021 DF checkpoint) |
| Video β GenConViT (ED + VAE ensemble, GenConViT weights) |
| Fusion β Weighted-average late fusion (Ξ± = 0.5) |
| |
| **B.Tech Project** β Multimodal Deepfake Detection |
| """) |
|
|
| return demo |
|
|
|
|
| |
| |
| |
| if __name__ == "__main__": |
| demo = build_ui() |
| |
| demo.launch( |
| server_name="0.0.0.0", |
| server_port=7860, |
| share=False, |
| show_error=True, |
| show_api=False, |
| ) |
|
|