""" app.py — Multimodal Deepfake Detection — Gradio Frontend ========================================================= Supports: • Audio-only detection (upload WAV / FLAC / MP3) • Video-only detection (upload MP4 — runs GenConViT via subprocess) • Multimodal fusion (upload video with audio track) Hosting: HuggingFace Spaces (recommended) — set HF_SPACE=1 to auto-detect. Local: python app.py Model weights are downloaded at startup from HuggingFace Hub. """ import os import sys import json import time import tempfile import subprocess import textwrap from pathlib import Path # --------------------------------------------------------------------------- # CRITICAL: Monkey-patch gradio_client BEFORE importing gradio. # Root cause: gradio_client/utils.py:_json_schema_to_python_type() receives # a bool (False) as `schema` when processing Video/Audio component schemas # that contain `"additionalProperties": false`. The function then does # `if "const" in schema` which crashes because booleans are not iterable. # This bug lives in: gradio_client<=0.9.1 (fixed in 0.10.0 / gradio>=5.0). # Since we target gradio 4.44.x for Python 3.10, we patch it in-process. # --------------------------------------------------------------------------- def _patch_gradio_client(): try: import gradio_client.utils as _gc _orig = _gc._json_schema_to_python_type def _safe(schema, defs=None): # Guard: if schema is not a dict (e.g. bool from additionalProperties:false) # return a safe fallback type string instead of crashing. if not isinstance(schema, dict): return "any" return _orig(schema, defs) _gc._json_schema_to_python_type = _safe except Exception: pass # If gradio_client is not yet installed or already patched, skip. _patch_gradio_client() import gradio as gr # --------------------------------------------------------------------------- # Project paths # --------------------------------------------------------------------------- APP_DIR = Path(__file__).parent.resolve() PROJ_DIR = APP_DIR.parent AUDIO_DIR = PROJ_DIR / "audio_detection" FUSION_DIR = PROJ_DIR / "fusion" VIDEO_DIR = PROJ_DIR / "video_detection" GENCONVIT_WEIGHT_DIR = VIDEO_DIR / "GenConViT" / "weight" sys.path.insert(0, str(AUDIO_DIR)) sys.path.insert(0, str(FUSION_DIR)) # --------------------------------------------------------------------------- # GenConViT weight bootstrap — download at startup if missing # --------------------------------------------------------------------------- _GENCONVIT_WEIGHTS = { "genconvit_ed_inference.pth": "https://huggingface.co/Deressa/GenConViT/resolve/main/genconvit_ed_inference.pth", "genconvit_vae_inference.pth": "https://huggingface.co/Deressa/GenConViT/resolve/main/genconvit_vae_inference.pth", } def _ensure_genconvit_weights(): """Download GenConViT pretrained weights if they are not already present.""" import urllib.request GENCONVIT_WEIGHT_DIR.mkdir(parents=True, exist_ok=True) for fname, url in _GENCONVIT_WEIGHTS.items(): dest = GENCONVIT_WEIGHT_DIR / fname if not dest.exists(): print(f"[Video] Downloading {fname} from HuggingFace …") try: urllib.request.urlretrieve(url, str(dest)) print(f"[Video] ✓ {fname} downloaded ({dest.stat().st_size // 1_048_576} MB)") except Exception as exc: print(f"[Video] ✗ Failed to download {fname}: {exc}") else: print(f"[Video] Weight already present: {fname}") _ensure_genconvit_weights() # --------------------------------------------------------------------------- # Lazy model loading # --------------------------------------------------------------------------- _audio_detector = None _fusion_module = None def get_audio_detector(): global _audio_detector if _audio_detector is None: from inference import AudioDeepfakeDetector _audio_detector = AudioDeepfakeDetector( device="cuda" if _cuda_available() else "cpu" ) return _audio_detector def get_fusion(): global _fusion_module if _fusion_module is None: from fusion import MultimodalFusion _fusion_module = MultimodalFusion(strategy="weighted_average", alpha=0.5) return _fusion_module def _cuda_available(): try: import torch return torch.cuda.is_available() except ImportError: return False # --------------------------------------------------------------------------- # Inference helpers # --------------------------------------------------------------------------- def _score_bar(score_fake: float) -> str: """ Build a text-based confidence bar for display. score_fake: 0..1, 1=100% fake """ filled = int(round(score_fake * 20)) bar = "█" * filled + "░" * (20 - filled) return f"[{bar}] {score_fake*100:.1f}% Fake" def run_audio_inference(audio_path: str): """Run Nes2Net on a local audio file. Returns result dict.""" if audio_path is None: return None os.chdir(str(AUDIO_DIR)) det = get_audio_detector() result = det.predict(audio_path) os.chdir(str(PROJ_DIR)) return result def run_video_inference_via_subprocess(video_path: str): """ Run GenConViT video inference. Loads video_detection/inference.py directly via importlib to avoid the sys.modules name collision with audio_detection/inference.py. """ video_dir = PROJ_DIR / "video_detection" try: import importlib.util # Load video inference module using its absolute path under a unique name. # This bypasses sys.modules where 'inference' is already cached as the # audio_detection version — which caused the VideoDeepfakeDetector import error. spec = importlib.util.spec_from_file_location( "video_inference", # unique module name str(video_dir / "inference.py"), # absolute file path ) video_mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(video_mod) VideoDeepfakeDetector = video_mod.VideoDeepfakeDetector det = VideoDeepfakeDetector( ed_weight="genconvit_ed_inference", vae_weight="genconvit_vae_inference", num_frames=15, fp16=False, ) return det.predict(video_path) except Exception as exc: return { "error": str(exc), "label": "Unavailable", "prob_fake": None, "prob_real": None, "normalized_score": None, "faces_detected": False, } def extract_audio_from_video(video_path: str) -> str | None: """Extract 16kHz mono WAV from a video file using ffmpeg.""" out_path = os.path.join(tempfile.gettempdir(), "deepfake_extracted.wav") try: subprocess.run( [ "ffmpeg", "-y", "-i", video_path, "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", out_path, ], capture_output=True, check=True, ) return out_path except Exception: return None # --------------------------------------------------------------------------- # Gradio callbacks # --------------------------------------------------------------------------- def predict_audio(audio_file): """Called when user submits an audio file.""" if audio_file is None: return ( "No file uploaded.", "", gr.update(visible=False), ) try: t0 = time.time() result = run_audio_inference(audio_file) elapsed = time.time() - t0 if result is None: return "Model not loaded.", "", gr.update(visible=False) label = result.get("label", "Unknown") real_score = result.get("real_score", 0.5) fake_score = 1 - real_score conf = result.get("confidence", abs(real_score - 0.5) * 2) verdict = "FAKE AUDIO DETECTED" if label.upper() == "FAKE" else "GENUINE AUDIO" colour = "#ff4b4b" if label.upper() == "FAKE" else "#21c354" bar = _score_bar(fake_score) summary_html = f"""
{verdict}
Confidence Metrics {conf:.1%}
P(Fake) vs P(Real) {fake_score:.4f}   |   {real_score:.4f}
Execution Time: {elapsed:.2f}s
""" details = json.dumps(result, indent=2) return summary_html, details, gr.update(visible=True) except Exception as exc: return f"Error: {exc}", "", gr.update(visible=False) def predict_video(video_file): """Called when user submits a video file.""" if video_file is None: return "No file uploaded.", "", gr.update(visible=False) try: t0 = time.time() result = run_video_inference_via_subprocess(video_file) elapsed = time.time() - t0 if result.get("error"): return ( f"Video model unavailable locally." f"
{result['error']}" f"

To evaluate videos, run the notebook on Colab/Kaggle.", json.dumps(result, indent=2), gr.update(visible=True), ) label = result.get("label", "Unknown") prob_fake = result.get("prob_fake", 0.5) prob_real = result.get("prob_real", 0.5) conf = result.get("confidence", 0) faces = result.get("faces_detected", False) verdict = "DEEPFAKE VIDEO DETECTED" if label.upper() == "FAKE" else "GENUINE VIDEO" colour = "#ff4b4b" if label.upper() == "FAKE" else "#21c354" bar = _score_bar(prob_fake) summary_html = f"""
{verdict}
Facial Extraction Target {"Face Found" if faces else "No Face"}
P(Fake) vs P(Real) {prob_fake:.4f}   |   {prob_real:.4f}
Execution Time: {elapsed:.2f}s
""" return summary_html, json.dumps(result, indent=2), gr.update(visible=True) except Exception as exc: return f"Error: {exc}", "", gr.update(visible=False) def predict_multimodal(video_file): """Fuse audio + video scores from a single video file.""" if video_file is None: return "No file uploaded.", "", gr.update(visible=False) try: t0 = time.time() # Extract audio audio_path = extract_audio_from_video(video_file) # Run both modalities audio_result = run_audio_inference(audio_path) if audio_path else None video_result = run_video_inference_via_subprocess(video_file) # Fuse from fusion import MultimodalFusion _, fused = MultimodalFusion.from_detector_results( audio_result, video_result, strategy="weighted_average", alpha=0.5 ) elapsed = time.time() - t0 label = fused["label"] score = fused["fused_score"] conf = fused["confidence"] mods = ", ".join(fused["modalities_used"]) or "none" fake_score = 1 - score verdict = "DEEPFAKE DETECTED" if label.upper() == "FAKE" else "GENUINE MEDIA" colour = "#ff4b4b" if label.upper() == "FAKE" else "#21c354" bar = _score_bar(fake_score) summary_html = f"""
{verdict}
Multimodal Fused Score (Fake) {fake_score * 100:.2f}%
Audio P(Real) {fused.get('audio_score', 'N/A')}
Video P(Real) {fused.get('video_score', 'N/A')}
Active Streams: {mods} | Inference Time: {elapsed:.2f}s
""" return summary_html, json.dumps(fused, indent=2), gr.update(visible=True) except Exception as exc: return f"Error: {exc}", "", gr.update(visible=False) # --------------------------------------------------------------------------- # UI # --------------------------------------------------------------------------- THEME = gr.themes.Soft( primary_hue="violet", secondary_hue="blue", neutral_hue="slate", font=[gr.themes.GoogleFont("Outfit"), "sans-serif"], ).set( body_background_fill="linear-gradient(135deg, #0f172a 0%, #1e1b4b 100%)", body_background_fill_dark="linear-gradient(135deg, #0f172a 0%, #1e1b4b 100%)", block_background_fill="rgba(30, 41, 59, 0.7)", block_background_fill_dark="rgba(30, 41, 59, 0.7)", block_border_color="rgba(148, 163, 184, 0.2)", block_border_width="1px", block_label_text_color="#cbd5e1", block_shadow="0 8px 32px 0 rgba(0, 0, 0, 0.3)", button_primary_background_fill="linear-gradient(90deg, #8b5cf6 0%, #3b82f6 100%)", button_primary_background_fill_hover="linear-gradient(90deg, #7c3aed 0%, #2563eb 100%)", button_primary_text_color="#ffffff", input_background_fill="rgba(15, 23, 42, 0.6)", input_border_color="rgba(99, 102, 241, 0.3)", panel_background_fill="rgba(30, 41, 59, 0.4)", ) DESCRIPTION = textwrap.dedent(""" ## Multimodal Deepfake Detection System **Nes2Net (audio)** | **GenConViT (video)** | **Late fusion** > Upload an audio clip, a video, or a video with audio to detect deepfakes. """) CUSTOM_CSS = """ @import url('https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;600;800&display=swap'); /* Force overriding Gradio's internal container instead of just 'body' */ .gradio-container { background-color: #0b0f19 !important; background-image: radial-gradient(at 0% 0%, rgba(17, 24, 39, 1) 0, transparent 50%), radial-gradient(at 100% 0%, rgba(30, 27, 75, 1) 0, transparent 50%), radial-gradient(at 50% 100%, rgba(15, 23, 42, 1) 0, transparent 50%) !important; background-attachment: fixed !important; color: #f8fafc !important; font-family: 'Outfit', sans-serif !important; } /* Force standard Gradio wrappers to be slightly transparent to see background */ .wrap, .panel, .gap, .form { background-color: rgba(15, 23, 42, 0.4) !important; border-color: rgba(255, 255, 255, 0.05) !important; } .hero-header { text-align: center; padding: 30px 10px; margin-bottom: 30px; background: rgba(15, 23, 42, 0.4) !important; border-radius: 16px; border: 1px solid rgba(255, 255, 255, 0.05) !important; backdrop-filter: blur(20px); box-shadow: 0 10px 40px rgba(0,0,0,0.5); } .hero-title { font-size: 3rem; font-weight: 800; text-transform: uppercase; letter-spacing: 2px; background: linear-gradient(to right, #8b5cf6, #3b82f6, #06b6d4) !important; -webkit-background-clip: text !important; background-clip: text !important; -webkit-text-fill-color: transparent !important; margin-bottom: 10px; } .hero-subtitle { font-size: 1.1rem; color: #94a3b8 !important; font-weight: 300; } .glass-panel { background: linear-gradient(145deg, rgba(30, 41, 59, 0.6) 0%, rgba(15, 23, 42, 0.8) 100%); backdrop-filter: blur(24px); border: 1px solid rgba(148, 163, 184, 0.1); border-radius: 20px; padding: 32px; box-shadow: 0 15px 35px rgba(0,0,0,0.5), inset 0 1px 0 rgba(255,255,255,0.05); margin-bottom: 24px; transition: transform 0.3s ease, box-shadow 0.3s ease; } .glass-panel:hover { transform: translateY(-5px); box-shadow: 0 20px 40px rgba(0,0,0,0.6), inset 0 1px 0 rgba(255,255,255,0.1); } .status-fake { border-top: 4px solid #f43f5e; box-shadow: 0 10px 40px rgba(244, 63, 94, 0.15); } .status-real { border-top: 4px solid #10b981; box-shadow: 0 10px 40px rgba(16, 185, 129, 0.15); } .result-title { font-size: 2.2rem; font-weight: 800; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 24px; text-align: center; text-shadow: 0 4px 10px rgba(0,0,0,0.4); } .score-container { display: flex; justify-content: space-between; align-items: center; padding: 16px 20px; background: rgba(0, 0, 0, 0.3); border-radius: 12px; margin-top: 16px; border: 1px solid rgba(255,255,255,0.03); } .score-label { font-size: 1rem; color: #94a3b8; font-weight: 600; text-transform: uppercase; letter-spacing: 1px; } .score-value { font-size: 1.5rem; font-weight: 800; color: #f8fafc; } @keyframes fillout { from { width: 0; opacity: 0; } to { opacity: 1; } } .progress-track { width: 100%; height: 14px; background: rgba(0, 0, 0, 0.5); border-radius: 7px; overflow: hidden; margin-top: 12px; box-shadow: inset 0 2px 4px rgba(0,0,0,0.5); } .progress-fill { height: 100%; border-radius: 7px; animation: fillout 1.2s cubic-bezier(0.16, 1, 0.3, 1) forwards; position: relative; } .status-fake .progress-fill { background: linear-gradient(90deg, #be123c, #f43f5e); } .status-real .progress-fill { background: linear-gradient(90deg, #047857, #10b981); } .gradio-container .prose * { padding: 0 !important; } .fusion-btn { background: linear-gradient(90deg, #8b5cf6, #3b82f6) !important; border: none !important; box-shadow: 0 0 15px rgba(139, 92, 246, 0.5) !important; animation: pulseGlow 2s infinite; } @keyframes pulseGlow { 0% { box-shadow: 0 0 15px rgba(139, 92, 246, 0.5); } 50% { box-shadow: 0 0 30px rgba(139, 92, 246, 0.9); } 100% { box-shadow: 0 0 15px rgba(139, 92, 246, 0.5); } } """ def build_ui(): # Injecting CSS strictly inline to bypass Windows path resolution bugs in Gradio 4 with gr.Blocks(theme=THEME, css=CUSTOM_CSS, title="Multimodal Deepfake Detection") as demo: # Premium Hero Banner gr.HTML("""
Multimodal Deepfake Detection
Industry-grade neural security layer fusing Nes2Net acoustics and GenConViT optics.
""") with gr.Tabs(): # ── Tab 1: Audio ────────────────────────────────────────── with gr.TabItem("Audio Detection"): gr.Markdown("### Upload a speech sample to detect AI-synthesised audio.") with gr.Row(): with gr.Column(scale=1): audio_input = gr.Audio( label="Upload Audio", type="filepath", sources=["upload", "microphone"], ) audio_btn = gr.Button("Analyse Audio", variant="primary") with gr.Column(scale=2): audio_result = gr.HTML(label="Result") audio_details = gr.Textbox(label="Raw JSON output", lines=10, visible=False, interactive=False) audio_expand_btn = gr.Button("Show raw output", size="sm", visible=False) audio_expand_btn.click( lambda: gr.update(visible=True), outputs=audio_details, ) audio_btn.click( predict_audio, inputs=audio_input, outputs=[audio_result, audio_details, audio_expand_btn], ) # ── Tab 2: Video ────────────────────────────────────────── with gr.TabItem("Video Detection"): gr.Markdown( "### Upload a video to detect face manipulation.\n" "> **Note:** Full GPU inference requires Colab/Kaggle. " "The local model may report 'Unavailable'." ) with gr.Row(): with gr.Column(scale=1): video_input = gr.Video(label="Upload Video") video_btn = gr.Button("Analyse Video", variant="primary") with gr.Column(scale=2): video_result = gr.HTML(label="Result") video_details = gr.Textbox(label="Raw JSON output", lines=10, visible=False, interactive=False) video_expand = gr.Button("Show raw output", size="sm", visible=False) video_expand.click( lambda: gr.update(visible=True), outputs=video_details, ) video_btn.click( predict_video, inputs=video_input, outputs=[video_result, video_details, video_expand], ) # ── Tab 3: Multimodal ──────────────────────────────────── with gr.TabItem("Multimodal Fusion"): gr.Markdown( "### Upload a video with audio to get a fused verdict.\n" "Both the audio track and video frames will be analysed " "and combined via weighted-average score fusion." ) with gr.Row(): with gr.Column(scale=1): mm_input = gr.Video(label="Upload Video (with audio)") mm_btn = gr.Button("Launch Deep Multimodal Fusion 🚀", variant="primary", elem_classes=["fusion-btn"]) with gr.Column(scale=2): mm_result = gr.HTML(label="Result") mm_details = gr.Textbox(label="Raw JSON output", lines=10, visible=False, interactive=False) mm_expand = gr.Button("Show raw output", size="sm", visible=False) mm_expand.click( lambda: gr.update(visible=True), outputs=mm_details, ) mm_btn.click( predict_multimodal, inputs=mm_input, outputs=[mm_result, mm_details, mm_expand], ) gr.Markdown(""" --- **Model Architecture:** Audio — Wav2Vec 2.0 (XLSR-300M) + Nes2Net (ASVspoof 2021 DF checkpoint) Video — GenConViT (ED + VAE ensemble, GenConViT weights) Fusion — Weighted-average late fusion (α = 0.5) **B.Tech Project** — Multimodal Deepfake Detection """) return demo # --------------------------------------------------------------------------- # Entry point # --------------------------------------------------------------------------- if __name__ == "__main__": demo = build_ui() demo.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True, show_api=False, # disables /api endpoint — prevents gradio_client bool-schema crash )