"""
app.py — Multimodal Deepfake Detection — Gradio Frontend
=========================================================
Supports:
• Audio-only detection (upload WAV / FLAC / MP3)
• Video-only detection (upload MP4 — runs GenConViT via subprocess)
• Multimodal fusion (upload video with audio track)
Hosting:
HuggingFace Spaces (recommended) — set HF_SPACE=1 to auto-detect.
Local: python app.py
Model weights are downloaded at startup from HuggingFace Hub.
"""
import os
import sys
import json
import time
import tempfile
import subprocess
import textwrap
from pathlib import Path
# ---------------------------------------------------------------------------
# CRITICAL: Monkey-patch gradio_client BEFORE importing gradio.
# Root cause: gradio_client/utils.py:_json_schema_to_python_type() receives
# a bool (False) as `schema` when processing Video/Audio component schemas
# that contain `"additionalProperties": false`. The function then does
# `if "const" in schema` which crashes because booleans are not iterable.
# This bug lives in: gradio_client<=0.9.1 (fixed in 0.10.0 / gradio>=5.0).
# Since we target gradio 4.44.x for Python 3.10, we patch it in-process.
# ---------------------------------------------------------------------------
def _patch_gradio_client():
try:
import gradio_client.utils as _gc
_orig = _gc._json_schema_to_python_type
def _safe(schema, defs=None):
# Guard: if schema is not a dict (e.g. bool from additionalProperties:false)
# return a safe fallback type string instead of crashing.
if not isinstance(schema, dict):
return "any"
return _orig(schema, defs)
_gc._json_schema_to_python_type = _safe
except Exception:
pass # If gradio_client is not yet installed or already patched, skip.
_patch_gradio_client()
import gradio as gr
# ---------------------------------------------------------------------------
# Project paths
# ---------------------------------------------------------------------------
APP_DIR = Path(__file__).parent.resolve()
PROJ_DIR = APP_DIR.parent
AUDIO_DIR = PROJ_DIR / "audio_detection"
FUSION_DIR = PROJ_DIR / "fusion"
VIDEO_DIR = PROJ_DIR / "video_detection"
GENCONVIT_WEIGHT_DIR = VIDEO_DIR / "GenConViT" / "weight"
sys.path.insert(0, str(AUDIO_DIR))
sys.path.insert(0, str(FUSION_DIR))
# ---------------------------------------------------------------------------
# GenConViT weight bootstrap — download at startup if missing
# ---------------------------------------------------------------------------
_GENCONVIT_WEIGHTS = {
"genconvit_ed_inference.pth": "https://huggingface.co/Deressa/GenConViT/resolve/main/genconvit_ed_inference.pth",
"genconvit_vae_inference.pth": "https://huggingface.co/Deressa/GenConViT/resolve/main/genconvit_vae_inference.pth",
}
def _ensure_genconvit_weights():
"""Download GenConViT pretrained weights if they are not already present."""
import urllib.request
GENCONVIT_WEIGHT_DIR.mkdir(parents=True, exist_ok=True)
for fname, url in _GENCONVIT_WEIGHTS.items():
dest = GENCONVIT_WEIGHT_DIR / fname
if not dest.exists():
print(f"[Video] Downloading {fname} from HuggingFace …")
try:
urllib.request.urlretrieve(url, str(dest))
print(f"[Video] ✓ {fname} downloaded ({dest.stat().st_size // 1_048_576} MB)")
except Exception as exc:
print(f"[Video] ✗ Failed to download {fname}: {exc}")
else:
print(f"[Video] Weight already present: {fname}")
_ensure_genconvit_weights()
# ---------------------------------------------------------------------------
# Lazy model loading
# ---------------------------------------------------------------------------
_audio_detector = None
_fusion_module = None
def get_audio_detector():
global _audio_detector
if _audio_detector is None:
from inference import AudioDeepfakeDetector
_audio_detector = AudioDeepfakeDetector(
device="cuda" if _cuda_available() else "cpu"
)
return _audio_detector
def get_fusion():
global _fusion_module
if _fusion_module is None:
from fusion import MultimodalFusion
_fusion_module = MultimodalFusion(strategy="weighted_average", alpha=0.5)
return _fusion_module
def _cuda_available():
try:
import torch
return torch.cuda.is_available()
except ImportError:
return False
# ---------------------------------------------------------------------------
# Inference helpers
# ---------------------------------------------------------------------------
def _score_bar(score_fake: float) -> str:
"""
Build a text-based confidence bar for display.
score_fake: 0..1, 1=100% fake
"""
filled = int(round(score_fake * 20))
bar = "█" * filled + "░" * (20 - filled)
return f"[{bar}] {score_fake*100:.1f}% Fake"
def run_audio_inference(audio_path: str):
"""Run Nes2Net on a local audio file. Returns result dict."""
if audio_path is None:
return None
os.chdir(str(AUDIO_DIR))
det = get_audio_detector()
result = det.predict(audio_path)
os.chdir(str(PROJ_DIR))
return result
def run_video_inference_via_subprocess(video_path: str):
"""
Run GenConViT video inference.
Loads video_detection/inference.py directly via importlib to avoid
the sys.modules name collision with audio_detection/inference.py.
"""
video_dir = PROJ_DIR / "video_detection"
try:
import importlib.util
# Load video inference module using its absolute path under a unique name.
# This bypasses sys.modules where 'inference' is already cached as the
# audio_detection version — which caused the VideoDeepfakeDetector import error.
spec = importlib.util.spec_from_file_location(
"video_inference", # unique module name
str(video_dir / "inference.py"), # absolute file path
)
video_mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(video_mod)
VideoDeepfakeDetector = video_mod.VideoDeepfakeDetector
det = VideoDeepfakeDetector(
ed_weight="genconvit_ed_inference",
vae_weight="genconvit_vae_inference",
num_frames=15,
fp16=False,
)
return det.predict(video_path)
except Exception as exc:
return {
"error": str(exc),
"label": "Unavailable",
"prob_fake": None,
"prob_real": None,
"normalized_score": None,
"faces_detected": False,
}
def extract_audio_from_video(video_path: str) -> str | None:
"""Extract 16kHz mono WAV from a video file using ffmpeg."""
out_path = os.path.join(tempfile.gettempdir(), "deepfake_extracted.wav")
try:
subprocess.run(
[
"ffmpeg", "-y", "-i", video_path,
"-vn", "-acodec", "pcm_s16le",
"-ar", "16000", "-ac", "1",
out_path,
],
capture_output=True, check=True,
)
return out_path
except Exception:
return None
# ---------------------------------------------------------------------------
# Gradio callbacks
# ---------------------------------------------------------------------------
def predict_audio(audio_file):
"""Called when user submits an audio file."""
if audio_file is None:
return (
"No file uploaded.",
"",
gr.update(visible=False),
)
try:
t0 = time.time()
result = run_audio_inference(audio_file)
elapsed = time.time() - t0
if result is None:
return "Model not loaded.", "", gr.update(visible=False)
label = result.get("label", "Unknown")
real_score = result.get("real_score", 0.5)
fake_score = 1 - real_score
conf = result.get("confidence", abs(real_score - 0.5) * 2)
verdict = "FAKE AUDIO DETECTED" if label.upper() == "FAKE" else "GENUINE AUDIO"
colour = "#ff4b4b" if label.upper() == "FAKE" else "#21c354"
bar = _score_bar(fake_score)
summary_html = f"""
{verdict}
Confidence Metrics
{conf:.1%}
P(Fake) vs P(Real)
{fake_score:.4f} | {real_score:.4f}
Execution Time: {elapsed:.2f}s
"""
details = json.dumps(result, indent=2)
return summary_html, details, gr.update(visible=True)
except Exception as exc:
return f"Error: {exc}", "", gr.update(visible=False)
def predict_video(video_file):
"""Called when user submits a video file."""
if video_file is None:
return "No file uploaded.", "", gr.update(visible=False)
try:
t0 = time.time()
result = run_video_inference_via_subprocess(video_file)
elapsed = time.time() - t0
if result.get("error"):
return (
f"Video model unavailable locally."
f"
{result['error']}"
f"
To evaluate videos, run the notebook on Colab/Kaggle.",
json.dumps(result, indent=2),
gr.update(visible=True),
)
label = result.get("label", "Unknown")
prob_fake = result.get("prob_fake", 0.5)
prob_real = result.get("prob_real", 0.5)
conf = result.get("confidence", 0)
faces = result.get("faces_detected", False)
verdict = "DEEPFAKE VIDEO DETECTED" if label.upper() == "FAKE" else "GENUINE VIDEO"
colour = "#ff4b4b" if label.upper() == "FAKE" else "#21c354"
bar = _score_bar(prob_fake)
summary_html = f"""
{verdict}
Facial Extraction Target
{"Face Found" if faces else "No Face"}
P(Fake) vs P(Real)
{prob_fake:.4f} | {prob_real:.4f}
Execution Time: {elapsed:.2f}s
"""
return summary_html, json.dumps(result, indent=2), gr.update(visible=True)
except Exception as exc:
return f"Error: {exc}", "", gr.update(visible=False)
def predict_multimodal(video_file):
"""Fuse audio + video scores from a single video file."""
if video_file is None:
return "No file uploaded.", "", gr.update(visible=False)
try:
t0 = time.time()
# Extract audio
audio_path = extract_audio_from_video(video_file)
# Run both modalities
audio_result = run_audio_inference(audio_path) if audio_path else None
video_result = run_video_inference_via_subprocess(video_file)
# Fuse
from fusion import MultimodalFusion
_, fused = MultimodalFusion.from_detector_results(
audio_result, video_result, strategy="weighted_average", alpha=0.5
)
elapsed = time.time() - t0
label = fused["label"]
score = fused["fused_score"]
conf = fused["confidence"]
mods = ", ".join(fused["modalities_used"]) or "none"
fake_score = 1 - score
verdict = "DEEPFAKE DETECTED" if label.upper() == "FAKE" else "GENUINE MEDIA"
colour = "#ff4b4b" if label.upper() == "FAKE" else "#21c354"
bar = _score_bar(fake_score)
summary_html = f"""
{verdict}
Multimodal Fused Score (Fake)
{fake_score * 100:.2f}%
Audio P(Real)
{fused.get('audio_score', 'N/A')}
Video P(Real)
{fused.get('video_score', 'N/A')}
Active Streams: {mods} | Inference Time: {elapsed:.2f}s
"""
return summary_html, json.dumps(fused, indent=2), gr.update(visible=True)
except Exception as exc:
return f"Error: {exc}", "", gr.update(visible=False)
# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------
THEME = gr.themes.Soft(
primary_hue="violet",
secondary_hue="blue",
neutral_hue="slate",
font=[gr.themes.GoogleFont("Outfit"), "sans-serif"],
).set(
body_background_fill="linear-gradient(135deg, #0f172a 0%, #1e1b4b 100%)",
body_background_fill_dark="linear-gradient(135deg, #0f172a 0%, #1e1b4b 100%)",
block_background_fill="rgba(30, 41, 59, 0.7)",
block_background_fill_dark="rgba(30, 41, 59, 0.7)",
block_border_color="rgba(148, 163, 184, 0.2)",
block_border_width="1px",
block_label_text_color="#cbd5e1",
block_shadow="0 8px 32px 0 rgba(0, 0, 0, 0.3)",
button_primary_background_fill="linear-gradient(90deg, #8b5cf6 0%, #3b82f6 100%)",
button_primary_background_fill_hover="linear-gradient(90deg, #7c3aed 0%, #2563eb 100%)",
button_primary_text_color="#ffffff",
input_background_fill="rgba(15, 23, 42, 0.6)",
input_border_color="rgba(99, 102, 241, 0.3)",
panel_background_fill="rgba(30, 41, 59, 0.4)",
)
DESCRIPTION = textwrap.dedent("""
## Multimodal Deepfake Detection System
**Nes2Net (audio)** | **GenConViT (video)** | **Late fusion**
> Upload an audio clip, a video, or a video with audio to detect deepfakes.
""")
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;600;800&display=swap');
/* Force overriding Gradio's internal container instead of just 'body' */
.gradio-container {
background-color: #0b0f19 !important;
background-image: radial-gradient(at 0% 0%, rgba(17, 24, 39, 1) 0, transparent 50%), radial-gradient(at 100% 0%, rgba(30, 27, 75, 1) 0, transparent 50%), radial-gradient(at 50% 100%, rgba(15, 23, 42, 1) 0, transparent 50%) !important;
background-attachment: fixed !important;
color: #f8fafc !important;
font-family: 'Outfit', sans-serif !important;
}
/* Force standard Gradio wrappers to be slightly transparent to see background */
.wrap, .panel, .gap, .form {
background-color: rgba(15, 23, 42, 0.4) !important;
border-color: rgba(255, 255, 255, 0.05) !important;
}
.hero-header {
text-align: center;
padding: 30px 10px;
margin-bottom: 30px;
background: rgba(15, 23, 42, 0.4) !important;
border-radius: 16px;
border: 1px solid rgba(255, 255, 255, 0.05) !important;
backdrop-filter: blur(20px);
box-shadow: 0 10px 40px rgba(0,0,0,0.5);
}
.hero-title {
font-size: 3rem;
font-weight: 800;
text-transform: uppercase;
letter-spacing: 2px;
background: linear-gradient(to right, #8b5cf6, #3b82f6, #06b6d4) !important;
-webkit-background-clip: text !important;
background-clip: text !important;
-webkit-text-fill-color: transparent !important;
margin-bottom: 10px;
}
.hero-subtitle {
font-size: 1.1rem;
color: #94a3b8 !important;
font-weight: 300;
}
.glass-panel {
background: linear-gradient(145deg, rgba(30, 41, 59, 0.6) 0%, rgba(15, 23, 42, 0.8) 100%);
backdrop-filter: blur(24px);
border: 1px solid rgba(148, 163, 184, 0.1);
border-radius: 20px;
padding: 32px;
box-shadow: 0 15px 35px rgba(0,0,0,0.5), inset 0 1px 0 rgba(255,255,255,0.05);
margin-bottom: 24px;
transition: transform 0.3s ease, box-shadow 0.3s ease;
}
.glass-panel:hover {
transform: translateY(-5px);
box-shadow: 0 20px 40px rgba(0,0,0,0.6), inset 0 1px 0 rgba(255,255,255,0.1);
}
.status-fake { border-top: 4px solid #f43f5e; box-shadow: 0 10px 40px rgba(244, 63, 94, 0.15); }
.status-real { border-top: 4px solid #10b981; box-shadow: 0 10px 40px rgba(16, 185, 129, 0.15); }
.result-title { font-size: 2.2rem; font-weight: 800; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 24px; text-align: center; text-shadow: 0 4px 10px rgba(0,0,0,0.4); }
.score-container { display: flex; justify-content: space-between; align-items: center; padding: 16px 20px; background: rgba(0, 0, 0, 0.3); border-radius: 12px; margin-top: 16px; border: 1px solid rgba(255,255,255,0.03); }
.score-label { font-size: 1rem; color: #94a3b8; font-weight: 600; text-transform: uppercase; letter-spacing: 1px; }
.score-value { font-size: 1.5rem; font-weight: 800; color: #f8fafc; }
@keyframes fillout { from { width: 0; opacity: 0; } to { opacity: 1; } }
.progress-track { width: 100%; height: 14px; background: rgba(0, 0, 0, 0.5); border-radius: 7px; overflow: hidden; margin-top: 12px; box-shadow: inset 0 2px 4px rgba(0,0,0,0.5); }
.progress-fill { height: 100%; border-radius: 7px; animation: fillout 1.2s cubic-bezier(0.16, 1, 0.3, 1) forwards; position: relative; }
.status-fake .progress-fill { background: linear-gradient(90deg, #be123c, #f43f5e); }
.status-real .progress-fill { background: linear-gradient(90deg, #047857, #10b981); }
.gradio-container .prose * { padding: 0 !important; }
.fusion-btn {
background: linear-gradient(90deg, #8b5cf6, #3b82f6) !important;
border: none !important;
box-shadow: 0 0 15px rgba(139, 92, 246, 0.5) !important;
animation: pulseGlow 2s infinite;
}
@keyframes pulseGlow {
0% { box-shadow: 0 0 15px rgba(139, 92, 246, 0.5); }
50% { box-shadow: 0 0 30px rgba(139, 92, 246, 0.9); }
100% { box-shadow: 0 0 15px rgba(139, 92, 246, 0.5); }
}
"""
def build_ui():
# Injecting CSS strictly inline to bypass Windows path resolution bugs in Gradio 4
with gr.Blocks(theme=THEME, css=CUSTOM_CSS, title="Multimodal Deepfake Detection") as demo:
# Premium Hero Banner
gr.HTML("""
""")
with gr.Tabs():
# ── Tab 1: Audio ──────────────────────────────────────────
with gr.TabItem("Audio Detection"):
gr.Markdown("### Upload a speech sample to detect AI-synthesised audio.")
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(
label="Upload Audio",
type="filepath",
sources=["upload", "microphone"],
)
audio_btn = gr.Button("Analyse Audio", variant="primary")
with gr.Column(scale=2):
audio_result = gr.HTML(label="Result")
audio_details = gr.Textbox(label="Raw JSON output",
lines=10, visible=False,
interactive=False)
audio_expand_btn = gr.Button("Show raw output", size="sm",
visible=False)
audio_expand_btn.click(
lambda: gr.update(visible=True),
outputs=audio_details,
)
audio_btn.click(
predict_audio,
inputs=audio_input,
outputs=[audio_result, audio_details, audio_expand_btn],
)
# ── Tab 2: Video ──────────────────────────────────────────
with gr.TabItem("Video Detection"):
gr.Markdown(
"### Upload a video to detect face manipulation.\n"
"> **Note:** Full GPU inference requires Colab/Kaggle. "
"The local model may report 'Unavailable'."
)
with gr.Row():
with gr.Column(scale=1):
video_input = gr.Video(label="Upload Video")
video_btn = gr.Button("Analyse Video", variant="primary")
with gr.Column(scale=2):
video_result = gr.HTML(label="Result")
video_details = gr.Textbox(label="Raw JSON output",
lines=10, visible=False,
interactive=False)
video_expand = gr.Button("Show raw output", size="sm",
visible=False)
video_expand.click(
lambda: gr.update(visible=True),
outputs=video_details,
)
video_btn.click(
predict_video,
inputs=video_input,
outputs=[video_result, video_details, video_expand],
)
# ── Tab 3: Multimodal ────────────────────────────────────
with gr.TabItem("Multimodal Fusion"):
gr.Markdown(
"### Upload a video with audio to get a fused verdict.\n"
"Both the audio track and video frames will be analysed "
"and combined via weighted-average score fusion."
)
with gr.Row():
with gr.Column(scale=1):
mm_input = gr.Video(label="Upload Video (with audio)")
mm_btn = gr.Button("Launch Deep Multimodal Fusion 🚀", variant="primary", elem_classes=["fusion-btn"])
with gr.Column(scale=2):
mm_result = gr.HTML(label="Result")
mm_details = gr.Textbox(label="Raw JSON output",
lines=10, visible=False,
interactive=False)
mm_expand = gr.Button("Show raw output", size="sm",
visible=False)
mm_expand.click(
lambda: gr.update(visible=True),
outputs=mm_details,
)
mm_btn.click(
predict_multimodal,
inputs=mm_input,
outputs=[mm_result, mm_details, mm_expand],
)
gr.Markdown("""
---
**Model Architecture:**
Audio — Wav2Vec 2.0 (XLSR-300M) + Nes2Net (ASVspoof 2021 DF checkpoint)
Video — GenConViT (ED + VAE ensemble, GenConViT weights)
Fusion — Weighted-average late fusion (α = 0.5)
**B.Tech Project** — Multimodal Deepfake Detection
""")
return demo
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
if __name__ == "__main__":
demo = build_ui()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True,
show_api=False, # disables /api endpoint — prevents gradio_client bool-schema crash
)