Spaces:

Aryanaideveloper
/

Multimodal_Deepfake_Detection

Sleeping

App Files Files Community

Multimodal_Deepfake_Detection / web_ui /app.py

Aryanaideveloper

fix: correct GenConViT weight download directory path to be inside GenConViT package

d95c9f9 about 2 months ago

raw

history blame contribute delete

25.7 kB

	"""
	app.py — Multimodal Deepfake Detection — Gradio Frontend
	=========================================================
	Supports:
	• Audio-only detection (upload WAV / FLAC / MP3)
	• Video-only detection (upload MP4 — runs GenConViT via subprocess)
	• Multimodal fusion (upload video with audio track)

	Hosting:
	HuggingFace Spaces (recommended) — set HF_SPACE=1 to auto-detect.
	Local: python app.py

	Model weights are downloaded at startup from HuggingFace Hub.
	"""

	import os
	import sys
	import json
	import time
	import tempfile
	import subprocess
	import textwrap
	from pathlib import Path

	# ---------------------------------------------------------------------------
	# CRITICAL: Monkey-patch gradio_client BEFORE importing gradio.
	# Root cause: gradio_client/utils.py:_json_schema_to_python_type() receives
	# a bool (False) as `schema` when processing Video/Audio component schemas
	# that contain `"additionalProperties": false`. The function then does
	# `if "const" in schema` which crashes because booleans are not iterable.
	# This bug lives in: gradio_client<=0.9.1 (fixed in 0.10.0 / gradio>=5.0).
	# Since we target gradio 4.44.x for Python 3.10, we patch it in-process.
	# ---------------------------------------------------------------------------
	def _patch_gradio_client():
	try:
	import gradio_client.utils as _gc
	_orig = _gc._json_schema_to_python_type

	def _safe(schema, defs=None):
	# Guard: if schema is not a dict (e.g. bool from additionalProperties:false)
	# return a safe fallback type string instead of crashing.
	if not isinstance(schema, dict):
	return "any"
	return _orig(schema, defs)

	_gc._json_schema_to_python_type = _safe
	except Exception:
	pass # If gradio_client is not yet installed or already patched, skip.

	_patch_gradio_client()

	import gradio as gr


	# ---------------------------------------------------------------------------
	# Project paths
	# ---------------------------------------------------------------------------
	APP_DIR = Path(__file__).parent.resolve()
	PROJ_DIR = APP_DIR.parent
	AUDIO_DIR = PROJ_DIR / "audio_detection"
	FUSION_DIR = PROJ_DIR / "fusion"
	VIDEO_DIR = PROJ_DIR / "video_detection"
	GENCONVIT_WEIGHT_DIR = VIDEO_DIR / "GenConViT" / "weight"

	sys.path.insert(0, str(AUDIO_DIR))
	sys.path.insert(0, str(FUSION_DIR))


	# ---------------------------------------------------------------------------
	# GenConViT weight bootstrap — download at startup if missing
	# ---------------------------------------------------------------------------
	_GENCONVIT_WEIGHTS = {
	"genconvit_ed_inference.pth": "https://huggingface.co/Deressa/GenConViT/resolve/main/genconvit_ed_inference.pth",
	"genconvit_vae_inference.pth": "https://huggingface.co/Deressa/GenConViT/resolve/main/genconvit_vae_inference.pth",
	}

	def _ensure_genconvit_weights():
	"""Download GenConViT pretrained weights if they are not already present."""
	import urllib.request
	GENCONVIT_WEIGHT_DIR.mkdir(parents=True, exist_ok=True)
	for fname, url in _GENCONVIT_WEIGHTS.items():
	dest = GENCONVIT_WEIGHT_DIR / fname
	if not dest.exists():
	print(f"[Video] Downloading {fname} from HuggingFace …")
	try:
	urllib.request.urlretrieve(url, str(dest))
	print(f"[Video] ✓ {fname} downloaded ({dest.stat().st_size // 1_048_576} MB)")
	except Exception as exc:
	print(f"[Video] ✗ Failed to download {fname}: {exc}")
	else:
	print(f"[Video] Weight already present: {fname}")

	_ensure_genconvit_weights()


	# ---------------------------------------------------------------------------
	# Lazy model loading
	# ---------------------------------------------------------------------------
	_audio_detector = None
	_fusion_module = None


	def get_audio_detector():
	global _audio_detector
	if _audio_detector is None:
	from inference import AudioDeepfakeDetector
	_audio_detector = AudioDeepfakeDetector(
	device="cuda" if _cuda_available() else "cpu"
	)
	return _audio_detector


	def get_fusion():
	global _fusion_module
	if _fusion_module is None:
	from fusion import MultimodalFusion
	_fusion_module = MultimodalFusion(strategy="weighted_average", alpha=0.5)
	return _fusion_module


	def _cuda_available():
	try:
	import torch
	return torch.cuda.is_available()
	except ImportError:
	return False


	# ---------------------------------------------------------------------------
	# Inference helpers
	# ---------------------------------------------------------------------------

	def _score_bar(score_fake: float) -> str:
	"""
	Build a text-based confidence bar for display.
	score_fake: 0..1, 1=100% fake
	"""
	filled = int(round(score_fake * 20))
	bar = "█" * filled + "░" * (20 - filled)
	return f"[{bar}] {score_fake*100:.1f}% Fake"


	def run_audio_inference(audio_path: str):
	"""Run Nes2Net on a local audio file. Returns result dict."""
	if audio_path is None:
	return None
	os.chdir(str(AUDIO_DIR))
	det = get_audio_detector()
	result = det.predict(audio_path)
	os.chdir(str(PROJ_DIR))
	return result


	def run_video_inference_via_subprocess(video_path: str):
	"""
	Run GenConViT video inference.
	Loads video_detection/inference.py directly via importlib to avoid
	the sys.modules name collision with audio_detection/inference.py.
	"""
	video_dir = PROJ_DIR / "video_detection"
	try:
	import importlib.util
	# Load video inference module using its absolute path under a unique name.
	# This bypasses sys.modules where 'inference' is already cached as the
	# audio_detection version — which caused the VideoDeepfakeDetector import error.
	spec = importlib.util.spec_from_file_location(
	"video_inference", # unique module name
	str(video_dir / "inference.py"), # absolute file path
	)
	video_mod = importlib.util.module_from_spec(spec)
	spec.loader.exec_module(video_mod)
	VideoDeepfakeDetector = video_mod.VideoDeepfakeDetector

	det = VideoDeepfakeDetector(
	ed_weight="genconvit_ed_inference",
	vae_weight="genconvit_vae_inference",
	num_frames=15,
	fp16=False,
	)
	return det.predict(video_path)
	except Exception as exc:
	return {
	"error": str(exc),
	"label": "Unavailable",
	"prob_fake": None,
	"prob_real": None,
	"normalized_score": None,
	"faces_detected": False,
	}



	def extract_audio_from_video(video_path: str) -> str \| None:
	"""Extract 16kHz mono WAV from a video file using ffmpeg."""
	out_path = os.path.join(tempfile.gettempdir(), "deepfake_extracted.wav")
	try:
	subprocess.run(
	[
	"ffmpeg", "-y", "-i", video_path,
	"-vn", "-acodec", "pcm_s16le",
	"-ar", "16000", "-ac", "1",
	out_path,
	],
	capture_output=True, check=True,
	)
	return out_path
	except Exception:
	return None


	# ---------------------------------------------------------------------------
	# Gradio callbacks
	# ---------------------------------------------------------------------------

	def predict_audio(audio_file):
	"""Called when user submits an audio file."""
	if audio_file is None:
	return (
	"No file uploaded.",
	"",
	gr.update(visible=False),
	)
	try:
	t0 = time.time()
	result = run_audio_inference(audio_file)
	elapsed = time.time() - t0

	if result is None:
	return "Model not loaded.", "", gr.update(visible=False)

	label = result.get("label", "Unknown")
	real_score = result.get("real_score", 0.5)
	fake_score = 1 - real_score
	conf = result.get("confidence", abs(real_score - 0.5) * 2)

	verdict = "FAKE AUDIO DETECTED" if label.upper() == "FAKE" else "GENUINE AUDIO"
	colour = "#ff4b4b" if label.upper() == "FAKE" else "#21c354"

	bar = _score_bar(fake_score)

	summary_html = f"""
	<div class="glass-panel status-{'fake' if label.upper() == 'FAKE' else 'real'}">
	<div class="result-title" style="color: {colour};">{verdict}</div>
	<div class="score-container">
	<span class="score-label">Confidence Metrics</span>
	<span class="score-value" style="color: {colour};">{conf:.1%}</span>
	</div>
	<div class="score-container">
	<span class="score-label">P(Fake) vs P(Real)</span>
	<span class="score-value">{fake_score:.4f}   \|   {real_score:.4f}</span>
	</div>
	<div class="progress-track">
	<div class="progress-fill" style="width: {fake_score * 100}%;"></div>
	</div>
	<div style="text-align: right; margin-top: 12px; font-size: 0.85rem; color: #64748b;">
	Execution Time: {elapsed:.2f}s
	</div>
	</div>
	"""
	details = json.dumps(result, indent=2)
	return summary_html, details, gr.update(visible=True)
	except Exception as exc:
	return f"<b>Error:</b> {exc}", "", gr.update(visible=False)


	def predict_video(video_file):
	"""Called when user submits a video file."""
	if video_file is None:
	return "No file uploaded.", "", gr.update(visible=False)
	try:
	t0 = time.time()
	result = run_video_inference_via_subprocess(video_file)
	elapsed = time.time() - t0

	if result.get("error"):
	return (
	f"<b style='color:#ff4b4b'>Video model unavailable locally.</b>"
	f"<br><small>{result['error']}</small>"
	f"<br><br>To evaluate videos, run the notebook on Colab/Kaggle.",
	json.dumps(result, indent=2),
	gr.update(visible=True),
	)

	label = result.get("label", "Unknown")
	prob_fake = result.get("prob_fake", 0.5)
	prob_real = result.get("prob_real", 0.5)
	conf = result.get("confidence", 0)
	faces = result.get("faces_detected", False)

	verdict = "DEEPFAKE VIDEO DETECTED" if label.upper() == "FAKE" else "GENUINE VIDEO"
	colour = "#ff4b4b" if label.upper() == "FAKE" else "#21c354"
	bar = _score_bar(prob_fake)

	summary_html = f"""
	<div class="glass-panel status-{'fake' if label.upper() == 'FAKE' else 'real'}">
	<div class="result-title" style="color: {colour};">{verdict}</div>
	<div class="score-container">
	<span class="score-label">Facial Extraction Target</span>
	<span class="score-value">{"Face Found" if faces else "No Face"}</span>
	</div>
	<div class="score-container">
	<span class="score-label">P(Fake) vs P(Real)</span>
	<span class="score-value">{prob_fake:.4f}   \|   {prob_real:.4f}</span>
	</div>
	<div class="progress-track">
	<div class="progress-fill" style="width: {prob_fake * 100}%;"></div>
	</div>
	<div style="text-align: right; margin-top: 12px; font-size: 0.85rem; color: #64748b;">
	Execution Time: {elapsed:.2f}s
	</div>
	</div>
	"""
	return summary_html, json.dumps(result, indent=2), gr.update(visible=True)
	except Exception as exc:
	return f"<b>Error:</b> {exc}", "", gr.update(visible=False)


	def predict_multimodal(video_file):
	"""Fuse audio + video scores from a single video file."""
	if video_file is None:
	return "No file uploaded.", "", gr.update(visible=False)
	try:
	t0 = time.time()

	# Extract audio
	audio_path = extract_audio_from_video(video_file)

	# Run both modalities
	audio_result = run_audio_inference(audio_path) if audio_path else None
	video_result = run_video_inference_via_subprocess(video_file)

	# Fuse
	from fusion import MultimodalFusion
	_, fused = MultimodalFusion.from_detector_results(
	audio_result, video_result, strategy="weighted_average", alpha=0.5
	)

	elapsed = time.time() - t0
	label = fused["label"]
	score = fused["fused_score"]
	conf = fused["confidence"]
	mods = ", ".join(fused["modalities_used"]) or "none"

	fake_score = 1 - score
	verdict = "DEEPFAKE DETECTED" if label.upper() == "FAKE" else "GENUINE MEDIA"
	colour = "#ff4b4b" if label.upper() == "FAKE" else "#21c354"
	bar = _score_bar(fake_score)

	summary_html = f"""
	<div class="glass-panel status-{'fake' if label.upper() == 'FAKE' else 'real'}">
	<div class="result-title" style="color: {colour};">{verdict}</div>
	<div class="score-container">
	<span class="score-label">Multimodal Fused Score (Fake)</span>
	<span class="score-value" style="color: {colour};">{fake_score * 100:.2f}%</span>
	</div>
	<div class="progress-track">
	<div class="progress-fill" style="width: {fake_score * 100}%;"></div>
	</div>

	<div style="display: flex; gap: 16px; margin-top: 16px;">
	<div class="score-container" style="flex: 1;">
	<span class="score-label">Audio P(Real)</span>
	<span class="score-value">{fused.get('audio_score', 'N/A')}</span>
	</div>
	<div class="score-container" style="flex: 1;">
	<span class="score-label">Video P(Real)</span>
	<span class="score-value">{fused.get('video_score', 'N/A')}</span>
	</div>
	</div>

	<div style="text-align: right; margin-top: 16px; font-size: 0.8rem; color: #64748b;">
	Active Streams: {mods} \| Inference Time: {elapsed:.2f}s
	</div>
	</div>
	"""
	return summary_html, json.dumps(fused, indent=2), gr.update(visible=True)
	except Exception as exc:
	return f"<b>Error:</b> {exc}", "", gr.update(visible=False)


	# ---------------------------------------------------------------------------
	# UI
	# ---------------------------------------------------------------------------

	THEME = gr.themes.Soft(
	primary_hue="violet",
	secondary_hue="blue",
	neutral_hue="slate",
	font=[gr.themes.GoogleFont("Outfit"), "sans-serif"],
	).set(
	body_background_fill="linear-gradient(135deg, #0f172a 0%, #1e1b4b 100%)",
	body_background_fill_dark="linear-gradient(135deg, #0f172a 0%, #1e1b4b 100%)",
	block_background_fill="rgba(30, 41, 59, 0.7)",
	block_background_fill_dark="rgba(30, 41, 59, 0.7)",
	block_border_color="rgba(148, 163, 184, 0.2)",
	block_border_width="1px",
	block_label_text_color="#cbd5e1",
	block_shadow="0 8px 32px 0 rgba(0, 0, 0, 0.3)",
	button_primary_background_fill="linear-gradient(90deg, #8b5cf6 0%, #3b82f6 100%)",
	button_primary_background_fill_hover="linear-gradient(90deg, #7c3aed 0%, #2563eb 100%)",
	button_primary_text_color="#ffffff",
	input_background_fill="rgba(15, 23, 42, 0.6)",
	input_border_color="rgba(99, 102, 241, 0.3)",
	panel_background_fill="rgba(30, 41, 59, 0.4)",
	)

	DESCRIPTION = textwrap.dedent("""
	## Multimodal Deepfake Detection System
	Nes2Net (audio) \| GenConViT (video) \| Late fusion

	> Upload an audio clip, a video, or a video with audio to detect deepfakes.
	""")


	CUSTOM_CSS = """
	@import url('https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;600;800&display=swap');

	/* Force overriding Gradio's internal container instead of just 'body' */
	.gradio-container {
	background-color: #0b0f19 !important;
	background-image: radial-gradient(at 0% 0%, rgba(17, 24, 39, 1) 0, transparent 50%), radial-gradient(at 100% 0%, rgba(30, 27, 75, 1) 0, transparent 50%), radial-gradient(at 50% 100%, rgba(15, 23, 42, 1) 0, transparent 50%) !important;
	background-attachment: fixed !important;
	color: #f8fafc !important;
	font-family: 'Outfit', sans-serif !important;
	}

	/* Force standard Gradio wrappers to be slightly transparent to see background */
	.wrap, .panel, .gap, .form {
	background-color: rgba(15, 23, 42, 0.4) !important;
	border-color: rgba(255, 255, 255, 0.05) !important;
	}

	.hero-header {
	text-align: center;
	padding: 30px 10px;
	margin-bottom: 30px;
	background: rgba(15, 23, 42, 0.4) !important;
	border-radius: 16px;
	border: 1px solid rgba(255, 255, 255, 0.05) !important;
	backdrop-filter: blur(20px);
	box-shadow: 0 10px 40px rgba(0,0,0,0.5);
	}
	.hero-title {
	font-size: 3rem;
	font-weight: 800;
	text-transform: uppercase;
	letter-spacing: 2px;
	background: linear-gradient(to right, #8b5cf6, #3b82f6, #06b6d4) !important;
	-webkit-background-clip: text !important;
	background-clip: text !important;
	-webkit-text-fill-color: transparent !important;
	margin-bottom: 10px;
	}
	.hero-subtitle {
	font-size: 1.1rem;
	color: #94a3b8 !important;
	font-weight: 300;
	}

	.glass-panel {
	background: linear-gradient(145deg, rgba(30, 41, 59, 0.6) 0%, rgba(15, 23, 42, 0.8) 100%);
	backdrop-filter: blur(24px);
	border: 1px solid rgba(148, 163, 184, 0.1);
	border-radius: 20px;
	padding: 32px;
	box-shadow: 0 15px 35px rgba(0,0,0,0.5), inset 0 1px 0 rgba(255,255,255,0.05);
	margin-bottom: 24px;
	transition: transform 0.3s ease, box-shadow 0.3s ease;
	}
	.glass-panel:hover {
	transform: translateY(-5px);
	box-shadow: 0 20px 40px rgba(0,0,0,0.6), inset 0 1px 0 rgba(255,255,255,0.1);
	}

	.status-fake { border-top: 4px solid #f43f5e; box-shadow: 0 10px 40px rgba(244, 63, 94, 0.15); }
	.status-real { border-top: 4px solid #10b981; box-shadow: 0 10px 40px rgba(16, 185, 129, 0.15); }
	.result-title { font-size: 2.2rem; font-weight: 800; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 24px; text-align: center; text-shadow: 0 4px 10px rgba(0,0,0,0.4); }

	.score-container { display: flex; justify-content: space-between; align-items: center; padding: 16px 20px; background: rgba(0, 0, 0, 0.3); border-radius: 12px; margin-top: 16px; border: 1px solid rgba(255,255,255,0.03); }
	.score-label { font-size: 1rem; color: #94a3b8; font-weight: 600; text-transform: uppercase; letter-spacing: 1px; }
	.score-value { font-size: 1.5rem; font-weight: 800; color: #f8fafc; }

	@keyframes fillout { from { width: 0; opacity: 0; } to { opacity: 1; } }
	.progress-track { width: 100%; height: 14px; background: rgba(0, 0, 0, 0.5); border-radius: 7px; overflow: hidden; margin-top: 12px; box-shadow: inset 0 2px 4px rgba(0,0,0,0.5); }
	.progress-fill { height: 100%; border-radius: 7px; animation: fillout 1.2s cubic-bezier(0.16, 1, 0.3, 1) forwards; position: relative; }
	.status-fake .progress-fill { background: linear-gradient(90deg, #be123c, #f43f5e); }
	.status-real .progress-fill { background: linear-gradient(90deg, #047857, #10b981); }

	.gradio-container .prose * { padding: 0 !important; }

	.fusion-btn {
	background: linear-gradient(90deg, #8b5cf6, #3b82f6) !important;
	border: none !important;
	box-shadow: 0 0 15px rgba(139, 92, 246, 0.5) !important;
	animation: pulseGlow 2s infinite;
	}
	@keyframes pulseGlow {
	0% { box-shadow: 0 0 15px rgba(139, 92, 246, 0.5); }
	50% { box-shadow: 0 0 30px rgba(139, 92, 246, 0.9); }
	100% { box-shadow: 0 0 15px rgba(139, 92, 246, 0.5); }
	}
	"""

	def build_ui():
	# Injecting CSS strictly inline to bypass Windows path resolution bugs in Gradio 4
	with gr.Blocks(theme=THEME, css=CUSTOM_CSS, title="Multimodal Deepfake Detection") as demo:

	# Premium Hero Banner
	gr.HTML("""
	<div class="hero-header">
	<div class="hero-title">Multimodal Deepfake Detection</div>
	<div class="hero-subtitle">Industry-grade neural security layer fusing Nes2Net acoustics and GenConViT optics.</div>
	</div>
	""")

	with gr.Tabs():
	# ── Tab 1: Audio ──────────────────────────────────────────
	with gr.TabItem("Audio Detection"):
	gr.Markdown("### Upload a speech sample to detect AI-synthesised audio.")
	with gr.Row():
	with gr.Column(scale=1):
	audio_input = gr.Audio(
	label="Upload Audio",
	type="filepath",
	sources=["upload", "microphone"],
	)
	audio_btn = gr.Button("Analyse Audio", variant="primary")
	with gr.Column(scale=2):
	audio_result = gr.HTML(label="Result")
	audio_details = gr.Textbox(label="Raw JSON output",
	lines=10, visible=False,
	interactive=False)
	audio_expand_btn = gr.Button("Show raw output", size="sm",
	visible=False)

	audio_expand_btn.click(
	lambda: gr.update(visible=True),
	outputs=audio_details,
	)
	audio_btn.click(
	predict_audio,
	inputs=audio_input,
	outputs=[audio_result, audio_details, audio_expand_btn],
	)

	# ── Tab 2: Video ──────────────────────────────────────────
	with gr.TabItem("Video Detection"):
	gr.Markdown(
	"### Upload a video to detect face manipulation.\n"
	"> Note: Full GPU inference requires Colab/Kaggle. "
	"The local model may report 'Unavailable'."
	)
	with gr.Row():
	with gr.Column(scale=1):
	video_input = gr.Video(label="Upload Video")
	video_btn = gr.Button("Analyse Video", variant="primary")
	with gr.Column(scale=2):
	video_result = gr.HTML(label="Result")
	video_details = gr.Textbox(label="Raw JSON output",
	lines=10, visible=False,
	interactive=False)
	video_expand = gr.Button("Show raw output", size="sm",
	visible=False)

	video_expand.click(
	lambda: gr.update(visible=True),
	outputs=video_details,
	)
	video_btn.click(
	predict_video,
	inputs=video_input,
	outputs=[video_result, video_details, video_expand],
	)

	# ── Tab 3: Multimodal ────────────────────────────────────
	with gr.TabItem("Multimodal Fusion"):
	gr.Markdown(
	"### Upload a video with audio to get a fused verdict.\n"
	"Both the audio track and video frames will be analysed "
	"and combined via weighted-average score fusion."
	)
	with gr.Row():
	with gr.Column(scale=1):
	mm_input = gr.Video(label="Upload Video (with audio)")
	mm_btn = gr.Button("Launch Deep Multimodal Fusion 🚀", variant="primary", elem_classes=["fusion-btn"])
	with gr.Column(scale=2):
	mm_result = gr.HTML(label="Result")
	mm_details = gr.Textbox(label="Raw JSON output",
	lines=10, visible=False,
	interactive=False)
	mm_expand = gr.Button("Show raw output", size="sm",
	visible=False)

	mm_expand.click(
	lambda: gr.update(visible=True),
	outputs=mm_details,
	)
	mm_btn.click(
	predict_multimodal,
	inputs=mm_input,
	outputs=[mm_result, mm_details, mm_expand],
	)

	gr.Markdown("""
	---
	Model Architecture:
	Audio — Wav2Vec 2.0 (XLSR-300M) + Nes2Net (ASVspoof 2021 DF checkpoint)
	Video — GenConViT (ED + VAE ensemble, GenConViT weights)
	Fusion — Weighted-average late fusion (α = 0.5)

	B.Tech Project — Multimodal Deepfake Detection
	""")

	return demo


	# ---------------------------------------------------------------------------
	# Entry point
	# ---------------------------------------------------------------------------
	if __name__ == "__main__":
	demo = build_ui()

	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True,
	show_api=False, # disables /api endpoint — prevents gradio_client bool-schema crash
	)