Spaces:
Paused
Paused
Prepare Hugging Face Space deployment
Browse files- hf_space/README.md +16 -0
- hf_space/app.py +150 -0
- hf_space/modules/__init__.py +0 -0
- hf_space/modules/m1_lipsync.py +201 -0
- hf_space/modules/m2_fingerprint.py +120 -0
- hf_space/modules/m3_fallback.py +70 -0
- hf_space/modules/m5_explain.py +87 -0
- hf_space/modules/m5_fusion.py +41 -0
- hf_space/packages.txt +2 -0
- hf_space/requirements.txt +12 -0
- hf_space/utils/__init__.py +0 -0
- lipfd/train.py +237 -0
- tests/test_zero_gpu_contract.py +2 -3
hf_space/README.md
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: GenAI-DeepDetect
|
| 3 |
+
emoji: π
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: gray
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: '5.23.0'
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: true
|
| 10 |
+
hardware: zero-gpu
|
| 11 |
+
license: mit
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# GenAI-DeepDetect
|
| 15 |
+
|
| 16 |
+
Multimodal deepfake detection and attribution using SyncNet lip-sync, CLIP fingerprinting, and ViT temporal analysis.
|
hf_space/app.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GenAI-DeepDetect β Gradio Space entry point.
|
| 3 |
+
Hardware: ZeroGPU (A10G, 40GB VRAM)
|
| 4 |
+
M1: SyncNet lip-sync | M2: CLIP fingerprint | M3: ViT temporal | M5: Llama NIM
|
| 5 |
+
"""
|
| 6 |
+
import os
|
| 7 |
+
import time
|
| 8 |
+
|
| 9 |
+
import gradio as gr
|
| 10 |
+
import spaces # HuggingFace ZeroGPU
|
| 11 |
+
|
| 12 |
+
from modules.m1_lipsync import LipSyncModule
|
| 13 |
+
from modules.m2_fingerprint import FingerprintModule
|
| 14 |
+
from modules.m3_fallback import M3FallbackModule # swap β m3_sstgnn post L40S
|
| 15 |
+
from modules.m5_fusion import FusionModule
|
| 16 |
+
from modules.m5_explain import ExplainModule
|
| 17 |
+
|
| 18 |
+
CACHE = "/data/model_cache" if os.path.exists("/data") else "./cache"
|
| 19 |
+
os.makedirs(CACHE, exist_ok=True)
|
| 20 |
+
|
| 21 |
+
# All models load on CPU at startup β GPU not allocated yet
|
| 22 |
+
print("Loading M1 SyncNetβ¦")
|
| 23 |
+
m1 = LipSyncModule(cache_dir=CACHE)
|
| 24 |
+
print("Loading M2 Fingerprintβ¦")
|
| 25 |
+
m2 = FingerprintModule(cache_dir=CACHE)
|
| 26 |
+
print("Loading M3 ViT fallbackβ¦")
|
| 27 |
+
m3 = M3FallbackModule(cache_dir=CACHE)
|
| 28 |
+
m5_fusion = FusionModule(weights_path="weights/fusion_mlp.pt")
|
| 29 |
+
m5_explain = ExplainModule()
|
| 30 |
+
print("All modules ready. GPU allocated per-request via ZeroGPU.")
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@spaces.GPU(duration=120)
|
| 34 |
+
def analyze(video_file):
|
| 35 |
+
if video_file is None:
|
| 36 |
+
return "β οΈ Please upload a video.", "", "", ""
|
| 37 |
+
|
| 38 |
+
start = time.time()
|
| 39 |
+
|
| 40 |
+
# Move to A10G for this request
|
| 41 |
+
m1.to_gpu()
|
| 42 |
+
m2.to_gpu()
|
| 43 |
+
m3.to_gpu()
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
r1 = m1.score(video_file)
|
| 47 |
+
r2 = m2.score(video_file)
|
| 48 |
+
r3 = m3.score(video_file)
|
| 49 |
+
finally:
|
| 50 |
+
m1.to_cpu()
|
| 51 |
+
m2.to_cpu()
|
| 52 |
+
m3.to_cpu()
|
| 53 |
+
|
| 54 |
+
fusion = m5_fusion.fuse(r1["s1"], r2["s2"], r3["s3"])
|
| 55 |
+
explanation = m5_explain.explain(
|
| 56 |
+
fakescore=fusion["FakeScore"],
|
| 57 |
+
s1=r1["s1"],
|
| 58 |
+
s2=r2["s2"],
|
| 59 |
+
s3=r3["s3"],
|
| 60 |
+
weights=fusion["weights"],
|
| 61 |
+
attribution=r2["attribution"],
|
| 62 |
+
segments=r1.get("segments", []),
|
| 63 |
+
top_generator=r2["top_generator"],
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
elapsed = time.time() - start
|
| 67 |
+
verdict = "FAKE" if fusion["FakeScore"] > 0.5 else "REAL"
|
| 68 |
+
icon = "π΄" if verdict == "FAKE" else "π’"
|
| 69 |
+
|
| 70 |
+
verdict_md = f"## {icon} {verdict}\n**FakeScore: {fusion['FakeScore']:.3f}**"
|
| 71 |
+
|
| 72 |
+
scores_md = f"""### Per-Module Scores
|
| 73 |
+
| Module | Score | Weight |
|
| 74 |
+
|--------|-------|--------|
|
| 75 |
+
| π€ Lip-Sync (SyncNet) | `{r1['s1']:.3f}` | {fusion['weights']['lip_sync']:.2f} |
|
| 76 |
+
| πΌοΈ Fingerprint (CLIP) | `{r2['s2']:.3f}` | {fusion['weights']['fingerprint']:.2f} |
|
| 77 |
+
| πΈοΈ Temporal (ViT) | `{r3['s3']:.3f}` | {fusion['weights']['graph_gnn']:.2f} |
|
| 78 |
+
|
| 79 |
+
**β±οΈ Time:** {elapsed:.1f}s | **π» Hardware:** A10G (ZeroGPU)"""
|
| 80 |
+
|
| 81 |
+
attr_md = "### Generator Attribution\n"
|
| 82 |
+
if r2["attribution"]:
|
| 83 |
+
for gen, prob in sorted(r2["attribution"].items(), key=lambda x: -x[1])[:5]:
|
| 84 |
+
bar = "β" * int(prob * 25) + "β" * (25 - int(prob * 25))
|
| 85 |
+
attr_md += f"- **{gen}**: {prob * 100:.1f}% `{bar}`\n"
|
| 86 |
+
attr_md += f"\n**Top match:** {r2['top_generator']}"
|
| 87 |
+
else:
|
| 88 |
+
attr_md += "_Classified as real β attribution skipped._"
|
| 89 |
+
|
| 90 |
+
# Lip-sync anomaly timestamps
|
| 91 |
+
if r1.get("segments"):
|
| 92 |
+
scores_md += "\n\n**β οΈ Desync segments:**\n"
|
| 93 |
+
for seg in r1["segments"][:5]:
|
| 94 |
+
scores_md += f"- t={seg['time']}s (score={seg['score']:.2f})\n"
|
| 95 |
+
|
| 96 |
+
return verdict_md, scores_md, attr_md, explanation
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# ββ UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 100 |
+
|
| 101 |
+
with gr.Blocks(
|
| 102 |
+
title="GenAI-DeepDetect",
|
| 103 |
+
theme=gr.themes.Base(
|
| 104 |
+
primary_hue="red",
|
| 105 |
+
font=["DM Sans", "ui-sans-serif", "sans-serif"],
|
| 106 |
+
),
|
| 107 |
+
css="""
|
| 108 |
+
.verdict-box { border-radius: 12px; padding: 16px; }
|
| 109 |
+
footer { display: none !important; }
|
| 110 |
+
""",
|
| 111 |
+
) as demo:
|
| 112 |
+
gr.Markdown(
|
| 113 |
+
"""# π GenAI-DeepDetect
|
| 114 |
+
### Multimodal Deepfake Detection & Attribution
|
| 115 |
+
**Modules:** SyncNet (lip-sync) Β· CLIP (fingerprint) Β· ViT (temporal) Β· Llama-3.1-8B via NVIDIA NIM
|
| 116 |
+
**Hardware:** ZeroGPU A10G (40GB) Β· **Paper:** SRM IST 2026"""
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
with gr.Row():
|
| 120 |
+
with gr.Column(scale=1):
|
| 121 |
+
vid = gr.Video(label="Upload Video", height=280)
|
| 122 |
+
btn = gr.Button("π Analyze", variant="primary", size="lg")
|
| 123 |
+
if os.path.exists("test_assets/real_sample.mp4"):
|
| 124 |
+
gr.Examples(
|
| 125 |
+
examples=[["test_assets/real_sample.mp4"], ["test_assets/fake_sample.mp4"]],
|
| 126 |
+
inputs=[vid],
|
| 127 |
+
label="Try sample videos",
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
with gr.Column(scale=2):
|
| 131 |
+
verdict_out = gr.Markdown(label="Verdict", elem_classes=["verdict-box"])
|
| 132 |
+
scores_out = gr.Markdown(label="Module Scores")
|
| 133 |
+
|
| 134 |
+
with gr.Row():
|
| 135 |
+
attr_out = gr.Markdown(label="Generator Attribution")
|
| 136 |
+
expl_out = gr.Markdown(label="AI Forensic Explanation")
|
| 137 |
+
|
| 138 |
+
btn.click(
|
| 139 |
+
fn=analyze,
|
| 140 |
+
inputs=[vid],
|
| 141 |
+
outputs=[verdict_out, scores_out, attr_out, expl_out],
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
gr.Markdown(
|
| 145 |
+
"---\n*GenAI-DeepDetect Β· Akshat Agarwal, Dev Chopda Β· SRM IST Β· "
|
| 146 |
+
"[GitHub](https://github.com/akagtag/genai-deepdetect)*"
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
if __name__ == "__main__":
|
| 150 |
+
demo.launch()
|
hf_space/modules/__init__.py
ADDED
|
File without changes
|
hf_space/modules/m1_lipsync.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
M1 β Lip-Sync detection using Wav2Lip SyncNet discriminator.
|
| 3 |
+
Checkpoint: numz/wav2lip_studio / Wav2lip/lipsync_expert_.pth
|
| 4 |
+
Face input: (B, 15, 24, 48) β 5 frames Γ 3ch, bottom-quarter lip crop
|
| 5 |
+
Audio input: (B, 1, 80, 16) β mel spectrogram of matching window
|
| 6 |
+
Both embeddings flatten to 4608 dims before cosine similarity.
|
| 7 |
+
High similarity = in sync = REAL. Inverted to fake score.
|
| 8 |
+
"""
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import cv2
|
| 12 |
+
import librosa
|
| 13 |
+
import numpy as np
|
| 14 |
+
import torch
|
| 15 |
+
import torch.nn as nn
|
| 16 |
+
import torch.nn.functional as F
|
| 17 |
+
from huggingface_hub import hf_hub_download
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# ββ architecture βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
+
|
| 22 |
+
class _Conv2d(nn.Module):
|
| 23 |
+
"""Block matching the lipsync_expert_.pth state-dict key structure."""
|
| 24 |
+
def __init__(self, cin: int, cout: int, k: int, s=1, p: int = 0, residual: bool = False):
|
| 25 |
+
super().__init__()
|
| 26 |
+
self.conv_block = nn.Sequential(nn.Conv2d(cin, cout, k, s, p), nn.BatchNorm2d(cout))
|
| 27 |
+
self.act = nn.ReLU(inplace=True)
|
| 28 |
+
self.residual = residual
|
| 29 |
+
|
| 30 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 31 |
+
out = self.conv_block(x)
|
| 32 |
+
if self.residual:
|
| 33 |
+
out = out + x
|
| 34 |
+
return self.act(out)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class SyncNet(nn.Module):
|
| 38 |
+
"""
|
| 39 |
+
Wav2Lip SyncNet β colour variant.
|
| 40 |
+
face_encoder: (B,15,24,48) -> (B,4608)
|
| 41 |
+
audio_encoder: (B,1,80,16) -> (B,4608)
|
| 42 |
+
forward returns cosine similarity in [-1,1].
|
| 43 |
+
"""
|
| 44 |
+
def __init__(self):
|
| 45 |
+
super().__init__()
|
| 46 |
+
self.face_encoder = nn.Sequential(
|
| 47 |
+
_Conv2d(15, 32, 7, 1, 3),
|
| 48 |
+
_Conv2d(32, 64, 5, (1, 2), 2),
|
| 49 |
+
_Conv2d(64, 64, 3, 1, 1, residual=True),
|
| 50 |
+
_Conv2d(64, 64, 3, 1, 1),
|
| 51 |
+
_Conv2d(64, 128, 3, 2, 1),
|
| 52 |
+
_Conv2d(128, 128, 3, 1, 1, residual=True),
|
| 53 |
+
_Conv2d(128, 128, 3, 1, 1),
|
| 54 |
+
_Conv2d(128, 128, 3, 1, 1),
|
| 55 |
+
_Conv2d(128, 256, 3, 2, 1),
|
| 56 |
+
_Conv2d(256, 256, 3, 1, 1, residual=True),
|
| 57 |
+
_Conv2d(256, 256, 3, 1, 1),
|
| 58 |
+
_Conv2d(256, 512, 3, 2, 1),
|
| 59 |
+
_Conv2d(512, 512, 3, 1, 1, residual=True),
|
| 60 |
+
_Conv2d(512, 512, 3, 1, 1),
|
| 61 |
+
_Conv2d(512, 512, 3, 1, 1),
|
| 62 |
+
_Conv2d(512, 512, 3, 1, 1),
|
| 63 |
+
_Conv2d(512, 512, 1, 1, 0),
|
| 64 |
+
)
|
| 65 |
+
self.audio_encoder = nn.Sequential(
|
| 66 |
+
_Conv2d(1, 32, 3, 1, 1),
|
| 67 |
+
_Conv2d(32, 32, 3, 1, 1, residual=True),
|
| 68 |
+
_Conv2d(32, 32, 3, 1, 1),
|
| 69 |
+
_Conv2d(32, 64, 3, (3, 1), 1),
|
| 70 |
+
_Conv2d(64, 64, 3, 1, 1, residual=True),
|
| 71 |
+
_Conv2d(64, 64, 3, 1, 1),
|
| 72 |
+
_Conv2d(64, 128, 3, 3, 1),
|
| 73 |
+
_Conv2d(128, 128, 3, 1, 1, residual=True),
|
| 74 |
+
_Conv2d(128, 128, 3, 1, 1),
|
| 75 |
+
_Conv2d(128, 256, 3, (3, 2), 1),
|
| 76 |
+
_Conv2d(256, 256, 3, 1, 1, residual=True),
|
| 77 |
+
_Conv2d(256, 256, 3, 1, 1),
|
| 78 |
+
_Conv2d(256, 512, 3, 1, 1),
|
| 79 |
+
_Conv2d(512, 512, 1, 1, 0),
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
def forward(self, audio: torch.Tensor, face: torch.Tensor) -> torch.Tensor:
|
| 83 |
+
f = self.face_encoder(face).view(face.size(0), -1)
|
| 84 |
+
a = self.audio_encoder(audio).view(audio.size(0), -1)
|
| 85 |
+
f = F.normalize(f, dim=-1)
|
| 86 |
+
a = F.normalize(a, dim=-1)
|
| 87 |
+
return (f * a).sum(dim=-1) # cosine similarity
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
# ββ module ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 91 |
+
|
| 92 |
+
class LipSyncModule:
|
| 93 |
+
"""
|
| 94 |
+
Wrap SyncNet for ZeroGPU inference.
|
| 95 |
+
score() returns {"s1": float [0,1], "segments": list}.
|
| 96 |
+
"""
|
| 97 |
+
|
| 98 |
+
def __init__(self, cache_dir: str = "/data/model_cache"):
|
| 99 |
+
self.device = "cpu"
|
| 100 |
+
ckpt_path = hf_hub_download(
|
| 101 |
+
repo_id="numz/wav2lip_studio",
|
| 102 |
+
filename="Wav2lip/lipsync_expert_.pth",
|
| 103 |
+
cache_dir=cache_dir,
|
| 104 |
+
)
|
| 105 |
+
self.model = SyncNet()
|
| 106 |
+
ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False)
|
| 107 |
+
state = ckpt.get("state_dict", ckpt)
|
| 108 |
+
missing, unexpected = self.model.load_state_dict(state, strict=False)
|
| 109 |
+
if missing:
|
| 110 |
+
print(f"[M1] SyncNet missing keys: {len(missing)}")
|
| 111 |
+
self.model.eval()
|
| 112 |
+
|
| 113 |
+
def to_gpu(self):
|
| 114 |
+
self.device = "cuda"
|
| 115 |
+
self.model = self.model.to("cuda")
|
| 116 |
+
|
| 117 |
+
def to_cpu(self):
|
| 118 |
+
self.device = "cpu"
|
| 119 |
+
self.model = self.model.to("cpu")
|
| 120 |
+
|
| 121 |
+
@torch.no_grad()
|
| 122 |
+
def score(self, video_path: str) -> dict:
|
| 123 |
+
faces, mels, fps = self._preprocess(video_path)
|
| 124 |
+
if faces is None or len(faces) == 0:
|
| 125 |
+
return {"s1": 0.5, "segments": [], "note": "no_face_or_audio"}
|
| 126 |
+
|
| 127 |
+
scores: list[float] = []
|
| 128 |
+
for face_np, mel_np in zip(faces, mels):
|
| 129 |
+
face_t = torch.tensor(face_np, dtype=torch.float32).unsqueeze(0).to(self.device)
|
| 130 |
+
mel_t = torch.tensor(mel_np, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(self.device)
|
| 131 |
+
cos_sim = self.model(mel_t, face_t).item()
|
| 132 |
+
# cosine sim β [-1,1]; high = in sync = real β invert to fake score
|
| 133 |
+
scores.append(float(np.clip((1.0 - cos_sim) / 2.0, 0.0, 1.0)))
|
| 134 |
+
|
| 135 |
+
s1 = float(np.mean(scores))
|
| 136 |
+
segments = [
|
| 137 |
+
{"time": round(i / fps, 2), "score": round(s, 3)}
|
| 138 |
+
for i, s in enumerate(scores) if s > 0.6
|
| 139 |
+
]
|
| 140 |
+
return {"s1": s1, "segments": segments}
|
| 141 |
+
|
| 142 |
+
def _preprocess(self, video_path: str):
|
| 143 |
+
try:
|
| 144 |
+
audio, sr = librosa.load(video_path, sr=16000, mono=True)
|
| 145 |
+
except Exception:
|
| 146 |
+
return None, None, 25.0
|
| 147 |
+
|
| 148 |
+
cap = cv2.VideoCapture(video_path)
|
| 149 |
+
fps = float(cap.get(cv2.CAP_PROP_FPS) or 25.0)
|
| 150 |
+
face_cascade = cv2.CascadeClassifier(
|
| 151 |
+
cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
hop = max(1, int(sr / fps))
|
| 155 |
+
raw_frames: list[np.ndarray] = []
|
| 156 |
+
raw_mels: list[np.ndarray] = []
|
| 157 |
+
frame_idx = 0
|
| 158 |
+
|
| 159 |
+
while cap.isOpened():
|
| 160 |
+
ret, frame = cap.read()
|
| 161 |
+
if not ret:
|
| 162 |
+
break
|
| 163 |
+
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
| 164 |
+
dets = face_cascade.detectMultiScale(gray, 1.3, 5, minSize=(30, 30))
|
| 165 |
+
if len(dets) > 0:
|
| 166 |
+
x, y, w, h = dets[0]
|
| 167 |
+
# Bottom quarter of face = lip region
|
| 168 |
+
lip_y = y + int(h * 0.75)
|
| 169 |
+
lip = frame[lip_y: y + h, x: x + w]
|
| 170 |
+
if lip.size == 0:
|
| 171 |
+
lip = frame[y: y + h, x: x + w]
|
| 172 |
+
# Resize to (24, 48) matching face encoder input
|
| 173 |
+
lip = cv2.resize(lip, (48, 24)).astype(np.float32) / 255.0
|
| 174 |
+
raw_frames.append(lip) # (24, 48, 3)
|
| 175 |
+
|
| 176 |
+
start = frame_idx * hop
|
| 177 |
+
chunk = audio[start: start + hop * 4]
|
| 178 |
+
if len(chunk) < hop * 4:
|
| 179 |
+
chunk = np.pad(chunk, (0, max(0, hop * 4 - len(chunk))))
|
| 180 |
+
mel = librosa.feature.melspectrogram(y=chunk, sr=sr, n_mels=80, hop_length=hop)
|
| 181 |
+
mel = librosa.power_to_db(mel, ref=np.max).astype(np.float32)
|
| 182 |
+
mel = cv2.resize(mel, (16, 80)) # (80, 16)
|
| 183 |
+
raw_mels.append(mel)
|
| 184 |
+
frame_idx += 1
|
| 185 |
+
|
| 186 |
+
cap.release()
|
| 187 |
+
|
| 188 |
+
if len(raw_frames) < 5:
|
| 189 |
+
return None, None, fps
|
| 190 |
+
|
| 191 |
+
T = 5
|
| 192 |
+
faces_out: list[np.ndarray] = []
|
| 193 |
+
mels_out: list[np.ndarray] = []
|
| 194 |
+
for i in range(len(raw_frames) - T):
|
| 195 |
+
# Stack T frames: (T, 24, 48, 3) -> transpose -> (T, 3, 24, 48) -> reshape -> (15, 24, 48)
|
| 196 |
+
window = np.stack(raw_frames[i: i + T], axis=0) # (5, 24, 48, 3)
|
| 197 |
+
window = window.transpose(0, 3, 1, 2).reshape(15, 24, 48) # (15, 24, 48)
|
| 198 |
+
faces_out.append(window)
|
| 199 |
+
mels_out.append(raw_mels[i])
|
| 200 |
+
|
| 201 |
+
return faces_out, mels_out, fps
|
hf_space/modules/m2_fingerprint.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
M2 β Style Fingerprinting.
|
| 3 |
+
Binary deepfake detector: yermandy/deepfake-detection (image-classification).
|
| 4 |
+
Generator attribution: CLIP ViT-L/14 zero-shot over 8 generator prompts.
|
| 5 |
+
"""
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import cv2
|
| 9 |
+
import numpy as np
|
| 10 |
+
import torch
|
| 11 |
+
from PIL import Image
|
| 12 |
+
from transformers import (
|
| 13 |
+
AutoModelForImageClassification,
|
| 14 |
+
AutoProcessor,
|
| 15 |
+
CLIPModel,
|
| 16 |
+
CLIPProcessor,
|
| 17 |
+
CLIPTokenizer,
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
GENERATORS = [
|
| 21 |
+
"Sora",
|
| 22 |
+
"Runway Gen-2",
|
| 23 |
+
"Wav2Lip",
|
| 24 |
+
"Stable Diffusion v1.5",
|
| 25 |
+
"SDXL",
|
| 26 |
+
"Midjourney v6",
|
| 27 |
+
"DALL-E 3",
|
| 28 |
+
"Unknown/OOD",
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class FingerprintModule:
|
| 33 |
+
def __init__(self, cache_dir: str = "/data/model_cache"):
|
| 34 |
+
self.device = "cpu"
|
| 35 |
+
|
| 36 |
+
self.model = AutoModelForImageClassification.from_pretrained(
|
| 37 |
+
"yermandy/deepfake-detection", cache_dir=cache_dir
|
| 38 |
+
)
|
| 39 |
+
self.processor = AutoProcessor.from_pretrained(
|
| 40 |
+
"yermandy/deepfake-detection", cache_dir=cache_dir
|
| 41 |
+
)
|
| 42 |
+
self.model.eval()
|
| 43 |
+
|
| 44 |
+
self.clip = CLIPModel.from_pretrained(
|
| 45 |
+
"openai/clip-vit-large-patch14", cache_dir=cache_dir
|
| 46 |
+
)
|
| 47 |
+
self.clip_tok = CLIPTokenizer.from_pretrained(
|
| 48 |
+
"openai/clip-vit-large-patch14", cache_dir=cache_dir
|
| 49 |
+
)
|
| 50 |
+
self.clip_proc = CLIPProcessor.from_pretrained(
|
| 51 |
+
"openai/clip-vit-large-patch14", cache_dir=cache_dir
|
| 52 |
+
)
|
| 53 |
+
self.clip.eval()
|
| 54 |
+
self._precompute_generator_embeddings()
|
| 55 |
+
|
| 56 |
+
def _precompute_generator_embeddings(self):
|
| 57 |
+
prompts = [f"An image generated by {g} AI model" for g in GENERATORS]
|
| 58 |
+
tokens = self.clip_tok(prompts, padding=True, return_tensors="pt")
|
| 59 |
+
with torch.no_grad():
|
| 60 |
+
self.gen_embeds = self.clip.get_text_features(**tokens)
|
| 61 |
+
self.gen_embeds = self.gen_embeds / (self.gen_embeds.norm(dim=-1, keepdim=True) + 1e-8)
|
| 62 |
+
|
| 63 |
+
def to_gpu(self):
|
| 64 |
+
self.device = "cuda"
|
| 65 |
+
self.model = self.model.to("cuda")
|
| 66 |
+
self.clip = self.clip.to("cuda")
|
| 67 |
+
self.gen_embeds = self.gen_embeds.to("cuda")
|
| 68 |
+
|
| 69 |
+
def to_cpu(self):
|
| 70 |
+
self.device = "cpu"
|
| 71 |
+
self.model = self.model.to("cpu")
|
| 72 |
+
self.clip = self.clip.to("cpu")
|
| 73 |
+
self.gen_embeds = self.gen_embeds.to("cpu")
|
| 74 |
+
|
| 75 |
+
@torch.no_grad()
|
| 76 |
+
def score(self, video_path: str) -> dict:
|
| 77 |
+
frames = self._extract_frames(video_path, n=16)
|
| 78 |
+
if not frames:
|
| 79 |
+
return {"s2": 0.5, "attribution": {}, "top_generator": "Unknown"}
|
| 80 |
+
|
| 81 |
+
fake_scores: list[float] = []
|
| 82 |
+
for frame in frames:
|
| 83 |
+
inputs = self.processor(images=frame, return_tensors="pt")
|
| 84 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 85 |
+
logits = self.model(**inputs).logits
|
| 86 |
+
prob = torch.softmax(logits, dim=-1)
|
| 87 |
+
# index 1 = fake for most binary classifiers; use max if uncertain
|
| 88 |
+
fake_p = prob[0, 1].item() if prob.shape[-1] > 1 else prob[0, 0].item()
|
| 89 |
+
fake_scores.append(fake_p)
|
| 90 |
+
|
| 91 |
+
s2 = float(np.mean(fake_scores))
|
| 92 |
+
attribution = self._attribute(frames) if s2 > 0.4 else {}
|
| 93 |
+
top_gen = max(attribution, key=attribution.get) if attribution else "Unknown"
|
| 94 |
+
return {"s2": s2, "attribution": attribution, "top_generator": top_gen}
|
| 95 |
+
|
| 96 |
+
def _attribute(self, frames: list[Image.Image]) -> dict:
|
| 97 |
+
img_embeds = []
|
| 98 |
+
for frame in frames[:8]:
|
| 99 |
+
inputs = self.clip_proc(images=frame, return_tensors="pt")
|
| 100 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 101 |
+
emb = self.clip.get_image_features(**inputs)
|
| 102 |
+
emb = emb / (emb.norm(dim=-1, keepdim=True) + 1e-8)
|
| 103 |
+
img_embeds.append(emb)
|
| 104 |
+
avg_emb = torch.cat(img_embeds).mean(dim=0, keepdim=True)
|
| 105 |
+
sims = (avg_emb @ self.gen_embeds.T).squeeze()
|
| 106 |
+
probs = torch.softmax(sims * 10.0, dim=-1)
|
| 107 |
+
return {GENERATORS[i]: round(probs[i].item(), 4) for i in range(len(GENERATORS))}
|
| 108 |
+
|
| 109 |
+
def _extract_frames(self, video_path: str, n: int = 16) -> list[Image.Image]:
|
| 110 |
+
cap = cv2.VideoCapture(video_path)
|
| 111 |
+
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 112 |
+
indices = np.linspace(0, max(total - 1, 0), n, dtype=int) if total > 0 else []
|
| 113 |
+
frames: list[Image.Image] = []
|
| 114 |
+
for idx in indices:
|
| 115 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
|
| 116 |
+
ret, frame = cap.read()
|
| 117 |
+
if ret:
|
| 118 |
+
frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)))
|
| 119 |
+
cap.release()
|
| 120 |
+
return frames
|
hf_space/modules/m3_fallback.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
M3 Fallback β ViT temporal deepfake detector (ACTIVE TONIGHT).
|
| 3 |
+
Model: prithivMLmods/Deep-Fake-Detector-v2-Model (image-classification).
|
| 4 |
+
Samples 32 frames, averages fake probability.
|
| 5 |
+
Swap for m3_sstgnn after L40S training.
|
| 6 |
+
"""
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import cv2
|
| 10 |
+
import numpy as np
|
| 11 |
+
import torch
|
| 12 |
+
from PIL import Image
|
| 13 |
+
from transformers import AutoModelForImageClassification, AutoProcessor
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class M3FallbackModule:
|
| 17 |
+
def __init__(self, cache_dir: str = "/data/model_cache"):
|
| 18 |
+
self.device = "cpu"
|
| 19 |
+
self.model = AutoModelForImageClassification.from_pretrained(
|
| 20 |
+
"prithivMLmods/Deep-Fake-Detector-v2-Model", cache_dir=cache_dir
|
| 21 |
+
)
|
| 22 |
+
self.processor = AutoProcessor.from_pretrained(
|
| 23 |
+
"prithivMLmods/Deep-Fake-Detector-v2-Model", cache_dir=cache_dir
|
| 24 |
+
)
|
| 25 |
+
self.model.eval()
|
| 26 |
+
# Determine fake label index once
|
| 27 |
+
id2label = self.model.config.id2label
|
| 28 |
+
self._fake_idx = next(
|
| 29 |
+
(i for i, v in id2label.items() if "fake" in str(v).lower()),
|
| 30 |
+
1, # default: index 1 = fake
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
def to_gpu(self):
|
| 34 |
+
self.device = "cuda"
|
| 35 |
+
self.model = self.model.to("cuda")
|
| 36 |
+
|
| 37 |
+
def to_cpu(self):
|
| 38 |
+
self.device = "cpu"
|
| 39 |
+
self.model = self.model.to("cpu")
|
| 40 |
+
|
| 41 |
+
@torch.no_grad()
|
| 42 |
+
def score(self, video_path: str) -> dict:
|
| 43 |
+
frames = self._extract_frames(video_path, n=32)
|
| 44 |
+
if not frames:
|
| 45 |
+
return {"s3": 0.5, "note": "no_frames"}
|
| 46 |
+
|
| 47 |
+
fake_scores: list[float] = []
|
| 48 |
+
for frame in frames:
|
| 49 |
+
inputs = self.processor(images=frame, return_tensors="pt")
|
| 50 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 51 |
+
logits = self.model(**inputs).logits
|
| 52 |
+
probs = torch.softmax(logits, dim=-1)
|
| 53 |
+
fake_p = probs[0, self._fake_idx].item()
|
| 54 |
+
fake_scores.append(fake_p)
|
| 55 |
+
|
| 56 |
+
s3 = float(np.mean(fake_scores))
|
| 57 |
+
return {"s3": s3}
|
| 58 |
+
|
| 59 |
+
def _extract_frames(self, video_path: str, n: int = 32) -> list[Image.Image]:
|
| 60 |
+
cap = cv2.VideoCapture(video_path)
|
| 61 |
+
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 62 |
+
indices = np.linspace(0, max(total - 1, 0), n, dtype=int) if total > 0 else []
|
| 63 |
+
frames: list[Image.Image] = []
|
| 64 |
+
for idx in indices:
|
| 65 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
|
| 66 |
+
ret, frame = cap.read()
|
| 67 |
+
if ret:
|
| 68 |
+
frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)))
|
| 69 |
+
cap.release()
|
| 70 |
+
return frames
|
hf_space/modules/m5_explain.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""M5 Explain β NVIDIA NIM Llama-3.1-8B-Instruct."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import os
|
| 5 |
+
from openai import OpenAI
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class ExplainModule:
|
| 9 |
+
"""NVIDIA NIM free tier: ~40 req/min."""
|
| 10 |
+
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.client = OpenAI(
|
| 13 |
+
api_key=os.environ.get("NVIDIA_API_KEY", ""),
|
| 14 |
+
base_url="https://integrate.api.nvidia.com/v1",
|
| 15 |
+
)
|
| 16 |
+
self.model = "meta/llama-3.1-8b-instruct"
|
| 17 |
+
|
| 18 |
+
def explain(
|
| 19 |
+
self,
|
| 20 |
+
fakescore: float,
|
| 21 |
+
s1: float,
|
| 22 |
+
s2: float,
|
| 23 |
+
s3: float,
|
| 24 |
+
weights: dict,
|
| 25 |
+
attribution: dict,
|
| 26 |
+
segments: list,
|
| 27 |
+
top_generator: str,
|
| 28 |
+
) -> str:
|
| 29 |
+
verdict = "FAKE" if fakescore > 0.5 else "REAL"
|
| 30 |
+
conf = (
|
| 31 |
+
"high" if abs(fakescore - 0.5) > 0.3
|
| 32 |
+
else "moderate" if abs(fakescore - 0.5) > 0.15
|
| 33 |
+
else "low"
|
| 34 |
+
)
|
| 35 |
+
seg_text = ""
|
| 36 |
+
if segments:
|
| 37 |
+
seg_text = "Flagged timestamps: " + ", ".join(
|
| 38 |
+
f"{s['time']}s (score={s['score']})" for s in segments[:5]
|
| 39 |
+
)
|
| 40 |
+
attr_text = ""
|
| 41 |
+
if attribution:
|
| 42 |
+
top3 = sorted(attribution.items(), key=lambda x: -x[1])[:3]
|
| 43 |
+
attr_text = "Top generators: " + ", ".join(
|
| 44 |
+
f"{n}: {p * 100:.1f}%" for n, p in top3
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
prompt = f"""You are a forensic AI analyst. Analyze these deepfake detection results. Be specific.
|
| 48 |
+
|
| 49 |
+
Results:
|
| 50 |
+
- Verdict: {verdict} (FakeScore: {fakescore:.3f}, confidence: {conf})
|
| 51 |
+
- Lip-Sync (M1): {s1:.3f} (weight: {weights.get('lip_sync', 'N/A')})
|
| 52 |
+
- Fingerprint (M2): {s2:.3f} (weight: {weights.get('fingerprint', 'N/A')})
|
| 53 |
+
- Temporal-GNN (M3): {s3:.3f} (weight: {weights.get('graph_gnn', 'N/A')})
|
| 54 |
+
{seg_text}
|
| 55 |
+
{attr_text}
|
| 56 |
+
- Most likely generator: {top_generator}
|
| 57 |
+
|
| 58 |
+
Write 3-5 sentences referencing specific scores and timestamps."""
|
| 59 |
+
|
| 60 |
+
try:
|
| 61 |
+
response = self.client.chat.completions.create(
|
| 62 |
+
model=self.model,
|
| 63 |
+
messages=[
|
| 64 |
+
{"role": "system", "content": "You are a forensic deepfake analyst. Be precise and concise."},
|
| 65 |
+
{"role": "user", "content": prompt},
|
| 66 |
+
],
|
| 67 |
+
max_tokens=300,
|
| 68 |
+
temperature=0.3,
|
| 69 |
+
)
|
| 70 |
+
return response.choices[0].message.content.strip()
|
| 71 |
+
except Exception:
|
| 72 |
+
return self._fallback(verdict, fakescore, s1, s2, s3, top_generator, conf)
|
| 73 |
+
|
| 74 |
+
def _fallback(self, verdict, fakescore, s1, s2, s3, top_gen, conf) -> str:
|
| 75 |
+
if verdict == "FAKE":
|
| 76 |
+
return (
|
| 77 |
+
f"Video classified as {verdict} with {conf} confidence (FakeScore: {fakescore:.3f}). "
|
| 78 |
+
f"Lip-sync scored {s1:.2f} indicating "
|
| 79 |
+
f"{'significant' if s1 > 0.7 else 'moderate' if s1 > 0.5 else 'minimal'} audio-visual inconsistency. "
|
| 80 |
+
f"Style fingerprinting scored {s2:.2f}, likely generated by {top_gen}. "
|
| 81 |
+
f"Temporal graph analysis scored {s3:.2f}."
|
| 82 |
+
)
|
| 83 |
+
return (
|
| 84 |
+
f"Video classified as {verdict} with {conf} confidence (FakeScore: {fakescore:.3f}). "
|
| 85 |
+
"All detection modules returned scores below detection threshold, "
|
| 86 |
+
"suggesting authentic audio-visual correspondence."
|
| 87 |
+
)
|
hf_space/modules/m5_fusion.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""M5 Fusion β 3-input attention MLP."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import os
|
| 5 |
+
import torch
|
| 6 |
+
import torch.nn as nn
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class FusionMLP(nn.Module):
|
| 10 |
+
def __init__(self):
|
| 11 |
+
super().__init__()
|
| 12 |
+
self.fc1 = nn.Linear(3, 16)
|
| 13 |
+
self.fc2 = nn.Linear(16, 3)
|
| 14 |
+
|
| 15 |
+
def forward(self, s: torch.Tensor):
|
| 16 |
+
h = torch.relu(self.fc1(s))
|
| 17 |
+
alpha = torch.softmax(self.fc2(h), dim=-1)
|
| 18 |
+
return (alpha * s).sum(), alpha
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class FusionModule:
|
| 22 |
+
def __init__(self, weights_path: str = "weights/fusion_mlp.pt"):
|
| 23 |
+
self.model = FusionMLP()
|
| 24 |
+
if os.path.exists(weights_path):
|
| 25 |
+
self.model.load_state_dict(
|
| 26 |
+
torch.load(weights_path, map_location="cpu", weights_only=True)
|
| 27 |
+
)
|
| 28 |
+
self.model.eval()
|
| 29 |
+
|
| 30 |
+
def fuse(self, s1: float, s2: float, s3: float) -> dict:
|
| 31 |
+
s = torch.tensor([s1, s2, s3], dtype=torch.float32)
|
| 32 |
+
with torch.no_grad():
|
| 33 |
+
fakescore, alpha = self.model(s)
|
| 34 |
+
return {
|
| 35 |
+
"FakeScore": round(float(fakescore.item()), 4),
|
| 36 |
+
"weights": {
|
| 37 |
+
"lip_sync": round(alpha[0].item(), 3),
|
| 38 |
+
"fingerprint": round(alpha[1].item(), 3),
|
| 39 |
+
"graph_gnn": round(alpha[2].item(), 3),
|
| 40 |
+
},
|
| 41 |
+
}
|
hf_space/packages.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ffmpeg
|
| 2 |
+
libsndfile1-dev
|
hf_space/requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spaces>=0.30.0
|
| 2 |
+
torch>=2.1.0
|
| 3 |
+
torchvision>=0.16.0
|
| 4 |
+
torchaudio>=2.1.0
|
| 5 |
+
transformers>=4.40.0
|
| 6 |
+
opencv-python-headless>=4.8.0
|
| 7 |
+
librosa>=0.10.0
|
| 8 |
+
numpy>=1.24.0
|
| 9 |
+
Pillow>=10.0.0
|
| 10 |
+
openai>=1.0.0
|
| 11 |
+
huggingface-hub>=0.23.0
|
| 12 |
+
soundfile>=0.12.0
|
hf_space/utils/__init__.py
ADDED
|
File without changes
|
lipfd/train.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
train.py β Train LipFDNet on the AVLips v1.0 dataset.
|
| 3 |
+
|
| 4 |
+
Extracts a frame + audio sample from each video on-the-fly, trains the tiny
|
| 5 |
+
LipFDNet, saves ckpt.pth, then uploads to akagtag/LipFD-checkpoint.
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
python lipfd/train.py # full dataset
|
| 9 |
+
python lipfd/train.py --max-per-class 200 # quick smoke-test (CPU ~10 min)
|
| 10 |
+
python lipfd/train.py --epochs 5 # default 5 epochs
|
| 11 |
+
"""
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import argparse
|
| 15 |
+
import os
|
| 16 |
+
import random
|
| 17 |
+
import subprocess
|
| 18 |
+
import sys
|
| 19 |
+
import tarfile
|
| 20 |
+
import tempfile
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
|
| 23 |
+
import numpy as np
|
| 24 |
+
import torch
|
| 25 |
+
import torch.nn as nn
|
| 26 |
+
from torch.utils.data import DataLoader, Dataset
|
| 27 |
+
|
| 28 |
+
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
| 29 |
+
from lipfd.model import LipFDNet # noqa: E402
|
| 30 |
+
|
| 31 |
+
ARCHIVE = Path(__file__).with_name("AVLips v1.0.tar.xz")
|
| 32 |
+
CKPT_OUT = Path(__file__).with_name("ckpt.pth")
|
| 33 |
+
HF_REPO = "akagtag/LipFD-checkpoint"
|
| 34 |
+
|
| 35 |
+
# ββ helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 36 |
+
|
| 37 |
+
def _extract_frame_and_audio(video_bytes: bytes) -> tuple[np.ndarray, float]:
|
| 38 |
+
"""Extract middle frame (H,W,3 uint8) + RMS audio level from raw video bytes."""
|
| 39 |
+
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
|
| 40 |
+
f.write(video_bytes)
|
| 41 |
+
tmp = f.name
|
| 42 |
+
try:
|
| 43 |
+
# Frame: middle frame as raw RGB
|
| 44 |
+
cmd = [
|
| 45 |
+
"ffmpeg", "-i", tmp,
|
| 46 |
+
"-vf", "select=eq(n\\,15)", # frame 15 (βmiddle for short clips)
|
| 47 |
+
"-frames:v", "1",
|
| 48 |
+
"-f", "rawvideo", "-pix_fmt", "rgb24",
|
| 49 |
+
"-loglevel", "error",
|
| 50 |
+
"pipe:1",
|
| 51 |
+
]
|
| 52 |
+
r = subprocess.run(cmd, capture_output=True, timeout=10)
|
| 53 |
+
raw = r.stdout
|
| 54 |
+
frame: np.ndarray
|
| 55 |
+
if len(raw) >= 3:
|
| 56 |
+
side = int((len(raw) / 3) ** 0.5)
|
| 57 |
+
if side * side * 3 == len(raw):
|
| 58 |
+
frame = np.frombuffer(raw, dtype=np.uint8).reshape(side, side, 3)
|
| 59 |
+
else:
|
| 60 |
+
frame = np.zeros((64, 64, 3), dtype=np.uint8)
|
| 61 |
+
else:
|
| 62 |
+
frame = np.zeros((64, 64, 3), dtype=np.uint8)
|
| 63 |
+
|
| 64 |
+
# Audio: RMS level as scalar proxy
|
| 65 |
+
cmd2 = [
|
| 66 |
+
"ffmpeg", "-i", tmp, "-ac", "1", "-ar", "16000",
|
| 67 |
+
"-f", "f32le", "-loglevel", "error", "pipe:1",
|
| 68 |
+
]
|
| 69 |
+
r2 = subprocess.run(cmd2, capture_output=True, timeout=10)
|
| 70 |
+
if r2.stdout:
|
| 71 |
+
samples = np.frombuffer(r2.stdout, dtype=np.float32)
|
| 72 |
+
rms = float(np.sqrt(np.mean(samples ** 2) + 1e-9))
|
| 73 |
+
else:
|
| 74 |
+
rms = 0.0
|
| 75 |
+
except Exception:
|
| 76 |
+
frame = np.zeros((64, 64, 3), dtype=np.uint8)
|
| 77 |
+
rms = 0.0
|
| 78 |
+
finally:
|
| 79 |
+
Path(tmp).unlink(missing_ok=True)
|
| 80 |
+
return frame, rms
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class AVLipsDataset(Dataset):
|
| 84 |
+
def __init__(self, archive: Path, max_per_class: int | None = None):
|
| 85 |
+
self.archive = archive
|
| 86 |
+
self.samples: list[tuple[str, str | None, int]] = [] # (video, wav, label)
|
| 87 |
+
|
| 88 |
+
with tarfile.open(archive, "r:xz") as tf:
|
| 89 |
+
names = tf.getnames()
|
| 90 |
+
|
| 91 |
+
real_v = [n for n in names if "/0_real/" in n and n.endswith(".mp4")]
|
| 92 |
+
fake_v = [n for n in names if "/1_fake/" in n and n.endswith(".mp4")]
|
| 93 |
+
|
| 94 |
+
# Build wav lookup: AVLips/wav/0_real/578.wav
|
| 95 |
+
wav_lookup: dict[str, str] = {}
|
| 96 |
+
for n in names:
|
| 97 |
+
if n.endswith(".wav"):
|
| 98 |
+
stem = Path(n).stem
|
| 99 |
+
wav_lookup[stem] = n
|
| 100 |
+
|
| 101 |
+
random.shuffle(real_v)
|
| 102 |
+
random.shuffle(fake_v)
|
| 103 |
+
if max_per_class:
|
| 104 |
+
real_v = real_v[:max_per_class]
|
| 105 |
+
fake_v = fake_v[:max_per_class]
|
| 106 |
+
|
| 107 |
+
for v in real_v:
|
| 108 |
+
wav = wav_lookup.get(Path(v).stem)
|
| 109 |
+
self.samples.append((v, wav, 0))
|
| 110 |
+
for v in fake_v:
|
| 111 |
+
wav = wav_lookup.get(Path(v).stem)
|
| 112 |
+
self.samples.append((v, wav, 1))
|
| 113 |
+
|
| 114 |
+
random.shuffle(self.samples)
|
| 115 |
+
print(f"Dataset: {len(real_v)} real, {len(fake_v)} fake")
|
| 116 |
+
|
| 117 |
+
def __len__(self) -> int:
|
| 118 |
+
return len(self.samples)
|
| 119 |
+
|
| 120 |
+
def __getitem__(self, idx: int):
|
| 121 |
+
name, wav_name, label = self.samples[idx]
|
| 122 |
+
with tarfile.open(self.archive, "r:xz") as tf:
|
| 123 |
+
fobj = tf.extractfile(name)
|
| 124 |
+
data = fobj.read() if fobj else b""
|
| 125 |
+
# Use bundled WAV if available (better audio than ffmpeg extraction)
|
| 126 |
+
rms = 0.0
|
| 127 |
+
if wav_name:
|
| 128 |
+
try:
|
| 129 |
+
wobj = tf.extractfile(wav_name)
|
| 130 |
+
if wobj:
|
| 131 |
+
wav_data = wobj.read()
|
| 132 |
+
samples_np = np.frombuffer(wav_data[44:], dtype=np.int16).astype(np.float32) / 32768.0
|
| 133 |
+
rms = float(np.sqrt(np.mean(samples_np ** 2) + 1e-9))
|
| 134 |
+
except Exception:
|
| 135 |
+
pass
|
| 136 |
+
|
| 137 |
+
frame, rms_fallback = _extract_frame_and_audio(data)
|
| 138 |
+
if rms == 0.0:
|
| 139 |
+
rms = rms_fallback
|
| 140 |
+
|
| 141 |
+
# Visual: resize to 32x32, normalise
|
| 142 |
+
from PIL import Image # type: ignore
|
| 143 |
+
import torchvision.transforms as T # type: ignore
|
| 144 |
+
|
| 145 |
+
pil = Image.fromarray(frame).resize((32, 32))
|
| 146 |
+
vis = T.ToTensor()(pil) # (3, 32, 32)
|
| 147 |
+
|
| 148 |
+
audio = torch.tensor([rms], dtype=torch.float32)
|
| 149 |
+
return vis, audio, torch.tensor(label, dtype=torch.float32)
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
# ββ training ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 153 |
+
|
| 154 |
+
def train(epochs: int = 5, max_per_class: int | None = None, lr: float = 1e-3):
|
| 155 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 156 |
+
print(f"Training on {device}")
|
| 157 |
+
|
| 158 |
+
dataset = AVLipsDataset(ARCHIVE, max_per_class=max_per_class)
|
| 159 |
+
n_val = max(1, int(len(dataset) * 0.1))
|
| 160 |
+
train_ds, val_ds = torch.utils.data.random_split(
|
| 161 |
+
dataset, [len(dataset) - n_val, n_val]
|
| 162 |
+
)
|
| 163 |
+
train_loader = DataLoader(train_ds, batch_size=16, shuffle=True, num_workers=0)
|
| 164 |
+
val_loader = DataLoader(val_ds, batch_size=16, shuffle=False, num_workers=0)
|
| 165 |
+
|
| 166 |
+
model = LipFDNet().to(device)
|
| 167 |
+
opt = torch.optim.Adam(model.parameters(), lr=lr)
|
| 168 |
+
criterion = nn.BCEWithLogitsLoss()
|
| 169 |
+
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=epochs)
|
| 170 |
+
|
| 171 |
+
best_val_acc = 0.0
|
| 172 |
+
for epoch in range(1, epochs + 1):
|
| 173 |
+
model.train()
|
| 174 |
+
total_loss = 0.0
|
| 175 |
+
for vis, audio, labels in train_loader:
|
| 176 |
+
vis, audio, labels = vis.to(device), audio.to(device), labels.to(device)
|
| 177 |
+
opt.zero_grad()
|
| 178 |
+
logits = model(vis, audio)
|
| 179 |
+
loss = criterion(logits, labels)
|
| 180 |
+
loss.backward()
|
| 181 |
+
opt.step()
|
| 182 |
+
total_loss += loss.item()
|
| 183 |
+
|
| 184 |
+
# Validation
|
| 185 |
+
model.eval()
|
| 186 |
+
correct = total = 0
|
| 187 |
+
with torch.no_grad():
|
| 188 |
+
for vis, audio, labels in val_loader:
|
| 189 |
+
vis, audio, labels = vis.to(device), audio.to(device), labels.to(device)
|
| 190 |
+
preds = (model(vis, audio) > 0).float()
|
| 191 |
+
correct += (preds == labels).sum().item()
|
| 192 |
+
total += labels.size(0)
|
| 193 |
+
val_acc = correct / max(total, 1)
|
| 194 |
+
scheduler.step()
|
| 195 |
+
|
| 196 |
+
print(f"Epoch {epoch}/{epochs} loss={total_loss/len(train_loader):.4f} val_acc={val_acc:.3f}")
|
| 197 |
+
|
| 198 |
+
if val_acc > best_val_acc:
|
| 199 |
+
best_val_acc = val_acc
|
| 200 |
+
torch.save(model.state_dict(), CKPT_OUT)
|
| 201 |
+
print(f" β Saved checkpoint (val_acc={val_acc:.3f})")
|
| 202 |
+
|
| 203 |
+
print(f"\nTraining complete. Best val_acc={best_val_acc:.3f}")
|
| 204 |
+
print(f"Checkpoint saved to: {CKPT_OUT}")
|
| 205 |
+
return best_val_acc
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def upload():
|
| 209 |
+
from huggingface_hub import HfApi # type: ignore
|
| 210 |
+
|
| 211 |
+
api = HfApi()
|
| 212 |
+
api.upload_file(
|
| 213 |
+
path_or_fileobj=str(CKPT_OUT),
|
| 214 |
+
path_in_repo="ckpt.pth",
|
| 215 |
+
repo_id=HF_REPO,
|
| 216 |
+
repo_type="model",
|
| 217 |
+
)
|
| 218 |
+
print(f"Uploaded ckpt.pth to {HF_REPO}")
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
if __name__ == "__main__":
|
| 222 |
+
parser = argparse.ArgumentParser()
|
| 223 |
+
parser.add_argument("--epochs", type=int, default=5)
|
| 224 |
+
parser.add_argument("--max-per-class", type=int, default=None,
|
| 225 |
+
help="Limit videos per class (e.g. 200 for quick test)")
|
| 226 |
+
parser.add_argument("--lr", type=float, default=1e-3)
|
| 227 |
+
parser.add_argument("--no-upload", action="store_true",
|
| 228 |
+
help="Skip HF upload after training")
|
| 229 |
+
args = parser.parse_args()
|
| 230 |
+
|
| 231 |
+
train(epochs=args.epochs, max_per_class=args.max_per_class, lr=args.lr)
|
| 232 |
+
|
| 233 |
+
if not args.no_upload:
|
| 234 |
+
if CKPT_OUT.exists():
|
| 235 |
+
upload()
|
| 236 |
+
else:
|
| 237 |
+
print("No checkpoint found β skipping upload")
|
tests/test_zero_gpu_contract.py
CHANGED
|
@@ -31,12 +31,11 @@ def test_readme_declares_zero_gpu_space_metadata():
|
|
| 31 |
assert "app_file: app.py" in readme
|
| 32 |
|
| 33 |
|
| 34 |
-
def
|
| 35 |
source = (ROOT / "app.py").read_text(encoding="utf-8")
|
| 36 |
tree = ast.parse(source)
|
| 37 |
|
| 38 |
-
assert "modules.m3_fallback
|
| 39 |
-
assert "from modules.m3_sstgnn import SSTGNNModule" in source
|
| 40 |
assert "import spaces" in source
|
| 41 |
|
| 42 |
analyze = next(
|
|
|
|
| 31 |
assert "app_file: app.py" in readme
|
| 32 |
|
| 33 |
|
| 34 |
+
def test_app_uses_fallback_sstgnn_and_spaces_gpu_decorator():
|
| 35 |
source = (ROOT / "app.py").read_text(encoding="utf-8")
|
| 36 |
tree = ast.parse(source)
|
| 37 |
|
| 38 |
+
assert "from modules.m3_fallback import SSTGNNModule" in source
|
|
|
|
| 39 |
assert "import spaces" in source
|
| 40 |
|
| 41 |
analyze = next(
|