|
|
import os |
|
|
import torch |
|
|
import torch.nn as nn |
|
|
import numpy as np |
|
|
from torchvision import transforms as T |
|
|
from torchvision.transforms.v2 import ToDtype |
|
|
from decord import VideoReader, cpu |
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model_path = 'vifi_clip_30_epochs_k400_full_finetuned.pth' |
|
|
if not os.path.exists(model_path): |
|
|
print(f"π½ Downloading model to {model_path}...") |
|
|
os.system("pip install -q gdown") |
|
|
os.system("gdown --id 1Nx30Kbu5xnv6dPwz4I3Ivy380LCdp1Md -O vifi_clip_30_epochs_k400_full_finetuned.pth") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _transform(n_px=224): |
|
|
return T.Compose([ |
|
|
ToDtype(torch.float32, scale=True), |
|
|
T.Resize(n_px, antialias=True), |
|
|
T.CenterCrop(n_px), |
|
|
T.Normalize((0.48145466, 0.4578275, 0.40821073), |
|
|
(0.26862954, 0.26130258, 0.27577711)), |
|
|
]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ClassificationHead(nn.Module): |
|
|
def __init__(self, input_dim=512, num_classes=1): |
|
|
super().__init__() |
|
|
self.dense = nn.Linear(input_dim, num_classes) |
|
|
|
|
|
def forward(self, x): |
|
|
return self.dense(x) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from trainers import vificlip |
|
|
from utils.config import get_config |
|
|
from utils.logger import create_logger |
|
|
|
|
|
cfgpth = 'configs/zero_shot/train/k400/16_16_vifi_clip.yaml' |
|
|
classifier_path = 'best_detector_model.pt' |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
class parse_option: |
|
|
def __init__(self): |
|
|
self.config = cfgpth |
|
|
self.output = "exp" |
|
|
self.resume = model_path |
|
|
self.only_test = True |
|
|
self.opts = None |
|
|
self.batch_size = None |
|
|
self.pretrained = None |
|
|
self.accumulation_steps = None |
|
|
self.local_rank = 0 |
|
|
|
|
|
args = parse_option() |
|
|
config = get_config(args) |
|
|
logger = create_logger(output_dir=args.output, name=f"{config.MODEL.ARCH}") |
|
|
model = vificlip.returnCLIP(config, logger, class_names=["true", "false"]) |
|
|
model = model.float().to(device) |
|
|
feature_extractor = model |
|
|
|
|
|
classifier = ClassificationHead() |
|
|
classifier.load_state_dict(torch.load(classifier_path, map_location=device)) |
|
|
classifier.to(device) |
|
|
classifier.eval() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def predict_video(video_path, threshold=0.5): |
|
|
preprocess = _transform(224) |
|
|
try: |
|
|
vr = VideoReader(video_path, ctx=cpu(0)) |
|
|
total_frames = len(vr) |
|
|
num_frames = 16 |
|
|
|
|
|
if total_frames > num_frames: |
|
|
start = np.random.randint(0, total_frames - num_frames) |
|
|
indices = list(range(start, start + num_frames)) |
|
|
else: |
|
|
indices = list(range(total_frames)) |
|
|
indices += [total_frames - 1] * (num_frames - len(indices)) |
|
|
|
|
|
frames = vr.get_batch(indices).asnumpy() |
|
|
video_tensor = torch.from_numpy(frames).permute(0, 3, 1, 2) |
|
|
video_tensor = preprocess(video_tensor).unsqueeze(0).to(device) |
|
|
|
|
|
B, T, C, H, W = video_tensor.shape |
|
|
input_clip = video_tensor.view(B * T, C, H, W) |
|
|
|
|
|
with torch.no_grad(): |
|
|
features = feature_extractor.image_encoder(input_clip) |
|
|
features = features.view(B, T, -1).mean(dim=1) |
|
|
logits = classifier(features) |
|
|
prob = torch.sigmoid(logits).item() |
|
|
label = "Real" if prob >= threshold else "Fake" |
|
|
|
|
|
return f"{label} (prob: {prob:.4f}, threshold: {threshold})" |
|
|
except Exception as e: |
|
|
return f"β Error: {str(e)}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gr.Interface( |
|
|
fn=predict_video, |
|
|
inputs=[ |
|
|
gr.Video(type="filepath", label="Upload Video (.mp4)"), |
|
|
gr.Slider(0.0, 1.0, value=0.5, step=0.01, label="Threshold (Real β₯ Threshold)") |
|
|
], |
|
|
outputs="text", |
|
|
title="π§ Deepfake Detection with ViFi-CLIP", |
|
|
description="Upload a video to classify it as Real or Fake. Threshold slider lets you adjust sensitivity." |
|
|
).launch() |
|
|
|