import os import torch import torch.nn as nn import numpy as np from torchvision import transforms as T from torchvision.transforms.v2 import ToDtype from decord import VideoReader, cpu import gradio as gr # ------------------------- # Step 0: Download model from Google Drive if not exists # ------------------------- model_path = 'vifi_clip_30_epochs_k400_full_finetuned.pth' if not os.path.exists(model_path): print(f"🔽 Downloading model to {model_path}...") os.system("pip install -q gdown") os.system("gdown --id 1Nx30Kbu5xnv6dPwz4I3Ivy380LCdp1Md -O vifi_clip_30_epochs_k400_full_finetuned.pth") # ------------------------- # Transform # ------------------------- def _transform(n_px=224): return T.Compose([ ToDtype(torch.float32, scale=True), T.Resize(n_px, antialias=True), T.CenterCrop(n_px), T.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), ]) # ------------------------- # Classifier Head # ------------------------- class ClassificationHead(nn.Module): def __init__(self, input_dim=512, num_classes=1): super().__init__() self.dense = nn.Linear(input_dim, num_classes) def forward(self, x): return self.dense(x) # ------------------------- # Load ViFi-CLIP Model # ------------------------- from trainers import vificlip from utils.config import get_config from utils.logger import create_logger cfgpth = 'configs/zero_shot/train/k400/16_16_vifi_clip.yaml' classifier_path = 'best_detector_model.pt' device = torch.device("cuda" if torch.cuda.is_available() else "cpu") class parse_option: def __init__(self): self.config = cfgpth self.output = "exp" self.resume = model_path self.only_test = True self.opts = None self.batch_size = None self.pretrained = None self.accumulation_steps = None self.local_rank = 0 args = parse_option() config = get_config(args) logger = create_logger(output_dir=args.output, name=f"{config.MODEL.ARCH}") model = vificlip.returnCLIP(config, logger, class_names=["true", "false"]) model = model.float().to(device) feature_extractor = model classifier = ClassificationHead() classifier.load_state_dict(torch.load(classifier_path, map_location=device)) classifier.to(device) classifier.eval() # ------------------------- # Inference Function # ------------------------- def predict_video(video_path, threshold=0.5): preprocess = _transform(224) try: vr = VideoReader(video_path, ctx=cpu(0)) total_frames = len(vr) num_frames = 16 if total_frames > num_frames: start = np.random.randint(0, total_frames - num_frames) indices = list(range(start, start + num_frames)) else: indices = list(range(total_frames)) indices += [total_frames - 1] * (num_frames - len(indices)) frames = vr.get_batch(indices).asnumpy() video_tensor = torch.from_numpy(frames).permute(0, 3, 1, 2) video_tensor = preprocess(video_tensor).unsqueeze(0).to(device) B, T, C, H, W = video_tensor.shape input_clip = video_tensor.view(B * T, C, H, W) with torch.no_grad(): features = feature_extractor.image_encoder(input_clip) features = features.view(B, T, -1).mean(dim=1) logits = classifier(features) prob = torch.sigmoid(logits).item() label = "Real" if prob >= threshold else "Fake" return f"{label} (prob: {prob:.4f}, threshold: {threshold})" except Exception as e: return f"❌ Error: {str(e)}" # ------------------------- # Gradio UI # ------------------------- gr.Interface( fn=predict_video, inputs=[ gr.Video(type="filepath", label="Upload Video (.mp4)"), gr.Slider(0.0, 1.0, value=0.5, step=0.01, label="Threshold (Real ≥ Threshold)") ], outputs="text", title="🧠 Deepfake Detection with ViFi-CLIP", description="Upload a video to classify it as Real or Fake. Threshold slider lets you adjust sensitivity." ).launch()