| import gradio as gr |
| import torch |
| from torch import nn |
| from PIL import Image |
| from transformers import CLIPProcessor, CLIPModel |
| import cv2 |
| import numpy as np |
|
|
| |
| class CLIPImageClassifier(nn.Module): |
| def __init__(self, clip_model_name="openai/clip-vit-base-patch32"): |
| super(CLIPImageClassifier, self).__init__() |
| self.clip = CLIPModel.from_pretrained(clip_model_name) |
| self.classifier = nn.Sequential( |
| nn.Linear(self.clip.config.vision_config.hidden_size, 256), |
| nn.ReLU(), |
| nn.Dropout(0.5), |
| nn.Linear(256, 1), |
| nn.Sigmoid() |
| ) |
| def forward(self, pixel_values): |
| vision_outputs = self.clip.vision_model(pixel_values=pixel_values) |
| image_features = vision_outputs.pooler_output |
| return self.classifier(image_features) |
|
|
| |
| DEVICE = "cpu" |
| MODEL_PATH = "best_clip_finetuned_classifier.pth" |
| CLIP_NAME = "openai/clip-vit-base-patch32" |
|
|
| print("Loading model...") |
| model = CLIPImageClassifier() |
| |
| model.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device(DEVICE)), strict=False) |
| model.to(DEVICE) |
| model.eval() |
|
|
| print("Loading processor...") |
| processor = CLIPProcessor.from_pretrained(CLIP_NAME) |
|
|
| |
| def predict_video(video_path): |
| """ |
| Gradio passes the 'video_path' as a string to the temporary file. |
| """ |
| if video_path is None: |
| return "Please upload a video.", 0.0 |
|
|
| print(f"Processing video: {video_path}") |
| cap = cv2.VideoCapture(video_path) |
| |
| fps = cap.get(cv2.CAP_PROP_FPS) |
| if fps == 0 or np.isnan(fps): |
| fps = 30 |
| |
| |
| frames_to_sample = 1 |
| frame_skip = max(1, int(fps / frames_to_sample)) |
| |
| predictions = [] |
| frame_count = 0 |
|
|
| while cap.isOpened(): |
| ret, frame = cap.read() |
| if not ret: |
| break |
|
|
| if frame_count % frame_skip == 0: |
| |
| frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) |
| pil_image = Image.fromarray(frame_rgb) |
|
|
| |
| inputs = processor(images=pil_image, return_tensors="pt")['pixel_values'].to(DEVICE) |
|
|
| |
| with torch.no_grad(): |
| output = model(inputs) |
| prob = output.item() |
| predictions.append(prob) |
| |
| frame_count += 1 |
|
|
| cap.release() |
|
|
| if not predictions: |
| return "Could not analyze video frames.", 0.0 |
|
|
| |
| avg_fake_prob = sum(predictions) / len(predictions) |
| |
| |
| label = "FAKE" if avg_fake_prob > 0.5 else "REAL" |
| confidence = avg_fake_prob if label == "FAKE" else (1 - avg_fake_prob) |
| |
| return f"{label} (Confidence: {confidence:.2%})", avg_fake_prob |
|
|
| |
| interface = gr.Interface( |
| fn=predict_video, |
| inputs=gr.Video(label="Upload Video"), |
| outputs=[ |
| gr.Textbox(label="Verdict"), |
| gr.Number(label="Fake Probability Score (0=Real, 1=Fake)") |
| ], |
| title="DeepFake Video Detector", |
| description="Upload a video to check if it is Real or AI-Generated. The model analyzes using a fine-tuned CLIP classifier." |
| ) |
|
|
| |
| if __name__ == "__main__": |
| interface.launch() |