faisalishfaq2005
/

deepfake-detection-efficientnet-vit

+# Deepfake Detection with Improved EfficientViT
+## Model Architecture
+![Model Architecture](assets/architecture.png)
+## Inference Pipeline
+![Inference Pipeline](assets/inference_pipeline.png)
+This repository contains a **PyTorch model for deepfake detection** based on an improved **EfficientViT** architecture, trained on video data.
+The model predicts whether a video is **real (0)** or **fake (1)** using both visual information and temporal cues.
 ---
+## 🧩 Model Description
+**Architecture:** Improved EfficientViT
+**Backbone:** EfficientNet-B0 for feature extraction
+**Head:** Transformer-based temporal modeling with classification head
+**Input:** Video frames (224×224 RGB images)
+**Output:** Binary label (0=Real, 1=Fake) and frame-level probabilities
+**Key Features:**
+- Extracts faces from frames using MTCNN
+- Supports inference on raw video files
+- Provides frame-level probabilities for fine-grained analysis
 ---
+## 📁 Repository Structure
+```
+deepfake-efficientvit/
+│
+├── model.py                  # ImprovedEfficientViT class
+├── inference.py              # Functions to run inference on videos
+├── model.pth  # Trained weights
+├── config.json               # Optional model metadata
+├── requirements.txt          # Required packages
+├── README.md
+```
+## ⚡ Installation
+git clone https://huggingface.co/faisalishfaq2005/deepfake-detection-efficientnet-vit
+cd deepfake-detection-efficientnet-vit
+pip install -r requirements.txt
+## 🚀 Usage
+# 1.Programmatic Inference
+```python
+from huggingface_hub import hf_hub_download
+import torch
+from model import ImprovedEfficientViT
+from inference import predict_vedio  # your inference function
+# 1️⃣ Download the checkpoint from Hugging Face
+checkpoint_path = hf_hub_download(
+    repo_id="faisalishfaq2005/deepfake-detection-efficientnet-vit",
+    filename="model.pth"
+)
+# 2️⃣ Load the model
+model = ImprovedEfficientViT()
+model.load_state_dict(torch.load(checkpoint_path, map_location="cpu"))
+model.eval()
+# 3️⃣ Run inference on a video
+video_path = "sample_video.mp4"
+result = predict_vedio(video_path, model)
+print(result)
+# Example Output: {'class': 1}
+```
+# 2. Manual Download
+Go to the Hugging Face model page
+Download:
+model.pth
+model.py
+inference.py
+Place them in the same folder locally.
+Install requirements and run predict_video().
+## 📄 License
+This model is released under the MIT License.
+You are free to use, modify, and distribute it, with attribution.
+## 📚 Citation
+If you use this model in your research, please cite:
+```bibtex
+@inproceedings{faisalishfaq2025efficientvit,
+  title={Deepfake Detection with Efficientnet and ViT},
+  author={Faisal Ishfaq},
+  year={2025}
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "model_type": "efficientnetb0_Vit_blocks_multi_head_attention",
+  "framework": "pytorch",
+  "architecture": {
+    "input": {
+      "video_frames":"20 frames per video" ,
+      "image_size": [224, 224]
+    },
+    "output_classes": ["real", "fake"]
+  },
+  "pretrained": true,
+  "model_file": "model.pth"
+}

inference.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from torchvision import transforms
+import torch
+from PIL import Image
+from model import ImprovedEfficientViT
+import os
+import cv2
+from mtcnn import MTCNN
+def extract_faces(video_path, target_frames=20):
+    detector = MTCNN()
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print(f"Error: Could not open video {video_path}")
+        return []
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    frame_interval = max(total_frames // target_frames, 1)
+    face_images = []
+    for i in range(0, total_frames, frame_interval):
+        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        ret, frame = cap.read()
+        if not ret:
+            continue
+        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        faces = detector.detect_faces(rgb_frame)
+        for face in faces:
+            if face['confidence'] < 0.9:
+                continue
+            x, y, w, h = face['box']
+            x, y = max(x, 0), max(y, 0)
+            face_img = rgb_frame[y:y+h, x:x+w]
+            if face_img.size == 0:
+                continue
+            face_img = cv2.resize(face_img, (224, 224))
+            face_images.append(face_img)
+    cap.release()
+    return face_images
+from torchvision import transforms
+transform_vedio=transforms.Compose([
+    transforms.ToPILImage(),
+    transforms.Resize((224,224)),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.5],std=[0.5])
+])
+def predict_vedio(video_path,model_vedio):
+    pred_list = []
+    prob_list=[]
+    faces = extract_faces(video_path, target_frames=20)
+    transformed_faces = [transform_vedio(face) for face in faces]
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model_vedio.to(device)
+    for face in transformed_faces:
+        face = face.to(device).unsqueeze(0)
+        with torch.no_grad():
+            logit = model_vedio(face)
+            prob = torch.sigmoid(logit)
+            pred = int(prob.item() > 0.5)
+            pred_list.append(pred)
+            prob_list.append(prob)
+    count=0
+    for ele in pred_list:
+        if ele==0:
+            count+=1
+    predicted_class=0 if count>3 else 1
+    return{
+        "class":predicted_class
+    }

model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:734e07e648d846edff79edc9a25fb35ae3d885b732a12032698ef70948a47904
+size 66414828

model.py ADDED Viewed

File without changes

requirements.txt ADDED Viewed

File without changes