faisalishfaq2005's picture
uploading data on huggingface
31ad805
from torchvision import transforms
import torch
from PIL import Image
from model import ImprovedEfficientViT
import os
import cv2
from mtcnn import MTCNN
def extract_faces(video_path, target_frames=20):
detector = MTCNN()
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print(f"Error: Could not open video {video_path}")
return []
fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_interval = max(total_frames // target_frames, 1)
face_images = []
for i in range(0, total_frames, frame_interval):
cap.set(cv2.CAP_PROP_POS_FRAMES, i)
ret, frame = cap.read()
if not ret:
continue
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
faces = detector.detect_faces(rgb_frame)
for face in faces:
if face['confidence'] < 0.9:
continue
x, y, w, h = face['box']
x, y = max(x, 0), max(y, 0)
face_img = rgb_frame[y:y+h, x:x+w]
if face_img.size == 0:
continue
face_img = cv2.resize(face_img, (224, 224))
face_images.append(face_img)
cap.release()
return face_images
from torchvision import transforms
transform_vedio=transforms.Compose([
transforms.ToPILImage(),
transforms.Resize((224,224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.5],std=[0.5])
])
def predict_vedio(video_path,model_vedio):
pred_list = []
prob_list=[]
faces = extract_faces(video_path, target_frames=20)
transformed_faces = [transform_vedio(face) for face in faces]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_vedio.to(device)
for face in transformed_faces:
face = face.to(device).unsqueeze(0)
with torch.no_grad():
logit = model_vedio(face)
prob = torch.sigmoid(logit)
pred = int(prob.item() > 0.5)
pred_list.append(pred)
prob_list.append(prob)
count=0
for ele in pred_list:
if ele==0:
count+=1
predicted_class=0 if count>3 else 1
return{
"class":predicted_class
}