import cv2 import numpy as np import torch from PIL import Image import tempfile import os from pathlib import Path import logging # logger = logging.getLogger(__name__) # # ImageNet normalization constants # MEAN = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1) # STD = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1) def save_uploaded_video(upload_file, temp_dir: str) -> str: """Save uploaded video to temporary file and return path.""" file_path = os.path.join(temp_dir, upload_file.filename) with open(file_path, "wb") as buffer: buffer.write(upload_file.file.read()) return file_path # def extract_frames(video_path: str, num_frames: int = 16) -> list: # """Extract evenly spaced frames from video.""" # cap = cv2.VideoCapture(video_path) # total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) # if total_frames <= 0: # cap.release() # return [] # indices = np.linspace(0, total_frames - 1, num=min(num_frames, total_frames), dtype=int) # frames = [] # for idx in indices: # cap.set(cv2.CAP_PROP_POS_FRAMES, idx) # ret, frame = cap.read() # if ret: # frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # frames.append(frame_rgb) # cap.release() # return frames # utils.py — replace extract_frames + preprocess_frame with these import cv2 import numpy as np import torch from PIL import Image import os import logging logger = logging.getLogger(__name__) MEAN = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1) STD = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1) # Load OpenCV's face detector (ships with opencv-python, no extra install) _face_cascade = cv2.CascadeClassifier( cv2.data.haarcascades + 'haarcascade_frontalface_default.xml' ) def _crop_face(frame_bgr: np.ndarray, margin: float = 0.3) -> np.ndarray: """ Detect and crop the largest face in a BGR frame. Returns the face crop, or the full frame if no face found. """ gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY) faces = _face_cascade.detectMultiScale( gray, scaleFactor=1.1, minNeighbors=5, minSize=(60, 60) ) if len(faces) == 0: # Fall back to centre crop (better than full frame) h, w = frame_bgr.shape[:2] size = min(h, w) y0 = (h - size) // 2 x0 = (w - size) // 2 return frame_bgr[y0:y0+size, x0:x0+size] # Pick the largest detected face x, y, fw, fh = max(faces, key=lambda f: f[2] * f[3]) # Add margin mx = int(fw * margin) my = int(fh * margin) H, W = frame_bgr.shape[:2] x1 = max(0, x - mx) y1 = max(0, y - my) x2 = min(W, x + fw + mx) y2 = min(H, y + fh + my) return frame_bgr[y1:y2, x1:x2] def extract_frames(video_path: str, num_frames: int = 16) -> list: """Extract evenly spaced frames from video, with face crop.""" cap = cv2.VideoCapture(video_path) total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) if total_frames <= 0: cap.release() return [] indices = np.linspace(0, total_frames - 1, num=min(num_frames, total_frames), dtype=int) frames = [] for idx in indices: cap.set(cv2.CAP_PROP_POS_FRAMES, idx) ret, frame = cap.read() if ret: face = _crop_face(frame) # <-- crop face frame_rgb = cv2.cvtColor(face, cv2.COLOR_BGR2RGB) frames.append(frame_rgb) cap.release() return frames def preprocess_frame(frame: np.ndarray, target_size: int = 224) -> torch.Tensor: """Preprocess a single frame for model input.""" # Convert to PIL and resize pil_img = Image.fromarray(frame).resize((target_size, target_size), Image.BILINEAR) # Convert to tensor and normalize to [0, 1] tensor = torch.from_numpy(np.array(pil_img)).float().permute(2, 0, 1) / 255.0 # Normalize with ImageNet stats tensor = (tensor - MEAN) / STD tensor = torch.nan_to_num(tensor, nan=0.0, posinf=5.0, neginf=-5.0) return tensor def video_to_tensor(video_path: str, num_frames: int = 16, img_size: int = 224) -> torch.Tensor: """Convert video to tensor of shape (num_frames, 3, img_size, img_size).""" frames = extract_frames(video_path, num_frames) if not frames: raise ValueError("Could not extract frames from video") tensors = [] for frame in frames: tensor = preprocess_frame(frame, img_size) tensors.append(tensor) # Pad if needed if len(tensors) < num_frames: last_tensor = tensors[-1] while len(tensors) < num_frames: tensors.append(last_tensor.clone()) return torch.stack(tensors)