| import cv2 |
| import numpy as np |
| import torch |
| from PIL import Image |
| import tempfile |
| import os |
| from pathlib import Path |
| import logging |
|
|
| |
|
|
| |
| |
| |
|
|
|
|
| def save_uploaded_video(upload_file, temp_dir: str) -> str: |
| """Save uploaded video to temporary file and return path.""" |
| file_path = os.path.join(temp_dir, upload_file.filename) |
| with open(file_path, "wb") as buffer: |
| buffer.write(upload_file.file.read()) |
| return file_path |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import cv2 |
| import numpy as np |
| import torch |
| from PIL import Image |
| import os |
| import logging |
|
|
| logger = logging.getLogger(__name__) |
|
|
| MEAN = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1) |
| STD = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1) |
|
|
| |
| _face_cascade = cv2.CascadeClassifier( |
| cv2.data.haarcascades + 'haarcascade_frontalface_default.xml' |
| ) |
|
|
| def _crop_face(frame_bgr: np.ndarray, margin: float = 0.3) -> np.ndarray: |
| """ |
| Detect and crop the largest face in a BGR frame. |
| Returns the face crop, or the full frame if no face found. |
| """ |
| gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY) |
| faces = _face_cascade.detectMultiScale( |
| gray, scaleFactor=1.1, minNeighbors=5, minSize=(60, 60) |
| ) |
|
|
| if len(faces) == 0: |
| |
| h, w = frame_bgr.shape[:2] |
| size = min(h, w) |
| y0 = (h - size) // 2 |
| x0 = (w - size) // 2 |
| return frame_bgr[y0:y0+size, x0:x0+size] |
|
|
| |
| x, y, fw, fh = max(faces, key=lambda f: f[2] * f[3]) |
|
|
| |
| mx = int(fw * margin) |
| my = int(fh * margin) |
| H, W = frame_bgr.shape[:2] |
| x1 = max(0, x - mx) |
| y1 = max(0, y - my) |
| x2 = min(W, x + fw + mx) |
| y2 = min(H, y + fh + my) |
|
|
| return frame_bgr[y1:y2, x1:x2] |
|
|
|
|
| def extract_frames(video_path: str, num_frames: int = 16) -> list: |
| """Extract evenly spaced frames from video, with face crop.""" |
| cap = cv2.VideoCapture(video_path) |
| total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) |
|
|
| if total_frames <= 0: |
| cap.release() |
| return [] |
|
|
| indices = np.linspace(0, total_frames - 1, num=min(num_frames, total_frames), dtype=int) |
| frames = [] |
|
|
| for idx in indices: |
| cap.set(cv2.CAP_PROP_POS_FRAMES, idx) |
| ret, frame = cap.read() |
| if ret: |
| face = _crop_face(frame) |
| frame_rgb = cv2.cvtColor(face, cv2.COLOR_BGR2RGB) |
| frames.append(frame_rgb) |
|
|
| cap.release() |
| return frames |
|
|
| def preprocess_frame(frame: np.ndarray, target_size: int = 224) -> torch.Tensor: |
| """Preprocess a single frame for model input.""" |
| |
| pil_img = Image.fromarray(frame).resize((target_size, target_size), Image.BILINEAR) |
| |
| |
| tensor = torch.from_numpy(np.array(pil_img)).float().permute(2, 0, 1) / 255.0 |
| |
| |
| tensor = (tensor - MEAN) / STD |
| tensor = torch.nan_to_num(tensor, nan=0.0, posinf=5.0, neginf=-5.0) |
| |
| return tensor |
|
|
|
|
| def video_to_tensor(video_path: str, num_frames: int = 16, img_size: int = 224) -> torch.Tensor: |
| """Convert video to tensor of shape (num_frames, 3, img_size, img_size).""" |
| frames = extract_frames(video_path, num_frames) |
| |
| if not frames: |
| raise ValueError("Could not extract frames from video") |
| |
| tensors = [] |
| for frame in frames: |
| tensor = preprocess_frame(frame, img_size) |
| tensors.append(tensor) |
| |
| |
| if len(tensors) < num_frames: |
| last_tensor = tensors[-1] |
| while len(tensors) < num_frames: |
| tensors.append(last_tensor.clone()) |
| |
| return torch.stack(tensors) |