Spaces:
Running
Running
| import cv2 | |
| import numpy as np | |
| import torch | |
| from PIL import Image | |
| import tempfile | |
| import os | |
| from pathlib import Path | |
| import logging | |
| # logger = logging.getLogger(__name__) | |
| # # ImageNet normalization constants | |
| # MEAN = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1) | |
| # STD = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1) | |
| def save_uploaded_video(upload_file, temp_dir: str) -> str: | |
| """Save uploaded video to temporary file and return path.""" | |
| file_path = os.path.join(temp_dir, upload_file.filename) | |
| with open(file_path, "wb") as buffer: | |
| buffer.write(upload_file.file.read()) | |
| return file_path | |
| # def extract_frames(video_path: str, num_frames: int = 16) -> list: | |
| # """Extract evenly spaced frames from video.""" | |
| # cap = cv2.VideoCapture(video_path) | |
| # total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) | |
| # if total_frames <= 0: | |
| # cap.release() | |
| # return [] | |
| # indices = np.linspace(0, total_frames - 1, num=min(num_frames, total_frames), dtype=int) | |
| # frames = [] | |
| # for idx in indices: | |
| # cap.set(cv2.CAP_PROP_POS_FRAMES, idx) | |
| # ret, frame = cap.read() | |
| # if ret: | |
| # frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
| # frames.append(frame_rgb) | |
| # cap.release() | |
| # return frames | |
| # utils.py — replace extract_frames + preprocess_frame with these | |
| import cv2 | |
| import numpy as np | |
| import torch | |
| from PIL import Image | |
| import os | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| MEAN = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1) | |
| STD = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1) | |
| # Load OpenCV's face detector (ships with opencv-python, no extra install) | |
| _face_cascade = cv2.CascadeClassifier( | |
| cv2.data.haarcascades + 'haarcascade_frontalface_default.xml' | |
| ) | |
| def _crop_face(frame_bgr: np.ndarray, margin: float = 0.3) -> np.ndarray: | |
| """ | |
| Detect and crop the largest face in a BGR frame. | |
| Returns the face crop, or the full frame if no face found. | |
| """ | |
| gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY) | |
| faces = _face_cascade.detectMultiScale( | |
| gray, scaleFactor=1.1, minNeighbors=5, minSize=(60, 60) | |
| ) | |
| if len(faces) == 0: | |
| # Fall back to centre crop (better than full frame) | |
| h, w = frame_bgr.shape[:2] | |
| size = min(h, w) | |
| y0 = (h - size) // 2 | |
| x0 = (w - size) // 2 | |
| return frame_bgr[y0:y0+size, x0:x0+size] | |
| # Pick the largest detected face | |
| x, y, fw, fh = max(faces, key=lambda f: f[2] * f[3]) | |
| # Add margin | |
| mx = int(fw * margin) | |
| my = int(fh * margin) | |
| H, W = frame_bgr.shape[:2] | |
| x1 = max(0, x - mx) | |
| y1 = max(0, y - my) | |
| x2 = min(W, x + fw + mx) | |
| y2 = min(H, y + fh + my) | |
| return frame_bgr[y1:y2, x1:x2] | |
| def extract_frames(video_path: str, num_frames: int = 16) -> list: | |
| """Extract evenly spaced frames from video, with face crop.""" | |
| cap = cv2.VideoCapture(video_path) | |
| total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) | |
| if total_frames <= 0: | |
| cap.release() | |
| return [] | |
| indices = np.linspace(0, total_frames - 1, num=min(num_frames, total_frames), dtype=int) | |
| frames = [] | |
| for idx in indices: | |
| cap.set(cv2.CAP_PROP_POS_FRAMES, idx) | |
| ret, frame = cap.read() | |
| if ret: | |
| face = _crop_face(frame) # <-- crop face | |
| frame_rgb = cv2.cvtColor(face, cv2.COLOR_BGR2RGB) | |
| frames.append(frame_rgb) | |
| cap.release() | |
| return frames | |
| def preprocess_frame(frame: np.ndarray, target_size: int = 224) -> torch.Tensor: | |
| """Preprocess a single frame for model input.""" | |
| # Convert to PIL and resize | |
| pil_img = Image.fromarray(frame).resize((target_size, target_size), Image.BILINEAR) | |
| # Convert to tensor and normalize to [0, 1] | |
| tensor = torch.from_numpy(np.array(pil_img)).float().permute(2, 0, 1) / 255.0 | |
| # Normalize with ImageNet stats | |
| tensor = (tensor - MEAN) / STD | |
| tensor = torch.nan_to_num(tensor, nan=0.0, posinf=5.0, neginf=-5.0) | |
| return tensor | |
| def video_to_tensor(video_path: str, num_frames: int = 16, img_size: int = 224) -> torch.Tensor: | |
| """Convert video to tensor of shape (num_frames, 3, img_size, img_size).""" | |
| frames = extract_frames(video_path, num_frames) | |
| if not frames: | |
| raise ValueError("Could not extract frames from video") | |
| tensors = [] | |
| for frame in frames: | |
| tensor = preprocess_frame(frame, img_size) | |
| tensors.append(tensor) | |
| # Pad if needed | |
| if len(tensors) < num_frames: | |
| last_tensor = tensors[-1] | |
| while len(tensors) < num_frames: | |
| tensors.append(last_tensor.clone()) | |
| return torch.stack(tensors) |