File size: 4,754 Bytes
3b237c2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | import cv2
import numpy as np
import torch
from PIL import Image
import tempfile
import os
from pathlib import Path
import logging
# logger = logging.getLogger(__name__)
# # ImageNet normalization constants
# MEAN = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
# STD = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
def save_uploaded_video(upload_file, temp_dir: str) -> str:
"""Save uploaded video to temporary file and return path."""
file_path = os.path.join(temp_dir, upload_file.filename)
with open(file_path, "wb") as buffer:
buffer.write(upload_file.file.read())
return file_path
# def extract_frames(video_path: str, num_frames: int = 16) -> list:
# """Extract evenly spaced frames from video."""
# cap = cv2.VideoCapture(video_path)
# total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
# if total_frames <= 0:
# cap.release()
# return []
# indices = np.linspace(0, total_frames - 1, num=min(num_frames, total_frames), dtype=int)
# frames = []
# for idx in indices:
# cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
# ret, frame = cap.read()
# if ret:
# frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# frames.append(frame_rgb)
# cap.release()
# return frames
# utils.py — replace extract_frames + preprocess_frame with these
import cv2
import numpy as np
import torch
from PIL import Image
import os
import logging
logger = logging.getLogger(__name__)
MEAN = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
STD = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
# Load OpenCV's face detector (ships with opencv-python, no extra install)
_face_cascade = cv2.CascadeClassifier(
cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
)
def _crop_face(frame_bgr: np.ndarray, margin: float = 0.3) -> np.ndarray:
"""
Detect and crop the largest face in a BGR frame.
Returns the face crop, or the full frame if no face found.
"""
gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
faces = _face_cascade.detectMultiScale(
gray, scaleFactor=1.1, minNeighbors=5, minSize=(60, 60)
)
if len(faces) == 0:
# Fall back to centre crop (better than full frame)
h, w = frame_bgr.shape[:2]
size = min(h, w)
y0 = (h - size) // 2
x0 = (w - size) // 2
return frame_bgr[y0:y0+size, x0:x0+size]
# Pick the largest detected face
x, y, fw, fh = max(faces, key=lambda f: f[2] * f[3])
# Add margin
mx = int(fw * margin)
my = int(fh * margin)
H, W = frame_bgr.shape[:2]
x1 = max(0, x - mx)
y1 = max(0, y - my)
x2 = min(W, x + fw + mx)
y2 = min(H, y + fh + my)
return frame_bgr[y1:y2, x1:x2]
def extract_frames(video_path: str, num_frames: int = 16) -> list:
"""Extract evenly spaced frames from video, with face crop."""
cap = cv2.VideoCapture(video_path)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
if total_frames <= 0:
cap.release()
return []
indices = np.linspace(0, total_frames - 1, num=min(num_frames, total_frames), dtype=int)
frames = []
for idx in indices:
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
ret, frame = cap.read()
if ret:
face = _crop_face(frame) # <-- crop face
frame_rgb = cv2.cvtColor(face, cv2.COLOR_BGR2RGB)
frames.append(frame_rgb)
cap.release()
return frames
def preprocess_frame(frame: np.ndarray, target_size: int = 224) -> torch.Tensor:
"""Preprocess a single frame for model input."""
# Convert to PIL and resize
pil_img = Image.fromarray(frame).resize((target_size, target_size), Image.BILINEAR)
# Convert to tensor and normalize to [0, 1]
tensor = torch.from_numpy(np.array(pil_img)).float().permute(2, 0, 1) / 255.0
# Normalize with ImageNet stats
tensor = (tensor - MEAN) / STD
tensor = torch.nan_to_num(tensor, nan=0.0, posinf=5.0, neginf=-5.0)
return tensor
def video_to_tensor(video_path: str, num_frames: int = 16, img_size: int = 224) -> torch.Tensor:
"""Convert video to tensor of shape (num_frames, 3, img_size, img_size)."""
frames = extract_frames(video_path, num_frames)
if not frames:
raise ValueError("Could not extract frames from video")
tensors = []
for frame in frames:
tensor = preprocess_frame(frame, img_size)
tensors.append(tensor)
# Pad if needed
if len(tensors) < num_frames:
last_tensor = tensors[-1]
while len(tensors) < num_frames:
tensors.append(last_tensor.clone())
return torch.stack(tensors) |