oo01's picture
Upload 6 files
3b237c2 verified
import cv2
import numpy as np
import torch
from PIL import Image
import tempfile
import os
from pathlib import Path
import logging
# logger = logging.getLogger(__name__)
# # ImageNet normalization constants
# MEAN = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
# STD = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
def save_uploaded_video(upload_file, temp_dir: str) -> str:
"""Save uploaded video to temporary file and return path."""
file_path = os.path.join(temp_dir, upload_file.filename)
with open(file_path, "wb") as buffer:
buffer.write(upload_file.file.read())
return file_path
# def extract_frames(video_path: str, num_frames: int = 16) -> list:
# """Extract evenly spaced frames from video."""
# cap = cv2.VideoCapture(video_path)
# total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
# if total_frames <= 0:
# cap.release()
# return []
# indices = np.linspace(0, total_frames - 1, num=min(num_frames, total_frames), dtype=int)
# frames = []
# for idx in indices:
# cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
# ret, frame = cap.read()
# if ret:
# frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# frames.append(frame_rgb)
# cap.release()
# return frames
# utils.py — replace extract_frames + preprocess_frame with these
import cv2
import numpy as np
import torch
from PIL import Image
import os
import logging
logger = logging.getLogger(__name__)
MEAN = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
STD = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
# Load OpenCV's face detector (ships with opencv-python, no extra install)
_face_cascade = cv2.CascadeClassifier(
cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
)
def _crop_face(frame_bgr: np.ndarray, margin: float = 0.3) -> np.ndarray:
"""
Detect and crop the largest face in a BGR frame.
Returns the face crop, or the full frame if no face found.
"""
gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
faces = _face_cascade.detectMultiScale(
gray, scaleFactor=1.1, minNeighbors=5, minSize=(60, 60)
)
if len(faces) == 0:
# Fall back to centre crop (better than full frame)
h, w = frame_bgr.shape[:2]
size = min(h, w)
y0 = (h - size) // 2
x0 = (w - size) // 2
return frame_bgr[y0:y0+size, x0:x0+size]
# Pick the largest detected face
x, y, fw, fh = max(faces, key=lambda f: f[2] * f[3])
# Add margin
mx = int(fw * margin)
my = int(fh * margin)
H, W = frame_bgr.shape[:2]
x1 = max(0, x - mx)
y1 = max(0, y - my)
x2 = min(W, x + fw + mx)
y2 = min(H, y + fh + my)
return frame_bgr[y1:y2, x1:x2]
def extract_frames(video_path: str, num_frames: int = 16) -> list:
"""Extract evenly spaced frames from video, with face crop."""
cap = cv2.VideoCapture(video_path)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
if total_frames <= 0:
cap.release()
return []
indices = np.linspace(0, total_frames - 1, num=min(num_frames, total_frames), dtype=int)
frames = []
for idx in indices:
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
ret, frame = cap.read()
if ret:
face = _crop_face(frame) # <-- crop face
frame_rgb = cv2.cvtColor(face, cv2.COLOR_BGR2RGB)
frames.append(frame_rgb)
cap.release()
return frames
def preprocess_frame(frame: np.ndarray, target_size: int = 224) -> torch.Tensor:
"""Preprocess a single frame for model input."""
# Convert to PIL and resize
pil_img = Image.fromarray(frame).resize((target_size, target_size), Image.BILINEAR)
# Convert to tensor and normalize to [0, 1]
tensor = torch.from_numpy(np.array(pil_img)).float().permute(2, 0, 1) / 255.0
# Normalize with ImageNet stats
tensor = (tensor - MEAN) / STD
tensor = torch.nan_to_num(tensor, nan=0.0, posinf=5.0, neginf=-5.0)
return tensor
def video_to_tensor(video_path: str, num_frames: int = 16, img_size: int = 224) -> torch.Tensor:
"""Convert video to tensor of shape (num_frames, 3, img_size, img_size)."""
frames = extract_frames(video_path, num_frames)
if not frames:
raise ValueError("Could not extract frames from video")
tensors = []
for frame in frames:
tensor = preprocess_frame(frame, img_size)
tensors.append(tensor)
# Pad if needed
if len(tensors) < num_frames:
last_tensor = tensors[-1]
while len(tensors) < num_frames:
tensors.append(last_tensor.clone())
return torch.stack(tensors)