File size: 4,754 Bytes

3b237c2

import cv2
import numpy as np
import torch
from PIL import Image
import tempfile
import os
from pathlib import Path
import logging

# logger = logging.getLogger(__name__)

# # ImageNet normalization constants
# MEAN = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
# STD = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)


def save_uploaded_video(upload_file, temp_dir: str) -> str:
    """Save uploaded video to temporary file and return path."""
    file_path = os.path.join(temp_dir, upload_file.filename)
    with open(file_path, "wb") as buffer:
        buffer.write(upload_file.file.read())
    return file_path


# def extract_frames(video_path: str, num_frames: int = 16) -> list:
#     """Extract evenly spaced frames from video."""
#     cap = cv2.VideoCapture(video_path)
#     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
#     if total_frames <= 0:
#         cap.release()
#         return []
    
#     indices = np.linspace(0, total_frames - 1, num=min(num_frames, total_frames), dtype=int)
#     frames = []
    
#     for idx in indices:
#         cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
#         ret, frame = cap.read()
#         if ret:
#             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
#             frames.append(frame_rgb)
    
#     cap.release()
#     return frames
# utils.py — replace extract_frames + preprocess_frame with these

import cv2
import numpy as np
import torch
from PIL import Image
import os
import logging

logger = logging.getLogger(__name__)

MEAN = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
STD  = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)

# Load OpenCV's face detector (ships with opencv-python, no extra install)
_face_cascade = cv2.CascadeClassifier(
    cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
)

def _crop_face(frame_bgr: np.ndarray, margin: float = 0.3) -> np.ndarray:
    """
    Detect and crop the largest face in a BGR frame.
    Returns the face crop, or the full frame if no face found.
    """
    gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
    faces = _face_cascade.detectMultiScale(
        gray, scaleFactor=1.1, minNeighbors=5, minSize=(60, 60)
    )

    if len(faces) == 0:
        # Fall back to centre crop (better than full frame)
        h, w = frame_bgr.shape[:2]
        size = min(h, w)
        y0 = (h - size) // 2
        x0 = (w - size) // 2
        return frame_bgr[y0:y0+size, x0:x0+size]

    # Pick the largest detected face
    x, y, fw, fh = max(faces, key=lambda f: f[2] * f[3])

    # Add margin
    mx = int(fw * margin)
    my = int(fh * margin)
    H, W = frame_bgr.shape[:2]
    x1 = max(0, x - mx)
    y1 = max(0, y - my)
    x2 = min(W, x + fw + mx)
    y2 = min(H, y + fh + my)

    return frame_bgr[y1:y2, x1:x2]


def extract_frames(video_path: str, num_frames: int = 16) -> list:
    """Extract evenly spaced frames from video, with face crop."""
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    if total_frames <= 0:
        cap.release()
        return []

    indices = np.linspace(0, total_frames - 1, num=min(num_frames, total_frames), dtype=int)
    frames = []

    for idx in indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            face = _crop_face(frame)                          # <-- crop face
            frame_rgb = cv2.cvtColor(face, cv2.COLOR_BGR2RGB)
            frames.append(frame_rgb)

    cap.release()
    return frames

def preprocess_frame(frame: np.ndarray, target_size: int = 224) -> torch.Tensor:
    """Preprocess a single frame for model input."""
    # Convert to PIL and resize
    pil_img = Image.fromarray(frame).resize((target_size, target_size), Image.BILINEAR)
    
    # Convert to tensor and normalize to [0, 1]
    tensor = torch.from_numpy(np.array(pil_img)).float().permute(2, 0, 1) / 255.0
    
    # Normalize with ImageNet stats
    tensor = (tensor - MEAN) / STD
    tensor = torch.nan_to_num(tensor, nan=0.0, posinf=5.0, neginf=-5.0)
    
    return tensor


def video_to_tensor(video_path: str, num_frames: int = 16, img_size: int = 224) -> torch.Tensor:
    """Convert video to tensor of shape (num_frames, 3, img_size, img_size)."""
    frames = extract_frames(video_path, num_frames)
    
    if not frames:
        raise ValueError("Could not extract frames from video")
    
    tensors = []
    for frame in frames:
        tensor = preprocess_frame(frame, img_size)
        tensors.append(tensor)
    
    # Pad if needed
    if len(tensors) < num_frames:
        last_tensor = tensors[-1]
        while len(tensors) < num_frames:
            tensors.append(last_tensor.clone())
    
    return torch.stack(tensors)