File size: 4,754 Bytes
3b237c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import cv2
import numpy as np
import torch
from PIL import Image
import tempfile
import os
from pathlib import Path
import logging

# logger = logging.getLogger(__name__)

# # ImageNet normalization constants
# MEAN = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
# STD = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)


def save_uploaded_video(upload_file, temp_dir: str) -> str:
    """Save uploaded video to temporary file and return path."""
    file_path = os.path.join(temp_dir, upload_file.filename)
    with open(file_path, "wb") as buffer:
        buffer.write(upload_file.file.read())
    return file_path


# def extract_frames(video_path: str, num_frames: int = 16) -> list:
#     """Extract evenly spaced frames from video."""
#     cap = cv2.VideoCapture(video_path)
#     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
#     if total_frames <= 0:
#         cap.release()
#         return []
    
#     indices = np.linspace(0, total_frames - 1, num=min(num_frames, total_frames), dtype=int)
#     frames = []
    
#     for idx in indices:
#         cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
#         ret, frame = cap.read()
#         if ret:
#             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
#             frames.append(frame_rgb)
    
#     cap.release()
#     return frames
# utils.py — replace extract_frames + preprocess_frame with these

import cv2
import numpy as np
import torch
from PIL import Image
import os
import logging

logger = logging.getLogger(__name__)

MEAN = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
STD  = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)

# Load OpenCV's face detector (ships with opencv-python, no extra install)
_face_cascade = cv2.CascadeClassifier(
    cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
)

def _crop_face(frame_bgr: np.ndarray, margin: float = 0.3) -> np.ndarray:
    """
    Detect and crop the largest face in a BGR frame.
    Returns the face crop, or the full frame if no face found.
    """
    gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
    faces = _face_cascade.detectMultiScale(
        gray, scaleFactor=1.1, minNeighbors=5, minSize=(60, 60)
    )

    if len(faces) == 0:
        # Fall back to centre crop (better than full frame)
        h, w = frame_bgr.shape[:2]
        size = min(h, w)
        y0 = (h - size) // 2
        x0 = (w - size) // 2
        return frame_bgr[y0:y0+size, x0:x0+size]

    # Pick the largest detected face
    x, y, fw, fh = max(faces, key=lambda f: f[2] * f[3])

    # Add margin
    mx = int(fw * margin)
    my = int(fh * margin)
    H, W = frame_bgr.shape[:2]
    x1 = max(0, x - mx)
    y1 = max(0, y - my)
    x2 = min(W, x + fw + mx)
    y2 = min(H, y + fh + my)

    return frame_bgr[y1:y2, x1:x2]


def extract_frames(video_path: str, num_frames: int = 16) -> list:
    """Extract evenly spaced frames from video, with face crop."""
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    if total_frames <= 0:
        cap.release()
        return []

    indices = np.linspace(0, total_frames - 1, num=min(num_frames, total_frames), dtype=int)
    frames = []

    for idx in indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            face = _crop_face(frame)                          # <-- crop face
            frame_rgb = cv2.cvtColor(face, cv2.COLOR_BGR2RGB)
            frames.append(frame_rgb)

    cap.release()
    return frames

def preprocess_frame(frame: np.ndarray, target_size: int = 224) -> torch.Tensor:
    """Preprocess a single frame for model input."""
    # Convert to PIL and resize
    pil_img = Image.fromarray(frame).resize((target_size, target_size), Image.BILINEAR)
    
    # Convert to tensor and normalize to [0, 1]
    tensor = torch.from_numpy(np.array(pil_img)).float().permute(2, 0, 1) / 255.0
    
    # Normalize with ImageNet stats
    tensor = (tensor - MEAN) / STD
    tensor = torch.nan_to_num(tensor, nan=0.0, posinf=5.0, neginf=-5.0)
    
    return tensor


def video_to_tensor(video_path: str, num_frames: int = 16, img_size: int = 224) -> torch.Tensor:
    """Convert video to tensor of shape (num_frames, 3, img_size, img_size)."""
    frames = extract_frames(video_path, num_frames)
    
    if not frames:
        raise ValueError("Could not extract frames from video")
    
    tensors = []
    for frame in frames:
        tensor = preprocess_frame(frame, img_size)
        tensors.append(tensor)
    
    # Pad if needed
    if len(tensors) < num_frames:
        last_tensor = tensors[-1]
        while len(tensors) < num_frames:
            tensors.append(last_tensor.clone())
    
    return torch.stack(tensors)