File size: 5,534 Bytes
3cf4fff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# Copyright (c) Meta Platforms, Inc. and affiliates.

import os
from logging import getLogger
from typing import Callable, List, Tuple

import torch
from PIL import Image
from torchcodec.decoders import VideoDecoder
from torchvision.utils import draw_bounding_boxes

from core.transforms.image_transform import ImageTransform

logger = getLogger()


def get_video_transform(
    image_res: int = 224,
    normalize_img: bool = True,
) -> Tuple[Callable, int]:

    transforms = VideoTransform(
        size=image_res,
        normalize_img=normalize_img,
    )

    return transforms


class VideoTransform(ImageTransform):
    def __init__(
        self,
        size: int = 224,
        normalize_img: bool = True,
    ) -> None:
        super().__init__(
            size=size,
            normalize_img=normalize_img,
        )

    def __call__(self, video_info: tuple, sampling_fps: int = 1):
        video_path, max_frames, s, e, bbox_map = video_info

        frames, sample_pos = self.load_video(
            video_path,
            max_frames=max_frames,
            sampling_fps=sampling_fps,
            s=s,
            e=e,
        )

        if bbox_map:
            bbox_dict_map = {}
            for idx_pos, pos in enumerate(sample_pos):
                if str(pos) in bbox_map and bbox_map[str(pos)] is not None:
                    bbox_dict_map[idx_pos] = bbox_map[str(pos)]
            if len(bbox_dict_map) > 0:
                frames = self.draw_bounding_boxes(frames, bbox_dict_map)

        return super()._transform_torch_tensor(frames)

    def _process_multiple_images(self, image_paths: List[str]):
        images = [Image.open(path).convert("RGB") for path in image_paths]
        processed_images = []
        for image in images:
            image, (w, h) = super().__call__(image)
            processed_images.append(image)
        processed_images = torch.cat(processed_images, dim=0)
        return processed_images, (w, h)

    def _process_multiple_images_pil(self, images: List[Image.Image]):
        processed_images = []
        for image in images:
            image, (w, h) = super().__call__(image)
            processed_images.append(image)
        processed_images = torch.cat(processed_images, dim=0)
        return processed_images, (w, h)

    def load_video(self, video_path, max_frames=16, sampling_fps=1, s=None, e=None):
        """
        Loads a video from a given path and extracts frames based on specified parameters using OpenCV.

        Args:
            video_path (str): The path to the video file.
            max_frames (int, optional): The maximum number of frames to extract. Defaults to 16.
            sampling_fps (int, optional): The sampling frame rate. Defaults to 1.
            s (float, optional): The start time of the video in seconds. Defaults to None.
            e (float, optional): The end time of the video in seconds. Defaults to None.

        Returns:
            list: A list of frames extracted from the video.
        """

        if not os.path.exists(video_path):
            return

        decoder = VideoDecoder(video_path, device="cpu")
        decoder_metadata = decoder.metadata
        fps = decoder_metadata.average_fps
        total_frames = decoder_metadata.num_frames

        start_frame = 0 if s is None else int(s * fps)
        end_frame = total_frames - 1 if e is None else int(e * fps)
        end_frame = min(end_frame, total_frames - 1)

        if start_frame > end_frame:
            start_frame, end_frame = end_frame, start_frame
        elif start_frame == end_frame:
            end_frame = start_frame + 1

        sample_fps = int(sampling_fps)
        t_stride = int(round(float(fps) / sample_fps))

        all_pos = list(range(start_frame, end_frame + 1, t_stride))
        if len(all_pos) > max_frames:
            sample_idxs = self.uniform_sample(len(all_pos), max_frames)
            sample_pos = [all_pos[i] for i in sample_idxs]
        elif len(all_pos) < max_frames:
            total_clip_frames = end_frame - start_frame + 1
            if total_clip_frames < max_frames:
                max_frames = total_clip_frames
            sample_idxs = self.uniform_sample(total_clip_frames, max_frames)
            sample_pos = [start_frame + idx for idx in sample_idxs]
        else:
            sample_pos = all_pos

        all_frames = decoder.get_frames_at(indices=sample_pos)
        all_frames = all_frames.data

        return all_frames, sample_pos

    def uniform_sample(self, m, n):
        assert n <= m
        stride = (m - 1) / (n - 1) if n > 1 else 0  # Calculate the stride
        return [int(round(i * stride)) for i in range(n)]

    def draw_bounding_boxes(self, frames, all_bboxes):
        # Assuming frames is a torch.Tensor with shape (N, C, H, W)
        N, _, _, _ = frames.shape
        frames_with_bbox = (
            frames.clone()
        )  # Clone the tensor to avoid modifying the original
        for i in range(N):
            if i in all_bboxes:
                bbox = all_bboxes[i]
                # Convert bbox to a tensor and add a batch dimension
                bbox_tensor = torch.tensor([bbox], dtype=torch.float32)
                # Draw the bounding box on the frame
                frames_with_bbox[i] = draw_bounding_boxes(
                    frames_with_bbox[i],
                    boxes=bbox_tensor,
                    colors=(255, 0, 0),  # Red color for the bounding box
                    width=4,
                )
        return frames_with_bbox