|
|
|
|
|
|
|
|
from abc import ABC, abstractmethod |
|
|
import io |
|
|
import math |
|
|
import numpy as np |
|
|
from typing import Optional, TypeVar, Union |
|
|
import collections |
|
|
|
|
|
try: |
|
|
import decord |
|
|
except ImportError: |
|
|
_HAS_DECORD = False |
|
|
else: |
|
|
_HAS_DECORD = True |
|
|
|
|
|
if _HAS_DECORD: |
|
|
decord.bridge.set_bridge('native') |
|
|
|
|
|
DecordDevice = TypeVar("DecordDevice") |
|
|
|
|
|
|
|
|
class VideoReaderWrapper(decord.VideoReader): |
|
|
def __init__(self, *args, **kwargs): |
|
|
super().__init__(*args, **kwargs) |
|
|
self.seek(0) |
|
|
|
|
|
def __getitem__(self, key): |
|
|
frames = super().__getitem__(key) |
|
|
self.seek(0) |
|
|
return frames |
|
|
|
|
|
|
|
|
class Video(ABC): |
|
|
""" |
|
|
Video provides an interface to access clips from a video container. |
|
|
""" |
|
|
|
|
|
@abstractmethod |
|
|
def __init__( |
|
|
self, |
|
|
file: Union[str, io.IOBase], |
|
|
video_name: Optional[str] = None, |
|
|
decode_audio: bool = True, |
|
|
) -> None: |
|
|
""" |
|
|
Args: |
|
|
file (BinaryIO): a file-like object (e.g. io.BytesIO or io.StringIO) that |
|
|
contains the encoded video. |
|
|
""" |
|
|
pass |
|
|
|
|
|
@property |
|
|
@abstractmethod |
|
|
def duration(self) -> float: |
|
|
""" |
|
|
Returns: |
|
|
duration of the video in seconds |
|
|
""" |
|
|
pass |
|
|
|
|
|
@abstractmethod |
|
|
def get_clip( |
|
|
self, start_sec: float, end_sec: float, num_samples: int |
|
|
): |
|
|
""" |
|
|
Retrieves frames from the internal video at the specified start and end times |
|
|
in seconds (the video always starts at 0 seconds). |
|
|
|
|
|
Args: |
|
|
start_sec (float): the clip start time in seconds |
|
|
end_sec (float): the clip end time in seconds |
|
|
Returns: |
|
|
video_data_dictonary: A dictionary mapping strings to tensor of the clip's |
|
|
underlying data. |
|
|
|
|
|
""" |
|
|
pass |
|
|
|
|
|
def close(self): |
|
|
pass |
|
|
|
|
|
|
|
|
class EncodedVideoDecord(Video): |
|
|
""" |
|
|
|
|
|
Accessing clips from an encoded video using Decord video reading API |
|
|
as the decoding backend. For more details, please refer to - |
|
|
`Decord <https://github.com/dmlc/decord>` |
|
|
""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
file: Union[str, io.IOBase], |
|
|
video_name: Optional[str] = None, |
|
|
width: int = -1, |
|
|
height: int = -1, |
|
|
num_threads: int = 0, |
|
|
fault_tol: int = -1, |
|
|
) -> None: |
|
|
""" |
|
|
Args: |
|
|
file str: file path. |
|
|
video_name (str): An optional name assigned to the video. |
|
|
decode_audio (bool): If disabled, audio is not decoded. |
|
|
sample_rate: int, default is -1 |
|
|
Desired output sample rate of the audio, unchanged if `-1` is specified. |
|
|
mono: bool, default is True |
|
|
Desired output channel layout of the audio. `True` is mono layout. `False` |
|
|
is unchanged. |
|
|
width : int, default is -1 |
|
|
Desired output width of the video, unchanged if `-1` is specified. |
|
|
height : int, default is -1 |
|
|
Desired output height of the video, unchanged if `-1` is specified. |
|
|
num_threads : int, default is 0 |
|
|
Number of decoding thread, auto if `0` is specified. |
|
|
fault_tol : int, default is -1 |
|
|
The threshold of corupted and recovered frames. This is to prevent silent fault |
|
|
tolerance when for example 50% frames of a video cannot be decoded and duplicate |
|
|
frames are returned. You may find the fault tolerant feature sweet in many |
|
|
cases, but not for training models. Say `N = # recovered frames` |
|
|
If `fault_tol` < 0, nothing will happen. |
|
|
If 0 < `fault_tol` < 1.0, if N > `fault_tol * len(video)`, |
|
|
raise `DECORDLimitReachedError`. |
|
|
If 1 < `fault_tol`, if N > `fault_tol`, raise `DECORDLimitReachedError`. |
|
|
""" |
|
|
self._video_name = video_name |
|
|
if not _HAS_DECORD: |
|
|
raise ImportError( |
|
|
"decord is required to use EncodedVideoDecord decoder. Please " |
|
|
"install with 'pip install decord' for CPU-only version and refer to" |
|
|
"'https://github.com/dmlc/decord' for GPU-supported version" |
|
|
) |
|
|
try: |
|
|
self._av_reader = VideoReaderWrapper( |
|
|
uri=file, |
|
|
ctx=decord.cpu(0), |
|
|
width=width, |
|
|
height=height, |
|
|
num_threads=num_threads, |
|
|
fault_tol=fault_tol, |
|
|
) |
|
|
except Exception as e: |
|
|
raise RuntimeError(f"Failed to open video {video_name} with Decord. {e}") |
|
|
|
|
|
self._fps = self._av_reader.get_avg_fps() |
|
|
self._duration = float(len(self._av_reader)) / float(self._fps) |
|
|
|
|
|
@property |
|
|
def name(self) -> Optional[str]: |
|
|
""" |
|
|
Returns: |
|
|
name: the name of the stored video if set. |
|
|
""" |
|
|
return self._video_name |
|
|
|
|
|
@property |
|
|
def duration(self) -> float: |
|
|
""" |
|
|
Returns: |
|
|
duration: the video's duration/end-time in seconds. |
|
|
""" |
|
|
return self._duration |
|
|
|
|
|
def close(self): |
|
|
if self._av_reader is not None: |
|
|
del self._av_reader |
|
|
self._av_reader = None |
|
|
|
|
|
def get_clip( |
|
|
self, start_sec: float, end_sec: float, num_samples: int |
|
|
): |
|
|
""" |
|
|
Retrieves frames from the encoded video at the specified start and end times |
|
|
in seconds (the video always starts at 0 seconds). |
|
|
|
|
|
Args: |
|
|
start_sec (float): the clip start time in seconds |
|
|
end_sec (float): the clip end time in seconds |
|
|
Returns: |
|
|
clip_data: |
|
|
A dictionary mapping the entries at "video" and "audio" to a tensors. |
|
|
|
|
|
"video": A tensor of the clip's RGB frames with shape: |
|
|
(channel, time, height, width). The frames are of type torch.float32 and |
|
|
in the range [0 - 255]. |
|
|
|
|
|
Returns None if no video or audio found within time range. |
|
|
|
|
|
""" |
|
|
if start_sec > end_sec or start_sec > self._duration: |
|
|
raise RuntimeError( |
|
|
f"Incorrect time window for Decord decoding for video: {self._video_name}." |
|
|
) |
|
|
|
|
|
start_idx = math.ceil(self._fps * start_sec) |
|
|
end_idx = math.ceil(self._fps * end_sec) |
|
|
end_idx = min(end_idx, len(self._av_reader)) |
|
|
|
|
|
|
|
|
frame_idxs = np.linspace(start_idx, end_idx - 1, num_samples, dtype=int) |
|
|
|
|
|
try: |
|
|
outputs = self._av_reader.get_batch(frame_idxs) |
|
|
return outputs.asnumpy(), frame_idxs - frame_idxs[0] |
|
|
except Exception as e: |
|
|
print(f"Failed to decode video with Decord: {self._video_name}. {e}") |
|
|
raise e |
|
|
|
|
|
try: |
|
|
import cv2 |
|
|
except ImportError: |
|
|
print(f"ERR: import cv2 failed, install cv2 by 'pip install opencv-python'") |
|
|
|
|
|
class EncodedVideoOpencv(): |
|
|
def __init__( |
|
|
self, |
|
|
file: Union[str, io.IOBase], |
|
|
video_name: Optional[str] = None, |
|
|
width: int = -1, |
|
|
height: int = -1, |
|
|
num_threads: int = 0, |
|
|
fault_tol: int = -1, |
|
|
) -> None: |
|
|
""" |
|
|
Args: |
|
|
file str: file path. |
|
|
video_name (str): An optional name assigned to the video. |
|
|
width : Not support yet. |
|
|
height : Not support yet. |
|
|
num_threads : Not support yet. |
|
|
fault_tol : Not support yet. |
|
|
""" |
|
|
|
|
|
self._video_name = video_name |
|
|
self.cap = cv2.VideoCapture(file) |
|
|
self._fps = self.cap.get(cv2.CAP_PROP_FPS) |
|
|
self._vlen = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT)) |
|
|
self._duration = float(self._vlen) / float(self._fps) |
|
|
|
|
|
@property |
|
|
def name(self) -> Optional[str]: |
|
|
""" |
|
|
Returns: |
|
|
name: the name of the stored video if set. |
|
|
""" |
|
|
return self._video_name |
|
|
|
|
|
@property |
|
|
def duration(self) -> float: |
|
|
""" |
|
|
Returns: |
|
|
duration: the video's duration/end-time in seconds. |
|
|
""" |
|
|
return self._duration |
|
|
|
|
|
def __del__(self): |
|
|
self.close() |
|
|
|
|
|
def close(self): |
|
|
self.cap.release() |
|
|
|
|
|
def get_clip( |
|
|
self, start_sec: float, end_sec: float, num_samples: int |
|
|
): |
|
|
if start_sec > end_sec or start_sec > self._duration: |
|
|
raise RuntimeError( |
|
|
f"Incorrect time window for Decord decoding for video: {self._video_name}." |
|
|
) |
|
|
start_idx = math.ceil(self._fps * start_sec) |
|
|
end_idx = math.ceil(self._fps * end_sec) |
|
|
end_idx = min(end_idx, self._vlen) |
|
|
frame_idxs = np.linspace(start_idx, end_idx - 1, num_samples, dtype=int) |
|
|
frame_idx2freq = collections.defaultdict(int) |
|
|
for frame_idx in frame_idxs: |
|
|
frame_idx2freq[frame_idx] += 1 |
|
|
try: |
|
|
frames = [] |
|
|
for i in range(self._vlen): |
|
|
if i > frame_idxs[-1]: |
|
|
break |
|
|
ret, frame = self.cap.read() |
|
|
if i in frame_idx2freq: |
|
|
frames.extend([frame] * frame_idx2freq[i]) |
|
|
frames = np.array(frames).astype(np.uint8) |
|
|
assert len(frames) == num_samples |
|
|
return frames, frame_idxs - frame_idxs[0] |
|
|
except Exception as e: |
|
|
print(f"Failed to decode video with opencv: {self._video_name}. {e}") |
|
|
raise e |
|
|
|