Spaces:

roll-ai
/

CogVideoXInterp

Paused

App Files Files Community

CogVideoXInterp / cogvideox_interpolation /datasets.py

AhmadMustafa

Initial commit for CogVideoXInterp

068b511 3 months ago

raw

history blame contribute delete

5.83 kB

	import json
	import torch
	from typing import Any, Dict, List, Optional, Tuple
	from torch.utils.data import DataLoader, Dataset
	import torchvision.transforms as TT
	from torchvision import transforms
	from torchvision.transforms.functional import center_crop, resize
	from torchvision.transforms import InterpolationMode
	import random
	try:
	import decord
	except ImportError:
	raise ImportError(
	"The `decord` package is required for loading the video dataset. Install with `pip install decord`"
	)

	decord.bridge.set_bridge("torch")

	class ImageVideoDataset(Dataset):
	def __init__(
	self,
	data_root,
	tokenizer,
	max_sequence_length: int = 226,
	height: int = 480,
	width: int = 720,
	video_reshape_mode: str = "center",
	fps: int = 8,
	stripe: int = 2,
	max_num_frames: int = 49,
	skip_frames_start: int = 0,
	skip_frames_end: int = 0,
	random_flip: Optional[float] = None,
	) -> None:
	super().__init__()

	with open(data_root, 'r') as f:
	self.data_list = json.load(f)

	self.tokenizer = tokenizer
	self.max_sequence_length = max_sequence_length
	self.height = height
	self.width = width
	self.video_reshape_mode = video_reshape_mode
	self.fps = fps
	self.max_num_frames = max_num_frames
	self.skip_frames_start = skip_frames_start
	self.skip_frames_end = skip_frames_end
	self.stripe = stripe
	self.video_transforms = transforms.Compose(
	[
	transforms.RandomHorizontalFlip(random_flip) if random_flip else transforms.Lambda(lambda x: x),
	transforms.Lambda(lambda x: x / 255.0),
	transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
	]
	)


	def __len__(self):
	return len(self.data_list)

	def _resize_for_rectangle_crop(self, arr):
	image_size = self.height, self.width
	reshape_mode = self.video_reshape_mode
	if arr.shape[3] / arr.shape[2] > image_size[1] / image_size[0]:
	arr = resize(
	arr,
	size=[image_size[0], int(arr.shape[3] * image_size[0] / arr.shape[2])],
	interpolation=InterpolationMode.BICUBIC,
	)
	else:
	arr = resize(
	arr,
	size=[int(arr.shape[2] * image_size[1] / arr.shape[3]), image_size[1]],
	interpolation=InterpolationMode.BICUBIC,
	)

	h, w = arr.shape[2], arr.shape[3]
	arr = arr.squeeze(0)

	delta_h = h - image_size[0]
	delta_w = w - image_size[1]

	if reshape_mode == "random" or reshape_mode == "none":
	top = np.random.randint(0, delta_h + 1)
	left = np.random.randint(0, delta_w + 1)
	elif reshape_mode == "center":
	top, left = delta_h // 2, delta_w // 2
	else:
	raise NotImplementedError
	arr = TT.functional.crop(arr, top=top, left=left, height=image_size[0], width=image_size[1])
	return arr

	def __getitem__(self, index):
	while True:
	try:
	video_reader = decord.VideoReader(self.data_list[index]['file_path'], width=self.width, height=self.height)
	video_num_frames = len(video_reader)
	# print(video_num_frames, video_reader.get_avg_fps())
	if self.stripe * self.max_num_frames > video_num_frames:
	stripe = 1
	else:
	stripe = self.stripe

	random_range = video_num_frames - stripe * self.max_num_frames - 1
	random_range = max(1, random_range)
	start_frame = random.randint(1, random_range) if random_range > 0 else 1

	indices = list(range(start_frame, start_frame + stripe * self.max_num_frames, stripe)) # (end_frame - start_frame) // self.max_num_frames))
	frames = video_reader.get_batch(indices)

	# Ensure that we don't go over the limit
	frames = frames[: self.max_num_frames]
	selected_num_frames = frames.shape[0]

	# Choose first (4k + 1) frames as this is how many is required by the VAE
	remainder = (3 + (selected_num_frames % 4)) % 4
	if remainder != 0:
	frames = frames[:-remainder]
	selected_num_frames = frames.shape[0]

	assert (selected_num_frames - 1) % 4 == 0
	if selected_num_frames == self.max_num_frames:
	break
	else:
	index = (index + 1) % len(self.data_list)
	continue

	except Exception as e:
	index = (index + 1) % len(self.data_list)
	print(video_num_frames, start_frame, indices)
	print(
	"Error encounter during audio feature extraction: ", e,
	)
	continue

	# Training transforms
	# frames = (frames - 127.5) / 127.5
	frames = frames.permute(0, 3, 1, 2).contiguous() # [F, C, H, W]
	frames = self._resize_for_rectangle_crop(frames)
	frames = torch.stack([self.video_transforms(frame) for frame in frames], dim=0)

	text_inputs = self.tokenizer(
	[self.data_list[index]['text']],
	padding="max_length",
	max_length=self.max_sequence_length,
	truncation=True,
	add_special_tokens=True,
	return_tensors="pt",
	)
	text_input_ids = text_inputs.input_ids[0]

	return frames.contiguous(), text_input_ids