import glob import torch from torchvision import transforms import torch.nn as nn from torch.utils.data import Dataset from PIL import Image import os import numpy as np # import random # from config import T2V_model import cv2 def pyramidsGL(image, num_levels, dim=224): ''' Creates Gaussian (G) and Laplacian (L) pyramids of level "num_levels" from image im. G and L are list where G[i], L[i] stores the i-th level of Gaussian and Laplacian pyramid, respectively. ''' o_width = image.shape[1] o_height = image.shape[0] # resize # if both greater than if o_width > (dim+num_levels) and o_height > (dim+num_levels): if o_width > o_height: f_height = dim f_width = int((o_width*f_height)/o_height) elif o_height > o_width: f_width = dim f_height = int((o_height*f_width)/o_width) else: f_width = f_height = dim height_step = int((o_height-f_height)/(num_levels-1))*(-1) width_step = int((o_width-f_width)/(num_levels-1))*(-1) height_list = [i for i in range(o_height, f_height-1, height_step)] width_list = [i for i in range(o_width, f_width-1, width_step)] # if both equal to elif o_width == dim or o_height == dim: height_list = [o_height for i in range(num_levels)] width_list = [o_width for i in range(num_levels)] else: if o_width > o_height: f_height = dim f_width = int((o_width*f_height)/o_height) elif o_height > o_width: f_width = dim f_height = int((o_height*f_width)/o_width) else: f_width = f_height = dim image = cv2.resize(image, (f_width, f_height), interpolation=cv2.INTER_CUBIC) height_list = [f_height for i in range(num_levels)] width_list = [f_width for i in range(num_levels)] layer = image.copy() gaussian_pyramid = [layer] # Gaussian Pyramid # print(gaussian_pyramid[2]) laplacian_pyramid = [] # Laplacian Pyramid for i in range(num_levels-1): blur = cv2.GaussianBlur(gaussian_pyramid[i], (5, 5), 5) layer = cv2.resize( blur, (width_list[i+1], height_list[i+1]), interpolation=cv2.INTER_CUBIC) gaussian_pyramid.append(layer) uplayer = cv2.resize( blur, (width_list[i], height_list[i]), interpolation=cv2.INTER_CUBIC) laplacian = cv2.subtract(gaussian_pyramid[i], uplayer) laplacian_pyramid.append(laplacian) gaussian_pyramid.pop(-1) return gaussian_pyramid, laplacian_pyramid def resizedpyramids(gaussian_pyramid, laplacian_pyramid, num_levels, width, height): gaussian_pyramid_resized, laplacian_pyramid_resized = [], [] for i in range(num_levels-1): # img_gaussian_pyramid = cv2.resize(gaussian_pyramid[i],(width, height), interpolation = cv2.INTER_CUBIC) img_laplacian_pyramid = cv2.resize( laplacian_pyramid[i], (width, height), interpolation=cv2.INTER_CUBIC) # gaussian_pyramid_resized.append(img_gaussian_pyramid) laplacian_pyramid_resized.append(img_laplacian_pyramid) return gaussian_pyramid_resized, laplacian_pyramid_resized class VideoDataset_mp4(Dataset): """Read data from the original dataset for feature extraction""" def __init__(self, database_name, vids_dir, num_levels=6): super(VideoDataset_mp4, self).__init__() self.database_name = database_name self.vids_dir = glob.glob(f'{vids_dir}/*.mp4') self.num_levels = num_levels def __len__(self): return len((self.vids_dir)) def __getitem__(self, idx): vid_path = self.vids_dir[idx] vid_name = vid_path.split('/')[-1] vid_name = vid_name[:-4] cap = cv2.VideoCapture(vid_path, cv2.CAP_FFMPEG) video_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) if video_length == 0: raise Exception('no frame in this vid') video_chunk = [] original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # Calculate new dimensions (scaled by 0.5) #scale_factor=0.5 #new_width = int(original_width * scale_factor) #new_height = int(original_height * scale_factor) while 1: ret, frame = cap.read() if not ret: break # resize the frame to half #resized_frame = cv2.resize(frame, (new_width, new_height)) video_chunk.append(frame) video_width = video_chunk[0].shape[1] video_height = video_chunk[0].shape[0] # ########## if (video_width < 768) | (video_height < 768): transformed_video = torch.zeros( [video_length * (self.num_levels - 1), 3, video_height, video_width]) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[ 0.229, 0.224, 0.225]) ]) else: transform = transforms.Compose([ transforms.ToTensor(), transforms.Resize(768), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[ 0.229, 0.224, 0.225]) ]) example_frame = video_chunk[0] example_frame = transform(example_frame) transformed_video = torch.zeros( [video_length * (self.num_levels - 1), 3, example_frame.size(1), example_frame.size(2)]) # seed = np.random.randint(0) # np.random.seed(seed) # random.seed(0) for i, extravt_frame in enumerate(video_chunk): gaussian_pyramid, laplacian_pyramid = pyramidsGL( extravt_frame, self.num_levels) _, laplacian_pyramid_resized = resizedpyramids(gaussian_pyramid, laplacian_pyramid, self.num_levels, video_width, video_height) for j in range(len(laplacian_pyramid_resized)): lp = laplacian_pyramid_resized[j] lp = cv2.cvtColor(lp, cv2.COLOR_BGR2RGB) lp = transform(lp) transformed_video[i*(self.num_levels-1)+j] = lp return transformed_video, video_length, vid_name class VideoDataset_mp42(Dataset): """Read data from the original dataset for feature extraction""" def __init__(self, database_name, vids_dir, num_levels=6): super(VideoDataset_mp42, self).__init__() self.database_name = database_name self.vids_dir = glob.glob(f'{vids_dir}/*.mp4') self.num_levels = num_levels def __len__(self): return len((self.vids_dir)) def __getitem__(self, idx): vid_path = self.vids_dir[idx] vid_name = vid_path.split('/')[-1] vid_name = vid_name[:-4] cap = cv2.VideoCapture(vid_path, cv2.CAP_FFMPEG) video_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) if video_length == 0: raise Exception('no frame in this vid') video_chunk = [] original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # Calculate new dimensions (scaled by 0.5) #scale_factor=0.5 #new_width = int(original_width * scale_factor) #new_height = int(original_height * scale_factor) while 1: ret, frame = cap.read() if not ret: break # resize the frame to half #resized_frame = cv2.resize(frame, (new_width, new_height)) video_chunk.append(frame) video_width = video_chunk[0].shape[1] video_height = video_chunk[0].shape[0] # ########## if (video_width < 768) | (video_height < 768): transformed_video = torch.zeros( [video_length, 3, video_height, video_width]) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[ 0.229, 0.224, 0.225]) ]) else: transform = transforms.Compose([ transforms.ToTensor(), transforms.Resize(768), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[ 0.229, 0.224, 0.225]) ]) example_frame = video_chunk[0] example_frame = transform(example_frame) transformed_video = torch.zeros( [video_length, 3, example_frame.size(1), example_frame.size(2)]) for i, extract_frame in enumerate(video_chunk): lp = cv2.cvtColor(extract_frame, cv2.COLOR_BGR2RGB) lp = transform(lp) transformed_video[i] = lp return transformed_video, video_length, vid_name