VQualA_GenAI_track2 / feat_extract /spatial_dataloader.py

Upload 493 files

a6bc892 verified 8 months ago

9.07 kB

	import glob
	import torch
	from torchvision import transforms
	import torch.nn as nn
	from torch.utils.data import Dataset
	from PIL import Image
	import os
	import numpy as np
	# import random
	# from config import T2V_model
	import cv2


	def pyramidsGL(image, num_levels, dim=224):
	''' Creates Gaussian (G) and Laplacian (L) pyramids of level "num_levels" from image im.
	G and L are list where G[i], L[i] stores the i-th level of Gaussian and Laplacian pyramid, respectively. '''
	o_width = image.shape[1]
	o_height = image.shape[0]

	# resize

	# if both greater than
	if o_width > (dim+num_levels) and o_height > (dim+num_levels):
	if o_width > o_height:
	f_height = dim
	f_width = int((o_width*f_height)/o_height)
	elif o_height > o_width:
	f_width = dim
	f_height = int((o_height*f_width)/o_width)
	else:
	f_width = f_height = dim

	height_step = int((o_height-f_height)/(num_levels-1))*(-1)
	width_step = int((o_width-f_width)/(num_levels-1))*(-1)
	height_list = [i for i in range(o_height, f_height-1, height_step)]
	width_list = [i for i in range(o_width, f_width-1, width_step)]

	# if both equal to
	elif o_width == dim or o_height == dim:
	height_list = [o_height for i in range(num_levels)]
	width_list = [o_width for i in range(num_levels)]

	else:
	if o_width > o_height:
	f_height = dim
	f_width = int((o_width*f_height)/o_height)
	elif o_height > o_width:
	f_width = dim
	f_height = int((o_height*f_width)/o_width)
	else:
	f_width = f_height = dim
	image = cv2.resize(image, (f_width, f_height),
	interpolation=cv2.INTER_CUBIC)
	height_list = [f_height for i in range(num_levels)]
	width_list = [f_width for i in range(num_levels)]

	layer = image.copy()
	gaussian_pyramid = [layer] # Gaussian Pyramid
	# print(gaussian_pyramid[2])

	laplacian_pyramid = [] # Laplacian Pyramid

	for i in range(num_levels-1):

	blur = cv2.GaussianBlur(gaussian_pyramid[i], (5, 5), 5)
	layer = cv2.resize(
	blur, (width_list[i+1], height_list[i+1]), interpolation=cv2.INTER_CUBIC)
	gaussian_pyramid.append(layer)

	uplayer = cv2.resize(
	blur, (width_list[i], height_list[i]), interpolation=cv2.INTER_CUBIC)
	laplacian = cv2.subtract(gaussian_pyramid[i], uplayer)
	laplacian_pyramid.append(laplacian)

	gaussian_pyramid.pop(-1)
	return gaussian_pyramid, laplacian_pyramid


	def resizedpyramids(gaussian_pyramid, laplacian_pyramid, num_levels, width, height):
	gaussian_pyramid_resized, laplacian_pyramid_resized = [], []
	for i in range(num_levels-1):
	# img_gaussian_pyramid = cv2.resize(gaussian_pyramid[i],(width, height), interpolation = cv2.INTER_CUBIC)
	img_laplacian_pyramid = cv2.resize(
	laplacian_pyramid[i], (width, height), interpolation=cv2.INTER_CUBIC)
	# gaussian_pyramid_resized.append(img_gaussian_pyramid)
	laplacian_pyramid_resized.append(img_laplacian_pyramid)
	return gaussian_pyramid_resized, laplacian_pyramid_resized


	class VideoDataset_mp4(Dataset):
	"""Read data from the original dataset for feature extraction"""

	def __init__(self, database_name, vids_dir, num_levels=6):
	super(VideoDataset_mp4, self).__init__()
	self.database_name = database_name
	self.vids_dir = glob.glob(f'{vids_dir}/*.mp4')

	self.num_levels = num_levels

	def __len__(self):
	return len((self.vids_dir))

	def __getitem__(self, idx):

	vid_path = self.vids_dir[idx]
	vid_name = vid_path.split('/')[-1]
	vid_name = vid_name[:-4]
	cap = cv2.VideoCapture(vid_path, cv2.CAP_FFMPEG)
	video_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
	if video_length == 0:
	raise Exception('no frame in this vid')
	video_chunk = []

	original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
	original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
	# Calculate new dimensions (scaled by 0.5)

	#scale_factor=0.5
	#new_width = int(original_width * scale_factor)
	#new_height = int(original_height * scale_factor)

	while 1:
	ret, frame = cap.read()
	if not ret:
	break
	# resize the frame to half
	#resized_frame = cv2.resize(frame, (new_width, new_height))
	video_chunk.append(frame)

	video_width = video_chunk[0].shape[1]
	video_height = video_chunk[0].shape[0]
	# ##########

	if (video_width < 768) \| (video_height < 768):
	transformed_video = torch.zeros(
	[video_length * (self.num_levels - 1), 3, video_height, video_width])
	transform = transforms.Compose([
	transforms.ToTensor(),
	transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[
	0.229, 0.224, 0.225])
	])
	else:
	transform = transforms.Compose([
	transforms.ToTensor(),
	transforms.Resize(768),
	transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[
	0.229, 0.224, 0.225])
	])

	example_frame = video_chunk[0]
	example_frame = transform(example_frame)
	transformed_video = torch.zeros(
	[video_length * (self.num_levels - 1), 3, example_frame.size(1), example_frame.size(2)])


	# seed = np.random.randint(0)
	# np.random.seed(seed)
	# random.seed(0)
	for i, extravt_frame in enumerate(video_chunk):
	gaussian_pyramid, laplacian_pyramid = pyramidsGL(
	extravt_frame, self.num_levels)
	_, laplacian_pyramid_resized = resizedpyramids(gaussian_pyramid,
	laplacian_pyramid, self.num_levels, video_width, video_height)
	for j in range(len(laplacian_pyramid_resized)):
	lp = laplacian_pyramid_resized[j]
	lp = cv2.cvtColor(lp, cv2.COLOR_BGR2RGB)
	lp = transform(lp)
	transformed_video[i*(self.num_levels-1)+j] = lp

	return transformed_video, video_length, vid_name


	class VideoDataset_mp42(Dataset):
	"""Read data from the original dataset for feature extraction"""

	def __init__(self, database_name, vids_dir, num_levels=6):
	super(VideoDataset_mp42, self).__init__()
	self.database_name = database_name
	self.vids_dir = glob.glob(f'{vids_dir}/*.mp4')

	self.num_levels = num_levels

	def __len__(self):
	return len((self.vids_dir))

	def __getitem__(self, idx):

	vid_path = self.vids_dir[idx]
	vid_name = vid_path.split('/')[-1]
	vid_name = vid_name[:-4]
	cap = cv2.VideoCapture(vid_path, cv2.CAP_FFMPEG)
	video_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
	if video_length == 0:
	raise Exception('no frame in this vid')
	video_chunk = []

	original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
	original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
	# Calculate new dimensions (scaled by 0.5)

	#scale_factor=0.5
	#new_width = int(original_width * scale_factor)
	#new_height = int(original_height * scale_factor)

	while 1:
	ret, frame = cap.read()
	if not ret:
	break
	# resize the frame to half
	#resized_frame = cv2.resize(frame, (new_width, new_height))
	video_chunk.append(frame)

	video_width = video_chunk[0].shape[1]
	video_height = video_chunk[0].shape[0]
	# ##########

	if (video_width < 768) \| (video_height < 768):
	transformed_video = torch.zeros(
	[video_length, 3, video_height, video_width])
	transform = transforms.Compose([
	transforms.ToTensor(),
	transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[
	0.229, 0.224, 0.225])
	])
	else:
	transform = transforms.Compose([
	transforms.ToTensor(),
	transforms.Resize(768),
	transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[
	0.229, 0.224, 0.225])
	])

	example_frame = video_chunk[0]
	example_frame = transform(example_frame)
	transformed_video = torch.zeros(
	[video_length, 3, example_frame.size(1), example_frame.size(2)])


	for i, extract_frame in enumerate(video_chunk):
	lp = cv2.cvtColor(extract_frame, cv2.COLOR_BGR2RGB)
	lp = transform(lp)
	transformed_video[i] = lp

	return transformed_video, video_length, vid_name