mimo-1.0

Build error

App Files Files Community

mimo-1.0 / tools /util.py

minhho

Clean deployment: All fixes without binary files

6f2c7f0 3 months ago

raw

history blame contribute delete

15.2 kB

	import numpy as np
	import cv2
	import glob
	import imageio
	from PIL import Image
	import os

	def all_file(file_dir):
	L = []
	for root, dirs, files in os.walk(file_dir):
	for file in files:
	extend = os.path.splitext(file)[1]
	if extend == '.png' or extend == '.jpg' or extend == '.jpeg' or extend == '.JPG' or extend == '.mp4':
	L.append(os.path.join(root, file))
	return L

	def crop_img(img, mask):
	# find the bounding box
	x, y, w, h = cv2.boundingRect(mask) #91 85 554 1836
	y_max = y + h
	x_max = x + w
	# extend the bounding box with 0.1
	y = max(0, y - int(h * 0.05))
	y_max = min(img.shape[0], y_max + int(h * 0.05))
	return img[y:y_max, x:x_max]

	def pad_img(img, color=[255, 255, 255]):
	# pad to square with mod 16 ==0
	h, w = img.shape[:2]
	max_size = max(h, w)
	if max_size % 16 != 0:
	max_size = int(max_size / 16) * 16 + 16
	top = (max_size - h) // 2
	bottom = max_size - h - top
	left = (max_size - w) // 2
	right = max_size - w - left
	img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
	padding_v = [top, bottom, left, right]
	return img, padding_v

	def extract_mask_sdc(img):
	# >0 value as human
	mask = np.zeros_like(img[:, :, 0])
	# color to gray
	gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
	# mask[gray[:, :] > 0] = 255
	mask[gray[:, :] > 10] = 255 # !!bug: remove noise
	return mask

	def clean_mask(mask):
	se1 = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5))
	se2 = cv2.getStructuringElement(cv2.MORPH_RECT, (2,2))
	mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, se1)
	mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, se2)
	return mask

	def crop_img_sdc(img, mask):
	# find the bounding box
	x, y, w, h = cv2.boundingRect(mask) #91 85 554 1836
	y_max = y + h
	x_max = x + w
	# y = max(0, y-2)
	pad_h = 0.1
	pad_w = 0.05
	y = max(0, y - int(h * pad_h))
	y_max = min(img.shape[0], y_max + int(h * pad_h))
	x = max(0, x - int(w * pad_w))
	x_max = min(img.shape[1], x_max + int(w * pad_w))
	return y, y_max,x,x_max

	def crop_human(pose_images, vid_images, mask_images):
	# find the bbox of the human in the whole frames
	bbox = []
	y = 10000
	y_max = 0
	x = 10000
	x_max = 0
	n_frame = len(pose_images)
	for pose_img in pose_images:
	frame = np.array(pose_img)
	mask = extract_mask_sdc(frame)
	y_, y_max_, x_, x_max_ = crop_img_sdc(frame, mask)
	y = min(y, y_)
	y_max = max(y_max, y_max_)
	x = min(x, x_)
	x_max = max(x_max, x_max_)
	# ensure width and height divisible by 2
	h = y_max - y
	w = x_max - x
	if h % 2 == 1:
	h += 1
	y_max += 1
	if w % 2 == 1:
	w += 1
	x_max += 1

	bbox = [x,x_max,y,y_max]

	# crop the human in the whole frames
	frames_res = []
	vid_res = []
	mask_res = []
	for i, pose_img in enumerate(pose_images):
	frame = np.array(pose_img)
	frame = frame[y:y_max, x:x_max]
	frame = Image.fromarray(frame)
	frames_res.append(frame)

	vid = vid_images[i]
	vid = np.array(vid)
	vid_res.append(Image.fromarray(vid[y:y_max, x:x_max]))

	mask = mask_images[i]
	mask = np.array(mask)
	mask_res.append(Image.fromarray(mask[y:y_max, x:x_max]))
	return frames_res, vid_res, mask_res


	def init_bbox():
	return [10000, 0, 10000, 0]

	def bbox_div2(x, x_max, y, y_max):
	# ensure width and height divisible by 2
	h = y_max - y
	w = x_max - x
	if h % 2 == 1:
	h += 1
	y_max += 1
	if w % 2 == 1:
	w += 1
	x_max += 1
	return x, x_max, y, y_max

	def bbox_pad(x, x_max, y, y_max, img):
	w = x_max - x
	h = y_max - y
	# pad to square with mod 16 ==0
	max_size = max(h, w)
	if max_size % 16 != 0:
	max_size = int(max_size / 16) * 16 + 16
	top = (max_size - h) // 2
	bottom = max_size - h - top
	left = (max_size - w) // 2
	right = max_size - w - left

	y = max(0, y-top)
	y_max = min(img.shape[0], y_max+bottom)
	x = max(0, x-left)
	x_max = min(img.shape[1], x_max+right)

	return x, x_max, y, y_max

	def compute_area_ratio(bbox_frame, bbox_clip):
	x1, x2, y1, y2 = bbox_frame
	x1_clip, x2_clip, y1_clip, y2_clip = bbox_clip
	area_frame = (x2 - x1) * (y2 - y1)
	area_clip = (x2_clip - x1_clip) * (y2_clip - y1_clip)
	ratio = area_frame / area_clip
	return ratio

	def update_clip(bbox_clip, start_idx, i, bbox_max):
	x, x_max, y, y_max = bbox_max
	for j in range(start_idx, i):
	bbox_clip[j] = [x, x_max, y, y_max]

	def crop_human_clip_auto_context(pose_images, vid_images, bk_images, overlay=4):
	# find the bbox of the human in the clip frames
	bbox_clip = []
	bbox_perframe = []
	ratio_list = []
	x, x_max, y, y_max = init_bbox()
	n_frame = len(pose_images)

	context_list = []
	bbox_clip_list = []

	areas = np.zeros(n_frame)
	start_idx = 0
	for i in range(0, n_frame):
	# print('i:', i)
	pose_img = pose_images[i]
	frame = np.array(pose_img)
	mask = extract_mask_sdc(frame)
	mask = clean_mask(mask)
	y_, y_max_, x_, x_max_ = crop_img_sdc(frame, mask)
	x_, x_max_, y_, y_max_ = bbox_div2(x_, x_max_, y_, y_max_)
	x_, x_max_, y_, y_max_ = bbox_pad(x_, x_max_, y_, y_max_, frame)
	bbox_max_prev = (x, x_max, y, y_max)

	# update max
	y = min(y, y_)
	y_max = max(y_max, y_max_)
	x = min(x, x_)
	x_max = max(x_max, x_max_)
	bbox_max_cur = (x, x_max, y, y_max)

	# save bbox per frame
	bbox_cur = [x_, x_max_, y_, y_max_]
	bbox_perframe.append(bbox_cur)
	bbox_clip.append(bbox_cur)

	# compute the area of each frame
	area = (x_max_ - x_) * (y_max_ - y_)/100
	areas[i] = area
	area_max = (y_max - y) * (x_max - x)/100
	if area_max!=0:
	ratios = areas[start_idx:i]/area_max
	else:
	ratios = np.zeros(i-start_idx)

	# ROI_THE = 0.2
	ROI_THE = 0.5
	if (i == n_frame - 1):
	i += 1
	# print('update from ')
	# print('start_idx:', start_idx)
	# print('i:', i)

	# print('clip from to:', range(start_idx, i))
	if len(context_list)==0:
	context_list.append(list(range(start_idx, i)))
	else:
	overlay_ = min(overlay, len(context_list[-1]))
	context_list.append(list(range(start_idx-overlay_, i)))
	bbox_clip_list.append(bbox_max_cur)

	update_clip(bbox_clip, start_idx, i, bbox_max_cur)
	start_idx = i
	continue
	elif np.any(ratios < ROI_THE) and ratios.sum()!=0:

	# generate a list from start_idx to i
	if len(context_list)==0:
	context_list.append(list(range(start_idx, i)))
	else:
	overlay_ = min(overlay, len(context_list[-1]))
	context_list.append(list(range(start_idx-overlay_, i)))
	bbox_clip_list.append(bbox_max_prev)

	# print('update from ')
	# print('start_idx:', start_idx)
	# print('i:', i)
	update_clip(bbox_clip, start_idx, i, bbox_max_prev)
	x, x_max, y, y_max = bbox_cur
	start_idx = i
	continue

	# vis ratio
	for i in range(0, n_frame):
	# print('i:', i)
	bbox_frame_ = bbox_perframe[i]
	bbox_clip_ = bbox_clip[i]
	# print('bbox_frame_:', bbox_frame_)
	# print('bbox_clip_:', bbox_clip_)
	if np.array(bbox_clip_).sum()==0:
	ratio = 0
	else:
	ratio = compute_area_ratio(bbox_frame_, bbox_clip_)
	# print('ratio:', ratio)
	ratio_list.append(ratio)

	# crop images
	frames_res = []
	vid_res = []
	bk_res = []
	for k, context in enumerate(context_list):
	for i in context:
	pose_img = pose_images[i]
	frame = np.array(pose_img)
	x, x_max, y, y_max = bbox_clip_list[k]
	if x >= x_max or y >= y_max:
	x, x_max, y, y_max = 0, frame.shape[1] - 1, 0, frame.shape[0] - 1
	frame = frame[y:y_max, x:x_max]
	frame = Image.fromarray(frame)
	frames_res.append(frame)

	vid = vid_images[i]
	vid = np.array(vid)
	vid_res.append(Image.fromarray(vid[y:y_max, x:x_max]))

	bk = bk_images[i]
	bk = np.array(bk)
	bk_res.append(Image.fromarray(bk[y:y_max, x:x_max]))

	return frames_res, vid_res, bk_res, bbox_clip, context_list, bbox_clip_list


	def crop_human_clip(pose_images, vid_images, bk_images, clip_length=1):
	# find the bbox of the human in the clip frames
	bbox_clip = []
	x, x_max, y, y_max = init_bbox()
	n_frame = len(pose_images)
	for i in range(0, n_frame):
	# print('i:', i)
	pose_img = pose_images[i]
	frame = np.array(pose_img)
	mask = extract_mask_sdc(frame)
	mask = clean_mask(mask)
	y_, y_max_, x_, x_max_ = crop_img_sdc(frame, mask)
	x_, x_max_, y_, y_max_ = bbox_div2(x_, x_max_, y_, y_max_)
	x_, x_max_, y_, y_max_ = bbox_pad(x_, x_max_, y_, y_max_, frame)

	# print(x_,x_max_,y_,y_max_)

	y = min(y, y_)
	y_max = max(y_max, y_max_)
	x = min(x, x_)
	x_max = max(x_max, x_max_)
	# print(x,x_max,y,y_max)

	if ((i+1) % clip_length == 0) or (i==n_frame-1):
	x, x_max, y, y_max = bbox_div2(x, x_max, y, y_max)
	if x>=x_max or y>=y_max:
	x, x_max, y, y_max = 0, frame.shape[1]-1, 0, frame.shape[0]-1
	# print(x,x_max,y,y_max)
	bbox_clip.append([x, x_max, y, y_max])
	x, x_max, y, y_max = init_bbox()
	# crop images
	frames_res = []
	vid_res = []
	bk_res = []
	for i, pose_img in enumerate(pose_images):
	x, x_max, y, y_max = bbox_clip[i//clip_length]
	frame = np.array(pose_img)
	frame = frame[y:y_max, x:x_max]
	frame = Image.fromarray(frame)
	frames_res.append(frame)

	vid = vid_images[i]
	vid = np.array(vid)
	vid_res.append(Image.fromarray(vid[y:y_max, x:x_max]))

	bk = bk_images[i]
	bk = np.array(bk)
	bk_res.append(Image.fromarray(bk[y:y_max, x:x_max]))
	return frames_res, vid_res, bk_res, bbox_clip


	def init_bk(n_frame,h,w):
	images = []
	for i in range(n_frame):
	img = np.ones((h, w, 3), dtype=np.uint8) * 255
	images.append(Image.fromarray(img))
	return images



	def pose_adjust(pose_image, width=512, height=784):
	canvas = np.zeros((height, width, 3), dtype=np.uint8)
	# PIL to numpy
	pose_img = np.array(pose_image)
	h, w, c = pose_img.shape
	# print('pose_img:', pose_img.shape)
	# resize
	# pose_img = cv2.resize(pose_img, (width, int(h * width / w)), interpolation=cv2.INTER_AREA)
	nh, nw = height, int(w * height / h)
	pose_img = cv2.resize(pose_img, (nw, nh), interpolation=cv2.INTER_AREA)
	if nw < width:
	# pad
	pad = (width - nw) // 2
	canvas[:, pad:pad + nw, :] = pose_img
	else:
	# center crop
	crop = (nw - width) // 2
	canvas = pose_img[:, crop:crop + width, :]

	# numpy to PIL
	canvas = Image.fromarray(canvas)
	return canvas


	def load_pretrain_pose_guider(model, ckpt_path):

	state_dict = torch.load(ckpt_path, map_location="cpu")
	# for k,v in state_dict.items():
	# print(k, v.shape)

	weights = state_dict['conv_in.weight']
	# _,c,_,_ = weights.shape
	# if c!=
	weights = torch.cat((weights, torch.zeros_like(weights), torch.zeros_like(weights)), dim=1)
	state_dict['conv_in.weight'] = weights

	model.load_state_dict(state_dict, strict=True)

	return model

	def refine_img_prepross(image, mask):
	im_ary = np.asarray(image).astype(np.float32)
	input = np.concatenate([im_ary, mask[:, :, np.newaxis]], axis=-1)
	return input

	mask_mode = {'up_down_left_right': 0, 'left_right_up': 1, 'left_right_down': 2, 'up_down_left': 3, 'up_down_right': 4,
	'left_right': 5, 'up_down': 6, 'left_up': 7, 'right_up': 8, 'left_down': 9, 'right_down': 10,
	'left': 11, 'right': 12, 'up': 13, 'down': 14, 'inner': 15}

	def get_mask(mask_list, bbox, img):
	w, h = img.size
	# print('size w h:', w, h)
	# print('bbox:', bbox)
	w_min, w_max, h_min, h_max = bbox
	if w_min<=0 and w_max>=w and h_min<=0 and h_max>=h: # up_down_left_right
	mode = 'up_down_left_right'
	elif w_min<=0 and w_max>=w and h_min<=0:
	mode = 'left_right_up'
	elif w_min<=0 and w_max>=w and h_max>=h:
	mode = 'left_right_down'
	elif w_min <= 0 and h_min <= 0 and h_max >= h:
	mode = 'up_down_left'
	elif w_max >= w and h_min <= 0 and h_max >= h:
	mode = 'up_down_right'

	elif w_min<=0 and w_max>=w: #
	mode = 'left_right'
	elif h_min<=0 and h_max>=h: #
	mode = 'up_down'
	elif w_min<=0 and h_min<=0: # left_up
	mode = 'left_up'
	elif w_max>=w and h_min<=0: # right_up5
	mode = 'right_up'
	elif w_min<=0 and h_max>=h: # left_down6
	mode = 'left_down'
	elif w_max>=w and h_max>=h: # right_down7
	mode = 'right_down'

	elif w_min<=0:
	mode = 'left'
	elif w_max>=w:
	mode = 'right'
	elif h_min<=0:
	mode = 'up'
	elif h_max>=h:
	mode = 'down'
	else:
	mode = 'inner'

	mask = mask_list[mask_mode[mode]]

	return mask

	def load_mask_list(mask_path):
	mask_list = []
	for key in mask_mode.keys():
	mask = cv2.imread(mask_path[:-4] + '_%s.png'%key)
	mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY).astype(np.float32) / 255.0
	mask_list.append(mask)
	return mask_list

	def recover_bk(images, start_idx, end_idx, template_name=None):
	img = np.array(images[0])
	for i in range(start_idx, end_idx):
	if template_name == "dance_indoor_1":
	images[i][:img.shape[0], :, 0] = 255
	images[i][:img.shape[0], :, 1] = 255
	images[i][:img.shape[0], :, 2] = 255
	else:
	img_blank = np.ones_like(img) * 255
	images[i] = Image.fromarray(img_blank)
	return images


	def load_video_fixed_fps(vid_path, target_fps=30, target_speed=1):
	# Load video and get metadata
	reader = imageio.get_reader(vid_path)
	fps = round(reader.get_meta_data()['fps'])
	# print('original fps:', fps)
	# print('target fps:', target_fps)

	# Calculate the ratio of original fps to target fps to determine which frames to keep
	keep_ratio = target_speed * fps / target_fps
	n_frames = reader.count_frames()
	keep_frames_indices = np.arange(0, n_frames, keep_ratio).astype(int)

	# Extract frames at the target frame rate
	frames = [Image.fromarray(reader.get_data(i)) for i in keep_frames_indices if i < len(reader)]

	reader.close()
	return frames