Spaces:
Build error
Build error
| import numpy as np | |
| import cv2 | |
| import glob | |
| import imageio | |
| from PIL import Image | |
| import os | |
| def all_file(file_dir): | |
| L = [] | |
| for root, dirs, files in os.walk(file_dir): | |
| for file in files: | |
| extend = os.path.splitext(file)[1] | |
| if extend == '.png' or extend == '.jpg' or extend == '.jpeg' or extend == '.JPG' or extend == '.mp4': | |
| L.append(os.path.join(root, file)) | |
| return L | |
| def crop_img(img, mask): | |
| # find the bounding box | |
| x, y, w, h = cv2.boundingRect(mask) #91 85 554 1836 | |
| y_max = y + h | |
| x_max = x + w | |
| # extend the bounding box with 0.1 | |
| y = max(0, y - int(h * 0.05)) | |
| y_max = min(img.shape[0], y_max + int(h * 0.05)) | |
| return img[y:y_max, x:x_max] | |
| def pad_img(img, color=[255, 255, 255]): | |
| # pad to square with mod 16 ==0 | |
| h, w = img.shape[:2] | |
| max_size = max(h, w) | |
| if max_size % 16 != 0: | |
| max_size = int(max_size / 16) * 16 + 16 | |
| top = (max_size - h) // 2 | |
| bottom = max_size - h - top | |
| left = (max_size - w) // 2 | |
| right = max_size - w - left | |
| img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) | |
| padding_v = [top, bottom, left, right] | |
| return img, padding_v | |
| def extract_mask_sdc(img): | |
| # >0 value as human | |
| mask = np.zeros_like(img[:, :, 0]) | |
| # color to gray | |
| gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) | |
| # mask[gray[:, :] > 0] = 255 | |
| mask[gray[:, :] > 10] = 255 # !!bug: remove noise | |
| return mask | |
| def clean_mask(mask): | |
| se1 = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5)) | |
| se2 = cv2.getStructuringElement(cv2.MORPH_RECT, (2,2)) | |
| mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, se1) | |
| mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, se2) | |
| return mask | |
| def crop_img_sdc(img, mask): | |
| # find the bounding box | |
| x, y, w, h = cv2.boundingRect(mask) #91 85 554 1836 | |
| y_max = y + h | |
| x_max = x + w | |
| # y = max(0, y-2) | |
| pad_h = 0.1 | |
| pad_w = 0.05 | |
| y = max(0, y - int(h * pad_h)) | |
| y_max = min(img.shape[0], y_max + int(h * pad_h)) | |
| x = max(0, x - int(w * pad_w)) | |
| x_max = min(img.shape[1], x_max + int(w * pad_w)) | |
| return y, y_max,x,x_max | |
| def crop_human(pose_images, vid_images, mask_images): | |
| # find the bbox of the human in the whole frames | |
| bbox = [] | |
| y = 10000 | |
| y_max = 0 | |
| x = 10000 | |
| x_max = 0 | |
| n_frame = len(pose_images) | |
| for pose_img in pose_images: | |
| frame = np.array(pose_img) | |
| mask = extract_mask_sdc(frame) | |
| y_, y_max_, x_, x_max_ = crop_img_sdc(frame, mask) | |
| y = min(y, y_) | |
| y_max = max(y_max, y_max_) | |
| x = min(x, x_) | |
| x_max = max(x_max, x_max_) | |
| # ensure width and height divisible by 2 | |
| h = y_max - y | |
| w = x_max - x | |
| if h % 2 == 1: | |
| h += 1 | |
| y_max += 1 | |
| if w % 2 == 1: | |
| w += 1 | |
| x_max += 1 | |
| bbox = [x,x_max,y,y_max] | |
| # crop the human in the whole frames | |
| frames_res = [] | |
| vid_res = [] | |
| mask_res = [] | |
| for i, pose_img in enumerate(pose_images): | |
| frame = np.array(pose_img) | |
| frame = frame[y:y_max, x:x_max] | |
| frame = Image.fromarray(frame) | |
| frames_res.append(frame) | |
| vid = vid_images[i] | |
| vid = np.array(vid) | |
| vid_res.append(Image.fromarray(vid[y:y_max, x:x_max])) | |
| mask = mask_images[i] | |
| mask = np.array(mask) | |
| mask_res.append(Image.fromarray(mask[y:y_max, x:x_max])) | |
| return frames_res, vid_res, mask_res | |
| def init_bbox(): | |
| return [10000, 0, 10000, 0] | |
| def bbox_div2(x, x_max, y, y_max): | |
| # ensure width and height divisible by 2 | |
| h = y_max - y | |
| w = x_max - x | |
| if h % 2 == 1: | |
| h += 1 | |
| y_max += 1 | |
| if w % 2 == 1: | |
| w += 1 | |
| x_max += 1 | |
| return x, x_max, y, y_max | |
| def bbox_pad(x, x_max, y, y_max, img): | |
| w = x_max - x | |
| h = y_max - y | |
| # pad to square with mod 16 ==0 | |
| max_size = max(h, w) | |
| if max_size % 16 != 0: | |
| max_size = int(max_size / 16) * 16 + 16 | |
| top = (max_size - h) // 2 | |
| bottom = max_size - h - top | |
| left = (max_size - w) // 2 | |
| right = max_size - w - left | |
| y = max(0, y-top) | |
| y_max = min(img.shape[0], y_max+bottom) | |
| x = max(0, x-left) | |
| x_max = min(img.shape[1], x_max+right) | |
| return x, x_max, y, y_max | |
| def compute_area_ratio(bbox_frame, bbox_clip): | |
| x1, x2, y1, y2 = bbox_frame | |
| x1_clip, x2_clip, y1_clip, y2_clip = bbox_clip | |
| area_frame = (x2 - x1) * (y2 - y1) | |
| area_clip = (x2_clip - x1_clip) * (y2_clip - y1_clip) | |
| ratio = area_frame / area_clip | |
| return ratio | |
| def update_clip(bbox_clip, start_idx, i, bbox_max): | |
| x, x_max, y, y_max = bbox_max | |
| for j in range(start_idx, i): | |
| bbox_clip[j] = [x, x_max, y, y_max] | |
| def crop_human_clip_auto_context(pose_images, vid_images, bk_images, overlay=4): | |
| # find the bbox of the human in the clip frames | |
| bbox_clip = [] | |
| bbox_perframe = [] | |
| ratio_list = [] | |
| x, x_max, y, y_max = init_bbox() | |
| n_frame = len(pose_images) | |
| context_list = [] | |
| bbox_clip_list = [] | |
| areas = np.zeros(n_frame) | |
| start_idx = 0 | |
| for i in range(0, n_frame): | |
| # print('i:', i) | |
| pose_img = pose_images[i] | |
| frame = np.array(pose_img) | |
| mask = extract_mask_sdc(frame) | |
| mask = clean_mask(mask) | |
| y_, y_max_, x_, x_max_ = crop_img_sdc(frame, mask) | |
| x_, x_max_, y_, y_max_ = bbox_div2(x_, x_max_, y_, y_max_) | |
| x_, x_max_, y_, y_max_ = bbox_pad(x_, x_max_, y_, y_max_, frame) | |
| bbox_max_prev = (x, x_max, y, y_max) | |
| # update max | |
| y = min(y, y_) | |
| y_max = max(y_max, y_max_) | |
| x = min(x, x_) | |
| x_max = max(x_max, x_max_) | |
| bbox_max_cur = (x, x_max, y, y_max) | |
| # save bbox per frame | |
| bbox_cur = [x_, x_max_, y_, y_max_] | |
| bbox_perframe.append(bbox_cur) | |
| bbox_clip.append(bbox_cur) | |
| # compute the area of each frame | |
| area = (x_max_ - x_) * (y_max_ - y_)/100 | |
| areas[i] = area | |
| area_max = (y_max - y) * (x_max - x)/100 | |
| if area_max!=0: | |
| ratios = areas[start_idx:i]/area_max | |
| else: | |
| ratios = np.zeros(i-start_idx) | |
| # ROI_THE = 0.2 | |
| ROI_THE = 0.5 | |
| if (i == n_frame - 1): | |
| i += 1 | |
| # print('update from ') | |
| # print('start_idx:', start_idx) | |
| # print('i:', i) | |
| # print('clip from to:', range(start_idx, i)) | |
| if len(context_list)==0: | |
| context_list.append(list(range(start_idx, i))) | |
| else: | |
| overlay_ = min(overlay, len(context_list[-1])) | |
| context_list.append(list(range(start_idx-overlay_, i))) | |
| bbox_clip_list.append(bbox_max_cur) | |
| update_clip(bbox_clip, start_idx, i, bbox_max_cur) | |
| start_idx = i | |
| continue | |
| elif np.any(ratios < ROI_THE) and ratios.sum()!=0: | |
| # generate a list from start_idx to i | |
| if len(context_list)==0: | |
| context_list.append(list(range(start_idx, i))) | |
| else: | |
| overlay_ = min(overlay, len(context_list[-1])) | |
| context_list.append(list(range(start_idx-overlay_, i))) | |
| bbox_clip_list.append(bbox_max_prev) | |
| # print('update from ') | |
| # print('start_idx:', start_idx) | |
| # print('i:', i) | |
| update_clip(bbox_clip, start_idx, i, bbox_max_prev) | |
| x, x_max, y, y_max = bbox_cur | |
| start_idx = i | |
| continue | |
| # vis ratio | |
| for i in range(0, n_frame): | |
| # print('i:', i) | |
| bbox_frame_ = bbox_perframe[i] | |
| bbox_clip_ = bbox_clip[i] | |
| # print('bbox_frame_:', bbox_frame_) | |
| # print('bbox_clip_:', bbox_clip_) | |
| if np.array(bbox_clip_).sum()==0: | |
| ratio = 0 | |
| else: | |
| ratio = compute_area_ratio(bbox_frame_, bbox_clip_) | |
| # print('ratio:', ratio) | |
| ratio_list.append(ratio) | |
| # crop images | |
| frames_res = [] | |
| vid_res = [] | |
| bk_res = [] | |
| for k, context in enumerate(context_list): | |
| for i in context: | |
| pose_img = pose_images[i] | |
| frame = np.array(pose_img) | |
| x, x_max, y, y_max = bbox_clip_list[k] | |
| if x >= x_max or y >= y_max: | |
| x, x_max, y, y_max = 0, frame.shape[1] - 1, 0, frame.shape[0] - 1 | |
| frame = frame[y:y_max, x:x_max] | |
| frame = Image.fromarray(frame) | |
| frames_res.append(frame) | |
| vid = vid_images[i] | |
| vid = np.array(vid) | |
| vid_res.append(Image.fromarray(vid[y:y_max, x:x_max])) | |
| bk = bk_images[i] | |
| bk = np.array(bk) | |
| bk_res.append(Image.fromarray(bk[y:y_max, x:x_max])) | |
| return frames_res, vid_res, bk_res, bbox_clip, context_list, bbox_clip_list | |
| def crop_human_clip(pose_images, vid_images, bk_images, clip_length=1): | |
| # find the bbox of the human in the clip frames | |
| bbox_clip = [] | |
| x, x_max, y, y_max = init_bbox() | |
| n_frame = len(pose_images) | |
| for i in range(0, n_frame): | |
| # print('i:', i) | |
| pose_img = pose_images[i] | |
| frame = np.array(pose_img) | |
| mask = extract_mask_sdc(frame) | |
| mask = clean_mask(mask) | |
| y_, y_max_, x_, x_max_ = crop_img_sdc(frame, mask) | |
| x_, x_max_, y_, y_max_ = bbox_div2(x_, x_max_, y_, y_max_) | |
| x_, x_max_, y_, y_max_ = bbox_pad(x_, x_max_, y_, y_max_, frame) | |
| # print(x_,x_max_,y_,y_max_) | |
| y = min(y, y_) | |
| y_max = max(y_max, y_max_) | |
| x = min(x, x_) | |
| x_max = max(x_max, x_max_) | |
| # print(x,x_max,y,y_max) | |
| if ((i+1) % clip_length == 0) or (i==n_frame-1): | |
| x, x_max, y, y_max = bbox_div2(x, x_max, y, y_max) | |
| if x>=x_max or y>=y_max: | |
| x, x_max, y, y_max = 0, frame.shape[1]-1, 0, frame.shape[0]-1 | |
| # print(x,x_max,y,y_max) | |
| bbox_clip.append([x, x_max, y, y_max]) | |
| x, x_max, y, y_max = init_bbox() | |
| # crop images | |
| frames_res = [] | |
| vid_res = [] | |
| bk_res = [] | |
| for i, pose_img in enumerate(pose_images): | |
| x, x_max, y, y_max = bbox_clip[i//clip_length] | |
| frame = np.array(pose_img) | |
| frame = frame[y:y_max, x:x_max] | |
| frame = Image.fromarray(frame) | |
| frames_res.append(frame) | |
| vid = vid_images[i] | |
| vid = np.array(vid) | |
| vid_res.append(Image.fromarray(vid[y:y_max, x:x_max])) | |
| bk = bk_images[i] | |
| bk = np.array(bk) | |
| bk_res.append(Image.fromarray(bk[y:y_max, x:x_max])) | |
| return frames_res, vid_res, bk_res, bbox_clip | |
| def init_bk(n_frame,h,w): | |
| images = [] | |
| for i in range(n_frame): | |
| img = np.ones((h, w, 3), dtype=np.uint8) * 255 | |
| images.append(Image.fromarray(img)) | |
| return images | |
| def pose_adjust(pose_image, width=512, height=784): | |
| canvas = np.zeros((height, width, 3), dtype=np.uint8) | |
| # PIL to numpy | |
| pose_img = np.array(pose_image) | |
| h, w, c = pose_img.shape | |
| # print('pose_img:', pose_img.shape) | |
| # resize | |
| # pose_img = cv2.resize(pose_img, (width, int(h * width / w)), interpolation=cv2.INTER_AREA) | |
| nh, nw = height, int(w * height / h) | |
| pose_img = cv2.resize(pose_img, (nw, nh), interpolation=cv2.INTER_AREA) | |
| if nw < width: | |
| # pad | |
| pad = (width - nw) // 2 | |
| canvas[:, pad:pad + nw, :] = pose_img | |
| else: | |
| # center crop | |
| crop = (nw - width) // 2 | |
| canvas = pose_img[:, crop:crop + width, :] | |
| # numpy to PIL | |
| canvas = Image.fromarray(canvas) | |
| return canvas | |
| def load_pretrain_pose_guider(model, ckpt_path): | |
| state_dict = torch.load(ckpt_path, map_location="cpu") | |
| # for k,v in state_dict.items(): | |
| # print(k, v.shape) | |
| weights = state_dict['conv_in.weight'] | |
| # _,c,_,_ = weights.shape | |
| # if c!= | |
| weights = torch.cat((weights, torch.zeros_like(weights), torch.zeros_like(weights)), dim=1) | |
| state_dict['conv_in.weight'] = weights | |
| model.load_state_dict(state_dict, strict=True) | |
| return model | |
| def refine_img_prepross(image, mask): | |
| im_ary = np.asarray(image).astype(np.float32) | |
| input = np.concatenate([im_ary, mask[:, :, np.newaxis]], axis=-1) | |
| return input | |
| mask_mode = {'up_down_left_right': 0, 'left_right_up': 1, 'left_right_down': 2, 'up_down_left': 3, 'up_down_right': 4, | |
| 'left_right': 5, 'up_down': 6, 'left_up': 7, 'right_up': 8, 'left_down': 9, 'right_down': 10, | |
| 'left': 11, 'right': 12, 'up': 13, 'down': 14, 'inner': 15} | |
| def get_mask(mask_list, bbox, img): | |
| w, h = img.size | |
| # print('size w h:', w, h) | |
| # print('bbox:', bbox) | |
| w_min, w_max, h_min, h_max = bbox | |
| if w_min<=0 and w_max>=w and h_min<=0 and h_max>=h: # up_down_left_right | |
| mode = 'up_down_left_right' | |
| elif w_min<=0 and w_max>=w and h_min<=0: | |
| mode = 'left_right_up' | |
| elif w_min<=0 and w_max>=w and h_max>=h: | |
| mode = 'left_right_down' | |
| elif w_min <= 0 and h_min <= 0 and h_max >= h: | |
| mode = 'up_down_left' | |
| elif w_max >= w and h_min <= 0 and h_max >= h: | |
| mode = 'up_down_right' | |
| elif w_min<=0 and w_max>=w: # | |
| mode = 'left_right' | |
| elif h_min<=0 and h_max>=h: # | |
| mode = 'up_down' | |
| elif w_min<=0 and h_min<=0: # left_up | |
| mode = 'left_up' | |
| elif w_max>=w and h_min<=0: # right_up5 | |
| mode = 'right_up' | |
| elif w_min<=0 and h_max>=h: # left_down6 | |
| mode = 'left_down' | |
| elif w_max>=w and h_max>=h: # right_down7 | |
| mode = 'right_down' | |
| elif w_min<=0: | |
| mode = 'left' | |
| elif w_max>=w: | |
| mode = 'right' | |
| elif h_min<=0: | |
| mode = 'up' | |
| elif h_max>=h: | |
| mode = 'down' | |
| else: | |
| mode = 'inner' | |
| mask = mask_list[mask_mode[mode]] | |
| return mask | |
| def load_mask_list(mask_path): | |
| mask_list = [] | |
| for key in mask_mode.keys(): | |
| mask = cv2.imread(mask_path[:-4] + '_%s.png'%key) | |
| mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY).astype(np.float32) / 255.0 | |
| mask_list.append(mask) | |
| return mask_list | |
| def recover_bk(images, start_idx, end_idx, template_name=None): | |
| img = np.array(images[0]) | |
| for i in range(start_idx, end_idx): | |
| if template_name == "dance_indoor_1": | |
| images[i][:img.shape[0], :, 0] = 255 | |
| images[i][:img.shape[0], :, 1] = 255 | |
| images[i][:img.shape[0], :, 2] = 255 | |
| else: | |
| img_blank = np.ones_like(img) * 255 | |
| images[i] = Image.fromarray(img_blank) | |
| return images | |
| def load_video_fixed_fps(vid_path, target_fps=30, target_speed=1): | |
| # Load video and get metadata | |
| reader = imageio.get_reader(vid_path) | |
| fps = round(reader.get_meta_data()['fps']) | |
| # print('original fps:', fps) | |
| # print('target fps:', target_fps) | |
| # Calculate the ratio of original fps to target fps to determine which frames to keep | |
| keep_ratio = target_speed * fps / target_fps | |
| n_frames = reader.count_frames() | |
| keep_frames_indices = np.arange(0, n_frames, keep_ratio).astype(int) | |
| # Extract frames at the target frame rate | |
| frames = [Image.fromarray(reader.get_data(i)) for i in keep_frames_indices if i < len(reader)] | |
| reader.close() | |
| return frames | |