import cv2 import torch import random import numpy as np import torch.nn.functional as F from lib.core.config import cfg from lib.utils.human_models import mano def get_aug_config_contact(): # Augmentation intensity factors scale_factor = 0.25 rot_factor = 30 color_factor = 0.2 trans_factor = 0.1 # Translation range (recommended 0.1 to 0.2) noise_std = 0.02 # Gaussian noise strength motion_blur_prob = 0.15 # Probability of applying motion blur extreme_crop_prob = 0.1 # Probability for extreme cropping extreme_crop_lvl = 0.3 # Crop intensity (recommended 0.2 to 0.4) low_res_prob = 0.05 # Probability for applying low resolution low_res_scale_range = (0.15, 0.5) # Range for low-res scaling # Scaling augmentation scale = np.clip(np.random.randn(), -1.0, 1.0) * scale_factor + 1.0 # Rotation augmentation rot = np.clip(np.random.randn(), -2.0, 2.0) * rot_factor if random.random() <= 0.6 else 0 # Color augmentation c_up = 1.0 + color_factor c_low = 1.0 - color_factor color_scale = np.array([ random.uniform(c_low, c_up), random.uniform(c_low, c_up), random.uniform(c_low, c_up) ]) # Flipping augmentation do_flip = random.random() <= 0.5 # Translation augmentation tx = np.clip(np.random.randn(), -1.0, 1.0) * trans_factor ty = np.clip(np.random.randn(), -1.0, 1.0) * trans_factor # Extreme cropping augmentation do_extreme_crop = random.random() <= extreme_crop_prob # Noise augmentation (returns standard deviation for Gaussian noise injection) add_noise = random.random() <= 0.3 # 30% chance of adding noise noise_std = noise_std if add_noise else 0.0 # Motion blur augmentation apply_motion_blur = random.random() <= motion_blur_prob motion_blur_kernel_size = random.choice([3, 5, 7]) if apply_motion_blur else 0 # Low-resolution augmentation apply_low_res = random.random() <= low_res_prob low_res_scale = random.uniform(*low_res_scale_range) if apply_low_res else 1.0 return { 'scale': scale, 'rot': rot, 'color_scale': color_scale, 'do_flip': do_flip, 'tx': tx, 'ty': ty, 'do_extreme_crop': do_extreme_crop, 'extreme_crop_lvl': extreme_crop_lvl if do_extreme_crop else 0, 'noise_std': noise_std, 'motion_blur_kernel_size': motion_blur_kernel_size, 'low_res_scale': low_res_scale # Added low-res scale parameter } def rotate_2d(pt_2d, rot_rad): x = pt_2d[0] y = pt_2d[1] sn, cs = np.sin(rot_rad), np.cos(rot_rad) xx = x * cs - y * sn yy = x * sn + y * cs return np.array([xx, yy], dtype=np.float32) def gen_trans_from_patch_cv(c_x, c_y, src_width, src_height, dst_width, dst_height, scale, rot, inv=False): # augment size with scale src_w = src_width * scale src_h = src_height * scale src_center = np.array([c_x, c_y], dtype=np.float32) # augment rotation rot_rad = np.pi * rot / 180 src_downdir = rotate_2d(np.array([0, src_h * 0.5], dtype=np.float32), rot_rad) src_rightdir = rotate_2d(np.array([src_w * 0.5, 0], dtype=np.float32), rot_rad) dst_w = dst_width dst_h = dst_height dst_center = np.array([dst_w * 0.5, dst_h * 0.5], dtype=np.float32) dst_downdir = np.array([0, dst_h * 0.5], dtype=np.float32) dst_rightdir = np.array([dst_w * 0.5, 0], dtype=np.float32) src = np.zeros((3, 2), dtype=np.float32) src[0, :] = src_center src[1, :] = src_center + src_downdir src[2, :] = src_center + src_rightdir dst = np.zeros((3, 2), dtype=np.float32) dst[0, :] = dst_center dst[1, :] = dst_center + dst_downdir dst[2, :] = dst_center + dst_rightdir if inv: trans = cv2.getAffineTransform(np.float32(dst), np.float32(src)) else: trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) trans = trans.astype(np.float32) return trans def generate_patch_image_contact(cvimg, bbox, scale, rot, do_flip, out_shape, tx=0.0, ty=0.0, bkg_color='black'): img = cvimg.copy() img_height, img_width, img_channels = img.shape bb_c_x = float(bbox[0] + 0.5 * bbox[2]) bb_c_y = float(bbox[1] + 0.5 * bbox[3]) bb_width = float(bbox[2]) bb_height = float(bbox[3]) if bkg_color == 'white': borderMode=cv2.BORDER_CONSTANT borderValue=(255, 255, 255) else: borderMode=cv2.BORDER_CONSTANT borderValue=(0, 0, 0) if do_flip: img = img[:, ::-1, :] bb_c_x = img_width - bb_c_x - 1 # Add translation offset bb_c_x += tx * img_width bb_c_y += ty * img_height trans = gen_trans_from_patch_cv(bb_c_x, bb_c_y, bb_width, bb_height, out_shape[1], out_shape[0], scale, rot) img_patch = cv2.warpAffine(img, trans, (int(out_shape[1]), int(out_shape[0])), flags=cv2.INTER_LINEAR, borderMode=borderMode, borderValue=borderValue) img_patch = img_patch.astype(np.float32) inv_trans = gen_trans_from_patch_cv(bb_c_x, bb_c_y, bb_width, bb_height, out_shape[1], out_shape[0], scale, rot, inv=True) return img_patch, trans, inv_trans def augmentation_contact(img, bbox, data_split, enforce_flip=None, bkg_color='black'): if data_split == 'train': aug_params = get_aug_config_contact() else: aug_params = { 'scale': 1.0, 'rot': 0.0, 'color_scale': np.array([1, 1, 1]), 'do_flip': False, 'tx': 0.0, 'ty': 0.0, 'do_extreme_crop': False, 'extreme_crop_lvl': 0.0, 'noise_std': 0.0, 'motion_blur_kernel_size': 0, 'low_res_scale': 1.0 # No low-res in non-training mode } # Enforce flip if specified if enforce_flip is not None: aug_params['do_flip'] = enforce_flip # Apply geometric augmentations (scaling, rotation, flipping) img, trans, inv_trans = generate_patch_image_contact( img, bbox, aug_params['scale'], aug_params['rot'], aug_params['do_flip'], cfg.MODEL.input_img_shape, aug_params['tx'], aug_params['ty'], bkg_color ) # Apply low-resolution augmentation if aug_params['low_res_scale'] < 1.0: # Only apply if scaling down img = apply_low_res(img, aug_params['low_res_scale']) # Apply color augmentation img = np.clip(img * aug_params['color_scale'][None, None, :], 0, 255) # Apply extreme cropping if aug_params['do_extreme_crop']: img = apply_extreme_crop(img, aug_params['extreme_crop_lvl']) # Apply noise augmentation if aug_params['noise_std'] > 0: img = add_gaussian_noise(img, aug_params['noise_std']) # Apply motion blur augmentation if aug_params['motion_blur_kernel_size'] > 0: img = apply_motion_blur(img, aug_params['motion_blur_kernel_size']) return img, trans, inv_trans, aug_params['rot'], aug_params['do_flip'], aug_params['color_scale'] def apply_extreme_crop(img, crop_lvl): """Extreme cropping: Aggressively crop the image.""" h, w = img.shape[:2] crop_size = max(1, int(min(h, w) * (1 - crop_lvl))) # Prevent zero-size crops start_x = random.randint(0, max(0, w - crop_size)) start_y = random.randint(0, max(0, h - crop_size)) cropped_img = img[start_y:start_y + crop_size, start_x:start_x + crop_size] # Preserve aspect ratio during resizing return cv2.resize(cropped_img, (w, h), interpolation=cv2.INTER_LINEAR) def add_gaussian_noise(img, noise_std): """Add Gaussian noise to the image with proper scaling for data type.""" noise = np.random.normal(0, noise_std, img.shape).astype(np.float32) if img.dtype == np.uint8: noisy_img = np.clip(img + noise * 255, 0, 255).astype(np.uint8) elif img.dtype == np.float32: noisy_img = np.clip(img + noise, 0.0, 1.0).astype(np.float32) elif img.dtype == np.float64: noisy_img = np.clip(img + noise, 0.0, 1.0).astype(np.float64) else: raise TypeError("Unsupported image dtype. Expected uint8 or float32.") return noisy_img def apply_motion_blur(img, kernel_size): """Apply motion blur to the image with a random direction.""" kernel = np.zeros((kernel_size, kernel_size)) direction = random.choice(['horizontal', 'vertical', 'diagonal']) if direction == 'horizontal': kernel[(kernel_size - 1) // 2, :] = np.ones(kernel_size) elif direction == 'vertical': kernel[:, (kernel_size - 1) // 2] = np.ones(kernel_size) elif direction == 'diagonal': np.fill_diagonal(kernel, 1) kernel /= kernel_size # Normalize the kernel return cv2.filter2D(img, -1, kernel, borderType=cv2.BORDER_REFLECT) def apply_low_res(img, scale_factor=0.25): """Simulate low-resolution effect by downsampling and upsampling.""" if not (0 < scale_factor < 1): raise ValueError("scale_factor should be between 0 and 1.") h, w = img.shape[:2] # Calculate target dimensions for downsampling downsampled_size = (max(1, int(w * scale_factor)), max(1, int(h * scale_factor))) # Downsample using INTER_AREA for better quality in aggressive downsampling low_res_img = cv2.resize(img, downsampled_size, interpolation=cv2.INTER_AREA) # Upsample using INTER_NEAREST for strong pixelation effect return cv2.resize(low_res_img, (w, h), interpolation=cv2.INTER_NEAREST).astype(img.dtype) def process_human_model_output_orig(human_model_param, cam_param): pose, shape, trans = human_model_param['pose'], human_model_param['shape'], human_model_param['trans'] hand_type = human_model_param['hand_type'] trans = human_model_param['trans'] pose = torch.FloatTensor(pose).view(-1,3); shape = torch.FloatTensor(shape).view(1,-1); # mano parameters (pose: 48 dimension, shape: 10 dimension) trans = torch.FloatTensor(trans).view(1,-1) # translation vector # apply camera extrinsic (rotation) # merge root pose and camera rotation if 'R' in cam_param: R = np.array(cam_param['R'], dtype=np.float32).reshape(3,3) root_pose = pose[mano.orig_root_joint_idx,:].numpy() root_pose, _ = cv2.Rodrigues(root_pose) root_pose, _ = cv2.Rodrigues(np.dot(R,root_pose)) pose[mano.orig_root_joint_idx] = torch.from_numpy(root_pose).view(3) # get root joint coordinate root_pose = pose[mano.orig_root_joint_idx].view(1,3) hand_pose = torch.cat((pose[:mano.orig_root_joint_idx,:], pose[mano.orig_root_joint_idx+1:,:])).view(1,-1) with torch.no_grad(): output = mano.layer[hand_type](betas=shape, hand_pose=hand_pose, global_orient=root_pose, transl=trans) mesh_coord = output.vertices[0].numpy() joint_coord = np.dot(mano.joint_regressor, mesh_coord) # apply camera exrinsic (translation) # compenstate rotation (translation from origin to root joint was not cancled) if 'R' in cam_param and 't' in cam_param: R, t = np.array(cam_param['R'], dtype=np.float32).reshape(3,3), np.array(cam_param['t'], dtype=np.float32).reshape(1,3) root_coord = joint_coord[mano.root_joint_idx,None,:] joint_coord = joint_coord - root_coord + np.dot(R, root_coord.transpose(1,0)).transpose(1,0) + t mesh_coord = mesh_coord - root_coord + np.dot(R, root_coord.transpose(1,0)).transpose(1,0) + t joint_cam_orig = joint_coord.copy() mesh_cam_orig = mesh_coord.copy() pose_orig, shape_orig, trans_orig = torch.cat((root_pose, hand_pose), dim=-1)[0].detach().cpu().numpy(), shape[0].detach().cpu().numpy(), trans[0].detach().cpu().numpy() return mesh_cam_orig, joint_cam_orig, pose_orig, shape_orig, trans_orig def mask2bbox(mask, expansion_factor=1.0): # Find non-zero elements (object pixels) coords = np.argwhere(mask) # Extract bounding box coordinates y_min, x_min = coords.min(axis=0) y_max, x_max = coords.max(axis=0) # Compute width and height width = x_max - x_min + 1 height = y_max - y_min + 1 # Expand bounding box if expansion_factor > 0: x_min = max(0, int(x_min - width * expansion_factor / 2)) y_min = max(0, int(y_min - height * expansion_factor / 2)) x_max = min(mask.shape[1] - 1, int(x_max + width * expansion_factor / 2)) y_max = min(mask.shape[0] - 1, int(y_max + height * expansion_factor / 2)) # Recalculate width and height after expansion width = x_max - x_min + 1 height = y_max - y_min + 1 return (x_min, y_min, width, height)