HACO / lib /utils /preprocessing.py
dqj5182's picture
init
5732928
import cv2
import torch
import random
import numpy as np
import torch.nn.functional as F
from lib.core.config import cfg
from lib.utils.human_models import mano
def get_aug_config_contact():
# Augmentation intensity factors
scale_factor = 0.25
rot_factor = 30
color_factor = 0.2
trans_factor = 0.1 # Translation range (recommended 0.1 to 0.2)
noise_std = 0.02 # Gaussian noise strength
motion_blur_prob = 0.15 # Probability of applying motion blur
extreme_crop_prob = 0.1 # Probability for extreme cropping
extreme_crop_lvl = 0.3 # Crop intensity (recommended 0.2 to 0.4)
low_res_prob = 0.05 # Probability for applying low resolution
low_res_scale_range = (0.15, 0.5) # Range for low-res scaling
# Scaling augmentation
scale = np.clip(np.random.randn(), -1.0, 1.0) * scale_factor + 1.0
# Rotation augmentation
rot = np.clip(np.random.randn(), -2.0, 2.0) * rot_factor if random.random() <= 0.6 else 0
# Color augmentation
c_up = 1.0 + color_factor
c_low = 1.0 - color_factor
color_scale = np.array([
random.uniform(c_low, c_up),
random.uniform(c_low, c_up),
random.uniform(c_low, c_up)
])
# Flipping augmentation
do_flip = random.random() <= 0.5
# Translation augmentation
tx = np.clip(np.random.randn(), -1.0, 1.0) * trans_factor
ty = np.clip(np.random.randn(), -1.0, 1.0) * trans_factor
# Extreme cropping augmentation
do_extreme_crop = random.random() <= extreme_crop_prob
# Noise augmentation (returns standard deviation for Gaussian noise injection)
add_noise = random.random() <= 0.3 # 30% chance of adding noise
noise_std = noise_std if add_noise else 0.0
# Motion blur augmentation
apply_motion_blur = random.random() <= motion_blur_prob
motion_blur_kernel_size = random.choice([3, 5, 7]) if apply_motion_blur else 0
# Low-resolution augmentation
apply_low_res = random.random() <= low_res_prob
low_res_scale = random.uniform(*low_res_scale_range) if apply_low_res else 1.0
return {
'scale': scale,
'rot': rot,
'color_scale': color_scale,
'do_flip': do_flip,
'tx': tx,
'ty': ty,
'do_extreme_crop': do_extreme_crop,
'extreme_crop_lvl': extreme_crop_lvl if do_extreme_crop else 0,
'noise_std': noise_std,
'motion_blur_kernel_size': motion_blur_kernel_size,
'low_res_scale': low_res_scale # Added low-res scale parameter
}
def rotate_2d(pt_2d, rot_rad):
x = pt_2d[0]
y = pt_2d[1]
sn, cs = np.sin(rot_rad), np.cos(rot_rad)
xx = x * cs - y * sn
yy = x * sn + y * cs
return np.array([xx, yy], dtype=np.float32)
def gen_trans_from_patch_cv(c_x, c_y, src_width, src_height, dst_width, dst_height, scale, rot, inv=False):
# augment size with scale
src_w = src_width * scale
src_h = src_height * scale
src_center = np.array([c_x, c_y], dtype=np.float32)
# augment rotation
rot_rad = np.pi * rot / 180
src_downdir = rotate_2d(np.array([0, src_h * 0.5], dtype=np.float32), rot_rad)
src_rightdir = rotate_2d(np.array([src_w * 0.5, 0], dtype=np.float32), rot_rad)
dst_w = dst_width
dst_h = dst_height
dst_center = np.array([dst_w * 0.5, dst_h * 0.5], dtype=np.float32)
dst_downdir = np.array([0, dst_h * 0.5], dtype=np.float32)
dst_rightdir = np.array([dst_w * 0.5, 0], dtype=np.float32)
src = np.zeros((3, 2), dtype=np.float32)
src[0, :] = src_center
src[1, :] = src_center + src_downdir
src[2, :] = src_center + src_rightdir
dst = np.zeros((3, 2), dtype=np.float32)
dst[0, :] = dst_center
dst[1, :] = dst_center + dst_downdir
dst[2, :] = dst_center + dst_rightdir
if inv:
trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
else:
trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
trans = trans.astype(np.float32)
return trans
def generate_patch_image_contact(cvimg, bbox, scale, rot, do_flip, out_shape, tx=0.0, ty=0.0, bkg_color='black'):
img = cvimg.copy()
img_height, img_width, img_channels = img.shape
bb_c_x = float(bbox[0] + 0.5 * bbox[2])
bb_c_y = float(bbox[1] + 0.5 * bbox[3])
bb_width = float(bbox[2])
bb_height = float(bbox[3])
if bkg_color == 'white':
borderMode=cv2.BORDER_CONSTANT
borderValue=(255, 255, 255)
else:
borderMode=cv2.BORDER_CONSTANT
borderValue=(0, 0, 0)
if do_flip:
img = img[:, ::-1, :]
bb_c_x = img_width - bb_c_x - 1
# Add translation offset
bb_c_x += tx * img_width
bb_c_y += ty * img_height
trans = gen_trans_from_patch_cv(bb_c_x, bb_c_y, bb_width, bb_height,
out_shape[1], out_shape[0], scale, rot)
img_patch = cv2.warpAffine(img, trans, (int(out_shape[1]), int(out_shape[0])), flags=cv2.INTER_LINEAR, borderMode=borderMode, borderValue=borderValue)
img_patch = img_patch.astype(np.float32)
inv_trans = gen_trans_from_patch_cv(bb_c_x, bb_c_y, bb_width, bb_height,
out_shape[1], out_shape[0], scale, rot, inv=True)
return img_patch, trans, inv_trans
def augmentation_contact(img, bbox, data_split, enforce_flip=None, bkg_color='black'):
if data_split == 'train':
aug_params = get_aug_config_contact()
else:
aug_params = {
'scale': 1.0,
'rot': 0.0,
'color_scale': np.array([1, 1, 1]),
'do_flip': False,
'tx': 0.0,
'ty': 0.0,
'do_extreme_crop': False,
'extreme_crop_lvl': 0.0,
'noise_std': 0.0,
'motion_blur_kernel_size': 0,
'low_res_scale': 1.0 # No low-res in non-training mode
}
# Enforce flip if specified
if enforce_flip is not None:
aug_params['do_flip'] = enforce_flip
# Apply geometric augmentations (scaling, rotation, flipping)
img, trans, inv_trans = generate_patch_image_contact(
img, bbox, aug_params['scale'], aug_params['rot'],
aug_params['do_flip'], cfg.MODEL.input_img_shape,
aug_params['tx'], aug_params['ty'], bkg_color
)
# Apply low-resolution augmentation
if aug_params['low_res_scale'] < 1.0: # Only apply if scaling down
img = apply_low_res(img, aug_params['low_res_scale'])
# Apply color augmentation
img = np.clip(img * aug_params['color_scale'][None, None, :], 0, 255)
# Apply extreme cropping
if aug_params['do_extreme_crop']:
img = apply_extreme_crop(img, aug_params['extreme_crop_lvl'])
# Apply noise augmentation
if aug_params['noise_std'] > 0:
img = add_gaussian_noise(img, aug_params['noise_std'])
# Apply motion blur augmentation
if aug_params['motion_blur_kernel_size'] > 0:
img = apply_motion_blur(img, aug_params['motion_blur_kernel_size'])
return img, trans, inv_trans, aug_params['rot'], aug_params['do_flip'], aug_params['color_scale']
def apply_extreme_crop(img, crop_lvl):
"""Extreme cropping: Aggressively crop the image."""
h, w = img.shape[:2]
crop_size = max(1, int(min(h, w) * (1 - crop_lvl))) # Prevent zero-size crops
start_x = random.randint(0, max(0, w - crop_size))
start_y = random.randint(0, max(0, h - crop_size))
cropped_img = img[start_y:start_y + crop_size, start_x:start_x + crop_size]
# Preserve aspect ratio during resizing
return cv2.resize(cropped_img, (w, h), interpolation=cv2.INTER_LINEAR)
def add_gaussian_noise(img, noise_std):
"""Add Gaussian noise to the image with proper scaling for data type."""
noise = np.random.normal(0, noise_std, img.shape).astype(np.float32)
if img.dtype == np.uint8:
noisy_img = np.clip(img + noise * 255, 0, 255).astype(np.uint8)
elif img.dtype == np.float32:
noisy_img = np.clip(img + noise, 0.0, 1.0).astype(np.float32)
elif img.dtype == np.float64:
noisy_img = np.clip(img + noise, 0.0, 1.0).astype(np.float64)
else:
raise TypeError("Unsupported image dtype. Expected uint8 or float32.")
return noisy_img
def apply_motion_blur(img, kernel_size):
"""Apply motion blur to the image with a random direction."""
kernel = np.zeros((kernel_size, kernel_size))
direction = random.choice(['horizontal', 'vertical', 'diagonal'])
if direction == 'horizontal':
kernel[(kernel_size - 1) // 2, :] = np.ones(kernel_size)
elif direction == 'vertical':
kernel[:, (kernel_size - 1) // 2] = np.ones(kernel_size)
elif direction == 'diagonal':
np.fill_diagonal(kernel, 1)
kernel /= kernel_size # Normalize the kernel
return cv2.filter2D(img, -1, kernel, borderType=cv2.BORDER_REFLECT)
def apply_low_res(img, scale_factor=0.25):
"""Simulate low-resolution effect by downsampling and upsampling."""
if not (0 < scale_factor < 1):
raise ValueError("scale_factor should be between 0 and 1.")
h, w = img.shape[:2]
# Calculate target dimensions for downsampling
downsampled_size = (max(1, int(w * scale_factor)), max(1, int(h * scale_factor)))
# Downsample using INTER_AREA for better quality in aggressive downsampling
low_res_img = cv2.resize(img, downsampled_size, interpolation=cv2.INTER_AREA)
# Upsample using INTER_NEAREST for strong pixelation effect
return cv2.resize(low_res_img, (w, h), interpolation=cv2.INTER_NEAREST).astype(img.dtype)
def process_human_model_output_orig(human_model_param, cam_param):
pose, shape, trans = human_model_param['pose'], human_model_param['shape'], human_model_param['trans']
hand_type = human_model_param['hand_type']
trans = human_model_param['trans']
pose = torch.FloatTensor(pose).view(-1,3); shape = torch.FloatTensor(shape).view(1,-1); # mano parameters (pose: 48 dimension, shape: 10 dimension)
trans = torch.FloatTensor(trans).view(1,-1) # translation vector
# apply camera extrinsic (rotation)
# merge root pose and camera rotation
if 'R' in cam_param:
R = np.array(cam_param['R'], dtype=np.float32).reshape(3,3)
root_pose = pose[mano.orig_root_joint_idx,:].numpy()
root_pose, _ = cv2.Rodrigues(root_pose)
root_pose, _ = cv2.Rodrigues(np.dot(R,root_pose))
pose[mano.orig_root_joint_idx] = torch.from_numpy(root_pose).view(3)
# get root joint coordinate
root_pose = pose[mano.orig_root_joint_idx].view(1,3)
hand_pose = torch.cat((pose[:mano.orig_root_joint_idx,:], pose[mano.orig_root_joint_idx+1:,:])).view(1,-1)
with torch.no_grad():
output = mano.layer[hand_type](betas=shape, hand_pose=hand_pose, global_orient=root_pose, transl=trans)
mesh_coord = output.vertices[0].numpy()
joint_coord = np.dot(mano.joint_regressor, mesh_coord)
# apply camera exrinsic (translation)
# compenstate rotation (translation from origin to root joint was not cancled)
if 'R' in cam_param and 't' in cam_param:
R, t = np.array(cam_param['R'], dtype=np.float32).reshape(3,3), np.array(cam_param['t'], dtype=np.float32).reshape(1,3)
root_coord = joint_coord[mano.root_joint_idx,None,:]
joint_coord = joint_coord - root_coord + np.dot(R, root_coord.transpose(1,0)).transpose(1,0) + t
mesh_coord = mesh_coord - root_coord + np.dot(R, root_coord.transpose(1,0)).transpose(1,0) + t
joint_cam_orig = joint_coord.copy()
mesh_cam_orig = mesh_coord.copy()
pose_orig, shape_orig, trans_orig = torch.cat((root_pose, hand_pose), dim=-1)[0].detach().cpu().numpy(), shape[0].detach().cpu().numpy(), trans[0].detach().cpu().numpy()
return mesh_cam_orig, joint_cam_orig, pose_orig, shape_orig, trans_orig
def mask2bbox(mask, expansion_factor=1.0):
# Find non-zero elements (object pixels)
coords = np.argwhere(mask)
# Extract bounding box coordinates
y_min, x_min = coords.min(axis=0)
y_max, x_max = coords.max(axis=0)
# Compute width and height
width = x_max - x_min + 1
height = y_max - y_min + 1
# Expand bounding box
if expansion_factor > 0:
x_min = max(0, int(x_min - width * expansion_factor / 2))
y_min = max(0, int(y_min - height * expansion_factor / 2))
x_max = min(mask.shape[1] - 1, int(x_max + width * expansion_factor / 2))
y_max = min(mask.shape[0] - 1, int(y_max + height * expansion_factor / 2))
# Recalculate width and height after expansion
width = x_max - x_min + 1
height = y_max - y_min + 1
return (x_min, y_min, width, height)