File size: 12,668 Bytes
5732928
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
import cv2
import torch
import random
import numpy as np
import torch.nn.functional as F

from lib.core.config import cfg
from lib.utils.human_models import mano


def get_aug_config_contact():
    # Augmentation intensity factors
    scale_factor = 0.25
    rot_factor = 30
    color_factor = 0.2
    trans_factor = 0.1 # Translation range (recommended 0.1 to 0.2)
    noise_std = 0.02 # Gaussian noise strength
    motion_blur_prob = 0.15 # Probability of applying motion blur
    extreme_crop_prob = 0.1 # Probability for extreme cropping
    extreme_crop_lvl = 0.3 # Crop intensity (recommended 0.2 to 0.4)
    low_res_prob = 0.05 # Probability for applying low resolution
    low_res_scale_range = (0.15, 0.5) # Range for low-res scaling

    # Scaling augmentation
    scale = np.clip(np.random.randn(), -1.0, 1.0) * scale_factor + 1.0

    # Rotation augmentation
    rot = np.clip(np.random.randn(), -2.0, 2.0) * rot_factor if random.random() <= 0.6 else 0

    # Color augmentation
    c_up = 1.0 + color_factor
    c_low = 1.0 - color_factor
    color_scale = np.array([
        random.uniform(c_low, c_up),
        random.uniform(c_low, c_up),
        random.uniform(c_low, c_up)
    ])

    # Flipping augmentation
    do_flip = random.random() <= 0.5

    # Translation augmentation
    tx = np.clip(np.random.randn(), -1.0, 1.0) * trans_factor
    ty = np.clip(np.random.randn(), -1.0, 1.0) * trans_factor

    # Extreme cropping augmentation
    do_extreme_crop = random.random() <= extreme_crop_prob

    # Noise augmentation (returns standard deviation for Gaussian noise injection)
    add_noise = random.random() <= 0.3  # 30% chance of adding noise
    noise_std = noise_std if add_noise else 0.0

    # Motion blur augmentation
    apply_motion_blur = random.random() <= motion_blur_prob
    motion_blur_kernel_size = random.choice([3, 5, 7]) if apply_motion_blur else 0

    # Low-resolution augmentation
    apply_low_res = random.random() <= low_res_prob
    low_res_scale = random.uniform(*low_res_scale_range) if apply_low_res else 1.0

    return {
        'scale': scale,
        'rot': rot,
        'color_scale': color_scale,
        'do_flip': do_flip,
        'tx': tx,
        'ty': ty,
        'do_extreme_crop': do_extreme_crop,
        'extreme_crop_lvl': extreme_crop_lvl if do_extreme_crop else 0,
        'noise_std': noise_std,
        'motion_blur_kernel_size': motion_blur_kernel_size,
        'low_res_scale': low_res_scale # Added low-res scale parameter
    }


def rotate_2d(pt_2d, rot_rad):
    x = pt_2d[0]
    y = pt_2d[1]
    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
    xx = x * cs - y * sn
    yy = x * sn + y * cs
    return np.array([xx, yy], dtype=np.float32)


def gen_trans_from_patch_cv(c_x, c_y, src_width, src_height, dst_width, dst_height, scale, rot, inv=False):
    # augment size with scale
    src_w = src_width * scale
    src_h = src_height * scale
    src_center = np.array([c_x, c_y], dtype=np.float32)

    # augment rotation
    rot_rad = np.pi * rot / 180
    src_downdir = rotate_2d(np.array([0, src_h * 0.5], dtype=np.float32), rot_rad)
    src_rightdir = rotate_2d(np.array([src_w * 0.5, 0], dtype=np.float32), rot_rad)

    dst_w = dst_width
    dst_h = dst_height
    dst_center = np.array([dst_w * 0.5, dst_h * 0.5], dtype=np.float32)
    dst_downdir = np.array([0, dst_h * 0.5], dtype=np.float32)
    dst_rightdir = np.array([dst_w * 0.5, 0], dtype=np.float32)

    src = np.zeros((3, 2), dtype=np.float32)
    src[0, :] = src_center
    src[1, :] = src_center + src_downdir
    src[2, :] = src_center + src_rightdir

    dst = np.zeros((3, 2), dtype=np.float32)
    dst[0, :] = dst_center
    dst[1, :] = dst_center + dst_downdir
    dst[2, :] = dst_center + dst_rightdir
    
    if inv:
        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
    else:
        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))

    trans = trans.astype(np.float32)
    return trans


def generate_patch_image_contact(cvimg, bbox, scale, rot, do_flip, out_shape, tx=0.0, ty=0.0, bkg_color='black'):
    img = cvimg.copy()
    img_height, img_width, img_channels = img.shape

    bb_c_x = float(bbox[0] + 0.5 * bbox[2])
    bb_c_y = float(bbox[1] + 0.5 * bbox[3])
    bb_width = float(bbox[2])
    bb_height = float(bbox[3])

    if bkg_color == 'white':
        borderMode=cv2.BORDER_CONSTANT
        borderValue=(255, 255, 255)
    else:
        borderMode=cv2.BORDER_CONSTANT
        borderValue=(0, 0, 0)

    if do_flip:
        img = img[:, ::-1, :]
        bb_c_x = img_width - bb_c_x - 1

    # Add translation offset
    bb_c_x += tx * img_width
    bb_c_y += ty * img_height

    trans = gen_trans_from_patch_cv(bb_c_x, bb_c_y, bb_width, bb_height, 
                                    out_shape[1], out_shape[0], scale, rot)
    img_patch = cv2.warpAffine(img, trans, (int(out_shape[1]), int(out_shape[0])), flags=cv2.INTER_LINEAR, borderMode=borderMode, borderValue=borderValue)
    img_patch = img_patch.astype(np.float32)
    inv_trans = gen_trans_from_patch_cv(bb_c_x, bb_c_y, bb_width, bb_height, 
                                        out_shape[1], out_shape[0], scale, rot, inv=True)

    return img_patch, trans, inv_trans


def augmentation_contact(img, bbox, data_split, enforce_flip=None, bkg_color='black'):
    if data_split == 'train':
        aug_params = get_aug_config_contact()
    else:
        aug_params = {
            'scale': 1.0,
            'rot': 0.0,
            'color_scale': np.array([1, 1, 1]),
            'do_flip': False,
            'tx': 0.0,
            'ty': 0.0,
            'do_extreme_crop': False,
            'extreme_crop_lvl': 0.0,
            'noise_std': 0.0,
            'motion_blur_kernel_size': 0,
            'low_res_scale': 1.0  # No low-res in non-training mode
        }
    
    # Enforce flip if specified
    if enforce_flip is not None:
        aug_params['do_flip'] = enforce_flip

    # Apply geometric augmentations (scaling, rotation, flipping)
    img, trans, inv_trans = generate_patch_image_contact(
        img, bbox, aug_params['scale'], aug_params['rot'], 
        aug_params['do_flip'], cfg.MODEL.input_img_shape, 
        aug_params['tx'], aug_params['ty'], bkg_color
    )

    # Apply low-resolution augmentation
    if aug_params['low_res_scale'] < 1.0:  # Only apply if scaling down
        img = apply_low_res(img, aug_params['low_res_scale'])

    # Apply color augmentation
    img = np.clip(img * aug_params['color_scale'][None, None, :], 0, 255)

    # Apply extreme cropping
    if aug_params['do_extreme_crop']:
        img = apply_extreme_crop(img, aug_params['extreme_crop_lvl'])

    # Apply noise augmentation
    if aug_params['noise_std'] > 0:
        img = add_gaussian_noise(img, aug_params['noise_std'])

    # Apply motion blur augmentation
    if aug_params['motion_blur_kernel_size'] > 0:
        img = apply_motion_blur(img, aug_params['motion_blur_kernel_size'])

    return img, trans, inv_trans, aug_params['rot'], aug_params['do_flip'], aug_params['color_scale']


def apply_extreme_crop(img, crop_lvl):
    """Extreme cropping: Aggressively crop the image."""
    h, w = img.shape[:2]
    crop_size = max(1, int(min(h, w) * (1 - crop_lvl)))  # Prevent zero-size crops
    start_x = random.randint(0, max(0, w - crop_size))
    start_y = random.randint(0, max(0, h - crop_size))
    cropped_img = img[start_y:start_y + crop_size, start_x:start_x + crop_size]
    
    # Preserve aspect ratio during resizing
    return cv2.resize(cropped_img, (w, h), interpolation=cv2.INTER_LINEAR)


def add_gaussian_noise(img, noise_std):
    """Add Gaussian noise to the image with proper scaling for data type."""
    noise = np.random.normal(0, noise_std, img.shape).astype(np.float32)
    
    if img.dtype == np.uint8:
        noisy_img = np.clip(img + noise * 255, 0, 255).astype(np.uint8)
    elif img.dtype == np.float32:
        noisy_img = np.clip(img + noise, 0.0, 1.0).astype(np.float32)
    elif img.dtype == np.float64:
        noisy_img = np.clip(img + noise, 0.0, 1.0).astype(np.float64)
    else:
        raise TypeError("Unsupported image dtype. Expected uint8 or float32.")
        
    return noisy_img


def apply_motion_blur(img, kernel_size):
    """Apply motion blur to the image with a random direction."""
    kernel = np.zeros((kernel_size, kernel_size))
    direction = random.choice(['horizontal', 'vertical', 'diagonal'])

    if direction == 'horizontal':
        kernel[(kernel_size - 1) // 2, :] = np.ones(kernel_size)
    elif direction == 'vertical':
        kernel[:, (kernel_size - 1) // 2] = np.ones(kernel_size)
    elif direction == 'diagonal':
        np.fill_diagonal(kernel, 1)
    
    kernel /= kernel_size  # Normalize the kernel
    return cv2.filter2D(img, -1, kernel, borderType=cv2.BORDER_REFLECT)


def apply_low_res(img, scale_factor=0.25):
    """Simulate low-resolution effect by downsampling and upsampling."""
    if not (0 < scale_factor < 1):
        raise ValueError("scale_factor should be between 0 and 1.")

    h, w = img.shape[:2]

    # Calculate target dimensions for downsampling
    downsampled_size = (max(1, int(w * scale_factor)), max(1, int(h * scale_factor)))

    # Downsample using INTER_AREA for better quality in aggressive downsampling
    low_res_img = cv2.resize(img, downsampled_size, interpolation=cv2.INTER_AREA)

    # Upsample using INTER_NEAREST for strong pixelation effect
    return cv2.resize(low_res_img, (w, h), interpolation=cv2.INTER_NEAREST).astype(img.dtype)


def process_human_model_output_orig(human_model_param, cam_param):
    pose, shape, trans = human_model_param['pose'], human_model_param['shape'], human_model_param['trans']
    hand_type = human_model_param['hand_type']
    trans = human_model_param['trans']
    pose = torch.FloatTensor(pose).view(-1,3); shape = torch.FloatTensor(shape).view(1,-1); # mano parameters (pose: 48 dimension, shape: 10 dimension)
    trans = torch.FloatTensor(trans).view(1,-1) # translation vector

    # apply camera extrinsic (rotation)
    # merge root pose and camera rotation 
    if 'R' in cam_param:
        R = np.array(cam_param['R'], dtype=np.float32).reshape(3,3)
        root_pose = pose[mano.orig_root_joint_idx,:].numpy()
        root_pose, _ = cv2.Rodrigues(root_pose)
        root_pose, _ = cv2.Rodrigues(np.dot(R,root_pose))
        pose[mano.orig_root_joint_idx] = torch.from_numpy(root_pose).view(3)
    
    # get root joint coordinate
    root_pose = pose[mano.orig_root_joint_idx].view(1,3)
    hand_pose = torch.cat((pose[:mano.orig_root_joint_idx,:], pose[mano.orig_root_joint_idx+1:,:])).view(1,-1)
    with torch.no_grad():
        output = mano.layer[hand_type](betas=shape, hand_pose=hand_pose, global_orient=root_pose, transl=trans)
    mesh_coord = output.vertices[0].numpy()
    joint_coord = np.dot(mano.joint_regressor, mesh_coord)
    
    # apply camera exrinsic (translation)
    # compenstate rotation (translation from origin to root joint was not cancled)
    if 'R' in cam_param and 't' in cam_param:
        R, t = np.array(cam_param['R'], dtype=np.float32).reshape(3,3), np.array(cam_param['t'], dtype=np.float32).reshape(1,3)
        root_coord = joint_coord[mano.root_joint_idx,None,:]
        joint_coord = joint_coord - root_coord + np.dot(R, root_coord.transpose(1,0)).transpose(1,0) + t
        mesh_coord = mesh_coord - root_coord + np.dot(R, root_coord.transpose(1,0)).transpose(1,0) + t

    
    joint_cam_orig = joint_coord.copy()
    mesh_cam_orig = mesh_coord.copy()
    pose_orig, shape_orig, trans_orig = torch.cat((root_pose, hand_pose), dim=-1)[0].detach().cpu().numpy(), shape[0].detach().cpu().numpy(), trans[0].detach().cpu().numpy()

    return mesh_cam_orig, joint_cam_orig, pose_orig, shape_orig, trans_orig


def mask2bbox(mask, expansion_factor=1.0):
    # Find non-zero elements (object pixels)
    coords = np.argwhere(mask)
    
    # Extract bounding box coordinates
    y_min, x_min = coords.min(axis=0)
    y_max, x_max = coords.max(axis=0)
    
    # Compute width and height
    width = x_max - x_min + 1
    height = y_max - y_min + 1

    # Expand bounding box
    if expansion_factor > 0:
        x_min = max(0, int(x_min - width * expansion_factor / 2))
        y_min = max(0, int(y_min - height * expansion_factor / 2))
        x_max = min(mask.shape[1] - 1, int(x_max + width * expansion_factor / 2))
        y_max = min(mask.shape[0] - 1, int(y_max + height * expansion_factor / 2))

        # Recalculate width and height after expansion
        width = x_max - x_min + 1
        height = y_max - y_min + 1

    return (x_min, y_min, width, height)