# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import math import numpy as np import cv2 def gaussian_blur(heatmaps: np.ndarray, kernel: int = 11) -> np.ndarray: """Modulate heatmap distribution with Gaussian. Note: - num_keypoints: K - heatmap height: H - heatmap width: W Args: heatmaps (np.ndarray[K, H, W]): model predicted heatmaps. kernel (int): Gaussian kernel size (K) for modulation, which should match the heatmap gaussian sigma when training. K=17 for sigma=3 and k=11 for sigma=2. Returns: np.ndarray ([K, H, W]): Modulated heatmap distribution. """ assert kernel % 2 == 1 border = (kernel - 1) // 2 K, H, W = heatmaps.shape for k in range(K): origin_max = np.max(heatmaps[k]) dr = np.zeros((H + 2 * border, W + 2 * border), dtype=np.float32) dr[border:-border, border:-border] = heatmaps[k].copy() dr = cv2.GaussianBlur(dr, (kernel, kernel), 0) heatmaps[k] = dr[border:-border, border:-border].copy() heatmaps[k] *= origin_max / np.max(heatmaps[k]) return heatmaps def get_heatmap_maximum(heatmaps: np.ndarray): """Get maximum response location and value from heatmaps. Note: batch_size: B num_keypoints: K heatmap height: H heatmap width: W Args: heatmaps (np.ndarray): Heatmaps in shape (K, H, W) or (B, K, H, W) Returns: tuple: - locs (np.ndarray): locations of maximum heatmap responses in shape (K, 2) or (B, K, 2) - vals (np.ndarray): values of maximum heatmap responses in shape (K,) or (B, K) """ assert isinstance(heatmaps, np.ndarray), ('heatmaps should be numpy.ndarray') assert heatmaps.ndim == 3 or heatmaps.ndim == 4, ( f'Invalid shape {heatmaps.shape}') if heatmaps.ndim == 3: K, H, W = heatmaps.shape B = None heatmaps_flatten = heatmaps.reshape(K, -1) else: B, K, H, W = heatmaps.shape heatmaps_flatten = heatmaps.reshape(B * K, -1) y_locs, x_locs = np.unravel_index( np.argmax(heatmaps_flatten, axis=1), shape=(H, W)) locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32) vals = np.amax(heatmaps_flatten, axis=1) locs[vals <= 0.] = -1 if B: locs = locs.reshape(B, K, 2) vals = vals.reshape(B, K) return locs, vals def refine_keypoints_dark_udp(keypoints: np.ndarray, heatmaps: np.ndarray, blur_kernel_size: int) -> np.ndarray: """Refine keypoint predictions using distribution aware coordinate decoding for UDP. See `UDP`_ for details. The operation is in-place. Note: - instance number: N - keypoint number: K - keypoint dimension: D - heatmap size: [W, H] Args: keypoints (np.ndarray): The keypoint coordinates in shape (N, K, D) heatmaps (np.ndarray): The heatmaps in shape (K, H, W) blur_kernel_size (int): The Gaussian blur kernel size of the heatmap modulation Returns: np.ndarray: Refine keypoint coordinates in shape (N, K, D) .. _`UDP`: https://arxiv.org/abs/1911.07524 """ N, K = keypoints.shape[:2] H, W = heatmaps.shape[1:] # modulate heatmaps heatmaps = gaussian_blur(heatmaps, blur_kernel_size) np.clip(heatmaps, 1e-3, 50., heatmaps) np.log(heatmaps, heatmaps) heatmaps_pad = np.pad( heatmaps, ((0, 0), (1, 1), (1, 1)), mode='edge').flatten() for n in range(N): index = keypoints[n, :, 0] + 1 + (keypoints[n, :, 1] + 1) * (W + 2) index += (W + 2) * (H + 2) * np.arange(0, K) index = index.astype(int).reshape(-1, 1) i_ = heatmaps_pad[index] ix1 = heatmaps_pad[index + 1] iy1 = heatmaps_pad[index + W + 2] ix1y1 = heatmaps_pad[index + W + 3] ix1_y1_ = heatmaps_pad[index - W - 3] ix1_ = heatmaps_pad[index - 1] iy1_ = heatmaps_pad[index - 2 - W] dx = 0.5 * (ix1 - ix1_) dy = 0.5 * (iy1 - iy1_) derivative = np.concatenate([dx, dy], axis=1) derivative = derivative.reshape(K, 2, 1) dxx = ix1 - 2 * i_ + ix1_ dyy = iy1 - 2 * i_ + iy1_ dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_) hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1) hessian = hessian.reshape(K, 2, 2) hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2)) keypoints[n] -= np.einsum('imn,ink->imk', hessian, derivative).squeeze() return keypoints def udp_decode(heatmaps, input_size, heatmap_size, blur_kernel_size=11) -> np.ndarray: """UDP decoding for keypoint location refinement. Note: - num_keypoints: K - heatmap height: H - heatmap width: W Args: heatmaps (np.ndarray[K, H, W]): model predicted heatmaps. blur_kernel_size (int): Gaussian kernel size (K) for modulation, which should match the heatmap gaussian sigma when training. K=17 for sigma=3 and k=11 for sigma=2. Returns: np.ndarray ([K, H, W]): Refined keypoint locations. """ keypoints, scores = get_heatmap_maximum(heatmaps) # unsqueeze the instance dimension for single-instance results keypoints = keypoints[None] scores = scores[None] keypoints = refine_keypoints_dark_udp( keypoints, heatmaps, blur_kernel_size=blur_kernel_size) W, H = heatmap_size keypoints = (keypoints / [W - 1, H - 1]) * input_size return keypoints, scores def get_udp_warp_matrix( center: np.ndarray, scale: np.ndarray, rot: float, output_size, ) -> np.ndarray: """Calculate the affine transformation matrix under the unbiased constraint. See `UDP (CVPR 2020)`_ for details. Note: - The bbox number: N Args: center (np.ndarray[2, ]): Center of the bounding box (x, y). scale (np.ndarray[2, ]): Scale of the bounding box wrt [width, height]. rot (float): Rotation angle (degree). output_size (tuple): Size ([w, h]) of the output image Returns: np.ndarray: A 2x3 transformation matrix .. _`UDP (CVPR 2020)`: https://arxiv.org/abs/1911.07524 """ assert len(center) == 2 assert len(scale) == 2 assert len(output_size) == 2 input_size = center * 2 rot_rad = np.deg2rad(rot) warp_mat = np.zeros((2, 3), dtype=np.float32) scale_x = (output_size[0] - 1) / scale[0] scale_y = (output_size[1] - 1) / scale[1] warp_mat[0, 0] = math.cos(rot_rad) * scale_x warp_mat[0, 1] = -math.sin(rot_rad) * scale_x warp_mat[0, 2] = scale_x * (-0.5 * input_size[0] * math.cos(rot_rad) + 0.5 * input_size[1] * math.sin(rot_rad) + 0.5 * scale[0]) warp_mat[1, 0] = math.sin(rot_rad) * scale_y warp_mat[1, 1] = math.cos(rot_rad) * scale_y warp_mat[1, 2] = scale_y * (-0.5 * input_size[0] * math.sin(rot_rad) - 0.5 * input_size[1] * math.cos(rot_rad) + 0.5 * scale[1]) return warp_mat def top_down_affine_transform(img, bbox, padding=1.25): """ Args: img (np.ndarray): Image to be transformed. bbox (np.ndarray): Bounding box to be transformed. padding (int): Padding size. Returns: np.ndarray: Transformed image. np.ndarray: Transformed bounding box. """ dim = bbox.ndim if dim == 1: bbox = bbox[None, :] x1, y1, x2, y2 = np.hsplit(bbox, [1, 2, 3]) center = np.hstack([x1 + x2, y1 + y2]) * 0.5 scale = np.hstack([x2 - x1, y2 - y1]) * padding if dim == 1: center = center[0] scale = scale[0] h, w = img.shape[:2] warp_size = (int(w), int(h)) aspect_ratio = w / h # reshape bbox to fixed aspect ratio box_w, box_h = np.hsplit(scale, [1]) scale = np.where(box_w > box_h * aspect_ratio, np.hstack([box_w, box_w / aspect_ratio]), np.hstack([box_h * aspect_ratio, box_h])) rot = 0. warp_mat = get_udp_warp_matrix( center, scale, rot, output_size=(w, h)) img = cv2.warpAffine( img, warp_mat, warp_size, flags=cv2.INTER_LINEAR) return img, [center], [scale] def nms(dets: np.ndarray, thr: float): """Greedily select boxes with high confidence and overlap <= thr. Args: dets (np.ndarray): [[x1, y1, x2, y2, score]]. thr (float): Retain overlap < thr. Returns: list: Indexes to keep. """ if len(dets) == 0: return [] x1 = dets[:, 0] y1 = dets[:, 1] x2 = dets[:, 2] y2 = dets[:, 3] scores = dets[:, 4] areas = (x2 - x1 + 1) * (y2 - y1 + 1) order = scores.argsort()[::-1] keep = [] while len(order) > 0: i = order[0] keep.append(i) xx1 = np.maximum(x1[i], x1[order[1:]]) yy1 = np.maximum(y1[i], y1[order[1:]]) xx2 = np.minimum(x2[i], x2[order[1:]]) yy2 = np.minimum(y2[i], y2[order[1:]]) w = np.maximum(0.0, xx2 - xx1 + 1) h = np.maximum(0.0, yy2 - yy1 + 1) inter = w * h ovr = inter / (areas[i] + areas[order[1:]] - inter) inds = np.where(ovr <= thr)[0] order = order[inds + 1] return keep