|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from copy import deepcopy |
|
|
from typing import Dict, List, Optional, Sequence, Tuple, Union |
|
|
import cv2 |
|
|
import numpy as np |
|
|
from mmcv.transforms import BaseTransform |
|
|
from mmcv.image import imflip |
|
|
from mmengine import is_seq_of |
|
|
from scipy.stats import truncnorm |
|
|
from mmpose.structures.bbox import bbox_xyxy2cs, flip_bbox |
|
|
from mmcv.transforms.utils import avoid_cache_randomness, cache_randomness |
|
|
from mmpose.registry import TRANSFORMS |
|
|
from mmpose.structures.keypoint import flip_keypoints_custom_center |
|
|
from mmpose.structures.keypoint import flip_keypoints |
|
|
from mmpose.structures.bbox import get_udp_warp_matrix, get_warp_matrix |
|
|
from .formatting import PackPoseInputs |
|
|
from mmpose.utils.typing import MultiConfig |
|
|
from scipy.stats import norm |
|
|
|
|
|
|
|
|
@TRANSFORMS.register_module() |
|
|
class Pose3dGenerateTarget(BaseTransform): |
|
|
def __init__(self) -> None: |
|
|
super().__init__() |
|
|
return |
|
|
|
|
|
def transform(self, results: Dict) -> Optional[dict]: |
|
|
if 'keypoints_depth' not in results: |
|
|
num_keypoints = results['transformed_keypoints'].shape[1] |
|
|
results['pose3d'] = np.zeros((num_keypoints, 3)).astype(np.float32) |
|
|
results['pose3d_visible'] = np.zeros(num_keypoints, dtype=bool) |
|
|
results['K'] = np.eye(3).astype(np.float32) |
|
|
return results |
|
|
|
|
|
assert 'K' in results |
|
|
results['K'] = results['K'].astype(np.float32) |
|
|
|
|
|
K = results['K'] |
|
|
height, width = results['img'].shape[:2] |
|
|
|
|
|
keypoints = results['transformed_keypoints'][0] |
|
|
keypoints_valid = results['keypoints_visible'][0] |
|
|
|
|
|
Z = results['keypoints_depth'][0, :, 0] |
|
|
|
|
|
|
|
|
fx, fy = K[0, 0], K[1, 1] |
|
|
cx, cy = K[0, 2], K[1, 2] |
|
|
|
|
|
X = (keypoints[:, 0] - cx) * Z / fx |
|
|
Y = (keypoints[:, 1] - cy) * Z / fy |
|
|
|
|
|
|
|
|
pose3d = np.stack([X, Y, Z], axis=-1) |
|
|
pose2d = np.dot(K, pose3d.T).T |
|
|
pose2d = pose2d[:, :2] / (pose2d[:, 2:] + 1e-8) |
|
|
|
|
|
keypoints_valid = keypoints_valid * (pose2d[:, 0] >= 0) * ( |
|
|
pose2d[:, 0] < width) * (pose2d[:, 1] >= 0) * (pose2d[:, 1] < height) |
|
|
|
|
|
|
|
|
pose3d[keypoints_valid == 0] = 0 |
|
|
pose2d[keypoints_valid == 0] = 0 |
|
|
|
|
|
results['pose3d'] = pose3d.astype(np.float32) |
|
|
results['pose3d_visible'] = keypoints_valid.astype(bool) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return results |
|
|
|
|
|
def __repr__(self) -> str: |
|
|
repr_str = self.__class__.__name__ |
|
|
return repr_str |
|
|
|
|
|
@TRANSFORMS.register_module() |
|
|
class PackPose3dInputs(PackPoseInputs): |
|
|
def __init__(self, **kwargs): |
|
|
super().__init__(**kwargs) |
|
|
|
|
|
def transform(self, results: dict) -> dict: |
|
|
|
|
|
if 'keypoints_visible' not in results: |
|
|
return super().transform(results) |
|
|
|
|
|
|
|
|
is_visible = results['keypoints_visible'] |
|
|
image_width, image_height = results['input_size'] |
|
|
transformed_keypoints = results['transformed_keypoints'] |
|
|
|
|
|
is_visible = is_visible * (transformed_keypoints[:, :, 0] >= 0) * (transformed_keypoints[:, :, 0] < image_width) \ |
|
|
* (transformed_keypoints[:, :, 1] >= 0) * (transformed_keypoints[:, :, 1] < image_height) |
|
|
|
|
|
results['keypoints_visible'] = is_visible |
|
|
|
|
|
|
|
|
results['transformed_keypoints'][is_visible == 0] = 0 |
|
|
|
|
|
packed_results = super().transform(results) |
|
|
|
|
|
|
|
|
if 'pose3d' not in results: |
|
|
num_keypoints = results['transformed_keypoints'].shape[1] |
|
|
results['pose3d'] = np.zeros((num_keypoints, 3)).astype(np.float32) |
|
|
results['pose3d_visible'] = np.zeros(num_keypoints, dtype=bool) |
|
|
results['K'] = np.eye(3).astype(np.float32) |
|
|
|
|
|
packed_results['data_samples'].gt_instances.set_field(results['pose3d'].reshape(1, -1, 3), 'pose3d') |
|
|
packed_results['data_samples'].gt_instances.set_field(results['pose3d_visible'].reshape(1, -1), 'pose3d_visible') |
|
|
|
|
|
if 'depth_heatmap' in results: |
|
|
packed_results['data_samples'].gt_instances.set_field(results['depth_heatmap'][np.newaxis, ...], 'depth_heatmap') |
|
|
|
|
|
return packed_results |
|
|
|
|
|
@TRANSFORMS.register_module() |
|
|
class Pose3dRandomBBoxTransform(BaseTransform): |
|
|
def __init__(self, |
|
|
shift_factor: float = 0.16, |
|
|
shift_prob: float = 0.3, |
|
|
scale_factor: Tuple[float, float] = (0.5, 1.5), |
|
|
scale_prob: float = 1.0,) -> None: |
|
|
super().__init__() |
|
|
|
|
|
self.shift_factor = shift_factor |
|
|
self.shift_prob = shift_prob |
|
|
self.scale_factor = scale_factor |
|
|
self.scale_prob = scale_prob |
|
|
|
|
|
@staticmethod |
|
|
def _truncnorm(low: float = -1., |
|
|
high: float = 1., |
|
|
size: tuple = ()) -> np.ndarray: |
|
|
"""Sample from a truncated normal distribution.""" |
|
|
return truncnorm.rvs(low, high, size=size).astype(np.float32) |
|
|
|
|
|
@cache_randomness |
|
|
def _get_transform_params(self, num_bboxes: int) -> Tuple: |
|
|
"""Get random transform parameters. |
|
|
|
|
|
Args: |
|
|
num_bboxes (int): The number of bboxes |
|
|
|
|
|
Returns: |
|
|
tuple: |
|
|
- offset (np.ndarray): Offset factor of each bbox in shape (n, 2) |
|
|
- scale (np.ndarray): Scaling factor of each bbox in shape (n, 1) |
|
|
- rotate (np.ndarray): Rotation degree of each bbox in shape (n,) |
|
|
""" |
|
|
|
|
|
offset = self._truncnorm(size=(num_bboxes, 2)) * self.shift_factor |
|
|
offset = np.where( |
|
|
np.random.rand(num_bboxes, 1) < self.shift_prob, offset, 0.) |
|
|
|
|
|
|
|
|
scale_min, scale_max = self.scale_factor |
|
|
mu = (scale_max + scale_min) * 0.5 |
|
|
sigma = (scale_max - scale_min) * 0.5 |
|
|
scale = self._truncnorm(size=(num_bboxes, 1)) * sigma + mu |
|
|
scale = np.where( |
|
|
np.random.rand(num_bboxes, 1) < self.scale_prob, scale, 1.) |
|
|
|
|
|
return offset, scale |
|
|
|
|
|
def transform(self, results: Dict) -> Optional[dict]: |
|
|
"""The transform function of :class:`RandomBboxTransform`. |
|
|
|
|
|
See ``transform()`` method of :class:`BaseTransform` for details. |
|
|
|
|
|
Args: |
|
|
results (dict): The result dict |
|
|
|
|
|
Returns: |
|
|
dict: The result dict. |
|
|
""" |
|
|
bbox_scale = results['bbox_scale'] |
|
|
num_bboxes = bbox_scale.shape[0] |
|
|
|
|
|
offset, scale = self._get_transform_params(num_bboxes) |
|
|
|
|
|
results['bbox_center'] += offset * bbox_scale |
|
|
results['bbox_scale'] *= scale |
|
|
|
|
|
return results |
|
|
|
|
|
def __repr__(self) -> str: |
|
|
"""print the basic information of the transform. |
|
|
|
|
|
Returns: |
|
|
str: Formatted string. |
|
|
""" |
|
|
repr_str = self.__class__.__name__ |
|
|
repr_str += f'(shift_prob={self.shift_prob}, ' |
|
|
repr_str += f'shift_factor={self.shift_factor}, ' |
|
|
repr_str += f'scale_prob={self.scale_prob}, ' |
|
|
repr_str += f'scale_factor={self.scale_factor}, ' |
|
|
return repr_str |
|
|
|
|
|
@TRANSFORMS.register_module() |
|
|
class Pose3dRandomFlip(BaseTransform): |
|
|
def __init__(self, |
|
|
prob: Union[float, List[float]] = 0.5, |
|
|
direction: Union[str, List[str]] = 'horizontal') -> None: |
|
|
if isinstance(prob, list): |
|
|
assert is_list_of(prob, float) |
|
|
assert 0 <= sum(prob) <= 1 |
|
|
elif isinstance(prob, float): |
|
|
assert 0 <= prob <= 1 |
|
|
else: |
|
|
raise ValueError(f'probs must be float or list of float, but \ |
|
|
got `{type(prob)}`.') |
|
|
self.prob = prob |
|
|
|
|
|
valid_directions = ['horizontal', 'vertical', 'diagonal'] |
|
|
if isinstance(direction, str): |
|
|
assert direction in valid_directions |
|
|
elif isinstance(direction, list): |
|
|
assert is_list_of(direction, str) |
|
|
assert set(direction).issubset(set(valid_directions)) |
|
|
else: |
|
|
raise ValueError(f'direction must be either str or list of str, \ |
|
|
but got `{type(direction)}`.') |
|
|
self.direction = direction |
|
|
|
|
|
if isinstance(prob, list): |
|
|
assert len(prob) == len(self.direction) |
|
|
|
|
|
@cache_randomness |
|
|
def _choose_direction(self) -> str: |
|
|
"""Choose the flip direction according to `prob` and `direction`""" |
|
|
if isinstance(self.direction, |
|
|
List) and not isinstance(self.direction, str): |
|
|
|
|
|
direction_list: list = list(self.direction) + [None] |
|
|
elif isinstance(self.direction, str): |
|
|
|
|
|
direction_list = [self.direction, None] |
|
|
|
|
|
if isinstance(self.prob, list): |
|
|
non_prob: float = 1 - sum(self.prob) |
|
|
prob_list = self.prob + [non_prob] |
|
|
elif isinstance(self.prob, float): |
|
|
non_prob = 1. - self.prob |
|
|
|
|
|
single_ratio = self.prob / (len(direction_list) - 1) |
|
|
prob_list = [single_ratio] * (len(direction_list) - 1) + [non_prob] |
|
|
|
|
|
cur_dir = np.random.choice(direction_list, p=prob_list) |
|
|
|
|
|
return cur_dir |
|
|
|
|
|
def transform(self, results: dict) -> dict: |
|
|
flip_dir = self._choose_direction() |
|
|
img_shape = results['img'].shape[:2] |
|
|
|
|
|
if flip_dir is None: |
|
|
results['flip'] = False |
|
|
results['flip_direction'] = None |
|
|
else: |
|
|
results['flip'] = True |
|
|
results['flip_direction'] = flip_dir |
|
|
|
|
|
h, w = results.get('input_size', results['img_shape']) |
|
|
|
|
|
if isinstance(results['img'], list): |
|
|
results['img'] = [ |
|
|
imflip(img, direction=flip_dir) for img in results['img'] |
|
|
] |
|
|
else: |
|
|
results['img'] = imflip(results['img'], direction=flip_dir) |
|
|
|
|
|
if 'img_mask' in results: |
|
|
results['img_mask'] = imflip( |
|
|
results['img_mask'], direction=flip_dir) |
|
|
|
|
|
|
|
|
if results.get('bbox', None) is not None: |
|
|
results['bbox'] = flip_bbox( |
|
|
results['bbox'], |
|
|
image_size=(w, h), |
|
|
bbox_format='xyxy', |
|
|
direction=flip_dir) |
|
|
|
|
|
if results.get('bbox_center', None) is not None: |
|
|
results['bbox_center'] = flip_bbox( |
|
|
results['bbox_center'], |
|
|
image_size=(w, h), |
|
|
bbox_format='center', |
|
|
direction=flip_dir) |
|
|
|
|
|
|
|
|
if results.get('keypoints', None) is not None: |
|
|
keypoints, keypoints_visible = flip_keypoints( |
|
|
results['keypoints'], |
|
|
results.get('keypoints_visible', None), |
|
|
image_size=(w, h), |
|
|
flip_indices=results['flip_indices'], |
|
|
direction=flip_dir) |
|
|
|
|
|
results['keypoints'] = keypoints |
|
|
results['keypoints_visible'] = keypoints_visible |
|
|
|
|
|
|
|
|
if 'K' in results.keys(): |
|
|
|
|
|
results['K'][0, 2] = img_shape[1] - results['K'][0, 2] - 1 |
|
|
|
|
|
return results |
|
|
|
|
|
def __repr__(self) -> str: |
|
|
"""print the basic information of the transform. |
|
|
|
|
|
Returns: |
|
|
str: Formatted string. |
|
|
""" |
|
|
repr_str = self.__class__.__name__ |
|
|
repr_str += f'(prob={self.prob}, ' |
|
|
repr_str += f'direction={self.direction})' |
|
|
return repr_str |
|
|
|
|
|
@TRANSFORMS.register_module() |
|
|
class RandomFlipAroundRoot(BaseTransform): |
|
|
"""Data augmentation with random horizontal joint flip around a root joint. |
|
|
|
|
|
Args: |
|
|
keypoints_flip_cfg (dict): Configurations of the |
|
|
``flip_keypoints_custom_center`` function for ``keypoints``. Please |
|
|
refer to the docstring of the ``flip_keypoints_custom_center`` |
|
|
function for more details. |
|
|
target_flip_cfg (dict): Configurations of the |
|
|
``flip_keypoints_custom_center`` function for ``lifting_target``. |
|
|
Please refer to the docstring of the |
|
|
``flip_keypoints_custom_center`` function for more details. |
|
|
flip_prob (float): Probability of flip. Default: 0.5. |
|
|
flip_camera (bool): Whether to flip horizontal distortion coefficients. |
|
|
Default: ``False``. |
|
|
|
|
|
Required keys: |
|
|
keypoints |
|
|
lifting_target |
|
|
|
|
|
Modified keys: |
|
|
(keypoints, keypoints_visible, lifting_target, lifting_target_visible, |
|
|
camera_param) |
|
|
""" |
|
|
|
|
|
def __init__(self, |
|
|
keypoints_flip_cfg, |
|
|
target_flip_cfg, |
|
|
flip_prob=0.5, |
|
|
flip_camera=False): |
|
|
self.keypoints_flip_cfg = keypoints_flip_cfg |
|
|
self.target_flip_cfg = target_flip_cfg |
|
|
self.flip_prob = flip_prob |
|
|
self.flip_camera = flip_camera |
|
|
|
|
|
def transform(self, results: Dict) -> dict: |
|
|
"""The transform function of :class:`ZeroCenterPose`. |
|
|
|
|
|
See ``transform()`` method of :class:`BaseTransform` for details. |
|
|
|
|
|
Args: |
|
|
results (dict): The result dict |
|
|
|
|
|
Returns: |
|
|
dict: The result dict. |
|
|
""" |
|
|
|
|
|
keypoints = results['keypoints'] |
|
|
if 'keypoints_visible' in results: |
|
|
keypoints_visible = results['keypoints_visible'] |
|
|
else: |
|
|
keypoints_visible = np.ones(keypoints.shape[:-1], dtype=np.float32) |
|
|
lifting_target = results['lifting_target'] |
|
|
if 'lifting_target_visible' in results: |
|
|
lifting_target_visible = results['lifting_target_visible'] |
|
|
else: |
|
|
lifting_target_visible = np.ones( |
|
|
lifting_target.shape[:-1], dtype=np.float32) |
|
|
|
|
|
if np.random.rand() <= self.flip_prob: |
|
|
if 'flip_indices' not in results: |
|
|
flip_indices = list(range(self.num_keypoints)) |
|
|
else: |
|
|
flip_indices = results['flip_indices'] |
|
|
|
|
|
|
|
|
keypoints, keypoints_visible = flip_keypoints_custom_center( |
|
|
keypoints, keypoints_visible, flip_indices, |
|
|
**self.keypoints_flip_cfg) |
|
|
lifting_target, lifting_target_visible = flip_keypoints_custom_center( |
|
|
lifting_target, lifting_target_visible, flip_indices, |
|
|
**self.target_flip_cfg) |
|
|
|
|
|
results['keypoints'] = keypoints |
|
|
results['keypoints_visible'] = keypoints_visible |
|
|
results['lifting_target'] = lifting_target |
|
|
results['lifting_target_visible'] = lifting_target_visible |
|
|
|
|
|
|
|
|
if self.flip_camera: |
|
|
assert 'camera_param' in results, \ |
|
|
'Camera parameters are missing.' |
|
|
_camera_param = deepcopy(results['camera_param']) |
|
|
|
|
|
assert 'c' in _camera_param |
|
|
_camera_param['c'][0] *= -1 |
|
|
|
|
|
if 'p' in _camera_param: |
|
|
_camera_param['p'][0] *= -1 |
|
|
|
|
|
results['camera_param'].update(_camera_param) |
|
|
|
|
|
return results |
|
|
|
|
|
@TRANSFORMS.register_module() |
|
|
class Pose3dTopdownAffine(BaseTransform): |
|
|
def __init__(self, |
|
|
input_size: Tuple[int, int], |
|
|
use_udp: bool = False) -> None: |
|
|
super().__init__() |
|
|
|
|
|
assert is_seq_of(input_size, int) and len(input_size) == 2, ( |
|
|
f'Invalid input_size {input_size}') |
|
|
|
|
|
self.input_size = input_size |
|
|
self.use_udp = use_udp |
|
|
|
|
|
@staticmethod |
|
|
def _fix_aspect_ratio(bbox_scale: np.ndarray, aspect_ratio: float): |
|
|
w, h = np.hsplit(bbox_scale, [1]) |
|
|
bbox_scale = np.where(w > h * aspect_ratio, |
|
|
np.hstack([w, w / aspect_ratio]), |
|
|
np.hstack([h * aspect_ratio, h])) |
|
|
return bbox_scale |
|
|
|
|
|
def transform(self, results: Dict) -> Optional[dict]: |
|
|
w, h = self.input_size |
|
|
warp_size = (int(w), int(h)) |
|
|
|
|
|
|
|
|
results['bbox_scale'] = self._fix_aspect_ratio( |
|
|
results['bbox_scale'], aspect_ratio=w / h) |
|
|
|
|
|
|
|
|
assert results['bbox_center'].shape[0] == 1, ( |
|
|
'Top-down heatmap only supports single instance. Got invalid ' |
|
|
f'shape of bbox_center {results["bbox_center"].shape}.') |
|
|
|
|
|
center = results['bbox_center'][0] |
|
|
scale = results['bbox_scale'][0] |
|
|
rot = 0. |
|
|
|
|
|
if self.use_udp: |
|
|
warp_mat = get_udp_warp_matrix( |
|
|
center, scale, rot, output_size=(w, h)) |
|
|
else: |
|
|
warp_mat = get_warp_matrix(center, scale, rot, output_size=(w, h)) |
|
|
|
|
|
if isinstance(results['img'], list): |
|
|
results['img'] = [ |
|
|
cv2.warpAffine( |
|
|
img, warp_mat, warp_size, flags=cv2.INTER_LINEAR) |
|
|
for img in results['img'] |
|
|
] |
|
|
else: |
|
|
results['img'] = cv2.warpAffine( |
|
|
results['img'], warp_mat, warp_size, flags=cv2.INTER_LINEAR) |
|
|
|
|
|
if results.get('keypoints', None) is not None: |
|
|
transformed_keypoints = results['keypoints'].copy() |
|
|
|
|
|
transformed_keypoints[..., :2] = cv2.transform( |
|
|
results['keypoints'][..., :2], warp_mat) |
|
|
results['transformed_keypoints'] = transformed_keypoints |
|
|
|
|
|
results['input_size'] = (w, h) |
|
|
|
|
|
|
|
|
if 'K' in results: |
|
|
K = results['K'] |
|
|
|
|
|
|
|
|
translation_x = center[0] - scale[0] / 2 |
|
|
translation_y = center[1] - scale[1] / 2 |
|
|
scale_factor_x = w / scale[0] |
|
|
scale_factor_y = h / scale[1] |
|
|
|
|
|
c_x_new = (K[0, 2] - translation_x) * scale_factor_x |
|
|
c_y_new = (K[1, 2] - translation_y) * scale_factor_y |
|
|
|
|
|
f_x_new = K[0, 0] * scale_factor_x |
|
|
f_y_new = K[1, 1] * scale_factor_y |
|
|
|
|
|
|
|
|
results['K'] = np.array([ |
|
|
[f_x_new, 0, c_x_new], |
|
|
[0, f_y_new, c_y_new], |
|
|
[0, 0, 1] |
|
|
]) |
|
|
|
|
|
return results |
|
|
|
|
|
def __repr__(self) -> str: |
|
|
"""print the basic information of the transform. |
|
|
|
|
|
Returns: |
|
|
str: Formatted string. |
|
|
""" |
|
|
repr_str = self.__class__.__name__ |
|
|
repr_str += f'(input_size={self.input_size}, ' |
|
|
repr_str += f'use_udp={self.use_udp})' |
|
|
return repr_str |
|
|
|