mmaction2 / mmaction /datasets /transforms /pose_transforms.py

mmaction2

d3dbf03 verified 8 months ago

55.9 kB

	# Copyright (c) OpenMMLab. All rights reserved.
	from typing import Dict, List, Optional, Tuple, Union

	import numpy as np
	import scipy
	from mmcv.transforms import BaseTransform, KeyMapper
	from mmengine.dataset import Compose
	from packaging import version as pv
	from scipy.stats import mode
	from torch.nn.modules.utils import _pair

	from mmaction.registry import TRANSFORMS
	from .loading import DecordDecode, DecordInit
	from .processing import _combine_quadruple

	if pv.parse(scipy.__version__) < pv.parse('1.11.0'):
	get_mode = mode
	else:
	from functools import partial
	get_mode = partial(mode, keepdims=True)


	@TRANSFORMS.register_module()
	class DecompressPose(BaseTransform):
	"""Load Compressed Pose.

	Required Keys:

	- frame_inds
	- total_frames
	- keypoint
	- anno_inds (optional)

	Modified Keys:

	- keypoint
	- frame_inds

	Added Keys:

	- keypoint_score
	- num_person

	Args:
	squeeze (bool): Whether to remove frames with no human pose.
	Defaults to True.
	max_person (int): The max number of persons in a frame. Defaults to 10.
	"""

	def __init__(self, squeeze: bool = True, max_person: int = 10) -> None:
	self.squeeze = squeeze
	self.max_person = max_person

	def transform(self, results: Dict) -> Dict:
	"""Perform the pose decoding.

	Args:
	results (dict): The resulting dict to be modified and passed
	to the next transform in pipeline.
	"""
	required_keys = ['total_frames', 'frame_inds', 'keypoint']
	for k in required_keys:
	assert k in results

	total_frames = results['total_frames']
	frame_inds = results.pop('frame_inds')
	keypoint = results['keypoint']

	if 'anno_inds' in results:
	frame_inds = frame_inds[results['anno_inds']]
	keypoint = keypoint[results['anno_inds']]

	assert np.all(np.diff(frame_inds) >= 0), \
	'frame_inds should be monotonical increasing'

	def mapinds(inds):
	uni = np.unique(inds)
	map_ = {x: i for i, x in enumerate(uni)}
	inds = [map_[x] for x in inds]
	return np.array(inds, dtype=np.int16)

	if self.squeeze:
	frame_inds = mapinds(frame_inds)
	total_frames = np.max(frame_inds) + 1

	results['total_frames'] = total_frames

	num_joints = keypoint.shape[1]
	num_person = get_mode(frame_inds)[-1][0]

	new_kp = np.zeros([num_person, total_frames, num_joints, 2],
	dtype=np.float16)
	new_kpscore = np.zeros([num_person, total_frames, num_joints],
	dtype=np.float16)
	nperson_per_frame = np.zeros([total_frames], dtype=np.int16)

	for frame_ind, kp in zip(frame_inds, keypoint):
	person_ind = nperson_per_frame[frame_ind]
	new_kp[person_ind, frame_ind] = kp[:, :2]
	new_kpscore[person_ind, frame_ind] = kp[:, 2]
	nperson_per_frame[frame_ind] += 1

	if num_person > self.max_person:
	for i in range(total_frames):
	nperson = nperson_per_frame[i]
	val = new_kpscore[:nperson, i]
	score_sum = val.sum(-1)

	inds = sorted(range(nperson), key=lambda x: -score_sum[x])
	new_kpscore[:nperson, i] = new_kpscore[inds, i]
	new_kp[:nperson, i] = new_kp[inds, i]
	num_person = self.max_person
	results['num_person'] = num_person

	results['keypoint'] = new_kp[:num_person]
	results['keypoint_score'] = new_kpscore[:num_person]
	return results

	def __repr__(self) -> str:
	repr_str = (f'{self.__class__.__name__}('
	f'squeeze={self.squeeze}, '
	f'max_person={self.max_person})')
	return repr_str


	@TRANSFORMS.register_module()
	class GeneratePoseTarget(BaseTransform):
	"""Generate pseudo heatmaps based on joint coordinates and confidence.

	Required Keys:

	- keypoint
	- keypoint_score (optional)
	- img_shape

	Added Keys:

	- imgs (optional)
	- heatmap_imgs (optional)

	Args:
	sigma (float): The sigma of the generated gaussian map.
	Defaults to 0.6.
	use_score (bool): Use the confidence score of keypoints as the maximum
	of the gaussian maps. Defaults to True.
	with_kp (bool): Generate pseudo heatmaps for keypoints.
	Defaults to True.
	with_limb (bool): Generate pseudo heatmaps for limbs. At least one of
	'with_kp' and 'with_limb' should be True. Defaults to False.
	skeletons (tuple[tuple]): The definition of human skeletons.
	Defaults to ``((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7),
	(7, 9), (0, 6), (6, 8), (8, 10), (5, 11), (11, 13),
	(13, 15), (6, 12), (12, 14), (14, 16), (11, 12))``,
	which is the definition of COCO-17p skeletons.
	double (bool): Output both original heatmaps and flipped heatmaps.
	Defaults to False.
	left_kp (tuple[int]): Indexes of left keypoints, which is used when
	flipping heatmaps. Defaults to (1, 3, 5, 7, 9, 11, 13, 15),
	which is left keypoints in COCO-17p.
	right_kp (tuple[int]): Indexes of right keypoints, which is used when
	flipping heatmaps. Defaults to (2, 4, 6, 8, 10, 12, 14, 16),
	which is right keypoints in COCO-17p.
	left_limb (tuple[int]): Indexes of left limbs, which is used when
	flipping heatmaps. Defaults to (0, 2, 4, 5, 6, 10, 11, 12),
	which is left limbs of skeletons we defined for COCO-17p.
	right_limb (tuple[int]): Indexes of right limbs, which is used when
	flipping heatmaps. Defaults to (1, 3, 7, 8, 9, 13, 14, 15),
	which is right limbs of skeletons we defined for COCO-17p.
	scaling (float): The ratio to scale the heatmaps. Defaults to 1.
	"""

	def __init__(self,
	sigma: float = 0.6,
	use_score: bool = True,
	with_kp: bool = True,
	with_limb: bool = False,
	skeletons: Tuple[Tuple[int]] = ((0, 1), (0, 2), (1, 3),
	(2, 4), (0, 5), (5, 7),
	(7, 9), (0, 6), (6, 8),
	(8, 10), (5, 11), (11, 13),
	(13, 15), (6, 12), (12, 14),
	(14, 16), (11, 12)),
	double: bool = False,
	left_kp: Tuple[int] = (1, 3, 5, 7, 9, 11, 13, 15),
	right_kp: Tuple[int] = (2, 4, 6, 8, 10, 12, 14, 16),
	left_limb: Tuple[int] = (0, 2, 4, 5, 6, 10, 11, 12),
	right_limb: Tuple[int] = (1, 3, 7, 8, 9, 13, 14, 15),
	scaling: float = 1.) -> None:

	self.sigma = sigma
	self.use_score = use_score
	self.with_kp = with_kp
	self.with_limb = with_limb
	self.double = double

	# an auxiliary const
	self.eps = 1e-4

	assert self.with_kp or self.with_limb, (
	'At least one of "with_limb" '
	'and "with_kp" should be set as True.')
	self.left_kp = left_kp
	self.right_kp = right_kp
	self.skeletons = skeletons
	self.left_limb = left_limb
	self.right_limb = right_limb
	self.scaling = scaling

	def generate_a_heatmap(self, arr: np.ndarray, centers: np.ndarray,
	max_values: np.ndarray) -> None:
	"""Generate pseudo heatmap for one keypoint in one frame.

	Args:
	arr (np.ndarray): The array to store the generated heatmaps.
	Shape: img_h * img_w.
	centers (np.ndarray): The coordinates of corresponding keypoints
	(of multiple persons). Shape: M * 2.
	max_values (np.ndarray): The max values of each keypoint. Shape: M.
	"""

	sigma = self.sigma
	img_h, img_w = arr.shape

	for center, max_value in zip(centers, max_values):
	if max_value < self.eps:
	continue

	mu_x, mu_y = center[0], center[1]
	st_x = max(int(mu_x - 3 * sigma), 0)
	ed_x = min(int(mu_x + 3 * sigma) + 1, img_w)
	st_y = max(int(mu_y - 3 * sigma), 0)
	ed_y = min(int(mu_y + 3 * sigma) + 1, img_h)
	x = np.arange(st_x, ed_x, 1, np.float32)
	y = np.arange(st_y, ed_y, 1, np.float32)

	# if the keypoint not in the heatmap coordinate system
	if not (len(x) and len(y)):
	continue
	y = y[:, None]

	patch = np.exp(-((x - mu_x)2 + (y - mu_y)2) / 2 / sigma**2)
	patch = patch * max_value
	arr[st_y:ed_y, st_x:ed_x] = \
	np.maximum(arr[st_y:ed_y, st_x:ed_x], patch)

	def generate_a_limb_heatmap(self, arr: np.ndarray, starts: np.ndarray,
	ends: np.ndarray, start_values: np.ndarray,
	end_values: np.ndarray) -> None:
	"""Generate pseudo heatmap for one limb in one frame.

	Args:
	arr (np.ndarray): The array to store the generated heatmaps.
	Shape: img_h * img_w.
	starts (np.ndarray): The coordinates of one keypoint in the
	corresponding limbs. Shape: M * 2.
	ends (np.ndarray): The coordinates of the other keypoint in the
	corresponding limbs. Shape: M * 2.
	start_values (np.ndarray): The max values of one keypoint in the
	corresponding limbs. Shape: M.
	end_values (np.ndarray): The max values of the other keypoint
	in the corresponding limbs. Shape: M.
	"""

	sigma = self.sigma
	img_h, img_w = arr.shape

	for start, end, start_value, end_value in zip(starts, ends,
	start_values,
	end_values):
	value_coeff = min(start_value, end_value)
	if value_coeff < self.eps:
	continue

	min_x, max_x = min(start[0], end[0]), max(start[0], end[0])
	min_y, max_y = min(start[1], end[1]), max(start[1], end[1])

	min_x = max(int(min_x - 3 * sigma), 0)
	max_x = min(int(max_x + 3 * sigma) + 1, img_w)
	min_y = max(int(min_y - 3 * sigma), 0)
	max_y = min(int(max_y + 3 * sigma) + 1, img_h)

	x = np.arange(min_x, max_x, 1, np.float32)
	y = np.arange(min_y, max_y, 1, np.float32)

	if not (len(x) and len(y)):
	continue

	y = y[:, None]
	x_0 = np.zeros_like(x)
	y_0 = np.zeros_like(y)

	# distance to start keypoints
	d2_start = ((x - start[0])2 + (y - start[1])2)

	# distance to end keypoints
	d2_end = ((x - end[0])2 + (y - end[1])2)

	# the distance between start and end keypoints.
	d2_ab = ((start[0] - end[0])2 + (start[1] - end[1])2)

	if d2_ab < 1:
	self.generate_a_heatmap(arr, start[None], start_value[None])
	continue

	coeff = (d2_start - d2_end + d2_ab) / 2. / d2_ab

	a_dominate = coeff <= 0
	b_dominate = coeff >= 1
	seg_dominate = 1 - a_dominate - b_dominate

	position = np.stack([x + y_0, y + x_0], axis=-1)
	projection = start + np.stack([coeff, coeff], axis=-1) * (
	end - start)
	d2_line = position - projection
	d2_line = d2_line[:, :, 0]2 + d2_line[:, :, 1]2
	d2_seg = (
	a_dominate * d2_start + b_dominate * d2_end +
	seg_dominate * d2_line)

	patch = np.exp(-d2_seg / 2. / sigma**2)
	patch = patch * value_coeff

	arr[min_y:max_y, min_x:max_x] = \
	np.maximum(arr[min_y:max_y, min_x:max_x], patch)

	def generate_heatmap(self, arr: np.ndarray, kps: np.ndarray,
	max_values: np.ndarray) -> None:
	"""Generate pseudo heatmap for all keypoints and limbs in one frame (if
	needed).

	Args:
	arr (np.ndarray): The array to store the generated heatmaps.
	Shape: V * img_h * img_w.
	kps (np.ndarray): The coordinates of keypoints in this frame.
	Shape: M * V * 2.
	max_values (np.ndarray): The confidence score of each keypoint.
	Shape: M * V.
	"""

	if self.with_kp:
	num_kp = kps.shape[1]
	for i in range(num_kp):
	self.generate_a_heatmap(arr[i], kps[:, i], max_values[:, i])

	if self.with_limb:
	for i, limb in enumerate(self.skeletons):
	start_idx, end_idx = limb
	starts = kps[:, start_idx]
	ends = kps[:, end_idx]

	start_values = max_values[:, start_idx]
	end_values = max_values[:, end_idx]
	self.generate_a_limb_heatmap(arr[i], starts, ends,
	start_values, end_values)

	def gen_an_aug(self, results: Dict) -> np.ndarray:
	"""Generate pseudo heatmaps for all frames.

	Args:
	results (dict): The dictionary that contains all info of a sample.

	Returns:
	np.ndarray: The generated pseudo heatmaps.
	"""

	all_kps = results['keypoint'].astype(np.float32)
	kp_shape = all_kps.shape

	if 'keypoint_score' in results:
	all_kpscores = results['keypoint_score']
	else:
	all_kpscores = np.ones(kp_shape[:-1], dtype=np.float32)

	img_h, img_w = results['img_shape']

	# scale img_h, img_w and kps
	img_h = int(img_h * self.scaling + 0.5)
	img_w = int(img_w * self.scaling + 0.5)
	all_kps[..., :2] *= self.scaling

	num_frame = kp_shape[1]
	num_c = 0
	if self.with_kp:
	num_c += all_kps.shape[2]
	if self.with_limb:
	num_c += len(self.skeletons)

	ret = np.zeros([num_frame, num_c, img_h, img_w], dtype=np.float32)

	for i in range(num_frame):
	# M, V, C
	kps = all_kps[:, i]
	# M, C
	kpscores = all_kpscores[:, i] if self.use_score else \
	np.ones_like(all_kpscores[:, i])

	self.generate_heatmap(ret[i], kps, kpscores)
	return ret

	def transform(self, results: Dict) -> Dict:
	"""Generate pseudo heatmaps based on joint coordinates and confidence.

	Args:
	results (dict): The resulting dict to be modified and passed
	to the next transform in pipeline.
	"""
	heatmap = self.gen_an_aug(results)
	key = 'heatmap_imgs' if 'imgs' in results else 'imgs'

	if self.double:
	indices = np.arange(heatmap.shape[1], dtype=np.int64)
	left, right = (self.left_kp, self.right_kp) if self.with_kp else (
	self.left_limb, self.right_limb)
	for l, r in zip(left, right): # noqa: E741
	indices[l] = r
	indices[r] = l
	heatmap_flip = heatmap[..., ::-1][:, indices]
	heatmap = np.concatenate([heatmap, heatmap_flip])
	results[key] = heatmap
	return results

	def __repr__(self) -> str:
	repr_str = (f'{self.__class__.__name__}('
	f'sigma={self.sigma}, '
	f'use_score={self.use_score}, '
	f'with_kp={self.with_kp}, '
	f'with_limb={self.with_limb}, '
	f'skeletons={self.skeletons}, '
	f'double={self.double}, '
	f'left_kp={self.left_kp}, '
	f'right_kp={self.right_kp}, '
	f'left_limb={self.left_limb}, '
	f'right_limb={self.right_limb}, '
	f'scaling={self.scaling})')
	return repr_str


	@TRANSFORMS.register_module()
	class PoseCompact(BaseTransform):
	"""Convert the coordinates of keypoints to make it more compact.
	Specifically, it first find a tight bounding box that surrounds all joints
	in each frame, then we expand the tight box by a given padding ratio. For
	example, if 'padding == 0.25', then the expanded box has unchanged center,
	and 1.25x width and height.

	Required Keys:

	- keypoint
	- img_shape

	Modified Keys:

	- img_shape
	- keypoint

	Added Keys:

	- crop_quadruple

	Args:
	padding (float): The padding size. Defaults to 0.25.
	threshold (int): The threshold for the tight bounding box. If the width
	or height of the tight bounding box is smaller than the threshold,
	we do not perform the compact operation. Defaults to 10.
	hw_ratio (float \| tuple[float] \| None): The hw_ratio of the expanded
	box. Float indicates the specific ratio and tuple indicates a
	ratio range. If set as None, it means there is no requirement on
	hw_ratio. Defaults to None.
	allow_imgpad (bool): Whether to allow expanding the box outside the
	image to meet the hw_ratio requirement. Defaults to True.
	"""

	def __init__(self,
	padding: float = 0.25,
	threshold: int = 10,
	hw_ratio: Optional[Union[float, Tuple[float]]] = None,
	allow_imgpad: bool = True) -> None:

	self.padding = padding
	self.threshold = threshold
	if hw_ratio is not None:
	hw_ratio = _pair(hw_ratio)

	self.hw_ratio = hw_ratio

	self.allow_imgpad = allow_imgpad
	assert self.padding >= 0

	def transform(self, results: Dict) -> Dict:
	"""Convert the coordinates of keypoints to make it more compact.

	Args:
	results (dict): The resulting dict to be modified and passed
	to the next transform in pipeline.
	"""
	img_shape = results['img_shape']
	h, w = img_shape
	kp = results['keypoint']

	# Make NaN zero
	kp[np.isnan(kp)] = 0.
	kp_x = kp[..., 0]
	kp_y = kp[..., 1]

	min_x = np.min(kp_x[kp_x != 0], initial=np.Inf)
	min_y = np.min(kp_y[kp_y != 0], initial=np.Inf)
	max_x = np.max(kp_x[kp_x != 0], initial=-np.Inf)
	max_y = np.max(kp_y[kp_y != 0], initial=-np.Inf)

	# The compact area is too small
	if max_x - min_x < self.threshold or max_y - min_y < self.threshold:
	return results

	center = ((max_x + min_x) / 2, (max_y + min_y) / 2)
	half_width = (max_x - min_x) / 2 * (1 + self.padding)
	half_height = (max_y - min_y) / 2 * (1 + self.padding)

	if self.hw_ratio is not None:
	half_height = max(self.hw_ratio[0] * half_width, half_height)
	half_width = max(1 / self.hw_ratio[1] * half_height, half_width)

	min_x, max_x = center[0] - half_width, center[0] + half_width
	min_y, max_y = center[1] - half_height, center[1] + half_height

	# hot update
	if not self.allow_imgpad:
	min_x, min_y = int(max(0, min_x)), int(max(0, min_y))
	max_x, max_y = int(min(w, max_x)), int(min(h, max_y))
	else:
	min_x, min_y = int(min_x), int(min_y)
	max_x, max_y = int(max_x), int(max_y)

	kp_x[kp_x != 0] -= min_x
	kp_y[kp_y != 0] -= min_y

	new_shape = (max_y - min_y, max_x - min_x)
	results['img_shape'] = new_shape

	# the order is x, y, w, h (in [0, 1]), a tuple
	crop_quadruple = results.get('crop_quadruple', (0., 0., 1., 1.))
	new_crop_quadruple = (min_x / w, min_y / h, (max_x - min_x) / w,
	(max_y - min_y) / h)
	crop_quadruple = _combine_quadruple(crop_quadruple, new_crop_quadruple)
	results['crop_quadruple'] = crop_quadruple
	return results

	def __repr__(self) -> str:
	repr_str = (f'{self.__class__.__name__}(padding={self.padding}, '
	f'threshold={self.threshold}, '
	f'hw_ratio={self.hw_ratio}, '
	f'allow_imgpad={self.allow_imgpad})')
	return repr_str


	@TRANSFORMS.register_module()
	class PreNormalize3D(BaseTransform):
	"""PreNormalize for NTURGB+D 3D keypoints (x, y, z).

	PreNormalize3D first subtracts the coordinates of each joint
	from the coordinates of the 'spine' (joint #1 in ntu) of the first person
	in the first frame. Subsequently, it performs a 3D rotation to fix the Z
	axis parallel to the 3D vector from the 'hip' (joint #0) and the 'spine'
	(joint #1) and the X axis toward the 3D vector from the 'right shoulder'
	(joint #8) and the 'left shoulder' (joint #4). Codes adapted from
	https://github.com/lshiwjx/2s-AGCN.

	Required Keys:

	- keypoint
	- total_frames (optional)

	Modified Keys:

	- keypoint

	Added Keys:

	- body_center

	Args:
	zaxis (list[int]): The target Z axis for the 3D rotation.
	Defaults to ``[0, 1]``.
	xaxis (list[int]): The target X axis for the 3D rotation.
	Defaults to ``[8, 4]``.
	align_spine (bool): Whether to perform a 3D rotation to
	align the spine. Defaults to True.
	align_shoulder (bool): Whether to perform a 3D rotation
	to align the shoulder. Defaults to True.
	align_center (bool): Whether to align the body center.
	Defaults to True.
	"""

	def __init__(self,
	zaxis: List[int] = [0, 1],
	xaxis: List[int] = [8, 4],
	align_spine: bool = True,
	align_shoulder: bool = True,
	align_center: bool = True) -> None:
	self.zaxis = zaxis
	self.xaxis = xaxis
	self.align_center = align_center
	self.align_spine = align_spine
	self.align_shoulder = align_shoulder

	def unit_vector(self, vector: np.ndarray) -> np.ndarray:
	"""Returns the unit vector of the vector."""
	return vector / np.linalg.norm(vector)

	def angle_between(self, v1: np.ndarray, v2: np.ndarray) -> float:
	"""Returns the angle in radians between vectors 'v1' and 'v2'."""
	if np.abs(v1).sum() < 1e-6 or np.abs(v2).sum() < 1e-6:
	return 0
	v1_u = self.unit_vector(v1)
	v2_u = self.unit_vector(v2)
	return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0))

	def rotation_matrix(self, axis: np.ndarray, theta: float) -> np.ndarray:
	"""Returns the rotation matrix associated with counterclockwise
	rotation about the given axis by theta radians."""
	if np.abs(axis).sum() < 1e-6 or np.abs(theta) < 1e-6:
	return np.eye(3)
	axis = np.asarray(axis)
	axis = axis / np.sqrt(np.dot(axis, axis))
	a = np.cos(theta / 2.0)
	b, c, d = -axis * np.sin(theta / 2.0)
	aa, bb, cc, dd = a * a, b * b, c * c, d * d
	bc, ad, ac, ab, bd, cd = b * c, a * d, a * c, a * b, b * d, c * d
	return np.array([[aa + bb - cc - dd, 2 * (bc + ad), 2 * (bd - ac)],
	[2 * (bc - ad), aa + cc - bb - dd, 2 * (cd + ab)],
	[2 * (bd + ac), 2 * (cd - ab), aa + dd - bb - cc]])

	def transform(self, results: Dict) -> Dict:
	"""The transform function of :class:`PreNormalize3D`.

	Args:
	results (dict): The result dict.

	Returns:
	dict: The result dict.
	"""
	skeleton = results['keypoint']
	total_frames = results.get('total_frames', skeleton.shape[1])

	M, T, V, C = skeleton.shape
	assert T == total_frames
	if skeleton.sum() == 0:
	return results

	index0 = [
	i for i in range(T) if not np.all(np.isclose(skeleton[0, i], 0))
	]

	assert M in [1, 2]
	if M == 2:
	index1 = [
	i for i in range(T)
	if not np.all(np.isclose(skeleton[1, i], 0))
	]
	if len(index0) < len(index1):
	skeleton = skeleton[:, np.array(index1)]
	skeleton = skeleton[[1, 0]]
	else:
	skeleton = skeleton[:, np.array(index0)]
	else:
	skeleton = skeleton[:, np.array(index0)]

	T_new = skeleton.shape[1]

	if self.align_center:
	if skeleton.shape[2] == 25:
	main_body_center = skeleton[0, 0, 1].copy()
	else:
	main_body_center = skeleton[0, 0, -1].copy()
	mask = ((skeleton != 0).sum(-1) > 0)[..., None]
	skeleton = (skeleton - main_body_center) * mask

	if self.align_spine:
	joint_bottom = skeleton[0, 0, self.zaxis[0]]
	joint_top = skeleton[0, 0, self.zaxis[1]]
	axis = np.cross(joint_top - joint_bottom, [0, 0, 1])
	angle = self.angle_between(joint_top - joint_bottom, [0, 0, 1])
	matrix_z = self.rotation_matrix(axis, angle)
	skeleton = np.einsum('abcd,kd->abck', skeleton, matrix_z)

	if self.align_shoulder:
	joint_rshoulder = skeleton[0, 0, self.xaxis[0]]
	joint_lshoulder = skeleton[0, 0, self.xaxis[1]]
	axis = np.cross(joint_rshoulder - joint_lshoulder, [1, 0, 0])
	angle = self.angle_between(joint_rshoulder - joint_lshoulder,
	[1, 0, 0])
	matrix_x = self.rotation_matrix(axis, angle)
	skeleton = np.einsum('abcd,kd->abck', skeleton, matrix_x)

	results['keypoint'] = skeleton
	results['total_frames'] = T_new
	results['body_center'] = main_body_center
	return results

	def __repr__(self) -> str:
	repr_str = (f'{self.__class__.__name__}('
	f'zaxis={self.zaxis}, '
	f'xaxis={self.xaxis}, '
	f'align_center={self.align_center}, '
	f'align_spine={self.align_spine}, '
	f'align_shoulder={self.align_shoulder})')
	return repr_str


	@TRANSFORMS.register_module()
	class PreNormalize2D(BaseTransform):
	"""Normalize the range of keypoint values.

	Required Keys:

	- keypoint
	- img_shape (optional)

	Modified Keys:

	- keypoint

	Args:
	img_shape (tuple[int, int]): The resolution of the original video.
	Defaults to ``(1080, 1920)``.
	"""

	def __init__(self, img_shape: Tuple[int, int] = (1080, 1920)) -> None:
	self.img_shape = img_shape

	def transform(self, results: Dict) -> Dict:
	"""The transform function of :class:`PreNormalize2D`.

	Args:
	results (dict): The result dict.

	Returns:
	dict: The result dict.
	"""
	h, w = results.get('img_shape', self.img_shape)
	results['keypoint'][..., 0] = \
	(results['keypoint'][..., 0] - (w / 2)) / (w / 2)
	results['keypoint'][..., 1] = \
	(results['keypoint'][..., 1] - (h / 2)) / (h / 2)
	return results

	def __repr__(self) -> str:
	repr_str = (f'{self.__class__.__name__}('
	f'img_shape={self.img_shape})')
	return repr_str


	@TRANSFORMS.register_module()
	class JointToBone(BaseTransform):
	"""Convert the joint information to bone information.

	Required Keys:

	- keypoint

	Modified Keys:

	- keypoint

	Args:
	dataset (str): Define the type of dataset: 'nturgb+d', 'openpose',
	'coco'. Defaults to ``'nturgb+d'``.
	target (str): The target key for the bone information.
	Defaults to ``'keypoint'``.
	"""

	def __init__(self,
	dataset: str = 'nturgb+d',
	target: str = 'keypoint') -> None:
	self.dataset = dataset
	self.target = target
	if self.dataset not in ['nturgb+d', 'openpose', 'coco']:
	raise ValueError(
	f'The dataset type {self.dataset} is not supported')
	if self.dataset == 'nturgb+d':
	self.pairs = [(0, 1), (1, 20), (2, 20), (3, 2), (4, 20), (5, 4),
	(6, 5), (7, 6), (8, 20), (9, 8), (10, 9), (11, 10),
	(12, 0), (13, 12), (14, 13), (15, 14), (16, 0),
	(17, 16), (18, 17), (19, 18), (21, 22), (20, 20),
	(22, 7), (23, 24), (24, 11)]
	elif self.dataset == 'openpose':
	self.pairs = ((0, 0), (1, 0), (2, 1), (3, 2), (4, 3), (5, 1),
	(6, 5), (7, 6), (8, 2), (9, 8), (10, 9), (11, 5),
	(12, 11), (13, 12), (14, 0), (15, 0), (16, 14), (17,
	15))
	elif self.dataset == 'coco':
	self.pairs = ((0, 0), (1, 0), (2, 0), (3, 1), (4, 2), (5, 0),
	(6, 0), (7, 5), (8, 6), (9, 7), (10, 8), (11, 0),
	(12, 0), (13, 11), (14, 12), (15, 13), (16, 14))

	def transform(self, results: Dict) -> Dict:
	"""The transform function of :class:`JointToBone`.

	Args:
	results (dict): The result dict.

	Returns:
	dict: The result dict.
	"""
	keypoint = results['keypoint']
	M, T, V, C = keypoint.shape
	bone = np.zeros((M, T, V, C), dtype=np.float32)

	assert C in [2, 3]
	for v1, v2 in self.pairs:
	bone[..., v1, :] = keypoint[..., v1, :] - keypoint[..., v2, :]
	if C == 3 and self.dataset in ['openpose', 'coco']:
	score = (keypoint[..., v1, 2] + keypoint[..., v2, 2]) / 2
	bone[..., v1, 2] = score

	results[self.target] = bone
	return results

	def __repr__(self) -> str:
	repr_str = (f'{self.__class__.__name__}('
	f'dataset={self.dataset}, '
	f'target={self.target})')
	return repr_str


	@TRANSFORMS.register_module()
	class ToMotion(BaseTransform):
	"""Convert the joint information or bone information to corresponding
	motion information.

	Required Keys:

	- keypoint

	Added Keys:

	- motion

	Args:
	dataset (str): Define the type of dataset: 'nturgb+d', 'openpose',
	'coco'. Defaults to ``'nturgb+d'``.
	source (str): The source key for the joint or bone information.
	Defaults to ``'keypoint'``.
	target (str): The target key for the motion information.
	Defaults to ``'motion'``.
	"""

	def __init__(self,
	dataset: str = 'nturgb+d',
	source: str = 'keypoint',
	target: str = 'motion') -> None:
	self.dataset = dataset
	self.source = source
	self.target = target

	def transform(self, results: Dict) -> Dict:
	"""The transform function of :class:`ToMotion`.

	Args:
	results (dict): The result dict.

	Returns:
	dict: The result dict.
	"""
	data = results[self.source]
	M, T, V, C = data.shape
	motion = np.zeros_like(data)

	assert C in [2, 3]
	motion[:, :T - 1] = np.diff(data, axis=1)
	if C == 3 and self.dataset in ['openpose', 'coco']:
	score = (data[:, :T - 1, :, 2] + data[:, 1:, :, 2]) / 2
	motion[:, :T - 1, :, 2] = score

	results[self.target] = motion

	return results

	def __repr__(self) -> str:
	repr_str = (f'{self.__class__.__name__}('
	f'dataset={self.dataset}, '
	f'source={self.source}, '
	f'target={self.target})')
	return repr_str


	@TRANSFORMS.register_module()
	class MergeSkeFeat(BaseTransform):
	"""Merge multi-stream features.

	Args:
	feat_list (list[str]): The list of the keys of features.
	Defaults to ``['keypoint']``.
	target (str): The target key for the merged multi-stream information.
	Defaults to ``'keypoint'``.
	axis (int): The axis along which the features will be joined.
	Defaults to -1.
	"""

	def __init__(self,
	feat_list: List[str] = ['keypoint'],
	target: str = 'keypoint',
	axis: int = -1) -> None:
	self.feat_list = feat_list
	self.target = target
	self.axis = axis

	def transform(self, results: Dict) -> Dict:
	"""The transform function of :class:`MergeSkeFeat`.

	Args:
	results (dict): The result dict.

	Returns:
	dict: The result dict.
	"""
	feats = []
	for name in self.feat_list:
	feats.append(results.pop(name))
	feats = np.concatenate(feats, axis=self.axis)
	results[self.target] = feats
	return results

	def __repr__(self) -> str:
	repr_str = (f'{self.__class__.__name__}('
	f'feat_list={self.feat_list}, '
	f'target={self.target}, '
	f'axis={self.axis})')
	return repr_str


	@TRANSFORMS.register_module()
	class GenSkeFeat(BaseTransform):
	"""Unified interface for generating multi-stream skeleton features.

	Required Keys:

	- keypoint
	- keypoint_score (optional)

	Args:
	dataset (str): Define the type of dataset: 'nturgb+d', 'openpose',
	'coco'. Defaults to ``'nturgb+d'``.
	feats (list[str]): The list of the keys of features.
	Defaults to ``['j']``.
	axis (int): The axis along which the features will be joined.
	Defaults to -1.
	"""

	def __init__(self,
	dataset: str = 'nturgb+d',
	feats: List[str] = ['j'],
	axis: int = -1) -> None:
	self.dataset = dataset
	self.feats = feats
	self.axis = axis
	ops = []
	if 'b' in feats or 'bm' in feats:
	ops.append(JointToBone(dataset=dataset, target='b'))
	ops.append(KeyMapper(remapping={'keypoint': 'j'}))
	if 'jm' in feats:
	ops.append(ToMotion(dataset=dataset, source='j', target='jm'))
	if 'bm' in feats:
	ops.append(ToMotion(dataset=dataset, source='b', target='bm'))
	ops.append(MergeSkeFeat(feat_list=feats, axis=axis))
	self.ops = Compose(ops)

	def transform(self, results: Dict) -> Dict:
	"""The transform function of :class:`GenSkeFeat`.

	Args:
	results (dict): The result dict.

	Returns:
	dict: The result dict.
	"""
	if 'keypoint_score' in results and 'keypoint' in results:
	assert self.dataset != 'nturgb+d'
	assert results['keypoint'].shape[
	-1] == 2, 'Only 2D keypoints have keypoint_score. '
	keypoint = results.pop('keypoint')
	keypoint_score = results.pop('keypoint_score')
	results['keypoint'] = np.concatenate(
	[keypoint, keypoint_score[..., None]], -1)
	return self.ops(results)

	def __repr__(self) -> str:
	repr_str = (f'{self.__class__.__name__}('
	f'dataset={self.dataset}, '
	f'feats={self.feats}, '
	f'axis={self.axis})')
	return repr_str


	@TRANSFORMS.register_module()
	class UniformSampleFrames(BaseTransform):
	"""Uniformly sample frames from the video.

	To sample an n-frame clip from the video. UniformSampleFrames basically
	divide the video into n segments of equal length and randomly sample one
	frame from each segment. To make the testing results reproducible, a
	random seed is set during testing, to make the sampling results
	deterministic.

	Required Keys:

	- total_frames
	- start_index (optional)

	Added Keys:

	- frame_inds
	- frame_interval
	- num_clips
	- clip_len

	Args:
	clip_len (int): Frames of each sampled output clip.
	num_clips (int): Number of clips to be sampled. Defaults to 1.
	test_mode (bool): Store True when building test or validation dataset.
	Defaults to False.
	seed (int): The random seed used during test time. Defaults to 255.
	"""

	def __init__(self,
	clip_len: int,
	num_clips: int = 1,
	test_mode: bool = False,
	seed: int = 255) -> None:
	self.clip_len = clip_len
	self.num_clips = num_clips
	self.test_mode = test_mode
	self.seed = seed

	def _get_train_clips(self, num_frames: int, clip_len: int) -> np.ndarray:
	"""Uniformly sample indices for training clips.

	Args:
	num_frames (int): The number of frames.
	clip_len (int): The length of the clip.

	Returns:
	np.ndarray: The sampled indices for training clips.
	"""
	all_inds = []
	for clip_idx in range(self.num_clips):
	if num_frames < clip_len:
	start = np.random.randint(0, num_frames)
	inds = np.arange(start, start + clip_len)
	elif clip_len <= num_frames < 2 * clip_len:
	basic = np.arange(clip_len)
	inds = np.random.choice(
	clip_len + 1, num_frames - clip_len, replace=False)
	offset = np.zeros(clip_len + 1, dtype=np.int32)
	offset[inds] = 1
	offset = np.cumsum(offset)
	inds = basic + offset[:-1]
	else:
	bids = np.array(
	[i * num_frames // clip_len for i in range(clip_len + 1)])
	bsize = np.diff(bids)
	bst = bids[:clip_len]
	offset = np.random.randint(bsize)
	inds = bst + offset

	all_inds.append(inds)

	return np.concatenate(all_inds)

	def _get_test_clips(self, num_frames: int, clip_len: int) -> np.ndarray:
	"""Uniformly sample indices for testing clips.

	Args:
	num_frames (int): The number of frames.
	clip_len (int): The length of the clip.

	Returns:
	np.ndarray: The sampled indices for testing clips.
	"""

	np.random.seed(self.seed)
	all_inds = []
	for i in range(self.num_clips):
	if num_frames < clip_len:
	start_ind = i if num_frames < self.num_clips \
	else i * num_frames // self.num_clips
	inds = np.arange(start_ind, start_ind + clip_len)
	elif clip_len <= num_frames < clip_len * 2:
	basic = np.arange(clip_len)
	inds = np.random.choice(
	clip_len + 1, num_frames - clip_len, replace=False)
	offset = np.zeros(clip_len + 1, dtype=np.int64)
	offset[inds] = 1
	offset = np.cumsum(offset)
	inds = basic + offset[:-1]
	else:
	bids = np.array(
	[i * num_frames // clip_len for i in range(clip_len + 1)])
	bsize = np.diff(bids)
	bst = bids[:clip_len]
	offset = np.random.randint(bsize)
	inds = bst + offset

	all_inds.append(inds)

	return np.concatenate(all_inds)

	def transform(self, results: Dict) -> Dict:
	"""The transform function of :class:`UniformSampleFrames`.

	Args:
	results (dict): The result dict.

	Returns:
	dict: The result dict.
	"""
	num_frames = results['total_frames']

	if self.test_mode:
	inds = self._get_test_clips(num_frames, self.clip_len)
	else:
	inds = self._get_train_clips(num_frames, self.clip_len)

	inds = np.mod(inds, num_frames)
	start_index = results.get('start_index', 0)
	inds = inds + start_index

	if 'keypoint' in results:
	kp = results['keypoint']
	assert num_frames == kp.shape[1]
	num_person = kp.shape[0]
	num_persons = [num_person] * num_frames
	for i in range(num_frames):
	j = num_person - 1
	while j >= 0 and np.all(np.abs(kp[j, i]) < 1e-5):
	j -= 1
	num_persons[i] = j + 1
	transitional = [False] * num_frames
	for i in range(1, num_frames - 1):
	if num_persons[i] != num_persons[i - 1]:
	transitional[i] = transitional[i - 1] = True
	if num_persons[i] != num_persons[i + 1]:
	transitional[i] = transitional[i + 1] = True
	inds_int = inds.astype(np.int64)
	coeff = np.array([transitional[i] for i in inds_int])
	inds = (coeff * inds_int + (1 - coeff) * inds).astype(np.float32)

	results['frame_inds'] = inds.astype(np.int32)
	results['clip_len'] = self.clip_len
	results['frame_interval'] = None
	results['num_clips'] = self.num_clips
	return results

	def __repr__(self) -> str:
	repr_str = (f'{self.__class__.__name__}('
	f'clip_len={self.clip_len}, '
	f'num_clips={self.num_clips}, '
	f'test_mode={self.test_mode}, '
	f'seed={self.seed})')
	return repr_str


	@TRANSFORMS.register_module()
	class PadTo(BaseTransform):
	"""Sample frames from the video.

	To sample an n-frame clip from the video, PadTo samples
	the frames from zero index, and loop or zero pad the frames
	if the length of video frames is less than the value of `length`.

	Required Keys:

	- keypoint
	- total_frames
	- start_index (optional)

	Modified Keys:

	- keypoint
	- total_frames

	Args:
	length (int): The maximum length of the sampled output clip.
	mode (str): The padding mode. Defaults to ``'loop'``.
	"""

	def __init__(self, length: int, mode: str = 'loop') -> None:
	self.length = length
	assert mode in ['loop', 'zero']
	self.mode = mode

	def transform(self, results: Dict) -> Dict:
	"""The transform function of :class:`PadTo`.

	Args:
	results (dict): The result dict.

	Returns:
	dict: The result dict.
	"""
	total_frames = results['total_frames']
	assert total_frames <= self.length
	start_index = results.get('start_index', 0)
	inds = np.arange(start_index, start_index + self.length)
	inds = np.mod(inds, total_frames)

	keypoint = results['keypoint'][:, inds].copy()
	if self.mode == 'zero':
	keypoint[:, total_frames:] = 0

	results['keypoint'] = keypoint
	results['total_frames'] = self.length
	return results

	def __repr__(self) -> str:
	repr_str = (f'{self.__class__.__name__}('
	f'length={self.length}, '
	f'mode={self.mode})')
	return repr_str


	@TRANSFORMS.register_module()
	class PoseDecode(BaseTransform):
	"""Load and decode pose with given indices.

	Required Keys:

	- keypoint
	- total_frames (optional)
	- frame_inds (optional)
	- offset (optional)
	- keypoint_score (optional)

	Modified Keys:

	- keypoint
	- keypoint_score (optional)
	"""

	@staticmethod
	def _load_kp(kp: np.ndarray, frame_inds: np.ndarray) -> np.ndarray:
	"""Load keypoints according to sampled indexes."""
	return kp[:, frame_inds].astype(np.float32)

	@staticmethod
	def _load_kpscore(kpscore: np.ndarray,
	frame_inds: np.ndarray) -> np.ndarray:
	"""Load keypoint scores according to sampled indexes."""
	return kpscore[:, frame_inds].astype(np.float32)

	def transform(self, results: Dict) -> Dict:
	"""The transform function of :class:`PoseDecode`.

	Args:
	results (dict): The result dict.

	Returns:
	dict: The result dict.
	"""
	if 'total_frames' not in results:
	results['total_frames'] = results['keypoint'].shape[1]

	if 'frame_inds' not in results:
	results['frame_inds'] = np.arange(results['total_frames'])

	if results['frame_inds'].ndim != 1:
	results['frame_inds'] = np.squeeze(results['frame_inds'])

	offset = results.get('offset', 0)
	frame_inds = results['frame_inds'] + offset

	if 'keypoint_score' in results:
	results['keypoint_score'] = self._load_kpscore(
	results['keypoint_score'], frame_inds)

	results['keypoint'] = self._load_kp(results['keypoint'], frame_inds)

	return results

	def __repr__(self) -> str:
	repr_str = f'{self.__class__.__name__}()'
	return repr_str


	@TRANSFORMS.register_module()
	class MMUniformSampleFrames(UniformSampleFrames):
	"""Uniformly sample frames from the multi-modal data."""

	def transform(self, results: Dict) -> Dict:
	"""The transform function of :class:`MMUniformSampleFrames`.

	Args:
	results (dict): The result dict.

	Returns:
	dict: The result dict.
	"""
	num_frames = results['total_frames']
	modalities = []
	for modality, clip_len in self.clip_len.items():
	if self.test_mode:
	inds = self._get_test_clips(num_frames, clip_len)
	else:
	inds = self._get_train_clips(num_frames, clip_len)
	inds = np.mod(inds, num_frames)
	results[f'{modality}_inds'] = inds.astype(np.int32)
	modalities.append(modality)
	results['clip_len'] = self.clip_len
	results['frame_interval'] = None
	results['num_clips'] = self.num_clips
	if not isinstance(results['modality'], list):
	# should override
	results['modality'] = modalities
	return results


	@TRANSFORMS.register_module()
	class MMDecode(DecordInit, DecordDecode, PoseDecode):
	"""Decode RGB videos and skeletons."""

	def __init__(self, io_backend: str = 'disk', **kwargs) -> None:
	DecordInit.__init__(self, io_backend=io_backend, **kwargs)
	DecordDecode.__init__(self)
	self.io_backend = io_backend
	self.kwargs = kwargs
	self.file_client = None

	def transform(self, results: Dict) -> Dict:
	"""The transform function of :class:`MMDecode`.

	Args:
	results (dict): The result dict.

	Returns:
	dict: The result dict.
	"""
	for mod in results['modality']:
	if results[f'{mod}_inds'].ndim != 1:
	results[f'{mod}_inds'] = np.squeeze(results[f'{mod}_inds'])
	frame_inds = results[f'{mod}_inds']
	if mod == 'RGB':
	if 'filename' not in results:
	results['filename'] = results['frame_dir'] + '.mp4'
	video_reader = self._get_video_reader(results['filename'])
	imgs = self._decord_load_frames(video_reader, frame_inds)
	del video_reader
	results['imgs'] = imgs
	elif mod == 'Pose':
	assert 'keypoint' in results
	if 'keypoint_score' not in results:
	keypoint_score = [
	np.ones(keypoint.shape[:-1], dtype=np.float32)
	for keypoint in results['keypoint']
	]
	results['keypoint_score'] = np.stack(keypoint_score)
	results['keypoint'] = self._load_kp(results['keypoint'],
	frame_inds)
	results['keypoint_score'] = self._load_kpscore(
	results['keypoint_score'], frame_inds)
	else:
	raise NotImplementedError(
	f'MMDecode: Modality {mod} not supported')

	# We need to scale human keypoints to the new image size
	if 'imgs' in results and 'keypoint' in results:
	real_img_shape = results['imgs'][0].shape[:2]
	if real_img_shape != results['img_shape']:
	oh, ow = results['img_shape']
	nh, nw = real_img_shape

	assert results['keypoint'].shape[-1] in [2, 3]
	results['keypoint'][..., 0] *= (nw / ow)
	results['keypoint'][..., 1] *= (nh / oh)
	results['img_shape'] = real_img_shape
	results['original_shape'] = real_img_shape

	return results

	def __repr__(self) -> str:
	repr_str = (f'{self.__class__.__name__}('
	f'io_backend={self.io_backend})')
	return repr_str


	@TRANSFORMS.register_module()
	class MMCompact(BaseTransform):
	"""Convert the coordinates of keypoints and crop the images to make them
	more compact.

	Required Keys:

	- imgs
	- keypoint
	- img_shape

	Modified Keys:

	- imgs
	- keypoint
	- img_shape

	Args:
	padding (float): The padding size. Defaults to 0.25.
	threshold (int): The threshold for the tight bounding box. If the width
	or height of the tight bounding box is smaller than the threshold,
	we do not perform the compact operation. Defaults to 10.
	hw_ratio (float \| tuple[float]): The hw_ratio of the expanded
	box. Float indicates the specific ratio and tuple indicates a
	ratio range. If set as None, it means there is no requirement on
	hw_ratio. Defaults to 1.
	allow_imgpad (bool): Whether to allow expanding the box outside the
	image to meet the hw_ratio requirement. Defaults to True.
	"""

	def __init__(self,
	padding: float = 0.25,
	threshold: int = 10,
	hw_ratio: Union[float, Tuple[float]] = 1,
	allow_imgpad: bool = True) -> None:

	self.padding = padding
	self.threshold = threshold
	if hw_ratio is not None:
	hw_ratio = _pair(hw_ratio)
	self.hw_ratio = hw_ratio
	self.allow_imgpad = allow_imgpad
	assert self.padding >= 0

	def _get_box(self, keypoint: np.ndarray, img_shape: Tuple[int]) -> Tuple:
	"""Calculate the bounding box surrounding all joints in the frames."""
	h, w = img_shape

	kp_x = keypoint[..., 0]
	kp_y = keypoint[..., 1]

	min_x = np.min(kp_x[kp_x != 0], initial=np.Inf)
	min_y = np.min(kp_y[kp_y != 0], initial=np.Inf)
	max_x = np.max(kp_x[kp_x != 0], initial=-np.Inf)
	max_y = np.max(kp_y[kp_y != 0], initial=-np.Inf)

	# The compact area is too small
	if max_x - min_x < self.threshold or max_y - min_y < self.threshold:
	return 0, 0, w, h

	center = ((max_x + min_x) / 2, (max_y + min_y) / 2)
	half_width = (max_x - min_x) / 2 * (1 + self.padding)
	half_height = (max_y - min_y) / 2 * (1 + self.padding)

	if self.hw_ratio is not None:
	half_height = max(self.hw_ratio[0] * half_width, half_height)
	half_width = max(1 / self.hw_ratio[1] * half_height, half_width)

	min_x, max_x = center[0] - half_width, center[0] + half_width
	min_y, max_y = center[1] - half_height, center[1] + half_height

	# hot update
	if not self.allow_imgpad:
	min_x, min_y = int(max(0, min_x)), int(max(0, min_y))
	max_x, max_y = int(min(w, max_x)), int(min(h, max_y))
	else:
	min_x, min_y = int(min_x), int(min_y)
	max_x, max_y = int(max_x), int(max_y)
	return min_x, min_y, max_x, max_y

	def _compact_images(self, imgs: List[np.ndarray], img_shape: Tuple[int],
	box: Tuple[int]) -> List:
	"""Crop the images acoordding the bounding box."""
	h, w = img_shape
	min_x, min_y, max_x, max_y = box
	pad_l, pad_u, pad_r, pad_d = 0, 0, 0, 0
	if min_x < 0:
	pad_l = -min_x
	min_x, max_x = 0, max_x + pad_l
	w += pad_l
	if min_y < 0:
	pad_u = -min_y
	min_y, max_y = 0, max_y + pad_u
	h += pad_u
	if max_x > w:
	pad_r = max_x - w
	w = max_x
	if max_y > h:
	pad_d = max_y - h
	h = max_y

	if pad_l > 0 or pad_r > 0 or pad_u > 0 or pad_d > 0:
	imgs = [
	np.pad(img, ((pad_u, pad_d), (pad_l, pad_r), (0, 0)))
	for img in imgs
	]
	imgs = [img[min_y:max_y, min_x:max_x] for img in imgs]
	return imgs

	def transform(self, results: Dict) -> Dict:
	"""The transform function of :class:`MMCompact`.

	Args:
	results (dict): The result dict.

	Returns:
	dict: The result dict.
	"""
	img_shape = results['img_shape']
	kp = results['keypoint']
	# Make NaN zero
	kp[np.isnan(kp)] = 0.
	min_x, min_y, max_x, max_y = self._get_box(kp, img_shape)

	kp_x, kp_y = kp[..., 0], kp[..., 1]
	kp_x[kp_x != 0] -= min_x
	kp_y[kp_y != 0] -= min_y

	new_shape = (max_y - min_y, max_x - min_x)
	results['img_shape'] = new_shape
	results['imgs'] = self._compact_images(results['imgs'], img_shape,
	(min_x, min_y, max_x, max_y))
	return results

	def __repr__(self) -> str:
	repr_str = (f'{self.__class__.__name__}(padding={self.padding}, '
	f'threshold={self.threshold}, '
	f'hw_ratio={self.hw_ratio}, '
	f'allow_imgpad={self.allow_imgpad})')
	return repr_str