Spaces:

ttxskk
/

AiOS

Runtime error

File size: 38,411 Bytes

d7e58f0

import json

import cv2
import h5py
import numpy as np
import torch
import tqdm

from detrsmpl.models.body_models.builder import build_body_model
from detrsmpl.models.body_models.utils import batch_transform_to_camera_frame


class SMCReader:
    def __init__(self, file_path, body_model=None):
        """Read SenseMocapFile endswith ".smc", see: https://github.com/open-
        mmlab/detrsmpl/blob/main/docs/smc.md.

        Args:
            file_path (str):
                Path to an SMC file.
            body_model (nn.Module or dict):
                Only needed for SMPL transformation to device frame
                if nn.Module: a body_model instance
                if dict: a body_model config
        """
        self.smc = h5py.File(file_path, 'r')
        self.__calibration_dict__ = None
        self.action_id = self.smc.attrs['action_id']
        self.actor_id = self.smc.attrs['actor_id']
        self.datetime_str = self.smc.attrs['datetime_str']  # .decode()
        self.kinect_num_frames = self.smc['Kinect'].attrs['num_frame']
        self.num_kinects = self.smc['Kinect'].attrs['num_device']
        self.kinect_color_resolution = self.get_kinect_color_resolution(0)
        self.kinect_depth_resolution = self.get_kinect_depth_resolution(0)
        self.iphone_exists = 'iPhone' in self.smc.keys()
        self.num_iphones = 1
        if self.iphone_exists:
            self.iphone_num_frames = self.smc['iPhone'].attrs['num_frame']
            self.iphone_color_resolution = \
                self.smc['iPhone'].attrs['color_resolution']  # vertical
            self.iphone_depth_resolution = \
                self.smc['iPhone'].attrs['depth_resolution']  # vertical
        self.keypoint_exists = 'Keypoints3D' in self.smc.keys()
        if self.keypoint_exists:
            self.keypoints_num_frames = self.smc['Keypoints3D'].attrs[
                'num_frame']
            self.keypoints_convention = self.smc['Keypoints3D'].attrs[
                'convention']
            self.keypoints_created_time = self.smc['Keypoints3D'].attrs[
                'created_time']
        self.smpl_exists = 'SMPL' in self.smc.keys()
        if self.smpl_exists:
            self.smpl_num_frames = self.smc['SMPL'].attrs['num_frame']
            self.smpl_created_time = self.smc['SMPL'].attrs['created_time']

            # initialize body model
            if isinstance(body_model, torch.nn.Module):
                self.body_model = body_model
            elif isinstance(body_model, dict):
                self.body_model = build_body_model(body_model)
            else:
                # in most cases, SMCReader is instantiated for image reading
                # only. Hence, it is wasteful to initialize a body model until
                # really needed in get_smpl()
                self.body_model = None
                self.default_body_model_config = dict(
                    type='SMPL',
                    gender='neutral',
                    num_betas=10,
                    keypoint_src='smpl_45',
                    keypoint_dst='smpl_45',
                    model_path='data/body_models/smpl',
                    batch_size=1,
                )

    def get_kinect_color_extrinsics(self, kinect_id, homogeneous=True):
        """Get extrinsics(cam2world) of a kinect RGB camera by kinect id.

        Args:
            kinect_id (int):
                ID of a kinect, starts from 0.
            homogeneous (bool, optional):
                If true, returns rotation and translation in
                one 4x4 matrix. Defaults to True.

        Returns:
            homogeneous is True
                ndarray: A 4x4 matrix of rotation and translation(cam2world).
            homogeneous is False
                dict: A dict of rotation and translation,
                        keys are R and T,
                        each value is an ndarray.
        """
        R = np.asarray(self.calibration_dict[str(kinect_id * 2)]['R']).reshape(
            3, 3)
        T = np.asarray(self.calibration_dict[str(kinect_id *
                                                 2)]['T']).reshape(3)
        if homogeneous:
            extrinsics = np.identity(4, dtype=float)
            extrinsics[:3, :3] = R
            extrinsics[:3, 3] = T
            return extrinsics
        else:
            return {'R': R, 'T': T}

    @property
    def calibration_dict(self):
        """Get the dict of calibration.

        Returns:
            dict:
                A dict of calibrated extrinsics.
        """
        if self.__calibration_dict__ is not None:
            return self.__calibration_dict__
        else:
            return json.loads(self.smc['Extrinsics'][()])

    def get_kinect_depth_extrinsics(self, kinect_id, homogeneous=True):
        """Get extrinsics(cam2world) of a kinect depth camera by kinect id.

        Args:
            kinect_id (int):
                ID of a kinect, starts from 0.
            homogeneous (bool, optional):
                If true, returns rotation and translation in
                one 4x4 matrix. Defaults to True.

        Returns:
            homogeneous is True
                ndarray: A 4x4 matrix of rotation and translation(cam2world).
            homogeneous is False
                dict: A dict of rotation and translation,
                        keys are R and T,
                        each value is an ndarray.
        """
        R = np.asarray(self.calibration_dict[str(kinect_id * 2 +
                                                 1)]['R']).reshape(3, 3)
        T = np.asarray(self.calibration_dict[str(kinect_id * 2 +
                                                 1)]['T']).reshape(3)
        if homogeneous:
            extrinsics = np.identity(4, dtype=float)
            extrinsics[:3, :3] = R
            extrinsics[:3, 3] = T
            return extrinsics
        else:
            return {'R': R, 'T': T}

    def get_kinect_color_intrinsics(self, kinect_id):
        """Get intrinsics of a kinect RGB camera by kinect id.

        Args:
            kinect_id (int):
                ID of a kinect, starts from 0.

        Returns:
            ndarray: A 3x3 matrix.
        """
        kinect_dict = self.smc['Kinect'][str(kinect_id)]
        intrinsics = \
            kinect_dict['Calibration']['Color']['Intrinsics'][()]
        cx, cy, fx, fy = intrinsics[:4]
        intrinsics = \
            np.asarray([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
        return intrinsics

    def get_kinect_color_resolution(self, kinect_id):
        """Get resolution of a kinect RGB camera by kinect id.

        Args:
            kinect_id (int):
                ID of a kinect, starts from 0.

        Returns:
            ndarray:
                An ndarray of (width, height), shape=[2, ].
        """
        kinect_dict = self.smc['Kinect'][str(kinect_id)]
        resolution = \
            kinect_dict['Calibration']['Color']['Resolution'][()]
        return resolution

    def get_kinect_depth_resolution(self, kinect_id):
        """Get resolution of a kinect depth camera by kinect id.

        Args:
            kinect_id (int):
                ID of a kinect, starts from 0.

        Returns:
            ndarray:
                An ndarray of (width, height), shape=[2, ].
        """
        kinect_dict = self.smc['Kinect'][str(kinect_id)]
        resolution = \
            kinect_dict['Calibration']['Depth']['Resolution'][()]
        return resolution

    def get_kinect_depth_intrinsics(self, kinect_id):
        """Get intrinsics of a kinect depth camera by kinect id.

        Args:
            kinect_id (int):
                ID of a kinect, starts from 0.

        Returns:
            ndarray: A 3x3 matrix.
        """
        kinect_dict = self.smc['Kinect'][str(kinect_id)]
        intrinsics = \
            kinect_dict['Calibration']['Depth']['Intrinsics'][()]
        cx, cy, fx, fy = intrinsics[:4]
        intrinsics = \
            np.asarray([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
        return intrinsics

    def get_iphone_intrinsics(self, iphone_id=0, frame_id=0, vertical=True):
        """Get intrinsics of an iPhone RGB camera by iPhone id.

        Args:
            iphone_id (int, optional):
                ID of an iPhone, starts from 0.
                Defaults to 0.
            frame_id (int, optional):
                int: frame id of one selected frame
                Defaults to 0.
            vertical (bool, optional):
                iPhone assumes landscape orientation
                if True, convert data to vertical orientation
                Defaults to True.

        Returns:
            ndarray: A 3x3 matrix.
        """
        camera_info = self.smc['iPhone'][str(iphone_id)]['CameraInfo'][str(
            frame_id)]
        camera_info = json.loads(camera_info[()])
        intrinsics = np.asarray(camera_info['cameraIntrinsics']).transpose()

        # Intrinsics have to be adjusted to achieve rotation
        #   1. swapping fx, fy
        #   2. cx -> image height - cy; cy -> cx
        if vertical:
            fx, fy = intrinsics[0, 0], intrinsics[1, 1]
            cx, cy = intrinsics[0, 2], intrinsics[1, 2]
            W, H = self.get_iphone_color_resolution(vertical=False)
            intrinsics = np.eye(3)
            intrinsics[0, 0], intrinsics[1, 1] = fy, fx
            intrinsics[0, 2], intrinsics[1, 2] = H - cy, cx

        return intrinsics

    def get_iphone_extrinsics(self,
                              iphone_id=0,
                              homogeneous=True,
                              vertical=True):
        """Get extrinsics(cam2world) of an iPhone RGB camera by iPhone id.

        Args:
            iphone_id (int, optional):
                ID of an iPhone, starts from 0.
                Defaults to 0.
            homogeneous (bool, optional):
                If true, returns rotation and translation in
                one 4x4 matrix. Defaults to True.
            vertical (bool, optional):
                iPhone assumes landscape orientation
                if True, convert data to vertical orientation
                Defaults to True.

        Returns:
            homogeneous is True
                ndarray: A 4x4 transformation matrix(cam2world).
            homogeneous is False
                dict: A dict of rotation and translation,
                    keys are R and T,
                    each value is an ndarray.
        """
        if iphone_id != 0:
            raise KeyError('Currently only one iPhone.')
        R = np.asarray(self.calibration_dict['iPhone']['R']).reshape(3, 3)
        T = np.asarray(self.calibration_dict['iPhone']['T']).reshape(3)

        # cam2world
        extrinsics = np.identity(4, dtype=float)
        extrinsics[:3, :3] = R
        extrinsics[:3, 3] = T

        # Extrinsics have to be adjusted to achieve rotation
        # A rotation matrix is applied on the extrinsics
        if vertical:
            # 90-degree clockwise rotation around z-axis
            R = np.eye(4)
            R[:2, :2] = np.array([[0, -1], [1, 0]])
            # Note the extrinsics is cam2world
            # world2cam_adjusted = R @ world2cam
            # => cam2world_adjusted = cam2world @ inv(R)
            extrinsics = extrinsics @ np.linalg.inv(R)
            R = extrinsics[:3, :3]
            T = extrinsics[:3, 3]

        if homogeneous:
            return extrinsics
        else:
            return {'R': R, 'T': T}

    def get_iphone_color_resolution(self, iphone_id=0, vertical=True):
        """Get color image resolution of an iPhone RGB camera by iPhone id.

        Args:
            iphone_id (int, optional):
                ID of an iPhone, starts from 0.
                Defaults to 0.
            vertical (bool, optional):
                iPhone assumes landscape orientation
                if True, convert data to vertical orientation
                Defaults to True.

        Returns:
            ndarray:get_iphone_keypoints2d
                An ndarray of (width, height), shape=[2, ].
        """
        if iphone_id != 0:
            raise KeyError('Currently only one iPhone.')
        if vertical:
            W_horizontal, H_horizontal = self.iphone_color_resolution
            W_vertical, H_vertical = H_horizontal, W_horizontal
            return np.array([W_vertical, H_vertical])
        else:
            return self.iphone_color_resolution

    def get_kinect_color(self, kinect_id, frame_id=None, disable_tqdm=True):
        """Get several frames captured by a kinect RGB camera.

        Args:
            kinect_id (int):
                ID of a kinect, starts from 0.
            frame_id (int, list or None, optional):
                int: frame id of one selected frame
                list: a list of frame id
                None: all frames will be returned
                Defaults to None.
            disable_tqdm (bool, optional):
                Whether to disable the entire progressbar wrapper.
                Defaults to True.

        Returns:
            ndarray:
                An ndarray in shape [frame_number, height, width, channels].
        """
        frames = []
        if frame_id is None:
            frame_list = range(self.get_kinect_num_frames())
        elif isinstance(frame_id, list):
            frame_list = frame_id
        elif isinstance(frame_id, int):
            assert frame_id < self.get_kinect_num_frames(),\
                'Index out of range...'
            frame_list = [frame_id]
        else:
            raise TypeError('frame_id should be int, list or None.')
        for i in tqdm.tqdm(frame_list, disable=disable_tqdm):
            frames.append(
                self.__read_color_from_bytes__(
                    self.smc['Kinect'][str(kinect_id)]['Color'][str(i)][()]))
        return np.stack(frames, axis=0)

    def get_kinect_rgbd(self,
                        kinect_id,
                        frame_id,
                        mode='color2depth',
                        threshold=0):
        if mode == 'color2depth':
            mapped_color = \
                self.__map_color_to_depth__(
                    kinect_id, frame_id, threshold=threshold
                )
            depth = self.get_kinect_depth(kinect_id, frame_id)[0]
            return mapped_color, depth
        else:
            print('Model {} is not supported...'.format(mode))

    def get_kinect_depth(self, kinect_id, frame_id=None, disable_tqdm=True):
        """Get several frames captured by a kinect depth camera.

        Args:
            kinect_id (int):
                ID of a kinect, starts from 0.
            frame_id (int, list or None, optional):
                int: frame id of one selected frame
                list: a list of frame id
                None: all frames will be returned
                Defaults to None.
            disable_tqdm (bool, optional):
                Whether to disable the entire progressbar wrapper.
                Defaults to True.

        Returns:
            ndarray:
                An ndarray in shape [frame_number, height, width, channels].
        """
        frames = []
        frame_list = []
        if frame_id is None or type(frame_id) == list:
            frame_list = range(self.get_kinect_num_frames())
            if frame_id:
                frame_list = frame_id
        else:
            assert frame_id < self.get_kinect_num_frames(),\
                'Index out of range...'
            frame_list.append(frame_id)
        for i in tqdm.tqdm(frame_list, disable=disable_tqdm):
            frames.append(
                self.smc['Kinect'][str(kinect_id)]['Depth'][str(i)][()])
        return np.stack(frames, axis=0)

    def __read_color_from_bytes__(self, color_array):
        """Decode an RGB image from an encoded byte array."""
        return cv2.cvtColor(cv2.imdecode(color_array, cv2.IMREAD_COLOR),
                            cv2.COLOR_BGR2RGB)

    def get_num_kinect(self):
        """Get the number of Kinect devices.

        Returns:
            int:
                Number of Kinect devices.
        """
        return self.num_kinects

    def get_kinect_num_frames(self):
        """Get the number of frames recorded by one Kinect RGB camera.

        Returns:
            int:
                Number of frames.
        """
        return self.kinect_num_frames

    def get_iphone_num_frames(self):
        """Get the number of frames recorded by one iPhone RGB camera.

        Returns:
            int:
                Number of frames.
        """
        return self.iphone_num_frames

    def get_depth_mask(self, device_id, frame_id):
        return self.smc['Kinect'][str(device_id)]['Mask'][str(frame_id)][()]

    def get_kinect_mask(self, device_id, frame_id):
        kinect_dict = self.smc['Kinect'][str(device_id)]
        return kinect_dict['Mask_k4abt'][str(frame_id)][()]

    def get_num_iphone(self):
        """Get the number of iPhone devices.

        Returns:
            int:
                Number of iPhone devices.
        """
        return self.num_iphones

    def get_iphone_color(self,
                         iphone_id=0,
                         frame_id=None,
                         disable_tqdm=True,
                         vertical=True):
        """Get several frames captured by an iPhone RGB camera.

        Args:
            iphone_id (int):
                ID of an iPhone, starts from 0.
            frame_id (int, list or None, optional):
                int: frame id of one selected frame
                list: a list of frame id
                None: all frames will be returned
                Defaults to None.
            disable_tqdm (bool, optional):
                Whether to disable the entire progressbar wrapper.
                Defaults to True.
            vertical (bool, optional):
                iPhone assumes horizontal orientation
                if True, convert data to vertical orientation
                Defaults to True.

        Returns:
            frames:
                An ndarray in shape [frame_number, height, width, channels].
        """
        frames = []
        if frame_id is None:
            frame_list = range(self.get_iphone_num_frames())
        elif isinstance(frame_id, list):
            frame_list = frame_id
        elif isinstance(frame_id, int):
            assert frame_id < self.get_iphone_num_frames(),\
                'Index out of range...'
            frame_list = [frame_id]
        else:
            raise TypeError('frame_id should be int, list or None.')
        for i in tqdm.tqdm(frame_list, disable=disable_tqdm):
            frame = self.__read_color_from_bytes__(
                self.smc['iPhone'][str(iphone_id)]['Color'][str(i)][()])
            if vertical:
                frame = cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE)
            frames.append(frame)
        return np.stack(frames, axis=0)

    def get_iphone_depth(self,
                         iphone_id=0,
                         frame_id=None,
                         disable_tqdm=True,
                         vertical=True):
        """Get several frames captured by an iPhone RGB camera.

        Args:
            iphone_id (int):
                ID of an iPhone, starts from 0.
            frame_id (int, list or None, optional):
                int: frame id of one selected frame
                list: a list of frame id
                None: all frames will be returned
                Defaults to None.
            disable_tqdm (bool, optional):
                Whether to disable the entire progressbar wrapper.
                Defaults to True.
            vertical (bool, optional):
                iPhone assumes horizontal orientation
                if True, convert data to vertical orientation
                Defaults to True.

        Returns:
            frames:
                An ndarray in shape [frame_number, height, width, channels].
        """
        frames = []
        if frame_id is None:
            frame_list = range(self.get_iphone_num_frames())
        elif isinstance(frame_id, list):
            frame_list = frame_id
        elif isinstance(frame_id, int):
            assert frame_id < self.get_iphone_num_frames(),\
                'Index out of range...'
            frame_list = [frame_id]
        else:
            raise TypeError('frame_id should be int, list or None.')
        for i in tqdm.tqdm(frame_list, disable=disable_tqdm):
            frame = self.smc['iPhone'][str(iphone_id)]['Depth'][str(i)][()]
            if vertical:
                frame = cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE)
            frames.append(frame)
        return np.stack(frames, axis=0)

    def get_kinect_transformation_depth_to_color(self, device_id):
        """Get transformation matrix from depth to color from a single kinect.

        Args:
            kinect_id (int, optional):
                ID of a Kinect, starts from 0.

        Returns:
            ndarray: A 4x4 transformation matrix.
        """
        return np.linalg.inv(self.get_kinect_color_extrinsics(
            device_id)) @ self.get_kinect_depth_extrinsics(device_id)

    def get_kinect_transformation_color_to_depth(self, device_id):
        """Get transformation matrix from color to depth from a single kinect.

        Args:
            kinect_id (int, optional):
                ID of a Kinect, starts from 0.

        Returns:
            ndarray: A 4x4 transformation matrix.
        """
        return np.linalg.inv(self.get_kinect_depth_extrinsics(
            device_id)) @ self.get_kinect_color_extrinsics(device_id)

    def __map_color_to_depth__(self, device_id, frame_id, threshold=100):
        color_image = self.get_kinect_color(device_id, frame_id)[0]
        depth_image = self.get_kinect_depth(device_id, frame_id)[0]
        color_intrinsic = self.get_kinect_color_intrinsics(device_id)
        depth_intrinsic = self.get_kinect_depth_intrinsics(device_id)

        mask = self.get_depth_mask(device_id, frame_id)

        Td2c = self.get_kinect_transformation_depth_to_color(device_id)

        colidx = np.arange(depth_image.shape[1])
        rowidx = np.arange(depth_image.shape[0])
        colidx_map, rowidx_map = np.meshgrid(colidx, rowidx)
        col_indices = colidx_map[mask >= threshold]
        row_indices = rowidx_map[mask >= threshold]

        homo_padding = \
            np.ones((col_indices.shape[0], 1), dtype=np.float32)
        homo_indices = \
            np.concatenate(
                (col_indices[..., None], row_indices[..., None], homo_padding),
                axis=1
            )

        depth_intrinsic_inv = np.linalg.inv(depth_intrinsic)
        normalized_points = \
            depth_intrinsic_inv[None, ...] @ homo_indices[..., None]

        z_values = (depth_image / 1000)[mask >= threshold]
        valid_points = \
            normalized_points.squeeze() * z_values[..., None]

        R = Td2c[:3, :3]
        T = Td2c[:3, 3]
        valid_points = \
            R[None, ...] @ valid_points[..., None] + T[None, ..., None]
        valid_uvs = \
            color_intrinsic[None, ...] @\
            valid_points / valid_points[:, 2][..., None]
        valid_uvs = np.int32(valid_uvs.squeeze()[..., :2] + 0.5)
        valid_uvs[:, 0] = np.clip(valid_uvs[:, 0], 0, color_image.shape[1] - 1)
        valid_uvs[:, 1] = np.clip(valid_uvs[:, 1], 0, color_image.shape[0] - 1)
        mapped_color = np.ones((depth_image.shape[0], depth_image.shape[1], 3),
                               dtype=np.uint8) * 255
        mapped_color[mask >= threshold] = \
            color_image[valid_uvs[:, 1], valid_uvs[:, 0]]

        if threshold == 1:
            return valid_uvs
        return mapped_color

    def get_kinect_skeleton_3d(self, device_id, frame_id):
        """Get the 3D skeleton key points from a certain kinect.

        Args:
            device_id (int):
                ID of a kinect, starts from 0.

        Returns:
            list:
                A list with 3D keypoints
        """
        kinect_dict = self.smc['Kinect'][str(device_id)]
        return json.loads(kinect_dict['Skeleton_k4abt'][str(frame_id)][()])

    def get_depth_floor(self, device_id: int) -> dict:
        """Get the floor plane defined by a normal vector and a center point
        from a certain kinect.

        Args:
            device_id (int):
                ID of a kinect, starts from 0.

        Raises:
            KeyError:
                Key 'floor' not in ID of a kinect.

        Returns:
            dict:
                A dict with 'center', 'normal' and 'pnum'.
        """
        device_dict = self.calibration_dict[str(device_id * 2 + 1)]
        if 'floor' in device_dict:
            return device_dict['floor']
        else:
            raise KeyError(f'Kinect {device_id} has no floor data.')

    def get_keypoints2d(self, device, device_id, frame_id=None, vertical=True):
        """Get keypoints2d projected from keypoints3d.

        Args:
            device (str):
                Device name, should be Kinect or iPhone.
            device_id (int):
                ID of a device, starts from 0.
            frame_id (int, list or None, optional):
                int: frame id of one selected frame
                list: a list of frame id
                None: all frames will be returned
                Defaults to None.
            vertical (bool, optional):
                Only applicable to iPhone as device
                iPhone assumes horizontal orientation
                if True, convert data to vertical orientation
                Defaults to True.

        Returns:
            Tuple[np.ndarray, np.ndarray]:
                keypoints2d (N, J, 3) and its mask (J, )
        """
        assert device in {
            'Kinect', 'iPhone'
        }, f'Undefined device: {device}, should be "Kinect" or "iPhone"'
        assert device_id >= 0

        kps2d_dict = self.smc['Keypoints2D'][device][str(device_id)]
        keypoints2d = kps2d_dict['keypoints2d'][...]
        keypoints2d_mask = kps2d_dict['keypoints2d_mask'][...]

        if frame_id is None:
            frame_list = range(self.get_keypoints_num_frames())
        elif isinstance(frame_id, list):
            frame_list = frame_id
        elif isinstance(frame_id, int):
            assert frame_id < self.get_keypoints_num_frames(),\
                'Index out of range...'
            frame_list = [frame_id]
        else:
            raise TypeError('frame_id should be int, list or None.')

        keypoints2d = keypoints2d[frame_list, ...]

        if device == 'iPhone' and vertical:
            # rotate keypoints 2D clockwise by 90 degrees
            W, H = self.get_iphone_color_resolution(vertical=False)
            xs, ys, conf = \
                keypoints2d[..., 0], keypoints2d[..., 1], keypoints2d[..., 2]
            xs, ys = H - ys, xs  # horizontal -> vertical
            keypoints2d[..., 0], keypoints2d[..., 1] = xs.copy(), ys.copy()
            keypoints2d[conf == 0.0] = 0.0

        return keypoints2d, keypoints2d_mask

    def get_kinect_keypoints2d(self, device_id, frame_id=None):
        """Get Kinect 2D keypoints.

        Args:
            device_id (int):
                ID of Kinect, starts from 0.
            frame_id (int, list or None, optional):
                int: frame id of one selected frame
                list: a list of frame id
                None: all frames will be returned
                Defaults to None.

        Returns:
            Tuple[np.ndarray, np.ndarray]:
                keypoints2d (N, J, 3) and its mask (J, )
        """
        assert self.num_kinects > device_id >= 0
        return self.get_keypoints2d('Kinect', device_id, frame_id)

    def get_iphone_keypoints2d(self,
                               device_id=0,
                               frame_id=None,
                               vertical=True):
        """Get iPhone 2D keypoints.

        Args:
            device_id (int):
                ID of iPhone, starts from 0.
            frame_id (int, list or None, optional):
                int: frame id of one selected frame
                list: a list of frame id
                None: all frames will be returned
                Defaults to None.
            vertical (bool, optional):
                iPhone assumes horizontal orientation
                if True, convert data to vertical orientation
                Defaults to True.

        Returns:
            Tuple[np.ndarray, np.ndarray]:
                keypoints2d (N, J, 3) and its mask (J, )
        """
        assert device_id >= 0
        return self.get_keypoints2d('iPhone',
                                    device_id,
                                    frame_id,
                                    vertical=vertical)

    def get_color(self,
                  device,
                  device_id,
                  frame_id=None,
                  disable_tqdm=True,
                  vertical=True):
        """Get RGB image(s) from Kinect RGB or iPhone RGB camera.

        Args:
            device (str):
                Device name, should be Kinect or iPhone.
            device_id (int):
                Device ID, starts from 0.
            frame_id (int, list or None, optional):
                int: frame id of one selected frame
                list: a list of frame id
                None: all frames will be returned
                Defaults to None.
            disable_tqdm (bool, optional):
                Whether to disable the entire progressbar wrapper.
                Defaults to True.
            vertical (bool, optional):
                Only applicable to iPhone as device
                iPhone assumes horizontal orientation
                if True, convert data to vertical orientation
                Defaults to True.

        Returns:
            img (ndarray):
                An ndarray in shape [frame_number, height, width, channels].
        """

        assert device in {
            'Kinect', 'iPhone'
        }, f'Undefined device: {device}, should be "Kinect" or "iPhone"'

        if device == 'Kinect':
            img = self.get_kinect_color(device_id, frame_id, disable_tqdm)
        else:
            img = self.get_iphone_color(device_id,
                                        frame_id,
                                        disable_tqdm,
                                        vertical=vertical)

        return img

    def get_keypoints_num_frames(self):
        return self.keypoints_num_frames

    def get_keypoints_convention(self):
        return self.keypoints_convention

    def get_keypoints_created_time(self):
        return self.keypoints_created_time

    def get_keypoints3d(self,
                        device=None,
                        device_id=None,
                        frame_id=None,
                        vertical=True):
        """Get keypoints3d (world coordinate) computed by mocap processing
        pipeline.

        Args:
            device (str):
                Device name, should be Kinect or iPhone.
                None: world coordinate
                Defaults to None.
            device_id (int):
                ID of a device, starts from 0.
                None: world coordinate
                Defaults to None
            frame_id (int, list or None, optional):
                int: frame id of one selected frame
                list: a list of frame id
                None: all frames will be returned
                Defaults to None.
            vertical (bool, optional):
                Only applicable to iPhone as device
                iPhone assumes horizontal orientation
                if True, convert data to vertical orientation
                Defaults to True.

        Returns:
            Tuple[np.ndarray, np.ndarray]:
                keypoints3d (N, J, 4) and its mask (J, )
        """
        assert (device is None and device_id is None) or \
            (device is not None and device_id is not None), \
            'device and device_id should be both None or both not None.'
        if device is not None:
            assert device in {
                'Kinect', 'iPhone'
            }, f'Undefined device: {device}, should be "Kinect" or "iPhone"'
        if device_id is not None:
            assert device_id >= 0

        if frame_id is None:
            frame_list = range(self.get_keypoints_num_frames())
        elif isinstance(frame_id, list):
            frame_list = frame_id
        elif isinstance(frame_id, int):
            assert frame_id < self.get_keypoints_num_frames(),\
                'Index out of range...'
            frame_list = [frame_id]
        else:
            raise TypeError('frame_id should be int, list or None.')

        kps3d_dict = self.smc['Keypoints3D']

        # keypoints3d are in world coordinate system
        keypoints3d_world = kps3d_dict['keypoints3d'][...]
        keypoints3d_world = keypoints3d_world[frame_list, ...]
        keypoints3d_mask = kps3d_dict['keypoints3d_mask'][...]

        # return keypoints3d in world coordinate system
        if device is None:
            return keypoints3d_world, keypoints3d_mask

        # return keypoints3d in device coordinate system
        else:
            if device == 'Kinect':
                cam2world = self.get_kinect_color_extrinsics(
                    kinect_id=device_id, homogeneous=True)
            else:
                cam2world = self.get_iphone_extrinsics(iphone_id=device_id,
                                                       vertical=vertical)

            xyz, conf = keypoints3d_world[..., :3], keypoints3d_world[..., [3]]
            xyz_homogeneous = np.ones([*xyz.shape[:-1], 4])
            xyz_homogeneous[..., :3] = xyz
            world2cam = np.linalg.inv(cam2world)
            keypoints3d = np.einsum('ij,kmj->kmi', world2cam, xyz_homogeneous)
            keypoints3d = np.concatenate([keypoints3d[..., :3], conf], axis=-1)

            return keypoints3d, keypoints3d_mask

    def get_smpl_num_frames(self):
        return self.smpl_num_frames

    def get_smpl_created_time(self):
        return self.smpl_created_time

    def get_smpl(self,
                 device=None,
                 device_id=None,
                 frame_id=None,
                 vertical=True):
        """Get SMPL (world coordinate) computed by mocap processing pipeline.

        Args:
            device (str):
                Device name, should be Kinect or iPhone.
                None: world coordinate
                Defaults to None.
            device_id (int):
                ID of a device, starts from 0.
                None: world coordinate
                Defaults to None
            frame_id (int, list or None, optional):
                int: frame id of one selected frame
                list: a list of frame id
                None: all frames will be returned
                Defaults to None.
            vertical (bool, optional):
                Only applicable to iPhone as device
                iPhone assumes horizontal orientation
                if True, convert data to vertical orientation
                Defaults to True.

        Returns:
            dict:
                'global_orient': np.ndarray of shape (N, 3)
                'body_pose': np.ndarray of shape (N, 69)
                'transl': np.ndarray of shape (N, 3)
                'betas': np.ndarray of shape (N, 10)
        """
        smpl_dict = self.smc['SMPL']
        global_orient = smpl_dict['global_orient'][...]
        body_pose = smpl_dict['body_pose'][...]
        transl = smpl_dict['transl'][...]
        betas = smpl_dict['betas'][...]

        if frame_id is None:
            frame_list = range(self.get_smpl_num_frames())
        elif isinstance(frame_id, list):
            frame_list = frame_id
        elif isinstance(frame_id, int):
            assert frame_id < self.get_keypoints_num_frames(),\
                'Index out of range...'
            frame_list = [frame_id]
        else:
            raise TypeError('frame_id should be int, list or None.')

        body_pose = body_pose[frame_list, ...]
        global_orient = global_orient[frame_list, ...]
        transl = transl[frame_list, ...]

        # return SMPL parameters in world coordinate system
        if device is None:
            smpl_dict = dict(global_orient=global_orient,
                             body_pose=body_pose,
                             transl=transl,
                             betas=betas)

            return smpl_dict

        # return SMPL parameters in device coordinate system
        else:

            if self.body_model is None:
                self.body_model = \
                    build_body_model(self.default_body_model_config)
            torch_device = self.body_model.global_orient.device

            assert device in {
                'Kinect', 'iPhone'
            }, f'Undefined device: {device}, should be "Kinect" or "iPhone"'
            assert device_id >= 0

            if device == 'Kinect':
                T_cam2world = self.get_kinect_color_extrinsics(
                    kinect_id=device_id, homogeneous=True)
            else:
                T_cam2world = self.get_iphone_extrinsics(iphone_id=device_id,
                                                         vertical=vertical)

            T_world2cam = np.linalg.inv(T_cam2world)

            output = self.body_model(
                global_orient=torch.tensor(global_orient, device=torch_device),
                body_pose=torch.tensor(body_pose, device=torch_device),
                transl=torch.tensor(transl, device=torch_device),
                betas=torch.tensor(betas, device=torch_device))
            joints = output['joints'].detach().cpu().numpy()
            pelvis = joints[:, 0, :]

            new_global_orient, new_transl = batch_transform_to_camera_frame(
                global_orient=global_orient,
                transl=transl,
                pelvis=pelvis,
                extrinsic=T_world2cam)

            smpl_dict = dict(global_orient=new_global_orient,
                             body_pose=body_pose,
                             transl=new_transl,
                             betas=betas)

            return smpl_dict