Spaces:

mohaed
/

3dAnimation

Sleeping

File size: 8,678 Bytes

f87d582

import os
from tqdm import tqdm
import numpy as np
import pickle as pkl
import utils.rotation_conversions as geometry
import torch

from .dataset import Dataset
# from torch.utils.data import Dataset

action2motion_joints = [8, 1, 2, 3, 4, 5, 6, 7, 0, 9, 10, 11, 12, 13, 14, 21, 24, 38]


def get_z(cam_s, cam_pos, joints, img_size, flength):
    """
    Solves for the depth offset of the model to approx. orth with persp camera.
    """
    # Translate the model itself: Solve the best z that maps to orth_proj points
    joints_orth_target = (cam_s * (joints[:, :2] + cam_pos) + 1) * 0.5 * img_size
    height3d = np.linalg.norm(np.max(joints[:, :2], axis=0) - np.min(joints[:, :2], axis=0))
    height2d = np.linalg.norm(np.max(joints_orth_target, axis=0) - np.min(joints_orth_target, axis=0))
    tz = np.array(flength * (height3d / height2d))
    return float(tz)


def get_trans_from_vibe(vibe, index, use_z=True):
    alltrans = []
    for t in range(vibe["joints3d"][index].shape[0]):
        # Convert crop cam to orig cam
        # No need! Because `convert_crop_cam_to_orig_img` from demoutils of vibe
        # does this already for us :)
        # Its format is: [sx, sy, tx, ty]
        cam_orig = vibe["orig_cam"][index][t]
        x = cam_orig[2]
        y = cam_orig[3]
        if use_z:
            z = get_z(cam_s=cam_orig[0],  # TODO: There are two scales instead of 1.
                      cam_pos=cam_orig[2:4],
                      joints=vibe['joints3d'][index][t],
                      img_size=540,
                      flength=500)
            # z = 500 / (0.5 * 480 * cam_orig[0])
        else:
            z = 0
        trans = [x, y, z]
        alltrans.append(trans)
    alltrans = np.array(alltrans)
    return alltrans - alltrans[0]


class UESTC(Dataset):
    dataname = "uestc"

    def __init__(self, datapath="dataset/uestc", method_name="vibe", view="all", **kargs):

        self.datapath = datapath
        self.method_name = method_name
        self.view = view
        super().__init__(**kargs)

        # Load pre-computed #frames data
        with open(os.path.join(datapath, 'info', 'num_frames_min.txt'), 'r') as f:
            num_frames_video = np.asarray([int(s) for s in f.read().splitlines()])

        # Out of 118 subjects -> 51 training, 67 in test
        all_subjects = np.arange(1, 119)
        self._tr_subjects = [
            1, 2, 6, 12, 13, 16, 21, 24, 28, 29, 30, 31, 33, 35, 39, 41, 42, 45, 47, 50,
            52, 54, 55, 57, 59, 61, 63, 64, 67, 69, 70, 71, 73, 77, 81, 84, 86, 87, 88,
            90, 91, 93, 96, 99, 102, 103, 104, 107, 108, 112, 113]
        self._test_subjects = [s for s in all_subjects if s not in self._tr_subjects]

        # Load names of 25600 videos
        with open(os.path.join(datapath, 'info', 'names.txt'), 'r') as f:
            videos = f.read().splitlines()

        self._videos = videos

        if self.method_name == "vibe":
            vibe_data_path = os.path.join(datapath, "vibe_cache_refined.pkl")
            vibe_data = pkl.load(open(vibe_data_path, "rb"))

            self._pose = vibe_data["pose"]
            num_frames_method = [p.shape[0] for p in self._pose]
            globpath = os.path.join(datapath, "globtrans_usez.pkl")

            if os.path.exists(globpath):
                self._globtrans = pkl.load(open(globpath, "rb"))
            else:
                self._globtrans = []
                for index in tqdm(range(len(self._pose))):
                    self._globtrans.append(get_trans_from_vibe(vibe_data, index, use_z=True))
                pkl.dump(self._globtrans, open("globtrans_usez.pkl", "wb"))
            self._joints = vibe_data["joints3d"]
            self._jointsIx = action2motion_joints
        else:
            raise ValueError("This method name is not recognized.")

        num_frames_video = np.minimum(num_frames_video, num_frames_method)
        num_frames_video = num_frames_video.astype(int)
        self._num_frames_in_video = [x for x in num_frames_video]

        N = len(videos)
        self._actions = np.zeros(N, dtype=int)
        for ind in range(N):
            self._actions[ind] = self.parse_action(videos[ind])

        self._actions = [x for x in self._actions]

        total_num_actions = 40
        self.num_actions = total_num_actions
        keep_actions = np.arange(0, total_num_actions)

        self._action_to_label = {x: i for i, x in enumerate(keep_actions)}
        self._label_to_action = {i: x for i, x in enumerate(keep_actions)}
        self.num_classes = len(keep_actions)

        self._train = []
        self._test = []

        self.info_actions = []

        def get_rotation(view):
            theta = - view * np.pi/4
            axis = torch.tensor([0, 1, 0], dtype=torch.float)
            axisangle = theta*axis
            matrix = geometry.axis_angle_to_matrix(axisangle)
            return matrix

        # 0 is identity if needed
        rotations = {key: get_rotation(key) for key in [0, 1, 2, 3, 4, 5, 6, 7]}

        for index, video in enumerate(tqdm(videos, desc='Preparing UESTC data..')):
            act, view, subject, side = self._get_action_view_subject_side(video)
            self.info_actions.append({"action": act,
                                      "view": view,
                                      "subject": subject,
                                      "side": side})
            if self.view == "frontview":
                if side != 1:
                    continue
            # rotate to front view
            if side != 1:
                # don't take the view 8 in side 2
                if view == 8:
                    continue
                rotation = rotations[view]
                global_matrix = geometry.axis_angle_to_matrix(torch.from_numpy(self._pose[index][:, :3]))
                # rotate the global pose
                self._pose[index][:, :3] = geometry.matrix_to_axis_angle(rotation @ global_matrix).numpy()
                # rotate the joints
                self._joints[index] = self._joints[index] @ rotation.T.numpy()
                self._globtrans[index] = (self._globtrans[index] @ rotation.T.numpy())

            # add the global translation to the joints
            self._joints[index] = self._joints[index] + self._globtrans[index][:, None]

            if subject in self._tr_subjects:
                self._train.append(index)
            elif subject in self._test_subjects:
                self._test.append(index)
            else:
                raise ValueError("This subject doesn't belong to any set.")

            # if index > 200:
            #     break

        # Select only sequences which have a minimum number of frames
        if self.num_frames > 0:
            threshold = self.num_frames*3/4
        else:
            threshold = 0

        method_extracted_ix = np.where(num_frames_video >= threshold)[0].tolist()
        self._train = list(set(self._train) & set(method_extracted_ix))
        # keep the test set without modification
        self._test = list(set(self._test))

        action_classes_file = os.path.join(datapath, "info/action_classes.txt")
        with open(action_classes_file, 'r') as f:
            self._action_classes = np.array(f.read().splitlines())

        # with open(processd_path, 'wb') as file:
        #     pkl.dump(xxx, file)

    def _load_joints3D(self, ind, frame_ix):
        if len(self._joints[ind]) == 0:
            raise ValueError(
                f"Cannot load index {ind} in _load_joints3D function.")
        if self._jointsIx is not None:
            joints3D = self._joints[ind][frame_ix][:, self._jointsIx]
        else:
            joints3D = self._joints[ind][frame_ix]

        return joints3D

    def _load_rotvec(self, ind, frame_ix):
        # 72 dim smpl
        pose = self._pose[ind][frame_ix, :].reshape(-1, 24, 3)
        return pose

    def _get_action_view_subject_side(self, videopath):
        # TODO: Can be moved to tools.py
        spl = videopath.split('_')
        action = int(spl[0][1:])
        view = int(spl[1][1:])
        subject = int(spl[2][1:])
        side = int(spl[3][1:])
        return action, view, subject, side

    def _get_videopath(self, action, view, subject, side):
        # Unused function
        return 'a{:d}_d{:d}_p{:03d}_c{:d}_color.avi'.format(
            action, view, subject, side)

    def parse_action(self, path, return_int=True):
        # Override parent method
        info, _, _, _ = self._get_action_view_subject_side(path)
        if return_int:
            return int(info)
        else:
            return info


if __name__ == "__main__":
    dataset = UESTC()