File size: 8,678 Bytes
f87d582
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import os
from tqdm import tqdm
import numpy as np
import pickle as pkl
import utils.rotation_conversions as geometry
import torch

from .dataset import Dataset
# from torch.utils.data import Dataset

action2motion_joints = [8, 1, 2, 3, 4, 5, 6, 7, 0, 9, 10, 11, 12, 13, 14, 21, 24, 38]


def get_z(cam_s, cam_pos, joints, img_size, flength):
    """
    Solves for the depth offset of the model to approx. orth with persp camera.
    """
    # Translate the model itself: Solve the best z that maps to orth_proj points
    joints_orth_target = (cam_s * (joints[:, :2] + cam_pos) + 1) * 0.5 * img_size
    height3d = np.linalg.norm(np.max(joints[:, :2], axis=0) - np.min(joints[:, :2], axis=0))
    height2d = np.linalg.norm(np.max(joints_orth_target, axis=0) - np.min(joints_orth_target, axis=0))
    tz = np.array(flength * (height3d / height2d))
    return float(tz)


def get_trans_from_vibe(vibe, index, use_z=True):
    alltrans = []
    for t in range(vibe["joints3d"][index].shape[0]):
        # Convert crop cam to orig cam
        # No need! Because `convert_crop_cam_to_orig_img` from demoutils of vibe
        # does this already for us :)
        # Its format is: [sx, sy, tx, ty]
        cam_orig = vibe["orig_cam"][index][t]
        x = cam_orig[2]
        y = cam_orig[3]
        if use_z:
            z = get_z(cam_s=cam_orig[0],  # TODO: There are two scales instead of 1.
                      cam_pos=cam_orig[2:4],
                      joints=vibe['joints3d'][index][t],
                      img_size=540,
                      flength=500)
            # z = 500 / (0.5 * 480 * cam_orig[0])
        else:
            z = 0
        trans = [x, y, z]
        alltrans.append(trans)
    alltrans = np.array(alltrans)
    return alltrans - alltrans[0]


class UESTC(Dataset):
    dataname = "uestc"

    def __init__(self, datapath="dataset/uestc", method_name="vibe", view="all", **kargs):

        self.datapath = datapath
        self.method_name = method_name
        self.view = view
        super().__init__(**kargs)

        # Load pre-computed #frames data
        with open(os.path.join(datapath, 'info', 'num_frames_min.txt'), 'r') as f:
            num_frames_video = np.asarray([int(s) for s in f.read().splitlines()])

        # Out of 118 subjects -> 51 training, 67 in test
        all_subjects = np.arange(1, 119)
        self._tr_subjects = [
            1, 2, 6, 12, 13, 16, 21, 24, 28, 29, 30, 31, 33, 35, 39, 41, 42, 45, 47, 50,
            52, 54, 55, 57, 59, 61, 63, 64, 67, 69, 70, 71, 73, 77, 81, 84, 86, 87, 88,
            90, 91, 93, 96, 99, 102, 103, 104, 107, 108, 112, 113]
        self._test_subjects = [s for s in all_subjects if s not in self._tr_subjects]

        # Load names of 25600 videos
        with open(os.path.join(datapath, 'info', 'names.txt'), 'r') as f:
            videos = f.read().splitlines()

        self._videos = videos

        if self.method_name == "vibe":
            vibe_data_path = os.path.join(datapath, "vibe_cache_refined.pkl")
            vibe_data = pkl.load(open(vibe_data_path, "rb"))

            self._pose = vibe_data["pose"]
            num_frames_method = [p.shape[0] for p in self._pose]
            globpath = os.path.join(datapath, "globtrans_usez.pkl")

            if os.path.exists(globpath):
                self._globtrans = pkl.load(open(globpath, "rb"))
            else:
                self._globtrans = []
                for index in tqdm(range(len(self._pose))):
                    self._globtrans.append(get_trans_from_vibe(vibe_data, index, use_z=True))
                pkl.dump(self._globtrans, open("globtrans_usez.pkl", "wb"))
            self._joints = vibe_data["joints3d"]
            self._jointsIx = action2motion_joints
        else:
            raise ValueError("This method name is not recognized.")

        num_frames_video = np.minimum(num_frames_video, num_frames_method)
        num_frames_video = num_frames_video.astype(int)
        self._num_frames_in_video = [x for x in num_frames_video]

        N = len(videos)
        self._actions = np.zeros(N, dtype=int)
        for ind in range(N):
            self._actions[ind] = self.parse_action(videos[ind])

        self._actions = [x for x in self._actions]

        total_num_actions = 40
        self.num_actions = total_num_actions
        keep_actions = np.arange(0, total_num_actions)

        self._action_to_label = {x: i for i, x in enumerate(keep_actions)}
        self._label_to_action = {i: x for i, x in enumerate(keep_actions)}
        self.num_classes = len(keep_actions)

        self._train = []
        self._test = []

        self.info_actions = []

        def get_rotation(view):
            theta = - view * np.pi/4
            axis = torch.tensor([0, 1, 0], dtype=torch.float)
            axisangle = theta*axis
            matrix = geometry.axis_angle_to_matrix(axisangle)
            return matrix

        # 0 is identity if needed
        rotations = {key: get_rotation(key) for key in [0, 1, 2, 3, 4, 5, 6, 7]}

        for index, video in enumerate(tqdm(videos, desc='Preparing UESTC data..')):
            act, view, subject, side = self._get_action_view_subject_side(video)
            self.info_actions.append({"action": act,
                                      "view": view,
                                      "subject": subject,
                                      "side": side})
            if self.view == "frontview":
                if side != 1:
                    continue
            # rotate to front view
            if side != 1:
                # don't take the view 8 in side 2
                if view == 8:
                    continue
                rotation = rotations[view]
                global_matrix = geometry.axis_angle_to_matrix(torch.from_numpy(self._pose[index][:, :3]))
                # rotate the global pose
                self._pose[index][:, :3] = geometry.matrix_to_axis_angle(rotation @ global_matrix).numpy()
                # rotate the joints
                self._joints[index] = self._joints[index] @ rotation.T.numpy()
                self._globtrans[index] = (self._globtrans[index] @ rotation.T.numpy())

            # add the global translation to the joints
            self._joints[index] = self._joints[index] + self._globtrans[index][:, None]

            if subject in self._tr_subjects:
                self._train.append(index)
            elif subject in self._test_subjects:
                self._test.append(index)
            else:
                raise ValueError("This subject doesn't belong to any set.")

            # if index > 200:
            #     break

        # Select only sequences which have a minimum number of frames
        if self.num_frames > 0:
            threshold = self.num_frames*3/4
        else:
            threshold = 0

        method_extracted_ix = np.where(num_frames_video >= threshold)[0].tolist()
        self._train = list(set(self._train) & set(method_extracted_ix))
        # keep the test set without modification
        self._test = list(set(self._test))

        action_classes_file = os.path.join(datapath, "info/action_classes.txt")
        with open(action_classes_file, 'r') as f:
            self._action_classes = np.array(f.read().splitlines())

        # with open(processd_path, 'wb') as file:
        #     pkl.dump(xxx, file)

    def _load_joints3D(self, ind, frame_ix):
        if len(self._joints[ind]) == 0:
            raise ValueError(
                f"Cannot load index {ind} in _load_joints3D function.")
        if self._jointsIx is not None:
            joints3D = self._joints[ind][frame_ix][:, self._jointsIx]
        else:
            joints3D = self._joints[ind][frame_ix]

        return joints3D

    def _load_rotvec(self, ind, frame_ix):
        # 72 dim smpl
        pose = self._pose[ind][frame_ix, :].reshape(-1, 24, 3)
        return pose

    def _get_action_view_subject_side(self, videopath):
        # TODO: Can be moved to tools.py
        spl = videopath.split('_')
        action = int(spl[0][1:])
        view = int(spl[1][1:])
        subject = int(spl[2][1:])
        side = int(spl[3][1:])
        return action, view, subject, side

    def _get_videopath(self, action, view, subject, side):
        # Unused function
        return 'a{:d}_d{:d}_p{:03d}_c{:d}_color.avi'.format(
            action, view, subject, side)

    def parse_action(self, path, return_int=True):
        # Override parent method
        info, _, _, _ = self._get_action_view_subject_side(path)
        if return_int:
            return int(info)
        else:
            return info


if __name__ == "__main__":
    dataset = UESTC()