Spaces:

rlogh
/

coachAI

Runtime error

App Files Files Community

rlogh commited on Dec 9, 2025

Commit

73dfb75

verified ·

1 Parent(s): c45cbe3

Upload 17 files

Browse files

Files changed (17) hide show

common/README.MD +1 -0
common/__pycache__/camera.cpython-311.pyc +0 -0
common/__pycache__/model_poseformer.cpython-311.pyc +0 -0
common/__pycache__/quaternion.cpython-311.pyc +0 -0
common/__pycache__/utils.cpython-311.pyc +0 -0
common/arguments.py +96 -0
common/camera.py +90 -0
common/custom_dataset.py +66 -0
common/generators.py +261 -0
common/h36m_dataset.py +263 -0
common/loss.py +108 -0
common/mocap_dataset.py +44 -0
common/model_poseformer.py +242 -0
common/quaternion.py +35 -0
common/skeleton.py +88 -0
common/utils.py +80 -0
common/visualization.py +216 -0

common/README.MD ADDED Viewed

	@@ -0,0 +1 @@


1	+

common/__pycache__/camera.cpython-311.pyc ADDED Viewed

Binary file (5.11 kB). View file

common/__pycache__/model_poseformer.cpython-311.pyc ADDED Viewed

Binary file (17.8 kB). View file

common/__pycache__/quaternion.cpython-311.pyc ADDED Viewed

Binary file (1.69 kB). View file

common/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (3.78 kB). View file

common/arguments.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Copyright (c) 2018-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Modified by Qitao Zhao (qitaozhao@mail.sdu.edu.cn)
+import argparse
+def parse_args():
+    parser = argparse.ArgumentParser(description='Training script')
+    # General arguments
+    parser.add_argument('-d', '--dataset', default='h36m', type=str, metavar='NAME', help='target dataset') # h36m or humaneva
+    parser.add_argument('-k', '--keypoints', default='cpn_ft_h36m_dbb', type=str, metavar='NAME', help='2D detections to use')
+    parser.add_argument('-str', '--subjects-train', default='S1,S5,S6,S7,S8', type=str, metavar='LIST',
+                        help='training subjects separated by comma')
+    parser.add_argument('-ste', '--subjects-test', default='S9,S11', type=str, metavar='LIST', help='test subjects separated by comma')
+    parser.add_argument('-sun', '--subjects-unlabeled', default='', type=str, metavar='LIST',
+                        help='unlabeled subjects separated by comma for self-supervision')
+    parser.add_argument('-a', '--actions', default='*', type=str, metavar='LIST',
+                        help='actions to train/test on, separated by comma, or * for all')
+    parser.add_argument('-c', '--checkpoint', default='checkpoint', type=str, metavar='PATH',
+                        help='checkpoint directory')
+    parser.add_argument('--checkpoint-frequency', default=40, type=int, metavar='N',
+                        help='create a checkpoint every N epochs')
+    parser.add_argument('-r', '--resume', default='', type=str, metavar='FILENAME',
+                        help='checkpoint to resume (file name)')
+    parser.add_argument('--evaluate', default='', type=str, metavar='FILENAME', help='checkpoint to evaluate (file name)')
+    parser.add_argument('--render', action='store_true', help='visualize a particular video')
+    parser.add_argument('--by-subject', action='store_true', help='break down error by subject (on evaluation)')
+    parser.add_argument('--export-training-curves', action='store_true', help='save training curves as .png images')
+    parser.add_argument('-g', '--gpu', type=list, help='set gpu number')
+    parser.add_argument('--local_rank', type=int, default=0, help='node rank for distributed training')
+    parser.add_argument('--center-pose', type=int, default=0, help='choose fine-tuning task as 3d pose estimation')
+    # Model arguments
+    parser.add_argument('-s', '--stride', default=1, type=int, metavar='N', help='chunk size to use during training')
+    parser.add_argument('-e', '--epochs', default=200, type=int, metavar='N', help='number of training epochs')
+    parser.add_argument('-b', '--batch-size', default=1024, type=int, metavar='N', help='batch size in terms of predicted frames')
+    parser.add_argument('-drop', '--dropout', default=0., type=float, metavar='P', help='dropout probability')
+    parser.add_argument('-lr', '--learning-rate', default=0.0001, type=float, metavar='LR', help='initial learning rate')
+    parser.add_argument('-lrd', '--lr-decay', default=0.99, type=float, metavar='LR', help='learning rate decay per epoch')
+    parser.add_argument('-no-da', '--no-data-augmentation', dest='data_augmentation', action='store_false',
+                        help='disable train-time flipping')
+    parser.add_argument('-frame', '--number-of-frames', default='81', type=int, metavar='N',
+                        help='how many frames used as input')
+    parser.add_argument('-frame-kept', '--number-of-kept-frames', default='27', type=int, metavar='N',
+                        help='how many frames are kept')
+    parser.add_argument('-coeff-kept', '--number-of-kept-coeffs', type=int, metavar='N', help='how many coefficients are kept')
+    parser.add_argument('--depth', default=4, type=int, metavar='N', help='number of transformer blocks')
+    parser.add_argument('--embed-dim-ratio', default=32, type=int, metavar='N', help='dimension of embedding ratio')
+    parser.add_argument('-std', type=float, default=0.0, help='the standard deviation for gaussian noise')
+    # Experimental
+    parser.add_argument('--subset', default=1, type=float, metavar='FRACTION', help='reduce dataset size by fraction')
+    parser.add_argument('--downsample', default=1, type=int, metavar='FACTOR', help='downsample frame rate by factor (semi-supervised)')
+    parser.add_argument('--warmup', default=1, type=int, metavar='N', help='warm-up epochs for semi-supervision')
+    parser.add_argument('--no-eval', action='store_true', help='disable epoch evaluation while training (small speed-up)')
+    parser.add_argument('--dense', action='store_true', help='use dense convolutions instead of dilated convolutions')
+    parser.add_argument('--disable-optimizations', action='store_true', help='disable optimized model for single-frame predictions')
+    parser.add_argument('--linear-projection', action='store_true', help='use only linear coefficients for semi-supervised projection')
+    parser.add_argument('--no-bone-length', action='store_false', dest='bone_length_term',
+                        help='disable bone length term in semi-supervised settings')
+    parser.add_argument('--no-proj', action='store_true', help='disable projection for semi-supervised setting')
+    # Visualization
+    parser.add_argument('--viz-subject', type=str, metavar='STR', help='subject to render')
+    parser.add_argument('--viz-action', type=str, metavar='STR', help='action to render')
+    parser.add_argument('--viz-camera', type=int, default=0, metavar='N', help='camera to render')
+    parser.add_argument('--viz-video', type=str, metavar='PATH', help='path to input video')
+    parser.add_argument('--viz-skip', type=int, default=0, metavar='N', help='skip first N frames of input video')
+    parser.add_argument('--viz-output', type=str, metavar='PATH', help='output file name (.gif or .mp4)')
+    parser.add_argument('--viz-export', type=str, metavar='PATH', help='output file name for coordinates')
+    parser.add_argument('--viz-bitrate', type=int, default=3000, metavar='N', help='bitrate for mp4 videos')
+    parser.add_argument('--viz-no-ground-truth', action='store_true', help='do not show ground-truth poses')
+    parser.add_argument('--viz-limit', type=int, default=-1, metavar='N', help='only render first N frames')
+    parser.add_argument('--viz-downsample', type=int, default=1, metavar='N', help='downsample FPS by a factor N')
+    parser.add_argument('--viz-size', type=int, default=5, metavar='N', help='image size')
+    parser.set_defaults(bone_length_term=True)
+    parser.set_defaults(data_augmentation=True)
+    parser.set_defaults(test_time_augmentation=True)
+    # parser.set_defaults(test_time_augmentation=False)
+    args = parser.parse_args()
+    # Check invalid configuration
+    if args.resume and args.evaluate:
+        print('Invalid flags: --resume and --evaluate cannot be set at the same time')
+        exit()
+    if args.export_training_curves and args.no_eval:
+        print('Invalid flags: --export-training-curves and --no-eval cannot be set at the same time')
+        exit()
+    return args

common/camera.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright (c) 2018-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+import numpy as np
+import torch
+from common.utils import wrap
+from common.quaternion import qrot, qinverse
+def normalize_screen_coordinates(X, w, h):
+    assert X.shape[-1] == 2
+    # Normalize so that [0, w] is mapped to [-1, 1], while preserving the aspect ratio
+    return X/w*2 - [1, h/w]
+def image_coordinates(X, w, h):
+    assert X.shape[-1] == 2
+    # Reverse camera frame normalization
+    return (X + [1, h/w])*w/2
+def world_to_camera(X, R, t):
+    Rt = wrap(qinverse, R) # Invert rotation
+    return wrap(qrot, np.tile(Rt, (*X.shape[:-1], 1)), X - t) # Rotate and translate
+def camera_to_world(X, R, t):
+    return wrap(qrot, np.tile(R, (*X.shape[:-1], 1)), X) + t
+def project_to_2d(X, camera_params):
+    """
+    Project 3D points to 2D using the Human3.6M camera projection function.
+    This is a differentiable and batched reimplementation of the original MATLAB script.
+    Arguments:
+    X -- 3D points in *camera space* to transform (N, *, 3)
+    camera_params -- intrinsic parameteres (N, 2+2+3+2=9)
+    """
+    assert X.shape[-1] == 3
+    assert len(camera_params.shape) == 2
+    assert camera_params.shape[-1] == 9
+    assert X.shape[0] == camera_params.shape[0]
+    while len(camera_params.shape) < len(X.shape):
+        camera_params = camera_params.unsqueeze(1)
+    f = camera_params[..., :2]
+    c = camera_params[..., 2:4]
+    k = camera_params[..., 4:7]
+    p = camera_params[..., 7:]
+    XX = torch.clamp(X[..., :2] / X[..., 2:], min=-1, max=1)
+    r2 = torch.sum(XX[..., :2]**2, dim=len(XX.shape)-1, keepdim=True)
+    radial = 1 + torch.sum(k * torch.cat((r2, r2**2, r2**3), dim=len(r2.shape)-1), dim=len(r2.shape)-1, keepdim=True)
+    tan = torch.sum(p*XX, dim=len(XX.shape)-1, keepdim=True)
+    XXX = XX*(radial + tan) + p*r2
+    return f*XXX + c
+def project_to_2d_linear(X, camera_params):
+    """
+    Project 3D points to 2D using only linear parameters (focal length and principal point).
+    Arguments:
+    X -- 3D points in *camera space* to transform (N, *, 3)
+    camera_params -- intrinsic parameteres (N, 2+2+3+2=9)
+    """
+    assert X.shape[-1] == 3
+    assert len(camera_params.shape) == 2
+    assert camera_params.shape[-1] == 9
+    assert X.shape[0] == camera_params.shape[0]
+    while len(camera_params.shape) < len(X.shape):
+        camera_params = camera_params.unsqueeze(1)
+    f = camera_params[..., :2]
+    c = camera_params[..., 2:4]
+    XX = torch.clamp(X[..., :2] / X[..., 2:], min=-1, max=1)
+    return f*XX + c

common/custom_dataset.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# Copyright (c) 2018-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+import numpy as np
+import copy
+from common.skeleton import Skeleton
+from common.mocap_dataset import MocapDataset
+from common.camera import normalize_screen_coordinates, image_coordinates
+from common.h36m_dataset import h36m_skeleton
+custom_camera_params = {
+    'id': None,
+    'res_w': None, # Pulled from metadata
+    'res_h': None, # Pulled from metadata
+    # Dummy camera parameters (taken from Human3.6M), only for visualization purposes
+    'azimuth': 70, # Only used for visualization
+    'orientation': [0.1407056450843811, -0.1500701755285263, -0.755240797996521, 0.6223280429840088],
+    'translation': [1841.1070556640625, 4955.28466796875, 1563.4454345703125],
+}
+class CustomDataset(MocapDataset):
+    def __init__(self, detections_path, remove_static_joints=True):
+        super().__init__(fps=None, skeleton=h36m_skeleton)
+        # Load serialized dataset
+        data = np.load(detections_path, allow_pickle=True)
+        resolutions = data['metadata'].item()['video_metadata']
+        self._cameras = {}
+        self._data = {}
+        for video_name, res in resolutions.items():
+            cam = {}
+            cam.update(custom_camera_params)
+            cam['orientation'] = np.array(cam['orientation'], dtype='float32')
+            cam['translation'] = np.array(cam['translation'], dtype='float32')
+            cam['translation'] = cam['translation']/1000 # mm to meters
+            cam['id'] = video_name
+            cam['res_w'] = res['w']
+            cam['res_h'] = res['h']
+            self._cameras[video_name] = [cam]
+            self._data[video_name] = {
+                'custom': {
+                    'cameras': cam
+                }
+            }
+        if remove_static_joints:
+            # Bring the skeleton to 17 joints instead of the original 32
+            self.remove_joints([4, 5, 9, 10, 11, 16, 20, 21, 22, 23, 24, 28, 29, 30, 31])
+            # Rewire shoulders to the correct parents
+            self._skeleton._parents[11] = 8
+            self._skeleton._parents[14] = 8
+    def supports_semi_supervised(self):
+        return False

common/generators.py ADDED Viewed

	@@ -0,0 +1,261 @@

+# Copyright (c) 2018-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+from itertools import zip_longest
+import numpy as np
+# def getbone(seq, boneindex):
+#     bs = np.shape(seq)[0]
+#     ss = np.shape(seq)[1]
+#     seq = np.reshape(seq,(bs*ss,-1,3))
+#     bone = []
+#     for index in boneindex:
+#         bone.append(seq[:,index[0]] - seq[:,index[1]])
+#     bone = np.stack(bone,1)
+#     bone = np.power(np.power(bone,2).sum(2),0.5)
+#     bone = np.reshape(bone, (bs,ss,np.shape(bone)[1]))
+#     return bone
+class ChunkedGenerator:
+    """
+    Batched data generator, used for training.
+    The sequences are split into equal-length chunks and padded as necessary.
+    Arguments:
+    batch_size -- the batch size to use for training
+    cameras -- list of cameras, one element for each video (optional, used for semi-supervised training)
+    poses_3d -- list of ground-truth 3D poses, one element for each video (optional, used for supervised training)
+    poses_2d -- list of input 2D keypoints, one element for each video
+    chunk_length -- number of output frames to predict for each training example (usually 1)
+    pad -- 2D input padding to compensate for valid convolutions, per side (depends on the receptive field)
+    causal_shift -- asymmetric padding offset when causal convolutions are used (usually 0 or "pad")
+    shuffle -- randomly shuffle the dataset before each epoch
+    random_seed -- initial seed to use for the random generator
+    augment -- augment the dataset by flipping poses horizontally
+    kps_left and kps_right -- list of left/right 2D keypoints if flipping is enabled
+    joints_left and joints_right -- list of left/right 3D joints if flipping is enabled
+    """
+    def __init__(self, batch_size, cameras, poses_3d, poses_2d,
+                 chunk_length, pad=0, causal_shift=0,
+                 shuffle=True, random_seed=1234,
+                 augment=False, kps_left=None, kps_right=None, joints_left=None, joints_right=None,
+                 endless=False):
+        assert poses_3d is None or len(poses_3d) == len(poses_2d), (len(poses_3d), len(poses_2d))
+        assert cameras is None or len(cameras) == len(poses_2d)
+        # Build lineage info
+        pairs = [] # (seq_idx, start_frame, end_frame, flip) tuples
+        for i in range(len(poses_2d)):
+            assert poses_3d is None or poses_3d[i].shape[0] == poses_3d[i].shape[0]
+            n_chunks = (poses_2d[i].shape[0] + chunk_length - 1) // chunk_length
+            offset = (n_chunks * chunk_length - poses_2d[i].shape[0]) // 2
+            bounds = np.arange(n_chunks+1)*chunk_length - offset
+            augment_vector = np.full(len(bounds - 1), False, dtype=bool)
+            pairs += zip(np.repeat(i, len(bounds - 1)), bounds[:-1], bounds[1:], augment_vector)
+            if augment:
+                pairs += zip(np.repeat(i, len(bounds - 1)), bounds[:-1], bounds[1:], ~augment_vector)
+        # Initialize buffers
+        if cameras is not None:
+            self.batch_cam = np.empty((batch_size, cameras[0].shape[-1]))
+        if poses_3d is not None:
+            self.batch_3d = np.empty((batch_size, chunk_length, poses_3d[0].shape[-2], poses_3d[0].shape[-1]))
+            # self.batch_3d = np.empty((batch_size, chunk_length + 2*pad, poses_3d[0].shape[-2], poses_3d[0].shape[-1]))
+        self.batch_2d = np.empty((batch_size, chunk_length + 2*pad, poses_2d[0].shape[-2], poses_2d[0].shape[-1]))
+        self.num_batches = (len(pairs) + batch_size - 1) // batch_size
+        self.batch_size = batch_size
+        self.random = np.random.RandomState(random_seed)
+        self.pairs = pairs
+        self.shuffle = shuffle
+        self.pad = pad
+        self.causal_shift = causal_shift
+        self.endless = endless
+        self.state = None
+        self.cameras = cameras
+        self.poses_3d = poses_3d
+        self.poses_2d = poses_2d
+        self.augment = augment
+        self.kps_left = kps_left
+        self.kps_right = kps_right
+        self.joints_left = joints_left
+        self.joints_right = joints_right
+    def num_frames(self):
+        return self.num_batches * self.batch_size
+    def random_state(self):
+        return self.random
+    def set_random_state(self, random):
+        self.random = random
+    def augment_enabled(self):
+        return self.augment
+    def next_pairs(self):
+        if self.state is None:
+            if self.shuffle:
+                pairs = self.random.permutation(self.pairs)
+            else:
+                pairs = self.pairs
+            return 0, pairs
+        else:
+            return self.state
+    def next_epoch(self):
+        enabled = True
+        while enabled:
+            start_idx, pairs = self.next_pairs()
+            for b_i in range(start_idx, self.num_batches):
+                chunks = pairs[b_i*self.batch_size : (b_i+1)*self.batch_size]
+                for i, (seq_i, start_3d, end_3d, flip) in enumerate(chunks):
+                    start_2d = start_3d - self.pad - self.causal_shift
+                    end_2d = end_3d + self.pad - self.causal_shift
+                    # 2D poses
+                    seq_2d = self.poses_2d[seq_i]
+                    low_2d = max(start_2d, 0)
+                    high_2d = min(end_2d, seq_2d.shape[0])
+                    pad_left_2d = low_2d - start_2d
+                    pad_right_2d = end_2d - high_2d
+                    if pad_left_2d != 0 or pad_right_2d != 0:
+                        self.batch_2d[i] = np.pad(seq_2d[low_2d:high_2d], ((pad_left_2d, pad_right_2d), (0, 0), (0, 0)), 'edge')
+                    else:
+                        self.batch_2d[i] = seq_2d[low_2d:high_2d]
+                    if flip:
+                        # Flip 2D keypoints
+                        self.batch_2d[i, :, :, 0] *= -1
+                        self.batch_2d[i, :, self.kps_left + self.kps_right] = self.batch_2d[i, :, self.kps_right + self.kps_left]
+                    # 3D poses
+                    if self.poses_3d is not None:
+                        seq_3d = self.poses_3d[seq_i]
+                        low_3d = max(start_3d, 0)
+                        high_3d = min(end_3d, seq_3d.shape[0])
+                        pad_left_3d = low_3d - start_3d
+                        pad_right_3d = end_3d - high_3d
+                        if pad_left_3d != 0 or pad_right_3d != 0:
+                        # if pad_left_2d != 0 or pad_right_2d != 0:
+                        #     self.batch_3d[i] = np.pad(seq_3d[low_2d:high_2d], ((pad_left_2d, pad_right_2d), (0, 0), (0, 0)), 'edge')
+                            self.batch_3d[i] = np.pad(seq_3d[low_3d:high_3d], ((pad_left_3d, pad_right_3d), (0, 0), (0, 0)), 'edge')
+                        else:
+                            # self.batch_3d[i] = seq_3d[low_2d:high_2d]
+                            self.batch_3d[i] = seq_3d[low_3d:high_3d]
+                        if flip:
+                            # Flip 3D joints
+                            self.batch_3d[i, :, :, 0] *= -1
+                            self.batch_3d[i, :, self.joints_left + self.joints_right] = \
+                                    self.batch_3d[i, :, self.joints_right + self.joints_left]
+                    # Cameras
+                    if self.cameras is not None:
+                        self.batch_cam[i] = self.cameras[seq_i]
+                        if flip:
+                            # Flip horizontal distortion coefficients
+                            self.batch_cam[i, 2] *= -1
+                            self.batch_cam[i, 7] *= -1
+                if self.endless:
+                    self.state = (b_i + 1, pairs)
+                if self.poses_3d is None and self.cameras is None:
+                    yield None, None, self.batch_2d[:len(chunks)]
+                elif self.poses_3d is not None and self.cameras is None:
+                    yield None, self.batch_3d[:len(chunks)], self.batch_2d[:len(chunks)]
+                    # yield None, self.batch_bins_3d[:len(chunks)], self.batch_2d[:len(chunks)]
+                elif self.poses_3d is None:
+                    yield self.batch_cam[:len(chunks)], None, self.batch_2d[:len(chunks)]
+                else:
+                    yield self.batch_cam[:len(chunks)], self.batch_3d[:len(chunks)], self.batch_2d[:len(chunks)]
+                    # yield self.batch_cam[:len(chunks)], self.batch_bins_3d[:len(chunks)], self.batch_2d[:len(chunks)]
+            if self.endless:
+                self.state = None
+            else:
+                enabled = False
+class UnchunkedGenerator:
+    """
+    Non-batched data generator, used for testing.
+    Sequences are returned one at a time (i.e. batch size = 1), without chunking.
+    If data augmentation is enabled, the batches contain two sequences (i.e. batch size = 2),
+    the second of which is a mirrored version of the first.
+    Arguments:
+    cameras -- list of cameras, one element for each video (optional, used for semi-supervised training)
+    poses_3d -- list of ground-truth 3D poses, one element for each video (optional, used for supervised training)
+    poses_2d -- list of input 2D keypoints, one element for each video
+    pad -- 2D input padding to compensate for valid convolutions, per side (depends on the receptive field)
+    causal_shift -- asymmetric padding offset when causal convolutions are used (usually 0 or "pad")
+    augment -- augment the dataset by flipping poses horizontally
+    kps_left and kps_right -- list of left/right 2D keypoints if flipping is enabled
+    joints_left and joints_right -- list of left/right 3D joints if flipping is enabled
+    """
+    def __init__(self, cameras, poses_3d, poses_2d, pad=0, causal_shift=0,
+                 augment=False, kps_left=None, kps_right=None, joints_left=None, joints_right=None):
+        assert poses_3d is None or len(poses_3d) == len(poses_2d)
+        assert cameras is None or len(cameras) == len(poses_2d)
+        self.augment = False
+        self.kps_left = kps_left
+        self.kps_right = kps_right
+        self.joints_left = joints_left
+        self.joints_right = joints_right
+        self.pad = pad
+        self.causal_shift = causal_shift
+        self.cameras = [] if cameras is None else cameras
+        self.poses_3d = [] if poses_3d is None else poses_3d
+        self.poses_2d = poses_2d
+    def num_frames(self):
+        count = 0
+        for p in self.poses_2d:
+            count += p.shape[0]
+        return count
+    def augment_enabled(self):
+        return self.augment
+    def set_augment(self, augment):
+        self.augment = augment
+    def next_epoch(self):
+        for seq_cam, seq_3d, seq_2d in zip_longest(self.cameras, self.poses_3d, self.poses_2d):
+            batch_cam = None if seq_cam is None else np.expand_dims(seq_cam, axis=0)
+            batch_3d = None if seq_3d is None else np.expand_dims(seq_3d, axis=0)
+            # batch_3d = np.expand_dims(np.pad(seq_3d,
+            #                 ((self.pad + self.causal_shift, self.pad - self.causal_shift), (0, 0), (0, 0)),
+            #                 'edge'), axis=0)
+            batch_2d = np.expand_dims(np.pad(seq_2d,
+                            ((self.pad + self.causal_shift, self.pad - self.causal_shift), (0, 0), (0, 0)),
+                            'edge'), axis=0)
+            if self.augment:
+                # Append flipped version
+                if batch_cam is not None:
+                    batch_cam = np.concatenate((batch_cam, batch_cam), axis=0)
+                    batch_cam[1, 2] *= -1
+                    batch_cam[1, 7] *= -1
+                if batch_3d is not None:
+                    batch_3d = np.concatenate((batch_3d, batch_3d), axis=0)
+                    batch_3d[1, :, :, 0] *= -1
+                    batch_3d[1, :, self.joints_left + self.joints_right] = batch_3d[1, :, self.joints_right + self.joints_left]
+                batch_2d = np.concatenate((batch_2d, batch_2d), axis=0)
+                batch_2d[1, :, :, 0] *= -1
+                batch_2d[1, :, self.kps_left + self.kps_right] = batch_2d[1, :, self.kps_right + self.kps_left]
+            yield batch_cam, batch_3d, batch_2d

common/h36m_dataset.py ADDED Viewed

	@@ -0,0 +1,263 @@

+# Copyright (c) 2018-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+import numpy as np
+import copy
+from common.skeleton import Skeleton
+from common.mocap_dataset import MocapDataset
+from common.camera import normalize_screen_coordinates, image_coordinates
+h36m_skeleton = Skeleton(parents=[-1,  0,  1,  2,  3,  4,  0,  6,  7,  8,  9,  0, 11, 12, 13, 14, 12,
+       16, 17, 18, 19, 20, 19, 22, 12, 24, 25, 26, 27, 28, 27, 30],
+       joints_left=[6, 7, 8, 9, 10, 16, 17, 18, 19, 20, 21, 22, 23],
+       joints_right=[1, 2, 3, 4, 5, 24, 25, 26, 27, 28, 29, 30, 31])
+h36m_cameras_intrinsic_params = [
+    {
+        'id': '54138969',
+        'center': [512.54150390625, 515.4514770507812],
+        'focal_length': [1145.0494384765625, 1143.7811279296875],
+        'radial_distortion': [-0.20709891617298126, 0.24777518212795258, -0.0030751503072679043],
+        'tangential_distortion': [-0.0009756988729350269, -0.00142447161488235],
+        'res_w': 1000,
+        'res_h': 1002,
+        'azimuth': 70, # Only used for visualization
+    },
+    {
+        'id': '55011271',
+        'center': [508.8486328125, 508.0649108886719],
+        'focal_length': [1149.6756591796875, 1147.5916748046875],
+        'radial_distortion': [-0.1942136287689209, 0.2404085397720337, 0.006819975562393665],
+        'tangential_distortion': [-0.0016190266469493508, -0.0027408944442868233],
+        'res_w': 1000,
+        'res_h': 1000,
+        'azimuth': -70, # Only used for visualization
+    },
+    {
+        'id': '58860488',
+        'center': [519.8158569335938, 501.40264892578125],
+        'focal_length': [1149.1407470703125, 1148.7989501953125],
+        'radial_distortion': [-0.2083381861448288, 0.25548800826072693, -0.0024604974314570427],
+        'tangential_distortion': [0.0014843869721516967, -0.0007599993259645998],
+        'res_w': 1000,
+        'res_h': 1000,
+        'azimuth': 110, # Only used for visualization
+    },
+    {
+        'id': '60457274',
+        'center': [514.9682006835938, 501.88201904296875],
+        'focal_length': [1145.5113525390625, 1144.77392578125],
+        'radial_distortion': [-0.198384091258049, 0.21832367777824402, -0.008947807364165783],
+        'tangential_distortion': [-0.0005872055771760643, -0.0018133620033040643],
+        'res_w': 1000,
+        'res_h': 1002,
+        'azimuth': -110, # Only used for visualization
+    },
+]
+h36m_cameras_extrinsic_params = {
+    'S1': [
+        {
+            'orientation': [0.1407056450843811, -0.1500701755285263, -0.755240797996521, 0.6223280429840088],
+            'translation': [1841.1070556640625, 4955.28466796875, 1563.4454345703125],
+        },
+        {
+            'orientation': [0.6157187819480896, -0.764836311340332, -0.14833825826644897, 0.11794740706682205],
+            'translation': [1761.278564453125, -5078.0068359375, 1606.2650146484375],
+        },
+        {
+            'orientation': [0.14651472866535187, -0.14647851884365082, 0.7653023600578308, -0.6094175577163696],
+            'translation': [-1846.7777099609375, 5215.04638671875, 1491.972412109375],
+        },
+        {
+            'orientation': [0.5834008455276489, -0.7853162288665771, 0.14548823237419128, -0.14749594032764435],
+            'translation': [-1794.7896728515625, -3722.698974609375, 1574.8927001953125],
+        },
+    ],
+    'S2': [
+        {},
+        {},
+        {},
+        {},
+    ],
+    'S3': [
+        {},
+        {},
+        {},
+        {},
+    ],
+    'S4': [
+        {},
+        {},
+        {},
+        {},
+    ],
+    'S5': [
+        {
+            'orientation': [0.1467377245426178, -0.162370964884758, -0.7551892995834351, 0.6178938746452332],
+            'translation': [2097.3916015625, 4880.94482421875, 1605.732421875],
+        },
+        {
+            'orientation': [0.6159758567810059, -0.7626792192459106, -0.15728192031383514, 0.1189815029501915],
+            'translation': [2031.7008056640625, -5167.93310546875, 1612.923095703125],
+        },
+        {
+            'orientation': [0.14291371405124664, -0.12907841801643372, 0.7678384780883789, -0.6110143065452576],
+            'translation': [-1620.5948486328125, 5171.65869140625, 1496.43701171875],
+        },
+        {
+            'orientation': [0.5920479893684387, -0.7814217805862427, 0.1274748593568802, -0.15036417543888092],
+            'translation': [-1637.1737060546875, -3867.3173828125, 1547.033203125],
+        },
+    ],
+    'S6': [
+        {
+            'orientation': [0.1337897777557373, -0.15692396461963654, -0.7571090459823608, 0.6198879480361938],
+            'translation': [1935.4517822265625, 4950.24560546875, 1618.0838623046875],
+        },
+        {
+            'orientation': [0.6147197484970093, -0.7628812789916992, -0.16174767911434174, 0.11819244921207428],
+            'translation': [1969.803955078125, -5128.73876953125, 1632.77880859375],
+        },
+        {
+            'orientation': [0.1529948115348816, -0.13529130816459656, 0.7646096348762512, -0.6112781167030334],
+            'translation': [-1769.596435546875, 5185.361328125, 1476.993408203125],
+        },
+        {
+            'orientation': [0.5916101336479187, -0.7804774045944214, 0.12832270562648773, -0.1561593860387802],
+            'translation': [-1721.668701171875, -3884.13134765625, 1540.4879150390625],
+        },
+    ],
+    'S7': [
+        {
+            'orientation': [0.1435241848230362, -0.1631336808204651, -0.7548328638076782, 0.6188824772834778],
+            'translation': [1974.512939453125, 4926.3544921875, 1597.8326416015625],
+        },
+        {
+            'orientation': [0.6141672730445862, -0.7638262510299683, -0.1596645563840866, 0.1177929937839508],
+            'translation': [1937.0584716796875, -5119.7900390625, 1631.5665283203125],
+        },
+        {
+            'orientation': [0.14550060033798218, -0.12874816358089447, 0.7660516500473022, -0.6127139329910278],
+            'translation': [-1741.8111572265625, 5208.24951171875, 1464.8245849609375],
+        },
+        {
+            'orientation': [0.5912848114967346, -0.7821764349937439, 0.12445473670959473, -0.15196487307548523],
+            'translation': [-1734.7105712890625, -3832.42138671875, 1548.5830078125],
+        },
+    ],
+    'S8': [
+        {
+            'orientation': [0.14110587537288666, -0.15589867532253265, -0.7561917304992676, 0.619644045829773],
+            'translation': [2150.65185546875, 4896.1611328125, 1611.9046630859375],
+        },
+        {
+            'orientation': [0.6169601678848267, -0.7647668123245239, -0.14846350252628326, 0.11158157885074615],
+            'translation': [2219.965576171875, -5148.453125, 1613.0440673828125],
+        },
+        {
+            'orientation': [0.1471444070339203, -0.13377119600772858, 0.7670128345489502, -0.6100369691848755],
+            'translation': [-1571.2215576171875, 5137.0185546875, 1498.1761474609375],
+        },
+        {
+            'orientation': [0.5927824378013611, -0.7825870513916016, 0.12147816270589828, -0.14631995558738708],
+            'translation': [-1476.913330078125, -3896.7412109375, 1547.97216796875],
+        },
+    ],
+    'S9': [
+        {
+            'orientation': [0.15540587902069092, -0.15548215806484222, -0.7532095313072205, 0.6199594736099243],
+            'translation': [2044.45849609375, 4935.1171875, 1481.2275390625],
+        },
+        {
+            'orientation': [0.618784487247467, -0.7634735107421875, -0.14132238924503326, 0.11933968216180801],
+            'translation': [1990.959716796875, -5123.810546875, 1568.8048095703125],
+        },
+        {
+            'orientation': [0.13357827067375183, -0.1367100477218628, 0.7689454555511475, -0.6100738644599915],
+            'translation': [-1670.9921875, 5211.98583984375, 1528.387939453125],
+        },
+        {
+            'orientation': [0.5879399180412292, -0.7823407053947449, 0.1427614390850067, -0.14794869720935822],
+            'translation': [-1696.04345703125, -3827.099853515625, 1591.4127197265625],
+        },
+    ],
+    'S11': [
+        {
+            'orientation': [0.15232472121715546, -0.15442320704460144, -0.7547563314437866, 0.6191070079803467],
+            'translation': [2098.440185546875, 4926.5546875, 1500.278564453125],
+        },
+        {
+            'orientation': [0.6189449429512024, -0.7600917220115662, -0.15300633013248444, 0.1255258321762085],
+            'translation': [2083.182373046875, -4912.1728515625, 1561.07861328125],
+        },
+        {
+            'orientation': [0.14943228662014008, -0.15650227665901184, 0.7681233882904053, -0.6026304364204407],
+            'translation': [-1609.8153076171875, 5177.3359375, 1537.896728515625],
+        },
+        {
+            'orientation': [0.5894251465797424, -0.7818877100944519, 0.13991211354732513, -0.14715361595153809],
+            'translation': [-1590.738037109375, -3854.1689453125, 1578.017578125],
+        },
+    ],
+}
+class Human36mDataset(MocapDataset):
+    def __init__(self, path, remove_static_joints=True):
+        super().__init__(fps=50, skeleton=h36m_skeleton)
+        self._cameras = copy.deepcopy(h36m_cameras_extrinsic_params)
+        for cameras in self._cameras.values():
+            for i, cam in enumerate(cameras):
+                cam.update(h36m_cameras_intrinsic_params[i])
+                for k, v in cam.items():
+                    if k not in ['id', 'res_w', 'res_h']:
+                        cam[k] = np.array(v, dtype='float32')
+                # Normalize camera frame
+                cam['center'] = normalize_screen_coordinates(cam['center'], w=cam['res_w'], h=cam['res_h']).astype('float32')
+                cam['focal_length'] = cam['focal_length']/cam['res_w']*2
+                if 'translation' in cam:
+                    cam['translation'] = cam['translation']/1000 # mm to meters
+                # Add intrinsic parameters vector
+                cam['intrinsic'] = np.concatenate((cam['focal_length'],
+                                                   cam['center'],
+                                                   cam['radial_distortion'],
+                                                   cam['tangential_distortion'],
+                                                   [1/cam['focal_length'][0], 0, -cam['center'][0]/cam['focal_length'][0],
+                                                    0, 1/cam['focal_length'][1], -cam['center'][1]/cam['focal_length'][1],
+                                                    0, 0, 1]))
+                # proj_matrix = np.array([1/cam['focal_length'][0], 0, -cam['center'][0]/cam['focal_length'][0],
+                #                         0, 1/cam['focal_length'][1], -cam['center'][1]/cam['focal_length'][1],
+                #                         0, 0, 1])
+                # cam['intrinsic'] = np.concatenate(camera_intrinsics, proj_matrix)
+        # Load serialized dataset
+        data = np.load(path, allow_pickle=True)['positions_3d'].item()
+        self._data = {}
+        for subject, actions in data.items():
+            self._data[subject] = {}
+            for action_name, positions in actions.items():
+                self._data[subject][action_name] = {
+                    'positions': positions,
+                    'cameras': self._cameras[subject],
+                }
+        if remove_static_joints:
+            # Bring the skeleton to 17 joints instead of the original 32
+            self.remove_joints([4, 5, 9, 10, 11, 16, 20, 21, 22, 23, 24, 28, 29, 30, 31])
+            # Rewire shoulders to the correct parents
+            self._skeleton._parents[11] = 8
+            self._skeleton._parents[14] = 8
+    def supports_semi_supervised(self):
+        return True

common/loss.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright (c) 2018-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+import torch
+import torch.nn as nn
+import numpy as np
+import matplotlib.pyplot as plt
+def mpjpe(predicted, target):
+    """
+    Mean per-joint position error (i.e. mean Euclidean distance),
+    often referred to as "Protocol #1" in many papers.
+    """
+    assert predicted.shape == target.shape
+    return torch.mean(torch.norm(predicted - target, dim=len(target.shape)-1))
+def mse(predicted, target, weights=None, gamma=0):
+    loss = nn.MSELoss()
+    return loss(predicted, target)
+def cross_entropy(predicted, target, weights=None, gamma=0, return_weights=False):
+    loss = nn.CrossEntropyLoss()
+    return loss(predicted.permute(0, 4, 1, 2, 3), target)
+def weighted_mpjpe(predicted, target, w):
+    """
+    Weighted mean per-joint position error (i.e. mean Euclidean distance)
+    """
+    assert predicted.shape == target.shape
+    assert w.shape[0] == predicted.shape[0]
+    return torch.mean(w * torch.norm(predicted - target, dim=len(target.shape)-1))
+def p_mpjpe(predicted, target):
+    """
+    Pose error: MPJPE after rigid alignment (scale, rotation, and translation),
+    often referred to as "Protocol #2" in many papers.
+    """
+    assert predicted.shape == target.shape
+    muX = np.mean(target, axis=1, keepdims=True)
+    muY = np.mean(predicted, axis=1, keepdims=True)
+    X0 = target - muX
+    Y0 = predicted - muY
+    normX = np.sqrt(np.sum(X0**2, axis=(1, 2), keepdims=True))
+    normY = np.sqrt(np.sum(Y0**2, axis=(1, 2), keepdims=True))
+    X0 /= normX
+    Y0 /= normY
+    H = np.matmul(X0.transpose(0, 2, 1), Y0)
+    U, s, Vt = np.linalg.svd(H)
+    V = Vt.transpose(0, 2, 1)
+    R = np.matmul(V, U.transpose(0, 2, 1))
+    # Avoid improper rotations (reflections), i.e. rotations with det(R) = -1
+    sign_detR = np.sign(np.expand_dims(np.linalg.det(R), axis=1))
+    V[:, :, -1] *= sign_detR
+    s[:, -1] *= sign_detR.flatten()
+    R = np.matmul(V, U.transpose(0, 2, 1)) # Rotation
+    tr = np.expand_dims(np.sum(s, axis=1, keepdims=True), axis=2)
+    a = tr * normX / normY # Scale
+    t = muX - a*np.matmul(muY, R) # Translation
+    # Perform rigid transformation on the input
+    predicted_aligned = a*np.matmul(predicted, R) + t
+    # Return MPJPE
+    return np.mean(np.linalg.norm(predicted_aligned - target, axis=len(target.shape)-1))
+def n_mpjpe(predicted, target):
+    """
+    Normalized MPJPE (scale only), adapted from:
+    https://github.com/hrhodin/UnsupervisedGeometryAwareRepresentationLearning/blob/master/losses/poses.py
+    """
+    assert predicted.shape == target.shape
+    norm_predicted = torch.mean(torch.sum(predicted**2, dim=3, keepdim=True), dim=2, keepdim=True)
+    norm_target = torch.mean(torch.sum(target*predicted, dim=3, keepdim=True), dim=2, keepdim=True)
+    scale = norm_target / norm_predicted
+    return mpjpe(scale * predicted, target)#[0]
+def weighted_bonelen_loss(predict_3d_length, gt_3d_length):
+    loss_length = 0.001 * torch.pow(predict_3d_length - gt_3d_length, 2).mean()
+    return loss_length
+def weighted_boneratio_loss(predict_3d_length, gt_3d_length):
+    loss_length = 0.1 * torch.pow((predict_3d_length - gt_3d_length)/gt_3d_length, 2).mean()
+    return loss_length
+def mean_velocity_error(predicted, target):
+    """
+    Mean per-joint velocity error (i.e. mean Euclidean distance of the 1st derivative)
+    """
+    assert predicted.shape == target.shape
+    velocity_predicted = np.diff(predicted, axis=0)
+    velocity_target = np.diff(target, axis=0)
+    return np.mean(np.linalg.norm(velocity_predicted - velocity_target, axis=len(target.shape)-1))

common/mocap_dataset.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Copyright (c) 2018-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+import numpy as np
+from common.skeleton import Skeleton
+class MocapDataset:
+    def __init__(self, fps, skeleton):
+        self._skeleton = skeleton
+        self._fps = fps
+        self._data = None # Must be filled by subclass
+        self._cameras = None # Must be filled by subclass
+    def remove_joints(self, joints_to_remove):
+        kept_joints = self._skeleton.remove_joints(joints_to_remove)
+        for subject in self._data.keys():
+            for action in self._data[subject].keys():
+                s = self._data[subject][action]
+                if 'positions' in s:
+                    s['positions'] = s['positions'][:, kept_joints]
+    def __getitem__(self, key):
+        return self._data[key]
+    def subjects(self):
+        return self._data.keys()
+    def fps(self):
+        return self._fps
+    def skeleton(self):
+        return self._skeleton
+    def cameras(self):
+        return self._cameras
+    def supports_semi_supervised(self):
+        # This method can be overridden
+        return False

common/model_poseformer.py ADDED Viewed

	@@ -0,0 +1,242 @@

+## Our PoseFormer model was revised from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+# Written by Ce Zheng (cezheng@knights.ucf.edu)
+# Modified by Qitao Zhao (qitaozhao@mail.sdu.edu.cn)
+import math
+import logging
+from functools import partial
+from einops import rearrange
+import torch
+import torch_dct as dct
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from timm.models.layers import DropPath
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class FreqMlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        b, f, _ = x.shape
+        x = dct.dct(x.permute(0, 2, 1)).permute(0, 2, 1).contiguous()
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        x = dct.idct(x.permute(0, 2, 1)).permute(0, 2, 1).contiguous()
+        return x
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class MixedBlock(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp1 = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.norm3 = norm_layer(dim)
+        self.mlp2 = FreqMlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+    def forward(self, x):
+        b, f, c = x.shape
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x1 = x[:, :f//2] + self.drop_path(self.mlp1(self.norm2(x[:, :f//2])))
+        x2 = x[:, f//2:] + self.drop_path(self.mlp2(self.norm3(x[:, f//2:])))
+        return torch.cat((x1, x2), dim=1)
+class PoseTransformerV2(nn.Module):
+    def __init__(self, num_frame=9, num_joints=17, in_chans=2,
+                 num_heads=8, mlp_ratio=2., qkv_bias=True, qk_scale=None,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.2,  norm_layer=None, args=None):
+        """    ##########hybrid_backbone=None, representation_size=None,
+        Args:
+            num_frame (int, tuple): input frame number
+            num_joints (int, tuple): joints number
+            in_chans (int): number of input channels, 2D joints have 2 channels: (x,y)
+            embed_dim_ratio (int): embedding dimension ratio
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            qk_scale (float): override default qk scale of head_dim ** -0.5 if set
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            norm_layer: (nn.Module): normalization layer
+        """
+        super().__init__()
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        embed_dim_ratio = args.embed_dim_ratio
+        depth = args.depth
+        embed_dim = embed_dim_ratio * num_joints   #### temporal embed_dim is num_joints * spatial embedding dim ratio
+        out_dim = num_joints * 3    #### output dimension is num_joints * 3
+        self.num_frame_kept = args.number_of_kept_frames
+        self.num_coeff_kept = args.number_of_kept_coeffs if args.number_of_kept_coeffs else self.num_frame_kept
+        ### spatial patch embedding
+        self.Joint_embedding = nn.Linear(in_chans, embed_dim_ratio)
+        self.Freq_embedding = nn.Linear(in_chans*num_joints, embed_dim)
+        self.Spatial_pos_embed = nn.Parameter(torch.zeros(1, num_joints, embed_dim_ratio))
+        self.Temporal_pos_embed = nn.Parameter(torch.zeros(1, self.num_frame_kept, embed_dim))
+        self.Temporal_pos_embed_ = nn.Parameter(torch.zeros(1, self.num_coeff_kept, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.Spatial_blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim_ratio, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
+            for i in range(depth)])
+        self.blocks = nn.ModuleList([
+            MixedBlock(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
+            for i in range(depth)])
+        self.Spatial_norm = norm_layer(embed_dim_ratio)
+        self.Temporal_norm = norm_layer(embed_dim)
+        ####### A easy way to implement weighted mean
+        self.weighted_mean = torch.nn.Conv1d(in_channels=self.num_coeff_kept, out_channels=1, kernel_size=1)
+        self.weighted_mean_ = torch.nn.Conv1d(in_channels=self.num_frame_kept, out_channels=1, kernel_size=1)
+        self.head = nn.Sequential(
+            nn.LayerNorm(embed_dim*2),
+            nn.Linear(embed_dim*2, out_dim),
+        )
+    def Spatial_forward_features(self, x):
+        b, f, p, _ = x.shape  ##### b is batch size, f is number of frames, p is number of joints
+        num_frame_kept = self.num_frame_kept
+        index = torch.arange((f-1)//2-num_frame_kept//2, (f-1)//2+num_frame_kept//2+1)
+        x = self.Joint_embedding(x[:, index].view(b*num_frame_kept, p, -1))
+        x += self.Spatial_pos_embed
+        x = self.pos_drop(x)
+        for blk in self.Spatial_blocks:
+            x = blk(x)
+        x = self.Spatial_norm(x)
+        x = rearrange(x, '(b f) p c -> b f (p c)', f=num_frame_kept)
+        return x
+    def forward_features(self, x, Spatial_feature):
+        b, f, p, _ = x.shape
+        num_coeff_kept = self.num_coeff_kept
+        x = dct.dct(x.permute(0, 2, 3, 1))[:, :, :, :num_coeff_kept]
+        x = x.permute(0, 3, 1, 2).contiguous().view(b, num_coeff_kept, -1)
+        x = self.Freq_embedding(x)
+        Spatial_feature += self.Temporal_pos_embed
+        x += self.Temporal_pos_embed_
+        x = torch.cat((x, Spatial_feature), dim=1)
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.Temporal_norm(x)
+        return x
+    def forward(self, x):
+        b, f, p, _ = x.shape
+        x_ = x.clone()
+        Spatial_feature = self.Spatial_forward_features(x)
+        x = self.forward_features(x_, Spatial_feature)
+        x = torch.cat((self.weighted_mean(x[:, :self.num_coeff_kept]), self.weighted_mean_(x[:, self.num_coeff_kept:])), dim=-1)
+        x = self.head(x).view(b, 1, p, -1)
+        return x

common/quaternion.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) 2018-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+import torch
+def qrot(q, v):
+    """
+    Rotate vector(s) v about the rotation described by quaternion(s) q.
+    Expects a tensor of shape (*, 4) for q and a tensor of shape (*, 3) for v,
+    where * denotes any number of dimensions.
+    Returns a tensor of shape (*, 3).
+    """
+    assert q.shape[-1] == 4
+    assert v.shape[-1] == 3
+    assert q.shape[:-1] == v.shape[:-1]
+    qvec = q[..., 1:]
+    uv = torch.cross(qvec, v, dim=len(q.shape)-1)
+    uuv = torch.cross(qvec, uv, dim=len(q.shape)-1)
+    return (v + 2 * (q[..., :1] * uv + uuv))
+def qinverse(q, inplace=False):
+    # We assume the quaternion to be normalized
+    if inplace:
+        q[..., 1:] *= -1
+        return q
+    else:
+        w = q[..., :1]
+        xyz = q[..., 1:]
+        return torch.cat((w, -xyz), dim=len(q.shape)-1)

common/skeleton.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (c) 2018-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+import numpy as np
+class Skeleton:
+    def __init__(self, parents, joints_left, joints_right):
+        assert len(joints_left) == len(joints_right)
+        self._parents = np.array(parents)
+        self._joints_left = joints_left
+        self._joints_right = joints_right
+        self._compute_metadata()
+    def num_joints(self):
+        return len(self._parents)
+    def parents(self):
+        return self._parents
+    def has_children(self):
+        return self._has_children
+    def children(self):
+        return self._children
+    def remove_joints(self, joints_to_remove):
+        """
+        Remove the joints specified in 'joints_to_remove'.
+        """
+        valid_joints = []
+        for joint in range(len(self._parents)):
+            if joint not in joints_to_remove:
+                valid_joints.append(joint)
+        for i in range(len(self._parents)):
+            while self._parents[i] in joints_to_remove:
+                self._parents[i] = self._parents[self._parents[i]]
+        index_offsets = np.zeros(len(self._parents), dtype=int)
+        new_parents = []
+        for i, parent in enumerate(self._parents):
+            if i not in joints_to_remove:
+                new_parents.append(parent - index_offsets[parent])
+            else:
+                index_offsets[i:] += 1
+        self._parents = np.array(new_parents)
+        if self._joints_left is not None:
+            new_joints_left = []
+            for joint in self._joints_left:
+                if joint in valid_joints:
+                    new_joints_left.append(joint - index_offsets[joint])
+            self._joints_left = new_joints_left
+        if self._joints_right is not None:
+            new_joints_right = []
+            for joint in self._joints_right:
+                if joint in valid_joints:
+                    new_joints_right.append(joint - index_offsets[joint])
+            self._joints_right = new_joints_right
+        self._compute_metadata()
+        return valid_joints
+    def joints_left(self):
+        return self._joints_left
+    def joints_right(self):
+        return self._joints_right
+    def _compute_metadata(self):
+        self._has_children = np.zeros(len(self._parents)).astype(bool)
+        for i, parent in enumerate(self._parents):
+            if parent != -1:
+                self._has_children[parent] = True
+        self._children = []
+        for i, parent in enumerate(self._parents):
+            self._children.append([])
+        for i, parent in enumerate(self._parents):
+            if parent != -1:
+                self._children[parent].append(i)

common/utils.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) 2018-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+import torch
+import numpy as np
+import hashlib
+def wrap(func, *args, unsqueeze=False):
+    """
+    Wrap a torch function so it can be called with NumPy arrays.
+    Input and return types are seamlessly converted.
+    """
+    # Convert input types where applicable
+    args = list(args)
+    for i, arg in enumerate(args):
+        if type(arg) == np.ndarray:
+            args[i] = torch.from_numpy(arg)
+            if unsqueeze:
+                args[i] = args[i].unsqueeze(0)
+    result = func(*args)
+    # Convert output types where applicable
+    if isinstance(result, tuple):
+        result = list(result)
+        for i, res in enumerate(result):
+            if type(res) == torch.Tensor:
+                if unsqueeze:
+                    res = res.squeeze(0)
+                result[i] = res.numpy()
+        return tuple(result)
+    elif type(result) == torch.Tensor:
+        if unsqueeze:
+            result = result.squeeze(0)
+        return result.numpy()
+    else:
+        return result
+def deterministic_random(min_value, max_value, data):
+    digest = hashlib.sha256(data.encode()).digest()
+    raw_value = int.from_bytes(digest[:4], byteorder='little', signed=False)
+    return int(raw_value / (2**32 - 1) * (max_value - min_value)) + min_value
+def load_pretrained_weights(model, checkpoint):
+    """Load pretrianed weights to model
+    Incompatible layers (unmatched in name or size) will be ignored
+    Args:
+    - model (nn.Module): network model, which must not be nn.DataParallel
+    - weight_path (str): path to pretrained weights
+    """
+    import collections
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+    model_dict = model.state_dict()
+    new_state_dict = collections.OrderedDict()
+    matched_layers, discarded_layers = [], []
+    for k, v in state_dict.items():
+        # If the pretrained state_dict was saved as nn.DataParallel,
+        # keys would contain "module.", which should be ignored.
+        if k.startswith('module.'):
+            k = k[7:]
+        if k in model_dict and model_dict[k].size() == v.size():
+            new_state_dict[k] = v
+            matched_layers.append(k)
+        else:
+            discarded_layers.append(k)
+    # new_state_dict.requires_grad = False
+    model_dict.update(new_state_dict)
+    model.load_state_dict(model_dict)
+    print('load_weight', len(matched_layers))
+    # model.state_dict(model_dict).requires_grad = False
+    return model

common/visualization.py ADDED Viewed

	@@ -0,0 +1,216 @@

+# Copyright (c) 2018-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from matplotlib.animation import FuncAnimation, writers
+from mpl_toolkits.mplot3d import Axes3D
+import numpy as np
+import subprocess as sp
+import cv2
+def get_resolution(filename):
+    command = ['ffprobe', '-v', 'error', '-select_streams', 'v:0',
+               '-show_entries', 'stream=width,height', '-of', 'csv=p=0', filename]
+    with sp.Popen(command, stdout=sp.PIPE, bufsize=-1) as pipe:
+        for line in pipe.stdout:
+            w, h = line.decode().strip().split(',')
+            return int(w), int(h)
+def get_fps(filename):
+    command = ['ffprobe', '-v', 'error', '-select_streams', 'v:0',
+               '-show_entries', 'stream=r_frame_rate', '-of', 'csv=p=0', filename]
+    with sp.Popen(command, stdout=sp.PIPE, bufsize=-1) as pipe:
+        for line in pipe.stdout:
+            a, b = line.decode().strip().split('/')
+            return int(a) / int(b)
+def read_video(filename, skip=0, limit=-1):
+    # w, h = get_resolution(filename)
+    w = 1000
+    h = 1002
+    command = ['ffmpeg',
+               '-i', filename,
+               '-f', 'image2pipe',
+               '-pix_fmt', 'rgb24',
+               '-vsync', '0',
+               '-vcodec', 'rawvideo', '-']
+    i = 0
+    with sp.Popen(command, stdout=sp.PIPE, bufsize=-1) as pipe:
+        while True:
+            data = pipe.stdout.read(w * h * 3)
+            if not data:
+                break
+            i += 1
+            if i > limit and limit != -1:
+                continue
+            if i > skip:
+                yield np.frombuffer(data, dtype='uint8').reshape((h, w, 3))
+def downsample_tensor(X, factor):
+    length = X.shape[0] // factor * factor
+    return np.mean(X[:length].reshape(-1, factor, *X.shape[1:]), axis=1)
+def render_animation(keypoints, keypoints_metadata, poses, skeleton, fps, bitrate, azim, output, viewport,
+                     limit=-1, downsample=1, size=6, input_video_path=None, input_video_skip=0):
+    """
+    TODO
+    Render an animation. The supported output modes are:
+     -- 'interactive': display an interactive figure
+                       (also works on notebooks if associated with %matplotlib inline)
+     -- 'html': render the animation as HTML5 video. Can be displayed in a notebook using HTML(...).
+     -- 'filename.mp4': render and export the animation as an h264 video (requires ffmpeg).
+     -- 'filename.gif': render and export the animation a gif file (requires imagemagick).
+    """
+    plt.ioff()
+    fig = plt.figure(figsize=(size * (1 + len(poses)), size))
+    ax_in = fig.add_subplot(1, 1 + len(poses), 1)
+    ax_in.get_xaxis().set_visible(False)
+    ax_in.get_yaxis().set_visible(False)
+    ax_in.set_axis_off()
+    ax_in.set_title('Input')
+    ax_3d = []
+    lines_3d = []
+    trajectories = []
+    radius = 1.7
+    for index, (title, data) in enumerate(poses.items()):
+        ax = fig.add_subplot(1, 1 + len(poses), index + 2, projection='3d')
+        ax.view_init(elev=15., azim=azim)
+        ax.set_xlim3d([-radius / 2, radius / 2])
+        ax.set_zlim3d([0, radius])
+        ax.set_ylim3d([-radius / 2, radius / 2])
+        try:
+            ax.set_aspect('equal')
+        except NotImplementedError:
+            ax.set_aspect('auto')
+        ax.set_xticklabels([])
+        ax.set_yticklabels([])
+        ax.set_zticklabels([])
+        ax.dist = 7.5
+        ax.set_title(title)  # , pad=35
+        ax_3d.append(ax)
+        lines_3d.append([])
+        trajectories.append(data[:, 0, [0, 1]])
+    poses = list(poses.values())
+    # Decode video
+    if input_video_path is None:
+        # Black background
+        all_frames = np.zeros((keypoints.shape[0], viewport[1], viewport[0]), dtype='uint8')
+    else:
+        # Load video using ffmpeg
+        all_frames = []
+        for f in read_video(input_video_path, skip=input_video_skip, limit=limit):
+            all_frames.append(f)
+        effective_length = min(keypoints.shape[0], len(all_frames))
+        all_frames = all_frames[:effective_length]
+        keypoints = keypoints[input_video_skip:]  # todo remove
+        for idx in range(len(poses)):
+            poses[idx] = poses[idx][input_video_skip:]
+        if fps is None:
+            fps = get_fps(input_video_path)
+    if downsample > 1:
+        keypoints = downsample_tensor(keypoints, downsample)
+        all_frames = downsample_tensor(np.array(all_frames), downsample).astype('uint8')
+        for idx in range(len(poses)):
+            poses[idx] = downsample_tensor(poses[idx], downsample)
+            trajectories[idx] = downsample_tensor(trajectories[idx], downsample)
+        fps /= downsample
+    initialized = False
+    image = None
+    lines = []
+    points = None
+    if limit < 1:
+        limit = len(all_frames)
+    else:
+        limit = min(limit, len(all_frames))
+    parents = skeleton.parents()
+    def update_video(i):
+        nonlocal initialized, image, lines, points
+        for n, ax in enumerate(ax_3d):
+            ax.set_xlim3d([-radius / 2 + trajectories[n][i, 0], radius / 2 + trajectories[n][i, 0]])
+            ax.set_ylim3d([-radius / 2 + trajectories[n][i, 1], radius / 2 + trajectories[n][i, 1]])
+        # Update 2D poses
+        joints_right_2d = keypoints_metadata['keypoints_symmetry'][1]
+        colors_2d = np.full(keypoints.shape[1], 'black')
+        colors_2d[joints_right_2d] = 'red'
+        if not initialized:
+            image = ax_in.imshow(all_frames[i], aspect='equal')
+            for j, j_parent in enumerate(parents):
+                if j_parent == -1:
+                    continue
+                if len(parents) == keypoints.shape[1] and keypoints_metadata['layout_name'] != 'coco':
+                    # Draw skeleton only if keypoints match (otherwise we don't have the parents definition)
+                    lines.append(ax_in.plot([keypoints[i, j, 0], keypoints[i, j_parent, 0]],
+                                            [keypoints[i, j, 1], keypoints[i, j_parent, 1]], color='pink'))
+                col = 'red' if j in skeleton.joints_right() else 'black'
+                for n, ax in enumerate(ax_3d):
+                    pos = poses[n][i]
+                    lines_3d[n].append(ax.plot([pos[j, 0], pos[j_parent, 0]],
+                                               [pos[j, 1], pos[j_parent, 1]],
+                                               [pos[j, 2], pos[j_parent, 2]], zdir='z', c=col))
+            points = ax_in.scatter(*keypoints[i].T, 10, color=colors_2d, edgecolors='white', zorder=10)
+            initialized = True
+        else:
+            image.set_data(all_frames[i])
+            for j, j_parent in enumerate(parents):
+                if j_parent == -1:
+                    continue
+                if len(parents) == keypoints.shape[1] and keypoints_metadata['layout_name'] != 'coco':
+                    lines[j - 1][0].set_data([keypoints[i, j, 0], keypoints[i, j_parent, 0]],
+                                             [keypoints[i, j, 1], keypoints[i, j_parent, 1]])
+                for n, ax in enumerate(ax_3d):
+                    pos = poses[n][i]
+                    lines_3d[n][j - 1][0].set_xdata(np.array([pos[j, 0], pos[j_parent, 0]]))
+                    lines_3d[n][j - 1][0].set_ydata(np.array([pos[j, 1], pos[j_parent, 1]]))
+                    lines_3d[n][j - 1][0].set_3d_properties(np.array([pos[j, 2], pos[j_parent, 2]]), zdir='z')
+            points.set_offsets(keypoints[i])
+        print('{}/{}      '.format(i, limit), end='\r')
+    fig.tight_layout()
+    anim = FuncAnimation(fig, update_video, frames=np.arange(0, limit), interval=1000 / fps, repeat=False)
+    if output.endswith('.mp4'):
+        Writer = writers['ffmpeg']
+        writer = Writer(fps=fps, metadata={}, bitrate=bitrate)
+        anim.save(output, writer=writer)
+    elif output.endswith('.gif'):
+        anim.save(output, dpi=80, writer='imagemagick')
+    else:
+        raise ValueError('Unsupported output format (only .mp4 and .gif are supported)')
+    plt.close()