Spaces:

Fred808
/

txt2motion

Paused

App Files Files Community

Fred808 commited on Jun 25, 2025

Commit

8a45a74

verified ·

1 Parent(s): af80243

Upload 10 files

Browse files

Files changed (10) hide show

utils/PYTORCH3D_LICENSE +30 -0
utils/config.py +17 -0
utils/dist_util.py +77 -0
utils/fixseed.py +18 -0
utils/loss_util.py +46 -0
utils/misc.py +74 -0
utils/model_util.py +132 -0
utils/parser_util.py +320 -0
utils/rotation_conversions.py +552 -0
utils/sampler_util.py +81 -0

utils/PYTORCH3D_LICENSE ADDED Viewed

	@@ -0,0 +1,30 @@

+BSD License
+For PyTorch3D software
+Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright notice, this
+    list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+       and/or other materials provided with the distribution.
+ * Neither the name Facebook nor the names of its contributors may be used to
+    endorse or promote products derived from this software without specific
+       prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

utils/config.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os
+SMPL_DATA_PATH = "./body_models/smpl"
+SMPL_KINTREE_PATH = os.path.join(SMPL_DATA_PATH, "kintree_table.pkl")
+SMPL_MODEL_PATH = os.path.join(SMPL_DATA_PATH, "SMPL_NEUTRAL.pkl")
+JOINT_REGRESSOR_TRAIN_EXTRA = os.path.join(SMPL_DATA_PATH, 'J_regressor_extra.npy')
+ROT_CONVENTION_TO_ROT_NUMBER = {
+    'legacy': 23,
+    'no_hands': 21,
+    'full_hands': 51,
+    'mitten_hands': 33,
+}
+GENDERS = ['neutral', 'male', 'female']
+NUM_BETAS = 10

utils/dist_util.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""
+Helpers for distributed training.
+"""
+import socket
+import torch as th
+import torch.distributed as dist
+# Change this to reflect your cluster layout.
+# The GPU for a given rank is (rank % GPUS_PER_NODE).
+GPUS_PER_NODE = 8
+SETUP_RETRY_COUNT = 3
+used_device = 0
+def setup_dist(device=0):
+    """
+    Setup a distributed process group.
+    """
+    global used_device
+    used_device = device
+    if dist.is_initialized():
+        return
+    # os.environ["CUDA_VISIBLE_DEVICES"] = str(device) # f"{MPI.COMM_WORLD.Get_rank() % GPUS_PER_NODE}"
+    # comm = MPI.COMM_WORLD
+    # backend = "gloo" if not th.cuda.is_available() else "nccl"
+    # if backend == "gloo":
+    #     hostname = "localhost"
+    # else:
+    #     hostname = socket.gethostbyname(socket.getfqdn())
+    # os.environ["MASTER_ADDR"] = comm.bcast(hostname, root=0)
+    # os.environ["RANK"] = str(comm.rank)
+    # os.environ["WORLD_SIZE"] = str(comm.size)
+    # port = comm.bcast(_find_free_port(), root=used_device)
+    # os.environ["MASTER_PORT"] = str(port)
+    # dist.init_process_group(backend=backend, init_method="env://")
+def dev():
+    """
+    Get the device to use for torch.distributed.
+    """
+    global used_device
+    if th.cuda.is_available() and used_device>=0:
+        return th.device(f"cuda:{used_device}")
+    return th.device("cpu")
+def load_state_dict(path, **kwargs):
+    """
+    Load a PyTorch file without redundant fetches across MPI ranks.
+    """
+    return th.load(path, **kwargs)
+def sync_params(params):
+    """
+    Synchronize a sequence of Tensors across ranks from rank 0.
+    """
+    for p in params:
+        with th.no_grad():
+            dist.broadcast(p, 0)
+def _find_free_port():
+    try:
+        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        s.bind(("", 0))
+        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        return s.getsockname()[1]
+    finally:
+        s.close()

utils/fixseed.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import numpy as np
+import torch
+import random
+def fixseed(seed):
+    torch.backends.cudnn.benchmark = False
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+# SEED = 10
+# EVALSEED = 0
+# # Provoc warning: not fully functionnal yet
+# # torch.set_deterministic(True)
+# torch.backends.cudnn.benchmark = False
+# fixseed(SEED)

utils/loss_util.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from diffusion.nn import mean_flat, sum_flat
+import torch
+import numpy as np
+def angle_l2(angle1, angle2):
+    a = angle1 - angle2
+    a = (a + (torch.pi/2)) % torch.pi - (torch.pi/2)
+    return a ** 2
+def diff_l2(a, b):
+    return (a - b) ** 2
+def masked_l2(a, b, mask, loss_fn=diff_l2, epsilon=1e-8, entries_norm=True):
+    # assuming a.shape == b.shape == bs, J, Jdim, seqlen
+    # assuming mask.shape == bs, 1, 1, seqlen
+    loss = loss_fn(a, b)
+    loss = sum_flat(loss * mask.float())  # gives \sigma_euclidean over unmasked elements
+    n_entries = a.shape[1]
+    if len(a.shape) > 3:
+        n_entries *= a.shape[2]
+    non_zero_elements = sum_flat(mask)
+    if entries_norm:
+        # In cases the mask is per frame, and not specifying the number of entries per frame, this normalization is needed,
+        # Otherwise set it to False
+        non_zero_elements *= n_entries
+    # print('mask', mask.shape)
+    # print('non_zero_elements', non_zero_elements)
+    # print('loss', loss)
+    mse_loss_val = loss / (non_zero_elements + epsilon)  # Add epsilon to avoid division by zero
+    # print('mse_loss_val', mse_loss_val)
+    return mse_loss_val
+def masked_goal_l2(pred_goal, ref_goal, cond, all_goal_joint_names):
+    all_goal_joint_names_w_traj = np.append(all_goal_joint_names, 'traj')
+    target_joint_idx = [[np.where(all_goal_joint_names_w_traj == j)[0][0] for j in sample_joints] for sample_joints in cond['target_joint_names']]
+    loc_mask = torch.zeros_like(pred_goal[:,:-1], dtype=torch.bool)
+    for sample_idx in range(loc_mask.shape[0]):
+        loc_mask[sample_idx, target_joint_idx[sample_idx]] = True
+    loc_mask[:, -1, 1] = False  # vertical joint of 'traj' is always masked out
+    loc_loss = masked_l2(pred_goal[:,:-1], ref_goal[:,:-1], loc_mask, entries_norm=False)
+    heading_loss = masked_l2(pred_goal[:,-1:, :1], ref_goal[:,-1:, :1], cond['is_heading'].unsqueeze(1).unsqueeze(1), loss_fn=angle_l2, entries_norm=False)
+    loss =  loc_loss + heading_loss
+    return loss

utils/misc.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import torch
+import torch.nn as nn
+class WeightedSum(nn.Module):
+    def __init__(self, num_rows):
+        super(WeightedSum, self).__init__()
+        # Initialize learnable weights
+        self.weights = nn.Parameter(torch.randn(num_rows))
+    def forward(self, x):
+        # Ensure weights are normalized (optional)
+        normalized_weights = self.weights / self.weights.sum()  # torch.softmax(self.weights, dim=0)
+        # Compute the weighted sum of the rows
+        weighted_sum = torch.matmul(normalized_weights, x)
+        return weighted_sum
+def wrapped_getattr(self, name, default=None, wrapped_member_name='model'):
+    ''' should be called from wrappers of model classes such as ClassifierFreeSampleModel'''
+    if isinstance(self, torch.nn.Module):
+        # for descendants of nn.Module, name may be in self.__dict__[_parameters/_buffers/_modules]
+        # so we activate nn.Module.__getattr__ first.
+        # Otherwise, we might encounter an infinite loop
+        try:
+            attr = torch.nn.Module.__getattr__(self, name)
+        except AttributeError:
+            wrapped_member = torch.nn.Module.__getattr__(self, wrapped_member_name)
+            attr = getattr(wrapped_member, name, default)
+    else:
+        # the easy case, where self is not derived from nn.Module
+        wrapped_member = getattr(self, wrapped_member_name)
+        attr = getattr(wrapped_member, name, default)
+    return attr
+def to_numpy(tensor):
+    if torch.is_tensor(tensor):
+        return tensor.cpu().numpy()
+    elif type(tensor).__module__ != 'numpy':
+        raise ValueError("Cannot convert {} to numpy array".format(
+            type(tensor)))
+    return tensor
+def to_torch(ndarray):
+    if type(ndarray).__module__ == 'numpy':
+        return torch.from_numpy(ndarray)
+    elif not torch.is_tensor(ndarray):
+        raise ValueError("Cannot convert {} to torch tensor".format(
+            type(ndarray)))
+    return ndarray
+def cleanexit():
+    import sys
+    import os
+    try:
+        sys.exit(0)
+    except SystemExit:
+        os._exit(0)
+def load_model_wo_clip(model, state_dict):
+    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+    assert len(unexpected_keys) == 0
+    assert all([k.startswith('clip_model.') for k in missing_keys])
+def freeze_joints(x, joints_to_freeze):
+    # Freezes selected joint *rotations* as they appear in the first frame
+    # x [bs, [root+n_joints], joint_dim(6), seqlen]
+    frozen = x.detach().clone()
+    frozen[:, joints_to_freeze, :, :] = frozen[:, joints_to_freeze, :, :1]
+    return frozen

utils/model_util.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import torch
+from model.mdm import MDM
+from diffusion import gaussian_diffusion as gd
+from diffusion.respace import SpacedDiffusion, space_timesteps
+from utils.parser_util import get_cond_mode
+from data_loaders.humanml_utils import HML_EE_JOINT_NAMES
+def load_model_wo_clip(model, state_dict):
+    # assert (state_dict['sequence_pos_encoder.pe'][:model.sequence_pos_encoder.pe.shape[0]] == model.sequence_pos_encoder.pe).all()  # TEST
+    # assert (state_dict['embed_timestep.sequence_pos_encoder.pe'][:model.embed_timestep.sequence_pos_encoder.pe.shape[0]] == model.embed_timestep.sequence_pos_encoder.pe).all()  # TEST
+    del state_dict['sequence_pos_encoder.pe']  # no need to load it (fixed), and causes size mismatch for older models
+    del state_dict['embed_timestep.sequence_pos_encoder.pe']  # no need to load it (fixed), and causes size mismatch for older models
+    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+    assert len(unexpected_keys) == 0
+    assert all([k.startswith('clip_model.') or 'sequence_pos_encoder' in k for k in missing_keys])
+def create_model_and_diffusion(args, data):
+    model = MDM(**get_model_args(args, data))
+    diffusion = create_gaussian_diffusion(args)
+    return model, diffusion
+def get_model_args(args, data):
+    # default args
+    clip_version = 'ViT-B/32'
+    action_emb = 'tensor'
+    cond_mode = get_cond_mode(args)
+    if hasattr(data.dataset, 'num_actions'):
+        num_actions = data.dataset.num_actions
+    else:
+        num_actions = 1
+    # SMPL defaults
+    data_rep = 'rot6d'
+    njoints = 25
+    nfeats = 6
+    all_goal_joint_names = []
+    if args.dataset == 'humanml':
+        data_rep = 'hml_vec'
+        njoints = 263
+        nfeats = 1
+        all_goal_joint_names = ['pelvis'] + HML_EE_JOINT_NAMES
+    elif args.dataset == 'kit':
+        data_rep = 'hml_vec'
+        njoints = 251
+        nfeats = 1
+    # Compatibility with old models
+    if not hasattr(args, 'pred_len'):
+        args.pred_len = 0
+        args.context_len = 0
+    emb_policy = args.__dict__.get('emb_policy', 'add')
+    multi_target_cond = args.__dict__.get('multi_target_cond', False)
+    multi_encoder_type = args.__dict__.get('multi_encoder_type', 'multi')
+    target_enc_layers = args.__dict__.get('target_enc_layers', 1)
+    return {'modeltype': '', 'njoints': njoints, 'nfeats': nfeats, 'num_actions': num_actions,
+            'translation': True, 'pose_rep': 'rot6d', 'glob': True, 'glob_rot': True,
+            'latent_dim': args.latent_dim, 'ff_size': 1024, 'num_layers': args.layers, 'num_heads': 4,
+            'dropout': 0.1, 'activation': "gelu", 'data_rep': data_rep, 'cond_mode': cond_mode,
+            'cond_mask_prob': args.cond_mask_prob, 'action_emb': action_emb, 'arch': args.arch,
+            'emb_trans_dec': args.emb_trans_dec, 'clip_version': clip_version, 'dataset': args.dataset,
+            'text_encoder_type': args.text_encoder_type,
+            'pos_embed_max_len': args.pos_embed_max_len, 'mask_frames': args.mask_frames,
+            'pred_len': args.pred_len, 'context_len': args.context_len, 'emb_policy': emb_policy,
+            'all_goal_joint_names': all_goal_joint_names, 'multi_target_cond': multi_target_cond, 'multi_encoder_type': multi_encoder_type, 'target_enc_layers': target_enc_layers,
+            }
+def create_gaussian_diffusion(args):
+    # default params
+    predict_xstart = True  # we always predict x_start (a.k.a. x0), that's our deal!
+    steps = args.diffusion_steps
+    scale_beta = 1.  # no scaling
+    timestep_respacing = ''  # can be used for ddim sampling, we don't use it.
+    learn_sigma = False
+    rescale_timesteps = False
+    betas = gd.get_named_beta_schedule(args.noise_schedule, steps, scale_beta)
+    loss_type = gd.LossType.MSE
+    if not timestep_respacing:
+        timestep_respacing = [steps]
+    if hasattr(args, 'lambda_target_loc'):
+        lambda_target_loc = args.lambda_target_loc
+    else:
+        lambda_target_loc = 0.
+    return SpacedDiffusion(
+        use_timesteps=space_timesteps(steps, timestep_respacing),
+        betas=betas,
+        model_mean_type=(
+            gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X
+        ),
+        model_var_type=(
+            (
+                gd.ModelVarType.FIXED_LARGE
+                if not args.sigma_small
+                else gd.ModelVarType.FIXED_SMALL
+            )
+            if not learn_sigma
+            else gd.ModelVarType.LEARNED_RANGE
+        ),
+        loss_type=loss_type,
+        rescale_timesteps=rescale_timesteps,
+        lambda_vel=args.lambda_vel,
+        lambda_rcxyz=args.lambda_rcxyz,
+        lambda_fc=args.lambda_fc,
+        lambda_target_loc=lambda_target_loc,
+    )
+def load_saved_model(model, model_path, use_avg: bool=False):  # use_avg_model
+    state_dict = torch.load(model_path, map_location='cpu')
+    # Use average model when possible
+    if use_avg and 'model_avg' in state_dict.keys():
+    # if use_avg_model:
+        print('loading avg model')
+        state_dict = state_dict['model_avg']
+    else:
+        if 'model' in state_dict:
+            print('loading model without avg')
+            state_dict = state_dict['model']
+        else:
+            print('checkpoint has no avg model, loading as usual.')
+    load_model_wo_clip(model, state_dict)
+    return model

utils/parser_util.py ADDED Viewed

	@@ -0,0 +1,320 @@

+from argparse import ArgumentParser
+import argparse
+import os
+import json
+def parse_and_load_from_model(parser):
+    # args according to the loaded model
+    # do not try to specify them from cmd line since they will be overwritten
+    add_data_options(parser)
+    add_model_options(parser)
+    add_diffusion_options(parser)
+    args = parser.parse_args()
+    args_to_overwrite = []
+    for group_name in ['dataset', 'model', 'diffusion']:
+        args_to_overwrite += get_args_per_group_name(parser, args, group_name)
+    # load args from model
+    if args.model_path != '':  # if not using external results file
+        args = load_args_from_model(args, args_to_overwrite)
+    if args.cond_mask_prob == 0:
+        args.guidance_param = 1
+    return apply_rules(args)
+def load_args_from_model(args, args_to_overwrite):
+    model_path = get_model_path_from_args()
+    args_path = os.path.join(os.path.dirname(model_path), 'args.json')
+    assert os.path.exists(args_path), 'Arguments json file was not found!'
+    with open(args_path, 'r') as fr:
+        model_args = json.load(fr)
+    for a in args_to_overwrite:
+        if a in model_args.keys():
+            setattr(args, a, model_args[a])
+        elif 'cond_mode' in model_args: # backward compitability
+            unconstrained = (model_args['cond_mode'] == 'no_cond')
+            setattr(args, 'unconstrained', unconstrained)
+        else:
+            print('Warning: was not able to load [{}], using default value [{}] instead.'.format(a, args.__dict__[a]))
+    return args
+def apply_rules(args):
+    # For prefix completion
+    if args.pred_len == 0:
+        args.pred_len = args.context_len
+    # For target conditioning
+    if args.lambda_target_loc > 0.:
+        args.multi_target_cond = True
+    return args
+def get_args_per_group_name(parser, args, group_name):
+    for group in parser._action_groups:
+        if group.title == group_name:
+            group_dict = {a.dest: getattr(args, a.dest, None) for a in group._group_actions}
+            return list(argparse.Namespace(**group_dict).__dict__.keys())
+    return ValueError('group_name was not found.')
+def get_model_path_from_args():
+    try:
+        dummy_parser = ArgumentParser()
+        dummy_parser.add_argument('--model_path')
+        dummy_args, _ = dummy_parser.parse_known_args()
+        return dummy_args.model_path
+    except:
+        raise ValueError('model_path argument must be specified.')
+def add_base_options(parser):
+    group = parser.add_argument_group('base')
+    group.add_argument("--cuda", default=True, type=bool, help="Use cuda device, otherwise use CPU.")
+    group.add_argument("--device", default=0, type=int, help="Device id to use.")
+    group.add_argument("--seed", default=10, type=int, help="For fixing random seed.")
+    group.add_argument("--batch_size", default=64, type=int, help="Batch size during training.")
+    group.add_argument("--train_platform_type", default='NoPlatform', choices=['NoPlatform', 'ClearmlPlatform', 'TensorboardPlatform', 'WandBPlatform'], type=str,
+                       help="Choose platform to log results. NoPlatform means no logging.")
+    group.add_argument("--external_mode", default=False, type=bool, help="For backward cometability, do not change or delete.")
+def add_diffusion_options(parser):
+    group = parser.add_argument_group('diffusion')
+    group.add_argument("--noise_schedule", default='cosine', choices=['linear', 'cosine'], type=str,
+                       help="Noise schedule type")
+    group.add_argument("--diffusion_steps", default=1000, type=int,
+                       help="Number of diffusion steps (denoted T in the paper)")
+    group.add_argument("--sigma_small", default=True, type=bool, help="Use smaller sigma values.")
+def add_model_options(parser):
+    group = parser.add_argument_group('model')
+    group.add_argument("--arch", default='trans_enc',
+                       choices=['trans_enc', 'trans_dec', 'gru'], type=str,
+                       help="Architecture types as reported in the paper.")
+    group.add_argument("--text_encoder_type", default='clip',
+                       choices=['clip', 'bert'], type=str, help="Text encoder type.")
+    group.add_argument("--emb_trans_dec", action='store_true',
+                       help="For trans_dec architecture only, if true, will inject condition as a class token"
+                            " (in addition to cross-attention).")
+    group.add_argument("--layers", default=8, type=int,
+                       help="Number of layers.")
+    group.add_argument("--latent_dim", default=512, type=int,
+                       help="Transformer/GRU width.")
+    group.add_argument("--cond_mask_prob", default=.1, type=float,
+                       help="The probability of masking the condition during training."
+                            " For classifier-free guidance learning.")
+    group.add_argument("--mask_frames", action='store_true', help="If true, will fix Rotem's bug and mask invalid frames.")
+    group.add_argument("--lambda_rcxyz", default=0.0, type=float, help="Joint positions loss.")
+    group.add_argument("--lambda_vel", default=0.0, type=float, help="Joint velocity loss.")
+    group.add_argument("--lambda_fc", default=0.0, type=float, help="Foot contact loss.")
+    group.add_argument("--lambda_target_loc", default=0.0, type=float, help="For HumanML only, when . L2 with target location.")
+    group.add_argument("--unconstrained", action='store_true',
+                       help="Model is trained unconditionally. That is, it is constrained by neither text nor action. "
+                            "Currently tested on HumanAct12 only.")
+    group.add_argument("--pos_embed_max_len", default=5000, type=int,
+                       help="Pose embedding max length.")
+    group.add_argument("--use_ema", action='store_true',
+                    help="If True, will use EMA model averaging.")
+    group.add_argument("--multi_target_cond", action='store_true', help="If true, enable multi-target conditioning (aka Sigal's model).")
+    group.add_argument("--multi_encoder_type", default='single', choices=['single', 'multi', 'split'], type=str, help="Specifies the encoder type to be used for the multi joint condition.")
+    group.add_argument("--target_enc_layers", default=1, type=int, help="Num target encoder layers")
+    # Prefix completion model
+    group.add_argument("--context_len", default=0, type=int, help="If larger than 0, will do prefix completion.")
+    group.add_argument("--pred_len", default=0, type=int, help="If context_len larger than 0, will do prefix completion. If pred_len will not be specified - will use the same length as context_len")
+def add_data_options(parser):
+    group = parser.add_argument_group('dataset')
+    group.add_argument("--dataset", default='humanml', choices=['humanml', 'kit', 'humanact12', 'uestc'], type=str,
+                       help="Dataset name (choose from list).")
+    group.add_argument("--data_dir", default="", type=str,
+                       help="If empty, will use defaults according to the specified dataset.")
+def add_training_options(parser):
+    group = parser.add_argument_group('training')
+    group.add_argument("--save_dir", required=True, type=str,
+                       help="Path to save checkpoints and results.")
+    group.add_argument("--overwrite", action='store_true',
+                       help="If True, will enable to use an already existing save_dir.")
+    group.add_argument("--lr", default=1e-4, type=float, help="Learning rate.")
+    group.add_argument("--weight_decay", default=0.0, type=float, help="Optimizer weight decay.")
+    group.add_argument("--lr_anneal_steps", default=0, type=int, help="Number of learning rate anneal steps.")
+    group.add_argument("--eval_batch_size", default=32, type=int,
+                       help="Batch size during evaluation loop. Do not change this unless you know what you are doing. "
+                            "T2m precision calculation is based on fixed batch size 32.")
+    group.add_argument("--eval_split", default='test', choices=['val', 'test'], type=str,
+                       help="Which split to evaluate on during training.")
+    group.add_argument("--eval_during_training", action='store_true',
+                       help="If True, will run evaluation during training.")
+    group.add_argument("--eval_rep_times", default=3, type=int,
+                       help="Number of repetitions for evaluation loop during training.")
+    group.add_argument("--eval_num_samples", default=1_000, type=int,
+                       help="If -1, will use all samples in the specified split.")
+    group.add_argument("--log_interval", default=1_000, type=int,
+                       help="Log losses each N steps")
+    group.add_argument("--save_interval", default=50_000, type=int,
+                       help="Save checkpoints and run evaluation each N steps")
+    group.add_argument("--num_steps", default=600_000, type=int,
+                       help="Training will stop after the specified number of steps.")
+    group.add_argument("--num_frames", default=60, type=int,
+                       help="Limit for the maximal number of frames. In HumanML3D and KIT this field is ignored.")
+    group.add_argument("--resume_checkpoint", default="", type=str,
+                       help="If not empty, will start from the specified checkpoint (path to model###.pt file).")
+    group.add_argument("--gen_during_training", action='store_true',
+                       help="If True, will generate motions during training, on each save interval.")
+    group.add_argument("--gen_num_samples", default=3, type=int,
+                       help="Number of samples to sample while generating")
+    group.add_argument("--gen_num_repetitions", default=2, type=int,
+                       help="Number of repetitions, per sample (text prompt/action)")
+    group.add_argument("--gen_guidance_param", default=2.5, type=float,
+                       help="For classifier-free sampling - specifies the s parameter, as defined in the paper.")
+    group.add_argument("--avg_model_beta", default=0.9999, type=float, help="Average model beta (for EMA).")
+    group.add_argument("--adam_beta2", default=0.999, type=float, help="Adam beta2.")
+    group.add_argument("--target_joint_names", default='DIMP_FINAL', type=str, help="Force single joint configuration by specifing the joints (coma separated). If None - will use the random mode for all end effectors.")
+    group.add_argument("--autoregressive", action='store_true', help="If true, and we use a prefix model will generate motions in an autoregressive loop.")
+    group.add_argument("--autoregressive_include_prefix", action='store_true', help="If true, include the init prefix in the output, otherwise, will drop it.")
+    group.add_argument("--autoregressive_init", default='data', type=str, choices=['data', 'isaac'],
+                        help="Sets the source of the init frames, either from the dataset or isaac init poses.")
+def add_sampling_options(parser):
+    group = parser.add_argument_group('sampling')
+    group.add_argument("--model_path", required=True, type=str,
+                       help="Path to model####.pt file to be sampled.")
+    group.add_argument("--output_dir", default='', type=str,
+                       help="Path to results dir (auto created by the script). "
+                            "If empty, will create dir in parallel to checkpoint.")
+    group.add_argument("--num_samples", default=6, type=int,
+                       help="Maximal number of prompts to sample, "
+                            "if loading dataset from file, this field will be ignored.")
+    group.add_argument("--num_repetitions", default=3, type=int,
+                       help="Number of repetitions, per sample (text prompt/action)")
+    group.add_argument("--guidance_param", default=2.5, type=float,
+                       help="For classifier-free sampling - specifies the s parameter, as defined in the paper.")
+    group.add_argument("--autoregressive", action='store_true', help="If true, and we use a prefix model will generate motions in an autoregressive loop.")
+    group.add_argument("--autoregressive_include_prefix", action='store_true', help="If true, include the init prefix in the output, otherwise, will drop it.")
+    group.add_argument("--autoregressive_init", default='data', type=str, choices=['data', 'isaac'],
+                        help="Sets the source of the init frames, either from the dataset or isaac init poses.")
+def add_generate_options(parser):
+    group = parser.add_argument_group('generate')
+    group.add_argument("--motion_length", default=6.0, type=float,
+                       help="The length of the sampled motion [in seconds]. "
+                            "Maximum is 9.8 for HumanML3D (text-to-motion), and 2.0 for HumanAct12 (action-to-motion)")
+    group.add_argument("--input_text", default='', type=str,
+                       help="Path to a text file lists text prompts to be synthesized. If empty, will take text prompts from dataset.")
+    group.add_argument("--dynamic_text_path", default='', type=str,
+                       help="For the autoregressive mode only! Path to a text file lists text prompts to be synthesized. If empty, will take text prompts from dataset.")
+    group.add_argument("--action_file", default='', type=str,
+                       help="Path to a text file that lists names of actions to be synthesized. Names must be a subset of dataset/uestc/info/action_classes.txt if sampling from uestc, "
+                            "or a subset of [warm_up,walk,run,jump,drink,lift_dumbbell,sit,eat,turn steering wheel,phone,boxing,throw] if sampling from humanact12. "
+                            "If no file is specified, will take action names from dataset.")
+    group.add_argument("--text_prompt", default='', type=str,
+                       help="A text prompt to be generated. If empty, will take text prompts from dataset.")
+    group.add_argument("--action_name", default='', type=str,
+                       help="An action name to be generated. If empty, will take text prompts from dataset.")
+    group.add_argument("--target_joint_names", default='DIMP_FINAL', type=str, help="Force single joint configuration by specifing the joints (coma separated). If None - will use the random mode for all end effectors.")
+def add_edit_options(parser):
+    group = parser.add_argument_group('edit')
+    group.add_argument("--edit_mode", default='in_between', choices=['in_between', 'upper_body'], type=str,
+                       help="Defines which parts of the input motion will be edited.\n"
+                            "(1) in_between - suffix and prefix motion taken from input motion, "
+                            "middle motion is generated.\n"
+                            "(2) upper_body - lower body joints taken from input motion, "
+                            "upper body is generated.")
+    group.add_argument("--text_condition", default='', type=str,
+                       help="Editing will be conditioned on this text prompt. "
+                            "If empty, will perform unconditioned editing.")
+    group.add_argument("--prefix_end", default=0.25, type=float,
+                       help="For in_between editing - Defines the end of input prefix (ratio from all frames).")
+    group.add_argument("--suffix_start", default=0.75, type=float,
+                       help="For in_between editing - Defines the start of input suffix (ratio from all frames).")
+def add_evaluation_options(parser):
+    group = parser.add_argument_group('eval')
+    group.add_argument("--model_path", required=True, type=str,
+                       help="Path to model####.pt file to be sampled.")
+    group.add_argument("--eval_mode", default='wo_mm', choices=['wo_mm', 'mm_short', 'debug', 'full'], type=str,
+                       help="wo_mm (t2m only) - 20 repetitions without multi-modality metric; "
+                            "mm_short (t2m only) - 5 repetitions with multi-modality metric; "
+                            "debug - short run, less accurate results."
+                            "full (a2m only) - 20 repetitions.")
+    group.add_argument("--autoregressive", action='store_true', help="If true, and we use a prefix model will generate motions in an autoregressive loop.")
+    group.add_argument("--autoregressive_include_prefix", action='store_true', help="If true, include the init prefix in the output, otherwise, will drop it.")
+    group.add_argument("--autoregressive_init", default='data', type=str, choices=['data', 'isaac'],
+                        help="Sets the source of the init frames, either from the dataset or isaac init poses.")
+    group.add_argument("--guidance_param", default=2.5, type=float,
+                       help="For classifier-free sampling - specifies the s parameter, as defined in the paper.")
+def get_cond_mode(args):
+    if args.unconstrained:
+        cond_mode = 'no_cond'
+    elif args.dataset in ['kit', 'humanml']:
+        cond_mode = 'text'
+    else:
+        cond_mode = 'action'
+    return cond_mode
+def train_args():
+    parser = ArgumentParser()
+    add_base_options(parser)
+    add_data_options(parser)
+    add_model_options(parser)
+    add_diffusion_options(parser)
+    add_training_options(parser)
+    return apply_rules(parser.parse_args())
+def generate_args():
+    parser = ArgumentParser()
+    # args specified by the user: (all other will be loaded from the model)
+    add_base_options(parser)
+    add_sampling_options(parser)
+    add_generate_options(parser)
+    args = parse_and_load_from_model(parser)
+    cond_mode = get_cond_mode(args)
+    if (args.input_text or args.text_prompt) and cond_mode != 'text':
+        raise Exception('Arguments input_text and text_prompt should not be used for an action condition. Please use action_file or action_name.')
+    elif (args.action_file or args.action_name) and cond_mode != 'action':
+        raise Exception('Arguments action_file and action_name should not be used for a text condition. Please use input_text or text_prompt.')
+    return args
+def edit_args():
+    parser = ArgumentParser()
+    # args specified by the user: (all other will be loaded from the model)
+    add_base_options(parser)
+    add_sampling_options(parser)
+    add_edit_options(parser)
+    return parse_and_load_from_model(parser)
+def evaluation_parser():
+    parser = ArgumentParser()
+    # args specified by the user: (all other will be loaded from the model)
+    add_base_options(parser)
+    add_evaluation_options(parser)
+    return parse_and_load_from_model(parser)

utils/rotation_conversions.py ADDED Viewed

	@@ -0,0 +1,552 @@

+# This code is based on https://github.com/Mathux/ACTOR.git
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+# Check PYTORCH3D_LICENCE before use
+import functools
+from typing import Optional
+import torch
+import torch.nn.functional as F
+"""
+The transformation matrices returned from the functions in this file assume
+the points on which the transformation will be applied are column vectors.
+i.e. the R matrix is structured as
+    R = [
+            [Rxx, Rxy, Rxz],
+            [Ryx, Ryy, Ryz],
+            [Rzx, Rzy, Rzz],
+        ]  # (3, 3)
+This matrix can be applied to column vectors by post multiplication
+by the points e.g.
+    points = [[0], [1], [2]]  # (3 x 1) xyz coordinates of a point
+    transformed_points = R * points
+To apply the same matrix to points which are row vectors, the R matrix
+can be transposed and pre multiplied by the points:
+e.g.
+    points = [[0, 1, 2]]  # (1 x 3) xyz coordinates of a point
+    transformed_points = points * R.transpose(1, 0)
+"""
+def quaternion_to_matrix(quaternions):
+    """
+    Convert rotations given as quaternions to rotation matrices.
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    r, i, j, k = torch.unbind(quaternions, -1)
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+def _copysign(a, b):
+    """
+    Return a tensor where each element has the absolute value taken from the,
+    corresponding element of a, with sign taken from the corresponding
+    element of b. This is like the standard copysign floating-point operation,
+    but is not careful about negative 0 and NaN.
+    Args:
+        a: source tensor.
+        b: tensor whose signs will be used, of the same shape as a.
+    Returns:
+        Tensor of the same shape as a with the signs of b.
+    """
+    signs_differ = (a < 0) != (b < 0)
+    return torch.where(signs_differ, -a, a)
+def _sqrt_positive_part(x):
+    """
+    Returns torch.sqrt(torch.max(0, x))
+    but with a zero subgradient where x is 0.
+    """
+    ret = torch.zeros_like(x)
+    positive_mask = x > 0
+    ret[positive_mask] = torch.sqrt(x[positive_mask])
+    return ret
+def matrix_to_quaternion(matrix):
+    """
+    Convert rotations given as rotation matrices to quaternions.
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+    Returns:
+        quaternions with real part first, as tensor of shape (..., 4).
+    """
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix  shape f{matrix.shape}.")
+    m00 = matrix[..., 0, 0]
+    m11 = matrix[..., 1, 1]
+    m22 = matrix[..., 2, 2]
+    o0 = 0.5 * _sqrt_positive_part(1 + m00 + m11 + m22)
+    x = 0.5 * _sqrt_positive_part(1 + m00 - m11 - m22)
+    y = 0.5 * _sqrt_positive_part(1 - m00 + m11 - m22)
+    z = 0.5 * _sqrt_positive_part(1 - m00 - m11 + m22)
+    o1 = _copysign(x, matrix[..., 2, 1] - matrix[..., 1, 2])
+    o2 = _copysign(y, matrix[..., 0, 2] - matrix[..., 2, 0])
+    o3 = _copysign(z, matrix[..., 1, 0] - matrix[..., 0, 1])
+    return torch.stack((o0, o1, o2, o3), -1)
+def _axis_angle_rotation(axis: str, angle):
+    """
+    Return the rotation matrices for one of the rotations about an axis
+    of which Euler angles describe, for each value of the angle given.
+    Args:
+        axis: Axis label "X" or "Y or "Z".
+        angle: any shape tensor of Euler angles in radians
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    cos = torch.cos(angle)
+    sin = torch.sin(angle)
+    one = torch.ones_like(angle)
+    zero = torch.zeros_like(angle)
+    if axis == "X":
+        R_flat = (one, zero, zero, zero, cos, -sin, zero, sin, cos)
+    if axis == "Y":
+        R_flat = (cos, zero, sin, zero, one, zero, -sin, zero, cos)
+    if axis == "Z":
+        R_flat = (cos, -sin, zero, sin, cos, zero, zero, zero, one)
+    return torch.stack(R_flat, -1).reshape(angle.shape + (3, 3))
+def euler_angles_to_matrix(euler_angles, convention: str):
+    """
+    Convert rotations given as Euler angles in radians to rotation matrices.
+    Args:
+        euler_angles: Euler angles in radians as tensor of shape (..., 3).
+        convention: Convention string of three uppercase letters from
+            {"X", "Y", and "Z"}.
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    if euler_angles.dim() == 0 or euler_angles.shape[-1] != 3:
+        raise ValueError("Invalid input euler angles.")
+    if len(convention) != 3:
+        raise ValueError("Convention must have 3 letters.")
+    if convention[1] in (convention[0], convention[2]):
+        raise ValueError(f"Invalid convention {convention}.")
+    for letter in convention:
+        if letter not in ("X", "Y", "Z"):
+            raise ValueError(f"Invalid letter {letter} in convention string.")
+    matrices = map(_axis_angle_rotation, convention, torch.unbind(euler_angles, -1))
+    return functools.reduce(torch.matmul, matrices)
+def _angle_from_tan(
+    axis: str, other_axis: str, data, horizontal: bool, tait_bryan: bool
+):
+    """
+    Extract the first or third Euler angle from the two members of
+    the matrix which are positive constant times its sine and cosine.
+    Args:
+        axis: Axis label "X" or "Y or "Z" for the angle we are finding.
+        other_axis: Axis label "X" or "Y or "Z" for the middle axis in the
+            convention.
+        data: Rotation matrices as tensor of shape (..., 3, 3).
+        horizontal: Whether we are looking for the angle for the third axis,
+            which means the relevant entries are in the same row of the
+            rotation matrix. If not, they are in the same column.
+        tait_bryan: Whether the first and third axes in the convention differ.
+    Returns:
+        Euler Angles in radians for each matrix in dataset as a tensor
+        of shape (...).
+    """
+    i1, i2 = {"X": (2, 1), "Y": (0, 2), "Z": (1, 0)}[axis]
+    if horizontal:
+        i2, i1 = i1, i2
+    even = (axis + other_axis) in ["XY", "YZ", "ZX"]
+    if horizontal == even:
+        return torch.atan2(data[..., i1], data[..., i2])
+    if tait_bryan:
+        return torch.atan2(-data[..., i2], data[..., i1])
+    return torch.atan2(data[..., i2], -data[..., i1])
+def _index_from_letter(letter: str):
+    if letter == "X":
+        return 0
+    if letter == "Y":
+        return 1
+    if letter == "Z":
+        return 2
+def matrix_to_euler_angles(matrix, convention: str):
+    """
+    Convert rotations given as rotation matrices to Euler angles in radians.
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+        convention: Convention string of three uppercase letters.
+    Returns:
+        Euler angles in radians as tensor of shape (..., 3).
+    """
+    if len(convention) != 3:
+        raise ValueError("Convention must have 3 letters.")
+    if convention[1] in (convention[0], convention[2]):
+        raise ValueError(f"Invalid convention {convention}.")
+    for letter in convention:
+        if letter not in ("X", "Y", "Z"):
+            raise ValueError(f"Invalid letter {letter} in convention string.")
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix  shape f{matrix.shape}.")
+    i0 = _index_from_letter(convention[0])
+    i2 = _index_from_letter(convention[2])
+    tait_bryan = i0 != i2
+    if tait_bryan:
+        central_angle = torch.asin(
+            matrix[..., i0, i2] * (-1.0 if i0 - i2 in [-1, 2] else 1.0)
+        )
+    else:
+        central_angle = torch.acos(matrix[..., i0, i0])
+    o = (
+        _angle_from_tan(
+            convention[0], convention[1], matrix[..., i2], False, tait_bryan
+        ),
+        central_angle,
+        _angle_from_tan(
+            convention[2], convention[1], matrix[..., i0, :], True, tait_bryan
+        ),
+    )
+    return torch.stack(o, -1)
+def random_quaternions(
+    n: int, dtype: Optional[torch.dtype] = None, device=None, requires_grad=False
+):
+    """
+    Generate random quaternions representing rotations,
+    i.e. versors with nonnegative real part.
+    Args:
+        n: Number of quaternions in a batch to return.
+        dtype: Type to return.
+        device: Desired device of returned tensor. Default:
+            uses the current device for the default tensor type.
+        requires_grad: Whether the resulting tensor should have the gradient
+            flag set.
+    Returns:
+        Quaternions as tensor of shape (N, 4).
+    """
+    o = torch.randn((n, 4), dtype=dtype, device=device, requires_grad=requires_grad)
+    s = (o * o).sum(1)
+    o = o / _copysign(torch.sqrt(s), o[:, 0])[:, None]
+    return o
+def random_rotations(
+    n: int, dtype: Optional[torch.dtype] = None, device=None, requires_grad=False
+):
+    """
+    Generate random rotations as 3x3 rotation matrices.
+    Args:
+        n: Number of rotation matrices in a batch to return.
+        dtype: Type to return.
+        device: Device of returned tensor. Default: if None,
+            uses the current device for the default tensor type.
+        requires_grad: Whether the resulting tensor should have the gradient
+            flag set.
+    Returns:
+        Rotation matrices as tensor of shape (n, 3, 3).
+    """
+    quaternions = random_quaternions(
+        n, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+    return quaternion_to_matrix(quaternions)
+def random_rotation(
+    dtype: Optional[torch.dtype] = None, device=None, requires_grad=False
+):
+    """
+    Generate a single random 3x3 rotation matrix.
+    Args:
+        dtype: Type to return
+        device: Device of returned tensor. Default: if None,
+            uses the current device for the default tensor type
+        requires_grad: Whether the resulting tensor should have the gradient
+            flag set
+    Returns:
+        Rotation matrix as tensor of shape (3, 3).
+    """
+    return random_rotations(1, dtype, device, requires_grad)[0]
+def standardize_quaternion(quaternions):
+    """
+    Convert a unit quaternion to a standard form: one in which the real
+    part is non negative.
+    Args:
+        quaternions: Quaternions with real part first,
+            as tensor of shape (..., 4).
+    Returns:
+        Standardized quaternions as tensor of shape (..., 4).
+    """
+    return torch.where(quaternions[..., 0:1] < 0, -quaternions, quaternions)
+def quaternion_raw_multiply(a, b):
+    """
+    Multiply two quaternions.
+    Usual torch rules for broadcasting apply.
+    Args:
+        a: Quaternions as tensor of shape (..., 4), real part first.
+        b: Quaternions as tensor of shape (..., 4), real part first.
+    Returns:
+        The product of a and b, a tensor of quaternions shape (..., 4).
+    """
+    aw, ax, ay, az = torch.unbind(a, -1)
+    bw, bx, by, bz = torch.unbind(b, -1)
+    ow = aw * bw - ax * bx - ay * by - az * bz
+    ox = aw * bx + ax * bw + ay * bz - az * by
+    oy = aw * by - ax * bz + ay * bw + az * bx
+    oz = aw * bz + ax * by - ay * bx + az * bw
+    return torch.stack((ow, ox, oy, oz), -1)
+def quaternion_multiply(a, b):
+    """
+    Multiply two quaternions representing rotations, returning the quaternion
+    representing their composition, i.e. the versor with nonnegative real part.
+    Usual torch rules for broadcasting apply.
+    Args:
+        a: Quaternions as tensor of shape (..., 4), real part first.
+        b: Quaternions as tensor of shape (..., 4), real part first.
+    Returns:
+        The product of a and b, a tensor of quaternions of shape (..., 4).
+    """
+    ab = quaternion_raw_multiply(a, b)
+    return standardize_quaternion(ab)
+def quaternion_invert(quaternion):
+    """
+    Given a quaternion representing rotation, get the quaternion representing
+    its inverse.
+    Args:
+        quaternion: Quaternions as tensor of shape (..., 4), with real part
+            first, which must be versors (unit quaternions).
+    Returns:
+        The inverse, a tensor of quaternions of shape (..., 4).
+    """
+    return quaternion * quaternion.new_tensor([1, -1, -1, -1])
+def quaternion_apply(quaternion, point):
+    """
+    Apply the rotation given by a quaternion to a 3D point.
+    Usual torch rules for broadcasting apply.
+    Args:
+        quaternion: Tensor of quaternions, real part first, of shape (..., 4).
+        point: Tensor of 3D points of shape (..., 3).
+    Returns:
+        Tensor of rotated points of shape (..., 3).
+    """
+    if point.size(-1) != 3:
+        raise ValueError(f"Points are not in 3D, f{point.shape}.")
+    real_parts = point.new_zeros(point.shape[:-1] + (1,))
+    point_as_quaternion = torch.cat((real_parts, point), -1)
+    out = quaternion_raw_multiply(
+        quaternion_raw_multiply(quaternion, point_as_quaternion),
+        quaternion_invert(quaternion),
+    )
+    return out[..., 1:]
+def axis_angle_to_matrix(axis_angle):
+    """
+    Convert rotations given as axis/angle to rotation matrices.
+    Args:
+        axis_angle: Rotations given as a vector in axis angle form,
+            as a tensor of shape (..., 3), where the magnitude is
+            the angle turned anticlockwise in radians around the
+            vector's direction.
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    return quaternion_to_matrix(axis_angle_to_quaternion(axis_angle))
+def matrix_to_axis_angle(matrix):
+    """
+    Convert rotations given as rotation matrices to axis/angle.
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+    Returns:
+        Rotations given as a vector in axis angle form, as a tensor
+            of shape (..., 3), where the magnitude is the angle
+            turned anticlockwise in radians around the vector's
+            direction.
+    """
+    return quaternion_to_axis_angle(matrix_to_quaternion(matrix))
+def axis_angle_to_quaternion(axis_angle):
+    """
+    Convert rotations given as axis/angle to quaternions.
+    Args:
+        axis_angle: Rotations given as a vector in axis angle form,
+            as a tensor of shape (..., 3), where the magnitude is
+            the angle turned anticlockwise in radians around the
+            vector's direction.
+    Returns:
+        quaternions with real part first, as tensor of shape (..., 4).
+    """
+    angles = torch.norm(axis_angle, p=2, dim=-1, keepdim=True)
+    half_angles = 0.5 * angles
+    eps = 1e-6
+    small_angles = angles.abs() < eps
+    sin_half_angles_over_angles = torch.empty_like(angles)
+    sin_half_angles_over_angles[~small_angles] = (
+        torch.sin(half_angles[~small_angles]) / angles[~small_angles]
+    )
+    # for x small, sin(x/2) is about x/2 - (x/2)^3/6
+    # so sin(x/2)/x is about 1/2 - (x*x)/48
+    sin_half_angles_over_angles[small_angles] = (
+        0.5 - (angles[small_angles] * angles[small_angles]) / 48
+    )
+    quaternions = torch.cat(
+        [torch.cos(half_angles), axis_angle * sin_half_angles_over_angles], dim=-1
+    )
+    return quaternions
+def quaternion_to_axis_angle(quaternions):
+    """
+    Convert rotations given as quaternions to axis/angle.
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+    Returns:
+        Rotations given as a vector in axis angle form, as a tensor
+            of shape (..., 3), where the magnitude is the angle
+            turned anticlockwise in radians around the vector's
+            direction.
+    """
+    norms = torch.norm(quaternions[..., 1:], p=2, dim=-1, keepdim=True)
+    half_angles = torch.atan2(norms, quaternions[..., :1])
+    angles = 2 * half_angles
+    eps = 1e-6
+    small_angles = angles.abs() < eps
+    sin_half_angles_over_angles = torch.empty_like(angles)
+    sin_half_angles_over_angles[~small_angles] = (
+        torch.sin(half_angles[~small_angles]) / angles[~small_angles]
+    )
+    # for x small, sin(x/2) is about x/2 - (x/2)^3/6
+    # so sin(x/2)/x is about 1/2 - (x*x)/48
+    sin_half_angles_over_angles[small_angles] = (
+        0.5 - (angles[small_angles] * angles[small_angles]) / 48
+    )
+    return quaternions[..., 1:] / sin_half_angles_over_angles
+def rotation_6d_to_matrix(d6: torch.Tensor) -> torch.Tensor:
+    """
+    Converts 6D rotation representation by Zhou et al. [1] to rotation matrix
+    using Gram--Schmidt orthogonalisation per Section B of [1].
+    Args:
+        d6: 6D rotation representation, of size (*, 6)
+    Returns:
+        batch of rotation matrices of size (*, 3, 3)
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+    a1, a2 = d6[..., :3], d6[..., 3:]
+    b1 = F.normalize(a1, dim=-1)
+    b2 = a2 - (b1 * a2).sum(-1, keepdim=True) * b1
+    b2 = F.normalize(b2, dim=-1)
+    b3 = torch.cross(b1, b2, dim=-1)
+    return torch.stack((b1, b2, b3), dim=-2)
+def matrix_to_rotation_6d(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Converts rotation matrices to 6D rotation representation by Zhou et al. [1]
+    by dropping the last row. Note that 6D representation is not unique.
+    Args:
+        matrix: batch of rotation matrices of size (*, 3, 3)
+    Returns:
+        6D rotation representation, of size (*, 6)
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+    return matrix[..., :2, :].clone().reshape(*matrix.size()[:-2], 6)

utils/sampler_util.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from copy import deepcopy
+from utils.misc import wrapped_getattr
+import joblib
+# A wrapper model for Classifier-free guidance **SAMPLING** only
+# https://arxiv.org/abs/2207.12598
+class ClassifierFreeSampleModel(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model  # model is the actual model to run
+        assert self.model.cond_mask_prob > 0, 'Cannot run a guided diffusion on a model that has not been trained with no conditions'
+        # pointers to inner model
+        self.rot2xyz = self.model.rot2xyz
+        self.translation = self.model.translation
+        self.njoints = self.model.njoints
+        self.nfeats = self.model.nfeats
+        self.data_rep = self.model.data_rep
+        self.cond_mode = self.model.cond_mode
+        self.encode_text = self.model.encode_text
+    def forward(self, x, timesteps, y=None):
+        cond_mode = self.model.cond_mode
+        assert cond_mode in ['text', 'action']
+        y_uncond = deepcopy(y)
+        y_uncond['uncond'] = True
+        out = self.model(x, timesteps, y)
+        out_uncond = self.model(x, timesteps, y_uncond)
+        return out_uncond + (y['scale'].view(-1, 1, 1, 1) * (out - out_uncond))
+    def __getattr__(self, name, default=None):
+        # this method is reached only if name is not in self.__dict__.
+        return wrapped_getattr(self, name, default=None)
+class AutoRegressiveSampler():
+    def __init__(self, args, sample_fn, required_frames=196):
+        self.sample_fn = sample_fn
+        self.args = args
+        self.required_frames = required_frames
+    def sample(self, model, shape, **kargs):
+        bs = shape[0]
+        n_iterations = (self.required_frames // self.args.pred_len) + int(self.required_frames % self.args.pred_len > 0)
+        samples_buf = []
+        cur_prefix = deepcopy(kargs['model_kwargs']['y']['prefix'])  # init with data
+        dynamic_text_mode = type(kargs['model_kwargs']['y']['text'][0]) == list  # Text changes on the fly - prompt per prediction is provided as a list (instead of a single prompt)
+        if self.args.autoregressive_include_prefix:
+            samples_buf.append(cur_prefix)
+        autoregressive_shape = list(deepcopy(shape))
+        autoregressive_shape[-1] = self.args.pred_len
+        # Autoregressive sampling
+        for i in range(n_iterations):
+            # Build the current kargs
+            cur_kargs = deepcopy(kargs)
+            cur_kargs['model_kwargs']['y']['prefix'] = cur_prefix
+            if dynamic_text_mode:
+                cur_kargs['model_kwargs']['y']['text'] = [s[i] for s in kargs['model_kwargs']['y']['text']]
+                if model.text_encoder_type == 'bert':
+                    cur_kargs['model_kwargs']['y']['text_embed'] = (cur_kargs['model_kwargs']['y']['text_embed'][0][:, :, i], cur_kargs['model_kwargs']['y']['text_embed'][1][:, i])
+                else:
+                    raise NotImplementedError('DiP model only supports BERT text encoder at the moment. If you implement this, please send a PR!')
+            # Sample the next prediction
+            sample = self.sample_fn(model, autoregressive_shape, **cur_kargs)
+            # Buffer the sample
+            samples_buf.append(sample.clone()[..., -self.args.pred_len:])
+            # Update the prefix
+            cur_prefix = sample.clone()[..., -self.args.context_len:]
+        full_batch = torch.cat(samples_buf, dim=-1)[..., :self.required_frames]  # 200 -> 196
+        return full_batch