Upload demo_motionstreamer.py with huggingface_hub
Browse files- demo_motionstreamer.py +58 -88
demo_motionstreamer.py
CHANGED
|
@@ -12,76 +12,34 @@ from utils.face_z_align_util import rotation_6d_to_matrix, matrix_to_axis_angle,
|
|
| 12 |
from sentence_transformers import SentenceTransformer
|
| 13 |
warnings.filterwarnings('ignore')
|
| 14 |
|
|
|
|
| 15 |
def save_motion_as_bvh(motion_data, output_path, fps=30):
|
| 16 |
-
"""
|
| 17 |
-
Saves a denormalized 272-dim motion numpy array to a BVH file.
|
| 18 |
-
"""
|
| 19 |
print(f"--- Starting direct conversion to BVH: {os.path.basename(output_path)} ---")
|
| 20 |
try:
|
| 21 |
-
if isinstance(motion_data, torch.Tensor):
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
elif motion_data.ndim != 2:
|
| 26 |
-
raise ValueError(f"Input motion data must be 2D, but got shape {motion_data.shape}")
|
| 27 |
-
|
| 28 |
-
njoint = 22
|
| 29 |
-
nfrm, _ = motion_data.shape
|
| 30 |
-
|
| 31 |
rotations_matrix = rotation_6d_to_matrix(torch.from_numpy(motion_data[:, 8+6*njoint : 8+12*njoint]).reshape(nfrm, -1, 6)).numpy()
|
| 32 |
-
|
| 33 |
global_heading_diff_rot_6d = torch.from_numpy(motion_data[:, 2:8])
|
| 34 |
global_heading_diff_rot = rotation_6d_to_matrix(global_heading_diff_rot_6d).numpy()
|
| 35 |
-
global_heading_rot = np.zeros_like(global_heading_diff_rot)
|
| 36 |
-
global_heading_rot[
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
velocities_root_xy = motion_data[:, :2]
|
| 41 |
-
height = motion_data[:, 8 : 8+3*njoint].reshape(nfrm, -1, 3)[:, 0, 1]
|
| 42 |
-
|
| 43 |
-
inv_global_heading_rot = np.transpose(global_heading_rot, (0, 2, 1))
|
| 44 |
-
rotations_matrix[:, 0, ...] = np.matmul(inv_global_heading_rot, rotations_matrix[:, 0, ...])
|
| 45 |
-
|
| 46 |
-
velocities_root_xyz = np.zeros((nfrm, 3))
|
| 47 |
-
velocities_root_xyz[:, 0] = velocities_root_xy[:, 0]
|
| 48 |
-
velocities_root_xyz[:, 2] = velocities_root_xy[:, 1]
|
| 49 |
velocities_root_xyz[1:, :] = np.matmul(inv_global_heading_rot[:-1], velocities_root_xyz[1:, :, None]).squeeze(-1)
|
| 50 |
-
root_translation = np.cumsum(velocities_root_xyz, axis=0)
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
poses_24_joints = np.
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
parents = model.parents.detach().cpu().numpy()
|
| 59 |
-
rest_pose = model().joints.detach().cpu().numpy().squeeze()[:24,:]
|
| 60 |
-
offsets = rest_pose - rest_pose[parents]
|
| 61 |
-
offsets[0] = np.array([0,0,0])
|
| 62 |
-
|
| 63 |
-
rotations_quat = axis_angle_to_quaternion(torch.from_numpy(poses_24_joints.reshape(-1, 24, 3))).numpy()
|
| 64 |
-
rotations_euler = np.degrees(quat.to_euler(rotations_quat, order="zyx"))
|
| 65 |
-
positions = np.zeros_like(rotations_quat[..., :3])
|
| 66 |
-
positions[:, 0] = root_translation
|
| 67 |
-
|
| 68 |
-
joint_names = [
|
| 69 |
-
"Pelvis", "Left_hip", "Right_hip", "Spine1", "Left_knee", "Right_knee", "Spine2",
|
| 70 |
-
"Left_ankle", "Right_ankle", "Spine3", "Left_foot", "Right_foot", "Neck",
|
| 71 |
-
"Left_collar", "Right_collar", "Head", "Left_shoulder", "Right_shoulder",
|
| 72 |
-
"Left_elbow", "Right_elbow", "Left_wrist", "Right_wrist", "Left_hand", "Right_hand"
|
| 73 |
-
]
|
| 74 |
-
|
| 75 |
-
bvh.save(output_path, {
|
| 76 |
-
"rotations": rotations_euler, "positions": positions, "offsets": offsets,
|
| 77 |
-
"parents": parents, "names": joint_names, "order": "zyx", "frametime": 1.0 / fps,
|
| 78 |
-
})
|
| 79 |
print(f"✅ BVH file saved successfully to {output_path}")
|
| 80 |
-
|
| 81 |
except Exception as e:
|
| 82 |
-
print(f"❌ BVH Conversion Failed. Error: {e}")
|
| 83 |
-
import traceback
|
| 84 |
-
traceback.print_exc()
|
| 85 |
|
| 86 |
|
| 87 |
if __name__ == '__main__':
|
|
@@ -102,56 +60,68 @@ if __name__ == '__main__':
|
|
| 102 |
latent_dim=16, clip_range=[-30, 20]
|
| 103 |
)
|
| 104 |
tae_ckpt = torch.load('Causal_TAE_t2m_babel/net_last.pth', map_location='cpu')
|
| 105 |
-
|
| 106 |
-
tae_net.load_state_dict(tae_ckpt['net'], strict=True)
|
| 107 |
tae_net.eval()
|
| 108 |
tae_net.to(comp_device)
|
| 109 |
|
| 110 |
config = LLaMAHFConfig.from_name('Normal_size')
|
| 111 |
config.block_size = 78
|
| 112 |
trans_encoder = LLaMAHF(config, args.num_diffusion_head_layers, args.latent_dim, comp_device)
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
trans_encoder.eval()
|
| 117 |
trans_encoder.to(comp_device)
|
| 118 |
|
|
|
|
| 119 |
print("Loading mean/std from BABEL dataset...")
|
| 120 |
mean = np.load('babel_272/t2m_babel_mean_std/Mean.npy')
|
| 121 |
std = np.load('babel_272/t2m_babel_mean_std/Std.npy')
|
| 122 |
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
# --- KEY PARAMETER TO TUNE ---
|
| 128 |
-
# The author often uses a higher CFG scale for more dynamic motions.
|
| 129 |
-
# Let's try 7.0, which is a common value in their code.
|
| 130 |
-
cfg_scale = 10.0
|
| 131 |
-
|
| 132 |
print(f"Generating motion for text: '{args.text}' with CFG scale: {cfg_scale}")
|
| 133 |
with torch.no_grad():
|
| 134 |
-
#
|
| 135 |
-
_, motion_latents = trans_encoder.
|
| 136 |
-
B_text=args.text,
|
| 137 |
-
A_motion=motion_history,
|
| 138 |
-
tokenizer='t5-xxl',
|
| 139 |
clip_model=t5_model,
|
| 140 |
device=comp_device,
|
| 141 |
-
cfg=cfg_scale,
|
| 142 |
-
length=240
|
|
|
|
| 143 |
)
|
| 144 |
-
|
| 145 |
print("Decoding latents to full motion...")
|
| 146 |
motion_seqs = tae_net.forward_decoder(motion_latents)
|
| 147 |
|
| 148 |
motion = motion_seqs.detach().cpu().numpy()
|
| 149 |
motion_denormalized = motion * std + mean
|
| 150 |
-
|
| 151 |
-
# --- Save Output ---
|
| 152 |
output_dir = 'demo_output_streamer'
|
| 153 |
-
if not os.path.exists(output_dir):
|
| 154 |
-
|
| 155 |
-
|
| 156 |
output_bvh_path = os.path.join(output_dir, f'{args.text.replace(" ", "_")}_cfg{cfg_scale}.bvh')
|
| 157 |
save_motion_as_bvh(motion_denormalized, output_bvh_path, fps=30)
|
|
|
|
| 12 |
from sentence_transformers import SentenceTransformer
|
| 13 |
warnings.filterwarnings('ignore')
|
| 14 |
|
| 15 |
+
# --- save_motion_as_bvh function is unchanged ---
|
| 16 |
def save_motion_as_bvh(motion_data, output_path, fps=30):
|
|
|
|
|
|
|
|
|
|
| 17 |
print(f"--- Starting direct conversion to BVH: {os.path.basename(output_path)} ---")
|
| 18 |
try:
|
| 19 |
+
if isinstance(motion_data, torch.Tensor): motion_data = motion_data.detach().cpu().numpy()
|
| 20 |
+
if motion_data.ndim == 3 and motion_data.shape[0] == 1: motion_data = motion_data.squeeze(0)
|
| 21 |
+
elif motion_data.ndim != 2: raise ValueError(f"Input motion data must be 2D, but got shape {motion_data.shape}")
|
| 22 |
+
njoint = 22; nfrm, _ = motion_data.shape
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
rotations_matrix = rotation_6d_to_matrix(torch.from_numpy(motion_data[:, 8+6*njoint : 8+12*njoint]).reshape(nfrm, -1, 6)).numpy()
|
|
|
|
| 24 |
global_heading_diff_rot_6d = torch.from_numpy(motion_data[:, 2:8])
|
| 25 |
global_heading_diff_rot = rotation_6d_to_matrix(global_heading_diff_rot_6d).numpy()
|
| 26 |
+
global_heading_rot = np.zeros_like(global_heading_diff_rot); global_heading_rot[0] = global_heading_diff_rot[0]
|
| 27 |
+
for i in range(1, nfrm): global_heading_rot[i] = np.matmul(global_heading_diff_rot[i], global_heading_rot[i-1])
|
| 28 |
+
velocities_root_xy = motion_data[:, :2]; height = motion_data[:, 8 : 8+3*njoint].reshape(nfrm, -1, 3)[:, 0, 1]
|
| 29 |
+
inv_global_heading_rot = np.transpose(global_heading_rot, (0, 2, 1)); rotations_matrix[:, 0, ...] = np.matmul(inv_global_heading_rot, rotations_matrix[:, 0, ...])
|
| 30 |
+
velocities_root_xyz = np.zeros((nfrm, 3)); velocities_root_xyz[:, 0] = velocities_root_xy[:, 0]; velocities_root_xyz[:, 2] = velocities_root_xy[:, 1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
velocities_root_xyz[1:, :] = np.matmul(inv_global_heading_rot[:-1], velocities_root_xyz[1:, :, None]).squeeze(-1)
|
| 32 |
+
root_translation = np.cumsum(velocities_root_xyz, axis=0); root_translation[:, 1] = height
|
| 33 |
+
axis_angle = matrix_to_axis_angle(torch.from_numpy(rotations_matrix)).numpy().reshape(nfrm, -1); poses_24_joints = np.zeros((nfrm, 72)); poses_24_joints[:, :66] = axis_angle
|
| 34 |
+
model = smplx.create(model_path="body_models/human_model_files", model_type="smpl", gender="NEUTRAL"); parents = model.parents.detach().cpu().numpy()
|
| 35 |
+
rest_pose = model().joints.detach().cpu().numpy().squeeze()[:24,:]; offsets = rest_pose - rest_pose[parents]; offsets[0] = np.array([0,0,0])
|
| 36 |
+
rotations_quat = axis_angle_to_quaternion(torch.from_numpy(poses_24_joints.reshape(-1, 24, 3))).numpy(); rotations_euler = np.degrees(quat.to_euler(rotations_quat, order="zyx"))
|
| 37 |
+
positions = np.zeros_like(rotations_quat[..., :3]); positions[:, 0] = root_translation
|
| 38 |
+
joint_names = ["Pelvis", "Left_hip", "Right_hip", "Spine1", "Left_knee", "Right_knee", "Spine2", "Left_ankle", "Right_ankle", "Spine3", "Left_foot", "Right_foot", "Neck", "Left_collar", "Right_collar", "Head", "Left_shoulder", "Right_shoulder", "Left_elbow", "Right_elbow", "Left_wrist", "Right_wrist", "Left_hand", "Right_hand"]
|
| 39 |
+
bvh.save(output_path, {"rotations": rotations_euler, "positions": positions, "offsets": offsets, "parents": parents, "names": joint_names, "order": "zyx", "frametime": 1.0 / fps})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
print(f"✅ BVH file saved successfully to {output_path}")
|
|
|
|
| 41 |
except Exception as e:
|
| 42 |
+
print(f"❌ BVH Conversion Failed. Error: {e}"); import traceback; traceback.print_exc()
|
|
|
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
if __name__ == '__main__':
|
|
|
|
| 60 |
latent_dim=16, clip_range=[-30, 20]
|
| 61 |
)
|
| 62 |
tae_ckpt = torch.load('Causal_TAE_t2m_babel/net_last.pth', map_location='cpu')
|
| 63 |
+
tae_net.load_state_dict(tae_ckpt['net'], strict=True)
|
|
|
|
| 64 |
tae_net.eval()
|
| 65 |
tae_net.to(comp_device)
|
| 66 |
|
| 67 |
config = LLaMAHFConfig.from_name('Normal_size')
|
| 68 |
config.block_size = 78
|
| 69 |
trans_encoder = LLaMAHF(config, args.num_diffusion_head_layers, args.latent_dim, comp_device)
|
| 70 |
+
|
| 71 |
+
# --- THIS IS THE FIX ---
|
| 72 |
+
print("Loading your trained MotionStreamer checkpoint from 'motionstreamer_model/latest.pth'...")
|
| 73 |
+
# Make sure this path is correct relative to where you run the script
|
| 74 |
+
checkpoint_path = 'motionstreamer_model/latest.pth'
|
| 75 |
+
trans_ckpt = torch.load(checkpoint_path, map_location='cpu')
|
| 76 |
+
|
| 77 |
+
# Create a new state dict without the 'module.' prefix
|
| 78 |
+
unwrapped_state_dict = {}
|
| 79 |
+
for key, value in trans_ckpt['trans'].items():
|
| 80 |
+
if key.startswith('module.'):
|
| 81 |
+
# Strip the 'module.' prefix
|
| 82 |
+
unwrapped_state_dict[key[len('module.'):]] = value
|
| 83 |
+
else:
|
| 84 |
+
# Keep keys that don't have the prefix (just in case)
|
| 85 |
+
unwrapped_state_dict[key] = value
|
| 86 |
+
|
| 87 |
+
# Load the unwrapped state dict
|
| 88 |
+
trans_encoder.load_state_dict(unwrapped_state_dict, strict=True)
|
| 89 |
+
print("Successfully loaded unwrapped checkpoint.")
|
| 90 |
+
# --- END FIX ---
|
| 91 |
+
|
| 92 |
trans_encoder.eval()
|
| 93 |
trans_encoder.to(comp_device)
|
| 94 |
|
| 95 |
+
# --- Rest of the script is unchanged ---
|
| 96 |
print("Loading mean/std from BABEL dataset...")
|
| 97 |
mean = np.load('babel_272/t2m_babel_mean_std/Mean.npy')
|
| 98 |
std = np.load('babel_272/t2m_babel_mean_std/Std.npy')
|
| 99 |
|
| 100 |
+
motion_history = torch.empty(0, 16).to(comp_device)
|
| 101 |
+
cfg_scale = 10.0
|
| 102 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
print(f"Generating motion for text: '{args.text}' with CFG scale: {cfg_scale}")
|
| 104 |
with torch.no_grad():
|
| 105 |
+
# Use the new two-forward sampling method to match training
|
| 106 |
+
_, motion_latents = trans_encoder.sample_for_eval_CFG_babel_inference_two_forward(
|
| 107 |
+
B_text=args.text,
|
| 108 |
+
A_motion=motion_history,
|
| 109 |
+
tokenizer='t5-xxl',
|
| 110 |
clip_model=t5_model,
|
| 111 |
device=comp_device,
|
| 112 |
+
cfg=cfg_scale,
|
| 113 |
+
length=240,
|
| 114 |
+
temperature=1.3
|
| 115 |
)
|
| 116 |
+
|
| 117 |
print("Decoding latents to full motion...")
|
| 118 |
motion_seqs = tae_net.forward_decoder(motion_latents)
|
| 119 |
|
| 120 |
motion = motion_seqs.detach().cpu().numpy()
|
| 121 |
motion_denormalized = motion * std + mean
|
| 122 |
+
|
|
|
|
| 123 |
output_dir = 'demo_output_streamer'
|
| 124 |
+
if not os.path.exists(output_dir): os.makedirs(output_dir)
|
| 125 |
+
|
|
|
|
| 126 |
output_bvh_path = os.path.join(output_dir, f'{args.text.replace(" ", "_")}_cfg{cfg_scale}.bvh')
|
| 127 |
save_motion_as_bvh(motion_denormalized, output_bvh_path, fps=30)
|