zirobtc
/

motion-stream

TensorBoard

Model card Files Files and versions

xet

Metrics Training metrics Community

zirobtc commited on Oct 29, 2025

Commit

69b50e2

verified ·

1 Parent(s): a50b63b

Upload demo_motionstreamer.py with huggingface_hub

Browse files

Files changed (1) hide show

demo_motionstreamer.py +58 -88

demo_motionstreamer.py CHANGED Viewed

@@ -12,76 +12,34 @@ from utils.face_z_align_util import rotation_6d_to_matrix, matrix_to_axis_angle,
 from sentence_transformers import SentenceTransformer
 warnings.filterwarnings('ignore')
 def save_motion_as_bvh(motion_data, output_path, fps=30):
-    """
-    Saves a denormalized 272-dim motion numpy array to a BVH file.
-    """
     print(f"--- Starting direct conversion to BVH: {os.path.basename(output_path)} ---")
     try:
-        if isinstance(motion_data, torch.Tensor):
-            motion_data = motion_data.detach().cpu().numpy()
-        if motion_data.ndim == 3 and motion_data.shape[0] == 1:
-            motion_data = motion_data.squeeze(0)
-        elif motion_data.ndim != 2:
-            raise ValueError(f"Input motion data must be 2D, but got shape {motion_data.shape}")
-        njoint = 22
-        nfrm, _ = motion_data.shape
         rotations_matrix = rotation_6d_to_matrix(torch.from_numpy(motion_data[:, 8+6*njoint : 8+12*njoint]).reshape(nfrm, -1, 6)).numpy()
         global_heading_diff_rot_6d = torch.from_numpy(motion_data[:, 2:8])
         global_heading_diff_rot = rotation_6d_to_matrix(global_heading_diff_rot_6d).numpy()
-        global_heading_rot = np.zeros_like(global_heading_diff_rot)
-        global_heading_rot[0] = global_heading_diff_rot[0]
-        for i in range(1, nfrm):
-            global_heading_rot[i] = np.matmul(global_heading_diff_rot[i], global_heading_rot[i-1])
-        velocities_root_xy = motion_data[:, :2]
-        height = motion_data[:, 8 : 8+3*njoint].reshape(nfrm, -1, 3)[:, 0, 1]
-        inv_global_heading_rot = np.transpose(global_heading_rot, (0, 2, 1))
-        rotations_matrix[:, 0, ...] = np.matmul(inv_global_heading_rot, rotations_matrix[:, 0, ...])
-        velocities_root_xyz = np.zeros((nfrm, 3))
-        velocities_root_xyz[:, 0] = velocities_root_xy[:, 0]
-        velocities_root_xyz[:, 2] = velocities_root_xy[:, 1]
         velocities_root_xyz[1:, :] = np.matmul(inv_global_heading_rot[:-1], velocities_root_xyz[1:, :, None]).squeeze(-1)
-        root_translation = np.cumsum(velocities_root_xyz, axis=0)
-        root_translation[:, 1] = height
-        axis_angle = matrix_to_axis_angle(torch.from_numpy(rotations_matrix)).numpy().reshape(nfrm, -1)
-        poses_24_joints = np.zeros((nfrm, 72))
-        poses_24_joints[:, :66] = axis_angle
-        model = smplx.create(model_path="body_models/human_model_files", model_type="smpl", gender="NEUTRAL")
-        parents = model.parents.detach().cpu().numpy()
-        rest_pose = model().joints.detach().cpu().numpy().squeeze()[:24,:]
-        offsets = rest_pose - rest_pose[parents]
-        offsets[0] = np.array([0,0,0])
-        rotations_quat = axis_angle_to_quaternion(torch.from_numpy(poses_24_joints.reshape(-1, 24, 3))).numpy()
-        rotations_euler = np.degrees(quat.to_euler(rotations_quat, order="zyx"))
-        positions = np.zeros_like(rotations_quat[..., :3])
-        positions[:, 0] = root_translation
-        joint_names = [
-            "Pelvis", "Left_hip", "Right_hip", "Spine1", "Left_knee", "Right_knee", "Spine2",
-            "Left_ankle", "Right_ankle", "Spine3", "Left_foot", "Right_foot", "Neck",
-            "Left_collar", "Right_collar", "Head", "Left_shoulder", "Right_shoulder",
-            "Left_elbow", "Right_elbow", "Left_wrist", "Right_wrist", "Left_hand", "Right_hand"
-        ]
-        bvh.save(output_path, {
-            "rotations": rotations_euler, "positions": positions, "offsets": offsets,
-            "parents": parents, "names": joint_names, "order": "zyx", "frametime": 1.0 / fps,
-        })
         print(f"✅ BVH file saved successfully to {output_path}")
     except Exception as e:
-        print(f"❌ BVH Conversion Failed. Error: {e}")
-        import traceback
-        traceback.print_exc()
 if __name__ == '__main__':
@@ -102,56 +60,68 @@ if __name__ == '__main__':
         latent_dim=16, clip_range=[-30, 20]
     )
     tae_ckpt = torch.load('Causal_TAE_t2m_babel/net_last.pth', map_location='cpu')
-    # The TAE checkpoint has weights stored under the 'net' key
-    tae_net.load_state_dict(tae_ckpt['net'], strict=True)
     tae_net.eval()
     tae_net.to(comp_device)
     config = LLaMAHFConfig.from_name('Normal_size')
     config.block_size = 78
     trans_encoder = LLaMAHF(config, args.num_diffusion_head_layers, args.latent_dim, comp_device)
-    print("Loading your trained MotionStreamer checkpoint from 'Experiments/motionstreamer_model/latest.pth'...")
-    trans_ckpt = torch.load('Experiments/motionstreamer_model/latest.pth', map_location='cpu')
-    trans_encoder.load_state_dict(trans_ckpt['trans'], strict=True)
     trans_encoder.eval()
     trans_encoder.to(comp_device)
     print("Loading mean/std from BABEL dataset...")
     mean = np.load('babel_272/t2m_babel_mean_std/Mean.npy')
     std = np.load('babel_272/t2m_babel_mean_std/Std.npy')
-    # --- Inference ---
-    # The history must be a 2D tensor for the model's unsqueeze operation to work
-    motion_history = torch.empty(0, 16).to(comp_device)
-    # --- KEY PARAMETER TO TUNE ---
-    # The author often uses a higher CFG scale for more dynamic motions.
-    # Let's try 7.0, which is a common value in their code.
-    cfg_scale = 10.0
     print(f"Generating motion for text: '{args.text}' with CFG scale: {cfg_scale}")
     with torch.no_grad():
-        # REVERTED to the correct streaming-native inference function
-        _, motion_latents = trans_encoder.sample_for_eval_CFG_babel_inference_new_demo(
-            B_text=args.text,
-            A_motion=motion_history,
-            tokenizer='t5-xxl',
             clip_model=t5_model,
             device=comp_device,
-            cfg=cfg_scale, # Use the tuned CFG scale
-            length=240 # Generate a longer sequence for "run"
         )
         print("Decoding latents to full motion...")
         motion_seqs = tae_net.forward_decoder(motion_latents)
     motion = motion_seqs.detach().cpu().numpy()
     motion_denormalized = motion * std + mean
-    # --- Save Output ---
     output_dir = 'demo_output_streamer'
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
     output_bvh_path = os.path.join(output_dir, f'{args.text.replace(" ", "_")}_cfg{cfg_scale}.bvh')
     save_motion_as_bvh(motion_denormalized, output_bvh_path, fps=30)

 from sentence_transformers import SentenceTransformer
 warnings.filterwarnings('ignore')
+# --- save_motion_as_bvh function is unchanged ---
 def save_motion_as_bvh(motion_data, output_path, fps=30):
     print(f"--- Starting direct conversion to BVH: {os.path.basename(output_path)} ---")
     try:
+        if isinstance(motion_data, torch.Tensor): motion_data = motion_data.detach().cpu().numpy()
+        if motion_data.ndim == 3 and motion_data.shape[0] == 1: motion_data = motion_data.squeeze(0)
+        elif motion_data.ndim != 2: raise ValueError(f"Input motion data must be 2D, but got shape {motion_data.shape}")
+        njoint = 22; nfrm, _ = motion_data.shape
         rotations_matrix = rotation_6d_to_matrix(torch.from_numpy(motion_data[:, 8+6*njoint : 8+12*njoint]).reshape(nfrm, -1, 6)).numpy()
         global_heading_diff_rot_6d = torch.from_numpy(motion_data[:, 2:8])
         global_heading_diff_rot = rotation_6d_to_matrix(global_heading_diff_rot_6d).numpy()
+        global_heading_rot = np.zeros_like(global_heading_diff_rot); global_heading_rot[0] = global_heading_diff_rot[0]
+        for i in range(1, nfrm): global_heading_rot[i] = np.matmul(global_heading_diff_rot[i], global_heading_rot[i-1])
+        velocities_root_xy = motion_data[:, :2]; height = motion_data[:, 8 : 8+3*njoint].reshape(nfrm, -1, 3)[:, 0, 1]
+        inv_global_heading_rot = np.transpose(global_heading_rot, (0, 2, 1)); rotations_matrix[:, 0, ...] = np.matmul(inv_global_heading_rot, rotations_matrix[:, 0, ...])
+        velocities_root_xyz = np.zeros((nfrm, 3)); velocities_root_xyz[:, 0] = velocities_root_xy[:, 0]; velocities_root_xyz[:, 2] = velocities_root_xy[:, 1]
         velocities_root_xyz[1:, :] = np.matmul(inv_global_heading_rot[:-1], velocities_root_xyz[1:, :, None]).squeeze(-1)
+        root_translation = np.cumsum(velocities_root_xyz, axis=0); root_translation[:, 1] = height
+        axis_angle = matrix_to_axis_angle(torch.from_numpy(rotations_matrix)).numpy().reshape(nfrm, -1); poses_24_joints = np.zeros((nfrm, 72)); poses_24_joints[:, :66] = axis_angle
+        model = smplx.create(model_path="body_models/human_model_files", model_type="smpl", gender="NEUTRAL"); parents = model.parents.detach().cpu().numpy()
+        rest_pose = model().joints.detach().cpu().numpy().squeeze()[:24,:]; offsets = rest_pose - rest_pose[parents]; offsets[0] = np.array([0,0,0])
+        rotations_quat = axis_angle_to_quaternion(torch.from_numpy(poses_24_joints.reshape(-1, 24, 3))).numpy(); rotations_euler = np.degrees(quat.to_euler(rotations_quat, order="zyx"))
+        positions = np.zeros_like(rotations_quat[..., :3]); positions[:, 0] = root_translation
+        joint_names = ["Pelvis", "Left_hip", "Right_hip", "Spine1", "Left_knee", "Right_knee", "Spine2", "Left_ankle", "Right_ankle", "Spine3", "Left_foot", "Right_foot", "Neck", "Left_collar", "Right_collar", "Head", "Left_shoulder", "Right_shoulder", "Left_elbow", "Right_elbow", "Left_wrist", "Right_wrist", "Left_hand", "Right_hand"]
+        bvh.save(output_path, {"rotations": rotations_euler, "positions": positions, "offsets": offsets, "parents": parents, "names": joint_names, "order": "zyx", "frametime": 1.0 / fps})
         print(f"✅ BVH file saved successfully to {output_path}")
     except Exception as e:
+        print(f"❌ BVH Conversion Failed. Error: {e}"); import traceback; traceback.print_exc()
 if __name__ == '__main__':
         latent_dim=16, clip_range=[-30, 20]
     )
     tae_ckpt = torch.load('Causal_TAE_t2m_babel/net_last.pth', map_location='cpu')
+    tae_net.load_state_dict(tae_ckpt['net'], strict=True)
     tae_net.eval()
     tae_net.to(comp_device)
     config = LLaMAHFConfig.from_name('Normal_size')
     config.block_size = 78
     trans_encoder = LLaMAHF(config, args.num_diffusion_head_layers, args.latent_dim, comp_device)
+    # --- THIS IS THE FIX ---
+    print("Loading your trained MotionStreamer checkpoint from 'motionstreamer_model/latest.pth'...")
+    # Make sure this path is correct relative to where you run the script
+    checkpoint_path = 'motionstreamer_model/latest.pth'
+    trans_ckpt = torch.load(checkpoint_path, map_location='cpu')
+    # Create a new state dict without the 'module.' prefix
+    unwrapped_state_dict = {}
+    for key, value in trans_ckpt['trans'].items():
+        if key.startswith('module.'):
+            # Strip the 'module.' prefix
+            unwrapped_state_dict[key[len('module.'):]] = value
+        else:
+            # Keep keys that don't have the prefix (just in case)
+            unwrapped_state_dict[key] = value
+    # Load the unwrapped state dict
+    trans_encoder.load_state_dict(unwrapped_state_dict, strict=True)
+    print("Successfully loaded unwrapped checkpoint.")
+    # --- END FIX ---
     trans_encoder.eval()
     trans_encoder.to(comp_device)
+    # --- Rest of the script is unchanged ---
     print("Loading mean/std from BABEL dataset...")
     mean = np.load('babel_272/t2m_babel_mean_std/Mean.npy')
     std = np.load('babel_272/t2m_babel_mean_std/Std.npy')
+    motion_history = torch.empty(0, 16).to(comp_device)
+    cfg_scale = 10.0
     print(f"Generating motion for text: '{args.text}' with CFG scale: {cfg_scale}")
     with torch.no_grad():
+        # Use the new two-forward sampling method to match training
+        _, motion_latents = trans_encoder.sample_for_eval_CFG_babel_inference_two_forward(
+            B_text=args.text,
+            A_motion=motion_history,
+            tokenizer='t5-xxl',
             clip_model=t5_model,
             device=comp_device,
+            cfg=cfg_scale,
+            length=240,
+            temperature=1.3
         )
         print("Decoding latents to full motion...")
         motion_seqs = tae_net.forward_decoder(motion_latents)
     motion = motion_seqs.detach().cpu().numpy()
     motion_denormalized = motion * std + mean
     output_dir = 'demo_output_streamer'
+    if not os.path.exists(output_dir): os.makedirs(output_dir)
     output_bvh_path = os.path.join(output_dir, f'{args.text.replace(" ", "_")}_cfg{cfg_scale}.bvh')
     save_motion_as_bvh(motion_denormalized, output_bvh_path, fps=30)