motion-stream / demo_mixed.py

Initial upload of MotionStreamer code, excluding large extracted data and output folders.

60b86d7 verified 3 months ago

6.56 kB

	import os
	import torch
	import numpy as np
	from models.llama_model import LLaMAHF, LLaMAHFConfig
	import models.tae as tae
	import options.option_transformer as option_trans
	from sentence_transformers import SentenceTransformer
	import warnings
	from utils.face_z_align_util import rotation_6d_to_matrix, matrix_to_axis_angle, axis_angle_to_quaternion
	from utils import bvh, quat
	import smplx

	warnings.filterwarnings('ignore')

	# This function converts the 272-dim representation to a BVH file for visualization.
	def save_motion_as_bvh(motion_data, output_path, fps=30):
	print(f"--- Converting to BVH: {os.path.basename(output_path)} ---")
	try:
	if isinstance(motion_data, torch.Tensor):
	motion_data = motion_data.detach().cpu().numpy()
	if motion_data.ndim == 3:
	motion_data = motion_data.squeeze(0)

	njoint = 22
	nfrm, _ = motion_data.shape

	# This complex logic correctly interprets the 272-dim vector into rotations and translations
	rotations_matrix = rotation_6d_to_matrix(torch.from_numpy(motion_data[:, 8+6njoint : 8+12njoint]).reshape(nfrm, -1, 6)).numpy()
	global_heading_diff_rot = rotation_6d_to_matrix(torch.from_numpy(motion_data[:, 2:8])).numpy()

	global_heading_rot = np.zeros_like(global_heading_diff_rot)
	global_heading_rot[0] = global_heading_diff_rot[0]
	for i in range(1, nfrm):
	global_heading_rot[i] = np.matmul(global_heading_diff_rot[i], global_heading_rot[i-1])

	velocities_root_xy = motion_data[:, :2]
	height = motion_data[:, 8 : 8+3*njoint].reshape(nfrm, -1, 3)[:, 0, 1]

	inv_global_heading_rot = np.transpose(global_heading_rot, (0, 2, 1))
	rotations_matrix[:, 0, ...] = np.matmul(inv_global_heading_rot, rotations_matrix[:, 0, ...])

	velocities_root_xyz = np.zeros((nfrm, 3))
	velocities_root_xyz[:, 0] = velocities_root_xy[:, 0]
	velocities_root_xyz[:, 2] = velocities_root_xy[:, 1]
	velocities_root_xyz[1:, :] = np.matmul(inv_global_heading_rot[:-1], velocities_root_xyz[1:, :, None]).squeeze(-1)
	root_translation = np.cumsum(velocities_root_xyz, axis=0)
	root_translation[:, 1] = height

	axis_angle = matrix_to_axis_angle(torch.from_numpy(rotations_matrix)).numpy().reshape(nfrm, -1)
	poses_24_joints = np.zeros((nfrm, 72))
	poses_24_joints[:, :66] = axis_angle

	model = smplx.create(model_path="body_models/human_model_files", model_type="smpl", gender="NEUTRAL")
	parents = model.parents.detach().cpu().numpy()
	rest_pose = model().joints.detach().cpu().numpy().squeeze()[:24,:]
	offsets = rest_pose - rest_pose[parents]
	offsets[0] = np.array([0,0,0])

	rotations_quat = axis_angle_to_quaternion(torch.from_numpy(poses_24_joints.reshape(-1, 24, 3))).numpy()
	rotations_euler = np.degrees(quat.to_euler(rotations_quat, order="zyx"))
	positions = np.zeros_like(rotations_quat[..., :3])
	positions[:, 0] = root_translation

	joint_names = ["Pelvis", "Left_hip", "Right_hip", "Spine1", "Left_knee", "Right_knee", "Spine2", "Left_ankle", "Right_ankle", "Spine3", "Left_foot", "Right_foot", "Neck", "Left_collar", "Right_collar", "Head", "Left_shoulder", "Right_shoulder", "Left_elbow", "Right_elbow", "Left_wrist", "Right_wrist", "Left_hand", "Right_hand"]

	bvh.save(output_path, {
	"rotations": rotations_euler, "positions": positions, "offsets": offsets,
	"parents": parents, "names": joint_names, "order": "zyx", "frametime": 1.0 / fps,
	})
	print(f"✅ BVH file saved successfully to {output_path}")

	except Exception as e:
	print(f"❌ BVH Conversion Failed. Error: {e}")
	import traceback
	traceback.print_exc()


	if __name__ == '__main__':
	comp_device = torch.device('cuda')
	args = option_trans.get_args_parser()
	torch.manual_seed(args.seed)

	# --- Load Models ---
	print("Loading models for MotionStreamer...")
	t5_model = SentenceTransformer('sentencet5-xxl/')
	t5_model.eval().to(comp_device)

	print("Loading Causal TAE (t2m_babel) checkpoint...")
	net = tae.Causal_HumanTAE(latent_dim=16)
	ckpt = torch.load('Causal_TAE_t2m_babel/net_last.pth', map_location='cpu')
	net.load_state_dict(ckpt['net'], strict=True)
	net.eval().to(comp_device)

	print("Loading YOUR trained MotionStreamer checkpoint...")
	config = LLaMAHFConfig.from_name('Normal_size')
	trans_encoder = LLaMAHF(config, args.num_diffusion_head_layers, args.latent_dim, comp_device)

	# --- FIX 1: Manually set the missing attribute ---
	trans_encoder.use_out_proj = True

	ckpt = torch.load('Experiments/motionstreamer_model/latest.pth', map_location='cpu')
	# Handle DataParallel wrapper if present
	trans_encoder.load_state_dict(ckpt['trans'], strict=True)
	trans_encoder.eval().to(comp_device)

	print("Loading mean/std from BABEL dataset...")
	mean = np.load('babel_272/t2m_babel_mean_std/Mean.npy')
	std = np.load('babel_272/t2m_babel_mean_std/Std.npy')

	# --- Inference ---
	motion_history = torch.empty(0, 16).to(comp_device) # Start with no history
	cfg_scale = 7.0
	text_prompt = "a person is running forward"
	desired_frames = 240 # How many frames of motion to generate

	print(f"Generating motion for '{text_prompt}' with CFG scale: {cfg_scale}")
	with torch.no_grad():
	# Use the correct inference function for the streaming model
	_, motion_latents = trans_encoder.sample_for_eval_CFG_babel_inference_new_demo(
	B_text=text_prompt,
	A_motion=motion_history,
	tokenizer='t5-xxl',
	clip_model=t5_model,
	device=comp_device,
	cfg=cfg_scale,
	length=desired_frames
	)

	print("Decoding latents to full motion...")
	motion_seqs = net.forward_decoder(motion_latents)

	# --- Denormalize, Correct Speed, and Save ---
	motion_denormalized = motion_seqs.detach().cpu().numpy() * std + mean

	# --- FIX 2: Subsample the frames to correct the speed ---
	motion_realtimespeed = motion_denormalized.squeeze(0)[::4, :]

	output_dir = 'demo_output_streamer'
	os.makedirs(output_dir, exist_ok=True)

	safe_filename = text_prompt.replace(" ", "_").replace("'", "")
	output_bvh_path = os.path.join(output_dir, f'{safe_filename}_final.bvh')
	save_motion_as_bvh(motion_realtimespeed, output_bvh_path, fps=30)