|
|
import os |
|
|
import torch |
|
|
import numpy as np |
|
|
from models.llama_model import LLaMAHF, LLaMAHFConfig |
|
|
import models.tae as tae |
|
|
import options.option_transformer as option_trans |
|
|
from sentence_transformers import SentenceTransformer |
|
|
import warnings |
|
|
from utils.face_z_align_util import rotation_6d_to_matrix, matrix_to_axis_angle, axis_angle_to_quaternion |
|
|
from utils import bvh, quat |
|
|
import smplx |
|
|
|
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
|
|
|
def save_motion_as_bvh(motion_data, output_path, fps=30): |
|
|
print(f"--- Converting to BVH: {os.path.basename(output_path)} ---") |
|
|
try: |
|
|
if isinstance(motion_data, torch.Tensor): |
|
|
motion_data = motion_data.detach().cpu().numpy() |
|
|
if motion_data.ndim == 3: |
|
|
motion_data = motion_data.squeeze(0) |
|
|
|
|
|
njoint = 22 |
|
|
nfrm, _ = motion_data.shape |
|
|
|
|
|
|
|
|
rotations_matrix = rotation_6d_to_matrix(torch.from_numpy(motion_data[:, 8+6*njoint : 8+12*njoint]).reshape(nfrm, -1, 6)).numpy() |
|
|
global_heading_diff_rot = rotation_6d_to_matrix(torch.from_numpy(motion_data[:, 2:8])).numpy() |
|
|
|
|
|
global_heading_rot = np.zeros_like(global_heading_diff_rot) |
|
|
global_heading_rot[0] = global_heading_diff_rot[0] |
|
|
for i in range(1, nfrm): |
|
|
global_heading_rot[i] = np.matmul(global_heading_diff_rot[i], global_heading_rot[i-1]) |
|
|
|
|
|
velocities_root_xy = motion_data[:, :2] |
|
|
height = motion_data[:, 8 : 8+3*njoint].reshape(nfrm, -1, 3)[:, 0, 1] |
|
|
|
|
|
inv_global_heading_rot = np.transpose(global_heading_rot, (0, 2, 1)) |
|
|
rotations_matrix[:, 0, ...] = np.matmul(inv_global_heading_rot, rotations_matrix[:, 0, ...]) |
|
|
|
|
|
velocities_root_xyz = np.zeros((nfrm, 3)) |
|
|
velocities_root_xyz[:, 0] = velocities_root_xy[:, 0] |
|
|
velocities_root_xyz[:, 2] = velocities_root_xy[:, 1] |
|
|
velocities_root_xyz[1:, :] = np.matmul(inv_global_heading_rot[:-1], velocities_root_xyz[1:, :, None]).squeeze(-1) |
|
|
root_translation = np.cumsum(velocities_root_xyz, axis=0) |
|
|
root_translation[:, 1] = height |
|
|
|
|
|
axis_angle = matrix_to_axis_angle(torch.from_numpy(rotations_matrix)).numpy().reshape(nfrm, -1) |
|
|
poses_24_joints = np.zeros((nfrm, 72)) |
|
|
poses_24_joints[:, :66] = axis_angle |
|
|
|
|
|
model = smplx.create(model_path="body_models/human_model_files", model_type="smpl", gender="NEUTRAL") |
|
|
parents = model.parents.detach().cpu().numpy() |
|
|
rest_pose = model().joints.detach().cpu().numpy().squeeze()[:24,:] |
|
|
offsets = rest_pose - rest_pose[parents] |
|
|
offsets[0] = np.array([0,0,0]) |
|
|
|
|
|
rotations_quat = axis_angle_to_quaternion(torch.from_numpy(poses_24_joints.reshape(-1, 24, 3))).numpy() |
|
|
rotations_euler = np.degrees(quat.to_euler(rotations_quat, order="zyx")) |
|
|
positions = np.zeros_like(rotations_quat[..., :3]) |
|
|
positions[:, 0] = root_translation |
|
|
|
|
|
joint_names = ["Pelvis", "Left_hip", "Right_hip", "Spine1", "Left_knee", "Right_knee", "Spine2", "Left_ankle", "Right_ankle", "Spine3", "Left_foot", "Right_foot", "Neck", "Left_collar", "Right_collar", "Head", "Left_shoulder", "Right_shoulder", "Left_elbow", "Right_elbow", "Left_wrist", "Right_wrist", "Left_hand", "Right_hand"] |
|
|
|
|
|
bvh.save(output_path, { |
|
|
"rotations": rotations_euler, "positions": positions, "offsets": offsets, |
|
|
"parents": parents, "names": joint_names, "order": "zyx", "frametime": 1.0 / fps, |
|
|
}) |
|
|
print(f"✅ BVH file saved successfully to {output_path}") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ BVH Conversion Failed. Error: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
comp_device = torch.device('cuda') |
|
|
args = option_trans.get_args_parser() |
|
|
torch.manual_seed(args.seed) |
|
|
|
|
|
|
|
|
print("Loading models for MotionStreamer...") |
|
|
t5_model = SentenceTransformer('sentencet5-xxl/') |
|
|
t5_model.eval().to(comp_device) |
|
|
|
|
|
print("Loading Causal TAE (t2m_babel) checkpoint...") |
|
|
net = tae.Causal_HumanTAE(latent_dim=16) |
|
|
ckpt = torch.load('Causal_TAE_t2m_babel/net_last.pth', map_location='cpu') |
|
|
net.load_state_dict(ckpt['net'], strict=True) |
|
|
net.eval().to(comp_device) |
|
|
|
|
|
print("Loading YOUR trained MotionStreamer checkpoint...") |
|
|
config = LLaMAHFConfig.from_name('Normal_size') |
|
|
trans_encoder = LLaMAHF(config, args.num_diffusion_head_layers, args.latent_dim, comp_device) |
|
|
|
|
|
|
|
|
trans_encoder.use_out_proj = True |
|
|
|
|
|
ckpt = torch.load('Experiments/motionstreamer_model/latest.pth', map_location='cpu') |
|
|
|
|
|
trans_encoder.load_state_dict(ckpt['trans'], strict=True) |
|
|
trans_encoder.eval().to(comp_device) |
|
|
|
|
|
print("Loading mean/std from BABEL dataset...") |
|
|
mean = np.load('babel_272/t2m_babel_mean_std/Mean.npy') |
|
|
std = np.load('babel_272/t2m_babel_mean_std/Std.npy') |
|
|
|
|
|
|
|
|
motion_history = torch.empty(0, 16).to(comp_device) |
|
|
cfg_scale = 7.0 |
|
|
text_prompt = "a person is running forward" |
|
|
desired_frames = 240 |
|
|
|
|
|
print(f"Generating motion for '{text_prompt}' with CFG scale: {cfg_scale}") |
|
|
with torch.no_grad(): |
|
|
|
|
|
_, motion_latents = trans_encoder.sample_for_eval_CFG_babel_inference_new_demo( |
|
|
B_text=text_prompt, |
|
|
A_motion=motion_history, |
|
|
tokenizer='t5-xxl', |
|
|
clip_model=t5_model, |
|
|
device=comp_device, |
|
|
cfg=cfg_scale, |
|
|
length=desired_frames |
|
|
) |
|
|
|
|
|
print("Decoding latents to full motion...") |
|
|
motion_seqs = net.forward_decoder(motion_latents) |
|
|
|
|
|
|
|
|
motion_denormalized = motion_seqs.detach().cpu().numpy() * std + mean |
|
|
|
|
|
|
|
|
motion_realtimespeed = motion_denormalized.squeeze(0)[::4, :] |
|
|
|
|
|
output_dir = 'demo_output_streamer' |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
safe_filename = text_prompt.replace(" ", "_").replace("'", "") |
|
|
output_bvh_path = os.path.join(output_dir, f'{safe_filename}_final.bvh') |
|
|
save_motion_as_bvh(motion_realtimespeed, output_bvh_path, fps=30) |