zirobtc commited on
Commit
be72f05
·
verified ·
1 Parent(s): 3c212d2

Upload demo_stream.py

Browse files
Files changed (1) hide show
  1. demo_stream.py +311 -0
demo_stream.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+ import warnings
5
+
6
+ import smplx
7
+
8
+ from models.llama_model import LLaMAHF, LLaMAHFConfig
9
+ import models.tae as tae
10
+ import options.option_transformer as option_trans
11
+ from utils import bvh, quat
12
+ from utils.face_z_align_util import rotation_6d_to_matrix, matrix_to_axis_angle, axis_angle_to_quaternion
13
+
14
+ warnings.filterwarnings('ignore')
15
+
16
+
17
+ class MockTextEncoder:
18
+ def __init__(self, dim: int = 768):
19
+ self.dim = dim
20
+
21
+ def to(self, device):
22
+ return self
23
+
24
+ def eval(self):
25
+ return self
26
+
27
+ def parameters(self):
28
+ return []
29
+
30
+ def encode(self, text):
31
+ if isinstance(text, list):
32
+ batch = len(text)
33
+ else:
34
+ batch = 1
35
+ text = [text]
36
+ embeddings = torch.zeros(batch, self.dim)
37
+ for i, t in enumerate(text):
38
+ val = hash(t) % self.dim
39
+ embeddings[i, val] = 1.0
40
+ return embeddings.numpy()
41
+
42
+
43
+ # --- save_motion_as_bvh function is unchanged ---
44
+ def save_motion_as_bvh(motion_data, output_path, fps=30):
45
+ print(f"--- Starting direct conversion to BVH: {os.path.basename(output_path)} ---")
46
+ try:
47
+ if isinstance(motion_data, torch.Tensor): motion_data = motion_data.detach().cpu().numpy()
48
+ if motion_data.ndim == 3 and motion_data.shape[0] == 1: motion_data = motion_data.squeeze(0)
49
+ elif motion_data.ndim != 2: raise ValueError(f"Input motion data must be 2D, but got shape {motion_data.shape}")
50
+ njoint = 22; nfrm, _ = motion_data.shape
51
+ rotations_matrix = rotation_6d_to_matrix(torch.from_numpy(motion_data[:, 8+6*njoint : 8+12*njoint]).reshape(nfrm, -1, 6)).numpy()
52
+ global_heading_diff_rot_6d = torch.from_numpy(motion_data[:, 2:8])
53
+ global_heading_diff_rot = rotation_6d_to_matrix(global_heading_diff_rot_6d).numpy()
54
+ global_heading_rot = np.zeros_like(global_heading_diff_rot); global_heading_rot[0] = global_heading_diff_rot[0]
55
+ for i in range(1, nfrm): global_heading_rot[i] = np.matmul(global_heading_diff_rot[i], global_heading_rot[i-1])
56
+ velocities_root_xy = motion_data[:, :2]; height = motion_data[:, 8 : 8+3*njoint].reshape(nfrm, -1, 3)[:, 0, 1]
57
+ inv_global_heading_rot = np.transpose(global_heading_rot, (0, 2, 1)); rotations_matrix[:, 0, ...] = np.matmul(inv_global_heading_rot, rotations_matrix[:, 0, ...])
58
+ velocities_root_xyz = np.zeros((nfrm, 3)); velocities_root_xyz[:, 0] = velocities_root_xy[:, 0]; velocities_root_xyz[:, 2] = velocities_root_xy[:, 1]
59
+ velocities_root_xyz[1:, :] = np.matmul(inv_global_heading_rot[:-1], velocities_root_xyz[1:, :, None]).squeeze(-1)
60
+ root_translation = np.cumsum(velocities_root_xyz, axis=0); root_translation[:, 1] = height
61
+ axis_angle = matrix_to_axis_angle(torch.from_numpy(rotations_matrix)).numpy().reshape(nfrm, -1); poses_24_joints = np.zeros((nfrm, 72)); poses_24_joints[:, :66] = axis_angle
62
+ model = smplx.create(model_path="body_models/human_model_files", model_type="smpl", gender="NEUTRAL"); parents = model.parents.detach().cpu().numpy()
63
+ rest_pose = model().joints.detach().cpu().numpy().squeeze()[:24,:]; offsets = rest_pose - rest_pose[parents]; offsets[0] = np.array([0,0,0])
64
+ rotations_quat = axis_angle_to_quaternion(torch.from_numpy(poses_24_joints.reshape(-1, 24, 3))).numpy(); rotations_euler = np.degrees(quat.to_euler(rotations_quat, order="zyx"))
65
+ positions = np.zeros_like(rotations_quat[..., :3]); positions[:, 0] = root_translation
66
+ joint_names = ["Pelvis", "Left_hip", "Right_hip", "Spine1", "Left_knee", "Right_knee", "Spine2", "Left_ankle", "Right_ankle", "Spine3", "Left_foot", "Right_foot", "Neck", "Left_collar", "Right_collar", "Head", "Left_shoulder", "Right_shoulder", "Left_elbow", "Right_elbow", "Left_wrist", "Right_wrist", "Left_hand", "Right_hand"]
67
+ bvh.save(output_path, {"rotations": rotations_euler, "positions": positions, "offsets": offsets, "parents": parents, "names": joint_names, "order": "zyx", "frametime": 1.0 / fps})
68
+ print(f"✅ BVH file saved successfully to {output_path}")
69
+ except Exception as e:
70
+ print(f"❌ BVH Conversion Failed. Error: {e}"); import traceback; traceback.print_exc()
71
+
72
+
73
+ def _to_prompt_tensor(embedding: np.ndarray, device: torch.device) -> torch.Tensor:
74
+ tensor = torch.from_numpy(embedding).float() if isinstance(embedding, np.ndarray) else embedding.float()
75
+ if tensor.dim() == 1:
76
+ tensor = tensor.unsqueeze(0)
77
+ return tensor.to(device)
78
+
79
+
80
+ def _set_prompt(trans: LLaMAHF, prompt_feat: torch.Tensor) -> None:
81
+ trans.clear_prompt()
82
+ trans.set_prompt(prompt_feat)
83
+
84
+
85
+ def _states_for_prompt(trans: LLaMAHF, latents: torch.Tensor, prompt_feat: torch.Tensor) -> torch.Tensor:
86
+ _set_prompt(trans, prompt_feat)
87
+ outputs = trans(latents, feature=None)
88
+ return outputs[:, :-1, :]
89
+
90
+
91
+ def _predict_sequence(
92
+ trans: LLaMAHF,
93
+ cond_seq: torch.Tensor,
94
+ uncond_seq: torch.Tensor,
95
+ cfg_scale: float,
96
+ temperature: float,
97
+ ) -> torch.Tensor:
98
+ batch, seq_len, _ = cond_seq.shape
99
+ if seq_len == 0:
100
+ dim = trans.diff_loss.in_channels
101
+ cond_seq = torch.zeros(batch, 1, trans.config.n_embd, device=cond_seq.device)
102
+ uncond_seq = torch.zeros_like(cond_seq)
103
+ seq_len = 1
104
+
105
+ mix = torch.cat([cond_seq, uncond_seq], dim=0) # [2B, L, D]
106
+ flat = mix.reshape(mix.size(0) * seq_len, -1)
107
+ trans.diff_loss.set_sequence_layout(mix.size(0), seq_len)
108
+ sampled = trans.diff_loss.sample(flat, temperature=temperature, cfg=cfg_scale)
109
+
110
+ if cfg_scale != 1.0:
111
+ cond_flat, _ = sampled.chunk(2, dim=0)
112
+ else:
113
+ cond_flat = sampled[: batch * seq_len, :]
114
+
115
+ target_dim = trans.diff_loss.in_channels
116
+ return cond_flat.view(batch, seq_len, target_dim)
117
+
118
+
119
+ def _sample_next_token(
120
+ trans: LLaMAHF,
121
+ current_seq: torch.Tensor,
122
+ latent_dim: int,
123
+ cond_prompt: torch.Tensor,
124
+ uncond_prompt: torch.Tensor,
125
+ temperature: float,
126
+ cfg_scale: float,
127
+ device: torch.device,
128
+ ) -> torch.Tensor:
129
+ history = current_seq.unsqueeze(0)
130
+ placeholder = torch.zeros(1, 1, latent_dim, device=device)
131
+ latents = torch.cat([history, placeholder], dim=1)
132
+
133
+ cond_seq = _states_for_prompt(trans, latents, cond_prompt)
134
+ uncond_seq = _states_for_prompt(trans, latents, uncond_prompt)
135
+ _set_prompt(trans, cond_prompt)
136
+
137
+ pred_seq = _predict_sequence(
138
+ trans=trans,
139
+ cond_seq=cond_seq,
140
+ uncond_seq=uncond_seq,
141
+ cfg_scale=cfg_scale,
142
+ temperature=temperature,
143
+ )
144
+
145
+ new_token = pred_seq[:, -1, :][0]
146
+ return torch.cat([current_seq, new_token.unsqueeze(0)], dim=0)
147
+
148
+
149
+ def _refine_sequence(
150
+ trans: LLaMAHF,
151
+ sequence: torch.Tensor,
152
+ frozen_prefix: int,
153
+ cond_prompt: torch.Tensor,
154
+ uncond_prompt: torch.Tensor,
155
+ temperature: float,
156
+ cfg_scale: float,
157
+ device: torch.device,
158
+ ) -> torch.Tensor:
159
+ total_len = sequence.shape[0]
160
+ for idx in range(frozen_prefix, total_len):
161
+ history = sequence[:idx]
162
+ predicted = _sample_next_token(
163
+ trans=trans,
164
+ current_seq=history,
165
+ latent_dim=sequence.size(1),
166
+ cond_prompt=cond_prompt,
167
+ uncond_prompt=uncond_prompt,
168
+ temperature=temperature,
169
+ cfg_scale=cfg_scale,
170
+ device=device,
171
+ )
172
+ sequence[idx] = predicted[-1]
173
+ return sequence
174
+
175
+
176
+ def generate_motion_latents(
177
+ trans: LLaMAHF,
178
+ initial_tokens: torch.Tensor,
179
+ latent_dim: int,
180
+ cond_prompt: torch.Tensor,
181
+ uncond_prompt: torch.Tensor,
182
+ num_new_tokens: int,
183
+ cfg_scale: float,
184
+ temperature: float,
185
+ device: torch.device,
186
+ ) -> torch.Tensor:
187
+ trans.eval()
188
+ _set_prompt(trans, cond_prompt)
189
+
190
+ seq = initial_tokens.clone()
191
+ for _ in range(num_new_tokens):
192
+ seq = _sample_next_token(
193
+ trans=trans,
194
+ current_seq=seq,
195
+ latent_dim=latent_dim,
196
+ cond_prompt=cond_prompt,
197
+ uncond_prompt=uncond_prompt,
198
+ temperature=temperature,
199
+ cfg_scale=cfg_scale,
200
+ device=device,
201
+ )
202
+
203
+ refined = _refine_sequence(
204
+ trans=trans,
205
+ sequence=seq.clone(),
206
+ frozen_prefix=initial_tokens.shape[0],
207
+ cond_prompt=cond_prompt,
208
+ uncond_prompt=uncond_prompt,
209
+ temperature=temperature,
210
+ cfg_scale=cfg_scale,
211
+ device=device,
212
+ )
213
+ return refined
214
+
215
+
216
+ if __name__ == '__main__':
217
+ comp_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
218
+ args = option_trans.get_args_parser()
219
+ torch.manual_seed(args.seed)
220
+
221
+ # --- Load Models ---
222
+ print("Loading models for MotionStreamer...")
223
+ t5_model = MockTextEncoder()
224
+ t5_model.eval()
225
+ for p in t5_model.parameters():
226
+ p.requires_grad = False
227
+
228
+ print("Loading Causal TAE (t2m_babel) checkpoint...")
229
+ tae_net = tae.Causal_HumanTAE(
230
+ hidden_size=1024, down_t=2, stride_t=2, depth=3, dilation_growth_rate=3,
231
+ latent_dim=16, clip_range=[-30, 20]
232
+ )
233
+ tae_ckpt = torch.load('Causal_TAE_t2m_babel/net_last.pth', map_location='cpu')
234
+ tae_net.load_state_dict(tae_ckpt['net'], strict=True)
235
+ tae_net.eval()
236
+ tae_net.to(comp_device)
237
+
238
+ config = LLaMAHFConfig.from_name('Normal_size')
239
+ trans_encoder = LLaMAHF(
240
+ config=config,
241
+ num_diffusion_head_layers=args.num_diffusion_head_layers,
242
+ input_token_dim=args.latent_dim,
243
+ device=comp_device,
244
+ )
245
+
246
+ # --- THIS IS THE FIX ---
247
+ # print("Loading your trained MotionStreamer checkpoint from 'motionstreamer_model/latest.pth'...")
248
+ # # Make sure this path is correct relative to where you run the script
249
+ # checkpoint_path = 'motionstreamer_model/latest.pth'
250
+ # trans_ckpt = torch.load(checkpoint_path, map_location='cpu')
251
+
252
+ # Create a new state dict without the 'module.' prefix
253
+ # unwrapped_state_dict = {}
254
+ # for key, value in trans_ckpt['trans'].items():
255
+ # if key.startswith('module.'):
256
+ # # Strip the 'module.' prefix
257
+ # unwrapped_state_dict[key[len('module.'):]] = value
258
+ # else:
259
+ # # Keep keys that don't have the prefix (just in case)
260
+ # unwrapped_state_dict[key] = value
261
+
262
+ # # Load the unwrapped state dict
263
+ # trans_encoder.load_state_dict(unwrapped_state_dict, strict=True)
264
+ # print("Successfully loaded unwrapped checkpoint.")
265
+ # --- END FIX ---
266
+
267
+ trans_encoder.eval()
268
+ trans_encoder.to(comp_device)
269
+
270
+ # --- Rest of the script is unchanged ---
271
+ print("Loading mean/std from BABEL dataset...")
272
+ mean = np.load('babel_272/t2m_babel_mean_std/Mean.npy')
273
+ std = np.load('babel_272/t2m_babel_mean_std/Std.npy')
274
+
275
+ latent_dim = args.latent_dim
276
+ motion_history = torch.empty(0, latent_dim, device=comp_device)
277
+ cfg_scale = 10.0
278
+ temperature = 1.3
279
+ unit_length = 4
280
+ target_tokens = 240 // unit_length
281
+
282
+ print(f"Generating motion for text: '{args.text}' with CFG scale: {cfg_scale}")
283
+ text_embedding = _to_prompt_tensor(t5_model.encode(args.text), comp_device)
284
+ empty_embedding = _to_prompt_tensor(t5_model.encode(''), comp_device)
285
+ num_new_tokens = max(0, target_tokens - motion_history.shape[0])
286
+
287
+ with torch.no_grad():
288
+ generated_seq = generate_motion_latents(
289
+ trans=trans_encoder,
290
+ initial_tokens=motion_history,
291
+ latent_dim=latent_dim,
292
+ cond_prompt=text_embedding,
293
+ uncond_prompt=empty_embedding,
294
+ num_new_tokens=num_new_tokens,
295
+ cfg_scale=cfg_scale,
296
+ temperature=temperature,
297
+ device=comp_device,
298
+ )
299
+ motion_latents = generated_seq.unsqueeze(0)
300
+
301
+ print("Decoding latents to full motion...")
302
+ motion_seqs = tae_net.forward_decoder(motion_latents)
303
+
304
+ motion = motion_seqs.detach().cpu().numpy()
305
+ motion_denormalized = motion * std + mean
306
+
307
+ output_dir = 'demo_output_streamer'
308
+ if not os.path.exists(output_dir): os.makedirs(output_dir)
309
+
310
+ output_bvh_path = os.path.join(output_dir, f'{args.text.replace(" ", "_")}_cfg{cfg_scale}.bvh')
311
+ save_motion_as_bvh(motion_denormalized, output_bvh_path, fps=30)