Spaces:

Luminia
/

momask-codes

Sleeping

App Files Files Community

Nekochu commited on 21 days ago

Commit

e8911be

verified ·

1 Parent(s): ebf1dc0

Fix render

Browse files

Files changed (2) hide show

README.md +1 -0
app_new.py +165 -170

README.md CHANGED Viewed

@@ -8,6 +8,7 @@ sdk_version: "6.1.0"
 app_file: app_new.py
 pinned: false
 python_version: "3.10"
 ---
 # MoMask: Text-to-Motion Generation

 app_file: app_new.py
 pinned: false
 python_version: "3.10"
+short_description: Text-to-3D motion generation using ONNX INT8 models
 ---
 # MoMask: Text-to-Motion Generation

app_new.py CHANGED Viewed

@@ -13,10 +13,9 @@ import clip
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
-from matplotlib.animation import FuncAnimation
-import mpl_toolkits.mplot3d.axes3d as p3
-from mpl_toolkits.mplot3d.art3d import Poly3DCollection
 from pathlib import Path
 # ============ Quaternion Operations ============
 def qinv(q):
     """Invert quaternion"""
@@ -30,11 +29,11 @@ def qrot(q, v):
     assert q.shape[-1] == 4
     assert v.shape[-1] == 3
     assert q.shape[:-1] == v.shape[:-1]
     original_shape = list(v.shape)
     q = q.contiguous().view(-1, 4)
     v = v.contiguous().view(-1, 3)
     qvec = q[:, 1:]
     uv = torch.cross(qvec, v, dim=1)
     uuv = torch.cross(qvec, uv, dim=1)
@@ -66,10 +65,9 @@ def get_session(name):
         path = ONNX_DIR / f"{name}.onnx"
         if not path.exists():
             raise FileNotFoundError(f"Model not found: {path}")
-        sessions[name] = ort.InferenceSession(str(path), providers=['CPUExecutionProvider'])
     return sessions[name]
 # ============ Motion Recovery ============
 def recover_root_rot_pos(data):
     """Recover root rotation and position from motion data"""
@@ -104,75 +102,111 @@ def recover_from_ric(data, joints_num=22):
 # ============ Visualization ============
 def plot_3d_motion(save_path, joints, title, fps=20):
     """Create MP4 video of 3D skeleton motion"""
-    kinematic_tree = T2M_KINEMATIC_CHAIN
-    figsize = (6, 6)
-    radius = 4
-    # Prepare title
-    title_sp = title.split(' ')
-    if len(title_sp) > 10:
-        title = '\n'.join([' '.join(title_sp[:10]), ' '.join(title_sp[10:])])
-    # Prepare data
-    data = joints.copy().reshape(len(joints), -1, 3)
-    frame_number = data.shape[0]
-    MINS = data.min(axis=0).min(axis=0)
-    MAXS = data.max(axis=0).max(axis=0)
-    height_offset = MINS[1]
-    data[:, :, 1] -= height_offset
-    trajec = data[:, 0, [0, 2]]
-    data[..., 0] -= data[:, 0:1, 0]
-    data[..., 2] -= data[:, 0:1, 2]
-    fig = plt.figure(figsize=figsize)
-    ax = p3.Axes3D(fig)
-    ax.set_xlim3d([-radius / 2, radius / 2])
-    ax.set_ylim3d([0, radius])
-    ax.set_zlim3d([0, radius])
-    fig.suptitle(title, fontsize=12)
-    ax.grid(False)
-    colors = ['red', 'blue', 'black', 'red', 'blue']
-    def plot_xzPlane(minx, maxx, miny, minz, maxz):
-        verts = [[minx, miny, minz], [minx, miny, maxz],
-                 [maxx, miny, maxz], [maxx, miny, minz]]
-        xz_plane = Poly3DCollection([verts])
-        xz_plane.set_facecolor((0.5, 0.5, 0.5, 0.5))
-        ax.add_collection3d(xz_plane)
-    def update(index):
-        for line in ax.lines[:]: line.remove()
-        for coll in ax.collections[:]: coll.remove()
-        ax.view_init(elev=120, azim=-90)
-        ax.dist = 7.5
-        plot_xzPlane(MINS[0] - trajec[index, 0], MAXS[0] - trajec[index, 0], 0,
-                     MINS[2] - trajec[index, 1], MAXS[2] - trajec[index, 1])
-        if index > 1:
-            ax.plot3D(trajec[:index, 0] - trajec[index, 0],
-                     np.zeros_like(trajec[:index, 0]),
-                     trajec[:index, 1] - trajec[index, 1],
-                     linewidth=1.0, color='blue')
-        for i, (chain, color) in enumerate(zip(kinematic_tree, colors)):
-            linewidth = 4.0 if i < 5 else 2.0
-            ax.plot3D(data[index, chain, 0], data[index, chain, 1],
-                     data[index, chain, 2], linewidth=linewidth, color=color)
-        plt.axis('off')
-        ax.set_xticklabels([])
-        ax.set_yticklabels([])
-        ax.set_zticklabels([])
-    ani = FuncAnimation(fig, update, frames=frame_number, interval=1000/fps, repeat=False)
-    ani.save(save_path, fps=fps)
     plt.close()
-    return save_path
 # ============ Sampling Utilities ============
 def cosine_schedule(t):
@@ -183,7 +217,7 @@ def top_k_filter(logits, k=0.9):
     """Apply top-k filtering"""
     k = int((1 - k) * logits.shape[-1])
     val, ind = torch.topk(logits, k, dim=-1)
-    probs = torch.full_like(logits, float('-inf'))
     probs.scatter_(-1, ind, val)
     return probs
@@ -191,187 +225,147 @@ def gumbel_sample(logits, temperature=1.0):
     """Gumbel softmax sampling"""
     gumbels = -torch.log(-torch.log(torch.rand_like(logits) + 1e-8) + 1e-8)
     return ((logits / max(temperature, 1e-10)) + gumbels).argmax(dim=-1)
 # ============ Main Generation Pipeline ============
-def generate_motion(text, motion_length=0, seed=None):
-    """Generate motion from text prompt
-    Args:
-        text: Text description of motion
-        motion_length: Length in seconds (0 = auto-estimate)
-        seed: Random seed for reproducibility
-    Returns:
-        joints: 3D joint positions (N, 22, 3)
-        video_path: Path to rendered MP4
-    """
     if seed is not None:
         torch.manual_seed(seed)
         np.random.seed(seed)
-    # Load mean/std for denormalization
     mean = np.load(ONNX_DIR / "mean.npy")
     std = np.load(ONNX_DIR / "std.npy")
-    # 1. Tokenize text with CLIP
     tokens = clip.tokenize([text], truncate=True)
-    # 2. Encode text with CLIP
     clip_sess = get_session("clip_text")
-    text_emb = clip_sess.run(None, {'text_tokens': tokens.numpy()})[0]  # (1, 512)
-    # 3. Estimate motion length
     if motion_length <= 0:
         len_sess = get_session("length_estimator")
-        len_logits = len_sess.run(None, {'text_embedding': text_emb})[0]
-        # Sample from distribution
         probs = torch.softmax(torch.from_numpy(len_logits), dim=-1)
         token_len = torch.multinomial(probs, 1).item()
     else:
-        # Convert seconds to tokens (20 fps, 4 frames per token)
         token_len = int(motion_length * 20 / 4)
-    token_len = max(2, min(token_len, 49))  # Clamp to valid range
     m_length = token_len * 4
-    max_len = 49  # Fixed sequence length for ONNX model
     print(f"Generating motion: '{text}' ({m_length} frames, {m_length/20:.1f}s)")
-    # 4. Initialize with mask/pad tokens
-    mask_id = 512  # Mask token ID
-    pad_id = 513   # Pad token ID
     ids = torch.full((1, max_len), pad_id, dtype=torch.long)
     ids[:, :token_len] = mask_id
     scores = torch.zeros(1, max_len)
-    scores[:, token_len:] = 1e5  # High scores for padding (won't be masked)
-    # Create padding mask (True = padded)
     padding_mask = np.zeros((1, max_len), dtype=bool)
     padding_mask[:, token_len:] = True
-    # 5. Iterative generation with MaskTransformer
     mask_sess = get_session("mask_transformer")
     for step in range(TIMESTEPS):
         t = step / TIMESTEPS
         rand_mask_prob = cosine_schedule(torch.tensor(t)).item()
-        # Number of tokens to mask (only in valid region)
         num_masked = max(1, int(rand_mask_prob * token_len))
-        # Get lowest scoring positions to mask (only in valid region)
         valid_scores = scores[:, :token_len].clone()
         _, sorted_idx = valid_scores.sort(dim=1)
         mask_pos = sorted_idx[:, :num_masked]
         is_mask = torch.zeros(1, token_len, dtype=torch.bool)
         is_mask.scatter_(1, mask_pos, True)
-        # Apply mask only to valid positions
         ids[:, :token_len] = torch.where(is_mask, mask_id, ids[:, :token_len])
-        # Run transformer with fixed max_len
         logits = mask_sess.run(None, {
-            'motion_ids': ids.numpy(),
-            'cond_vector': text_emb,
-            'padding_mask': padding_mask
-        })[0]  # (1, num_tokens, max_len)
-        logits = torch.from_numpy(logits)  # (1, 514, max_len)
-        # Get logits for valid positions only
-        logits = logits[:, :512, :token_len]  # Remove mask/pad tokens, trim to valid len
-        logits = logits.permute(0, 2, 1)  # (1, token_len, 512)
-        # Apply temperature and top-k filtering
         filtered_logits = top_k_filter(logits / TEMPERATURE, TOPK_FILTER)
-        # Sample new tokens
-        new_ids = gumbel_sample(filtered_logits, TEMPERATURE)  # (1, token_len)
-        # Get confidence scores
         probs = torch.softmax(filtered_logits, dim=-1)
         new_scores = probs.gather(-1, new_ids.unsqueeze(-1)).squeeze(-1)
-        # Update only masked positions (in valid region)
         ids[:, :token_len] = torch.where(is_mask, new_ids, ids[:, :token_len])
         scores[:, :token_len] = torch.where(is_mask, new_scores, scores[:, :token_len])
-    # 6. Residual refinement with ResidualTransformer
     res_sess = get_session("residual_transformer")
     num_quantizers = 6
-    # Load token embeddings for residual transformer
-    res_token_embed = np.load(ONNX_DIR / "res_token_embed.npy")  # (5, 513, 512)
-    # Initialize all quantizer codes
     all_codes = torch.zeros(1, max_len, num_quantizers, dtype=torch.long)
     all_codes[:, :, 0] = ids
-    # Accumulate code embeddings for residual refinement
     history_sum = np.zeros((1, max_len, 512), dtype=np.float32)
     motion_ids = ids.clone()
     for q in range(1, num_quantizers):
-        # Get token embeddings for current motion_ids using layer q-1
-        token_embed = res_token_embed[q-1]  # (513, 512)
-        # Gather embeddings for each position (clamp padding to valid range)
         clamped_ids = np.clip(motion_ids[0].numpy(), 0, 512)
-        gathered = token_embed[clamped_ids]  # (max_len, 512)
-        history_sum += gathered[np.newaxis, :, :]  # Accumulate
         q_id = np.array([q], dtype=np.int64)
         logits = res_sess.run(None, {
-            'motion_codes': history_sum.astype(np.float32),
-            'q_id': q_id,
-            'cond_vector': text_emb,
-            'padding_mask': padding_mask
         })[0]
         logits = torch.from_numpy(logits)[:, :512, :token_len].permute(0, 2, 1)
         new_ids_q = gumbel_sample(logits, 1.0)
         all_codes[:, :token_len, q] = new_ids_q
-        motion_ids[:, :token_len] = new_ids_q  # Update for next iteration
-    # 7. Decode motion with VQVAE (only valid tokens)
     decoder_sess = get_session("vqvae_decoder")
     valid_codes = all_codes[:, :token_len, :].numpy()
     motion = decoder_sess.run(None, {
-        'code_indices': valid_codes
-    })[0]  # (1, token_len, 263)
-    # Upsample to full length (token_len -> m_length via stride=2, down_t=2 -> 4x)
     motion = np.repeat(motion, 4, axis=1)[:, :m_length, :]
-    # 8. Denormalize
     motion = motion * std + mean
-    # 9. Recover 3D joint positions
     motion_tensor = torch.from_numpy(motion).float()
     joints = recover_from_ric(motion_tensor, JOINTS_NUM)
-    joints = joints.squeeze(0).numpy()  # (m_length, 22, 3)
-    # 10. Render video
-    video_path = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False).name
     plot_3d_motion(video_path, joints, text, fps=20)
-    return joints, video_path
 # ============ Gradio Interface ============
 def create_demo():
     import gradio as gr
-    def generate_fn(text, length, seed):
         if not text or text.strip() == "":
-            return None
         seed = int(seed) if seed else None
         length = float(length) if length else 0
-        _, video_path = generate_motion(text, length, seed)
-        return video_path
     with gr.Blocks(title="MoMask") as demo:
         gr.Markdown("## [MoMask](https://github.com/EricGuo5513/momask-codes) - Text to Motion")
-        gr.Markdown("Generate 3D human skeleton animations from text descriptions.")
         with gr.Row():
             with gr.Column():
@@ -383,41 +377,42 @@ def create_demo():
                                        info="0 = auto-estimate")
                     seed = gr.Number(label="Seed", value=42,
                                     info="For reproducibility")
                 btn = gr.Button("Generate", variant="primary")
             with gr.Column():
                 video = gr.Video(label="Generated Motion")
         gr.Examples(
             examples=[
-                ["A person walks forward", 0, 42],
-                ["A person is running on a treadmill", 0, 123],
-                ["A person jumps up and then lands", 0, 456],
-                ["A person does a salsa dance", 0, 789],
-                ["A person kicks with their right leg", 0, 101],
             ],
-            inputs=[text, length, seed],
-            outputs=video,
             fn=generate_fn,
             cache_examples=False,
         )
-        btn.click(fn=generate_fn, inputs=[text, length, seed], outputs=video)
     return demo
 # ============ CLI ============
 if __name__ == "__main__":
     if len(sys.argv) > 1:
-        # CLI mode: python app.py "motion description" [length] [seed]
         text = sys.argv[1]
         length = float(sys.argv[2]) if len(sys.argv) > 2 else 0
         seed = int(sys.argv[3]) if len(sys.argv) > 3 else 42
-        joints, video_path = generate_motion(text, length, seed)
-        print(f"Generated: {video_path}")
         print(f"Joints shape: {joints.shape}")
     else:
-        # Gradio mode
         demo = create_demo()
-        demo.launch()

 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
+from matplotlib.animation import FuncAnimation, FFMpegWriter
 from pathlib import Path
 # ============ Quaternion Operations ============
 def qinv(q):
     """Invert quaternion"""
     assert q.shape[-1] == 4
     assert v.shape[-1] == 3
     assert q.shape[:-1] == v.shape[:-1]
     original_shape = list(v.shape)
     q = q.contiguous().view(-1, 4)
     v = v.contiguous().view(-1, 3)
     qvec = q[:, 1:]
     uv = torch.cross(qvec, v, dim=1)
     uuv = torch.cross(qvec, uv, dim=1)
         path = ONNX_DIR / f"{name}.onnx"
         if not path.exists():
             raise FileNotFoundError(f"Model not found: {path}")
+        sessions[name] = ort.InferenceSession(str(path), providers=["CPUExecutionProvider"])
     return sessions[name]
 # ============ Motion Recovery ============
 def recover_root_rot_pos(data):
     """Recover root rotation and position from motion data"""
 # ============ Visualization ============
 def plot_3d_motion(save_path, joints, title, fps=20):
     """Create MP4 video of 3D skeleton motion"""
+    fig = plt.figure(figsize=(8, 8))
+    ax = fig.add_subplot(111, projection="3d")
+    COLORS = ["red", "blue", "black", "green", "purple"]
+    def init():
+        ax.set_xlim(-1.5, 1.5)
+        ax.set_ylim(-1.5, 1.5)
+        ax.set_zlim(0, 2)
+        ax.set_xlabel("X")
+        ax.set_ylabel("Z")
+        ax.set_zlabel("Y (up)")
+        ax.set_title(title)
+        return []
+    lines = []
+    for i, chain in enumerate(T2M_KINEMATIC_CHAIN):
+        line, = ax.plot([], [], [], color=COLORS[i], linewidth=2, marker="o", markersize=3)
+        lines.append(line)
+    def update(frame):
+        data = joints[frame]
+        for i, chain in enumerate(T2M_KINEMATIC_CHAIN):
+            x = [data[j, 0] for j in chain]
+            y = [data[j, 2] for j in chain]
+            z = [data[j, 1] for j in chain]
+            lines[i].set_data(x, y)
+            lines[i].set_3d_properties(z)
+        ax.view_init(elev=20, azim=45 + frame * 0.5)
+        return lines
+    ani = FuncAnimation(fig, update, frames=len(joints), init_func=init, blit=False, interval=1000//fps)
+    writer = FFMpegWriter(fps=fps, bitrate=2000)
+    ani.save(save_path, writer=writer)
     plt.close()
+# ============ BVH Export ============
+def joints_to_bvh(joints, output_path, fps=20):
+    """Convert joint positions to BVH format for Blender import."""
+    n_frames, n_joints, _ = joints.shape
+    joint_names = [
+        "Hips", "LeftUpLeg", "RightUpLeg", "Spine", "LeftLeg", "RightLeg",
+        "Spine1", "LeftFoot", "RightFoot", "Spine2", "LeftToe", "RightToe",
+        "Neck", "LeftShoulder", "RightShoulder", "Head", "LeftArm", "RightArm",
+        "LeftForeArm", "RightForeArm", "LeftHand", "RightHand"
+    ]
+    parents = [-1, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 12, 13, 14, 16, 17, 18, 19]
+    offsets = np.zeros((n_joints, 3))
+    ref_frame = joints[0]
+    for i in range(n_joints):
+        if parents[i] >= 0:
+            offsets[i] = ref_frame[i] - ref_frame[parents[i]]
+    scale = 100.0
+    offsets *= scale
+    joints_scaled = joints * scale
+    with open(output_path, "w") as f:
+        f.write("HIERARCHY" + chr(10))
+        def write_joint(idx, indent):
+            name = joint_names[idx]
+            off = offsets[idx]
+            prefix = "  " * indent
+            children = [i for i, p in enumerate(parents) if p == idx]
+            if idx == 0:
+                f.write(f"ROOT {name}" + chr(10))
+            else:
+                f.write(f"{prefix}JOINT {name}" + chr(10))
+            f.write(f"{prefix}{{" + chr(10))
+            f.write(f"{prefix}  OFFSET {off[0]:.6f} {off[1]:.6f} {off[2]:.6f}" + chr(10))
+            if idx == 0:
+                f.write(f"{prefix}  CHANNELS 6 Xposition Yposition Zposition Xrotation Yrotation Zrotation" + chr(10))
+            else:
+                f.write(f"{prefix}  CHANNELS 3 Xrotation Yrotation Zrotation" + chr(10))
+            if children:
+                for child in children:
+                    write_joint(child, indent + 1)
+            else:
+                f.write(f"{prefix}  End Site" + chr(10))
+                f.write(f"{prefix}  {{" + chr(10))
+                f.write(f"{prefix}    OFFSET 0.0 0.0 0.0" + chr(10))
+                f.write(f"{prefix}  }}" + chr(10))
+            f.write(f"{prefix}}}" + chr(10))
+        write_joint(0, 0)
+        f.write("MOTION" + chr(10))
+        f.write(f"Frames: {n_frames}" + chr(10))
+        f.write(f"Frame Time: {1.0/fps:.6f}" + chr(10))
+        for frame in range(n_frames):
+            root_pos = joints_scaled[frame, 0]
+            values = [root_pos[0], root_pos[1], root_pos[2]]
+            values.extend([0.0] * 3 * n_joints)
+            f.write(" ".join(f"{v:.6f}" for v in values) + chr(10))
+    return output_path
 # ============ Sampling Utilities ============
 def cosine_schedule(t):
     """Apply top-k filtering"""
     k = int((1 - k) * logits.shape[-1])
     val, ind = torch.topk(logits, k, dim=-1)
+    probs = torch.full_like(logits, float("-inf"))
     probs.scatter_(-1, ind, val)
     return probs
     """Gumbel softmax sampling"""
     gumbels = -torch.log(-torch.log(torch.rand_like(logits) + 1e-8) + 1e-8)
     return ((logits / max(temperature, 1e-10)) + gumbels).argmax(dim=-1)
 # ============ Main Generation Pipeline ============
+def generate_motion(text, motion_length=0, seed=None, export_bvh=False):
+    """Generate motion from text prompt"""
     if seed is not None:
         torch.manual_seed(seed)
         np.random.seed(seed)
     mean = np.load(ONNX_DIR / "mean.npy")
     std = np.load(ONNX_DIR / "std.npy")
     tokens = clip.tokenize([text], truncate=True)
     clip_sess = get_session("clip_text")
+    text_emb = clip_sess.run(None, {"text_tokens": tokens.numpy()})[0]
     if motion_length <= 0:
         len_sess = get_session("length_estimator")
+        len_logits = len_sess.run(None, {"text_embedding": text_emb})[0]
         probs = torch.softmax(torch.from_numpy(len_logits), dim=-1)
         token_len = torch.multinomial(probs, 1).item()
     else:
         token_len = int(motion_length * 20 / 4)
+    token_len = max(2, min(token_len, 49))
     m_length = token_len * 4
+    max_len = 49
     print(f"Generating motion: '{text}' ({m_length} frames, {m_length/20:.1f}s)")
+    mask_id = 512
+    pad_id = 513
     ids = torch.full((1, max_len), pad_id, dtype=torch.long)
     ids[:, :token_len] = mask_id
     scores = torch.zeros(1, max_len)
+    scores[:, token_len:] = 1e5
     padding_mask = np.zeros((1, max_len), dtype=bool)
     padding_mask[:, token_len:] = True
     mask_sess = get_session("mask_transformer")
     for step in range(TIMESTEPS):
         t = step / TIMESTEPS
         rand_mask_prob = cosine_schedule(torch.tensor(t)).item()
         num_masked = max(1, int(rand_mask_prob * token_len))
         valid_scores = scores[:, :token_len].clone()
         _, sorted_idx = valid_scores.sort(dim=1)
         mask_pos = sorted_idx[:, :num_masked]
         is_mask = torch.zeros(1, token_len, dtype=torch.bool)
         is_mask.scatter_(1, mask_pos, True)
         ids[:, :token_len] = torch.where(is_mask, mask_id, ids[:, :token_len])
         logits = mask_sess.run(None, {
+            "motion_ids": ids.numpy(),
+            "cond_vector": text_emb,
+            "padding_mask": padding_mask
+        })[0]
+        logits = torch.from_numpy(logits)
+        logits = logits[:, :512, :token_len]
+        logits = logits.permute(0, 2, 1)
         filtered_logits = top_k_filter(logits / TEMPERATURE, TOPK_FILTER)
+        new_ids = gumbel_sample(filtered_logits, TEMPERATURE)
         probs = torch.softmax(filtered_logits, dim=-1)
         new_scores = probs.gather(-1, new_ids.unsqueeze(-1)).squeeze(-1)
         ids[:, :token_len] = torch.where(is_mask, new_ids, ids[:, :token_len])
         scores[:, :token_len] = torch.where(is_mask, new_scores, scores[:, :token_len])
     res_sess = get_session("residual_transformer")
     num_quantizers = 6
+    res_token_embed = np.load(ONNX_DIR / "res_token_embed.npy")
     all_codes = torch.zeros(1, max_len, num_quantizers, dtype=torch.long)
     all_codes[:, :, 0] = ids
     history_sum = np.zeros((1, max_len, 512), dtype=np.float32)
     motion_ids = ids.clone()
     for q in range(1, num_quantizers):
+        token_embed = res_token_embed[q-1]
         clamped_ids = np.clip(motion_ids[0].numpy(), 0, 512)
+        gathered = token_embed[clamped_ids]
+        history_sum += gathered[np.newaxis, :, :]
         q_id = np.array([q], dtype=np.int64)
         logits = res_sess.run(None, {
+            "motion_codes": history_sum.astype(np.float32),
+            "q_id": q_id,
+            "cond_vector": text_emb,
+            "padding_mask": padding_mask
         })[0]
         logits = torch.from_numpy(logits)[:, :512, :token_len].permute(0, 2, 1)
         new_ids_q = gumbel_sample(logits, 1.0)
         all_codes[:, :token_len, q] = new_ids_q
+        motion_ids[:, :token_len] = new_ids_q
     decoder_sess = get_session("vqvae_decoder")
     valid_codes = all_codes[:, :token_len, :].numpy()
     motion = decoder_sess.run(None, {
+        "code_indices": valid_codes
+    })[0]
     motion = np.repeat(motion, 4, axis=1)[:, :m_length, :]
     motion = motion * std + mean
     motion_tensor = torch.from_numpy(motion).float()
     joints = recover_from_ric(motion_tensor, JOINTS_NUM)
+    joints = joints.squeeze(0).numpy()
+    video_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
     plot_3d_motion(video_path, joints, text, fps=20)
+    bvh_path = None
+    if export_bvh:
+        bvh_path = tempfile.NamedTemporaryFile(suffix=".bvh", delete=False).name
+        joints_to_bvh(joints, bvh_path, fps=20)
+        print(f"BVH exported: {bvh_path}")
+    return joints, video_path, bvh_path
 # ============ Gradio Interface ============
 def create_demo():
     import gradio as gr
+    def generate_fn(text, length, seed, export_bvh):
         if not text or text.strip() == "":
+            return None, None
         seed = int(seed) if seed else None
         length = float(length) if length else 0
+        joints, video_path, bvh_path = generate_motion(text, length, seed, export_bvh)
+        return video_path, bvh_path
     with gr.Blocks(title="MoMask") as demo:
         gr.Markdown("## [MoMask](https://github.com/EricGuo5513/momask-codes) - Text to Motion")
+        gr.Markdown("Generate 3D human skeleton animations from text descriptions. Download BVH for Blender!")
         with gr.Row():
             with gr.Column():
                                        info="0 = auto-estimate")
                     seed = gr.Number(label="Seed", value=42,
                                     info="For reproducibility")
+                export_bvh = gr.Checkbox(label="Export BVH for Blender", value=True)
                 btn = gr.Button("Generate", variant="primary")
             with gr.Column():
                 video = gr.Video(label="Generated Motion")
+                bvh_file = gr.File(label="BVH Download")
         gr.Examples(
             examples=[
+                ["A person walks forward", 0, 42, True],
+                ["A person is running on a treadmill", 0, 123, True],
+                ["A person jumps up and then lands", 0, 456, True],
+                ["A person does a salsa dance", 0, 789, True],
+                ["A person kicks with their right leg", 0, 101, True],
             ],
+            inputs=[text, length, seed, export_bvh],
+            outputs=[video, bvh_file],
             fn=generate_fn,
             cache_examples=False,
         )
+        btn.click(fn=generate_fn, inputs=[text, length, seed, export_bvh], outputs=[video, bvh_file])
     return demo
 # ============ CLI ============
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         text = sys.argv[1]
         length = float(sys.argv[2]) if len(sys.argv) > 2 else 0
         seed = int(sys.argv[3]) if len(sys.argv) > 3 else 42
+        joints, video_path, bvh_path = generate_motion(text, length, seed, export_bvh=True)
+        print(f"Video: {video_path}")
+        print(f"BVH: {bvh_path}")
         print(f"Joints shape: {joints.shape}")
     else:
         demo = create_demo()
+        demo.launch()