Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,414 Bytes
f875353 2c536f7 f875353 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
from pathlib import Path
import gradio as gr
import torch
from datetime import datetime
import tempfile
from tqdm import tqdm
from textwrap import dedent
import spaces
from motion_latent_diffusion_standalone import MotionLatentDiffusionModel
from visualize import create_video_from_joints
model = MotionLatentDiffusionModel(
vae_repo_id="blanchon/motion-latent-diffusion-standalone-vae",
denoiser_repo_id="blanchon/motion-latent-diffusion-standalone-denoiser",
text_encoder_repo_id="openai/clip-vit-large-patch14",
)
model.to("cuda")
model.eval()
model.requires_grad_(False)
@spaces.GPU
def generate_motion(
text_prompt: str, motion_length: int, progress=gr.Progress(track_tqdm=True)
) -> tuple[Path, str, Path]:
try:
# Create temporary files
temp_dir = tempfile.gettempdir()
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"motion_{timestamp}"
pt_path = Path(temp_dir) / f"{filename}.pt"
video_path = Path(temp_dir) / f"{filename}.mp4"
print("π¬ Generating motion...")
with tqdm(
total=motion_length,
desc="Generating motion",
# disable=not progress.is_tracked(),
) as pbar:
def callback_on_step_end(i: int, latents: torch.Tensor):
pbar.update(i)
# Generate motion (returns PyTorch tensor)
joints, latent = model.generate(
text_prompt,
motion_length,
return_latent=True,
callback_on_step_end=callback_on_step_end,
)
# Save motion data as PyTorch tensor
torch.save(joints, pt_path)
print("π₯ Creating visualization...")
# Create video visualization
video_path = create_video_from_joints(joints, video_path.as_posix(), fps=20)
print("β
Done!")
# Generate info text
info_text = dedent("""
β
**Generation Complete!**
**Prompt:** {text_prompt}
**Motion Length:** {motion_length} frames ({motion_length / 20:.1f}s at 20fps)
**Output Shape:** {joints.shape} (frames Γ joints Γ coords)
The video shows a 3D skeleton performing the motion.
You can download both the video and the raw motion data below.
""")
return video_path, info_text, pt_path.as_posix()
except Exception as e:
error_msg = f"Error during generation: {str(e)}"
import traceback
traceback.print_exc()
return None, error_msg, None
def create_example_prompts():
"""Return example prompts for the interface"""
return [
["a person walks forward slowly", 80],
["jumping up and down", 100],
["a person waves hello", 60],
["running in place", 100],
["a person does jumping jacks", 120],
["someone performs a cartwheel", 140],
["walking backwards carefully", 90],
["a person stretches their arms", 80],
]
with gr.Blocks(title="MLD Text-to-Motion Generator", theme=gr.themes.Soft()) as demo:
# Header
gr.Markdown("""
# π¬ MLD Text-to-Motion Generator
Generate realistic human motion animations from text descriptions!
Powered by Motion Latent Diffusion (MLD).
### π‘ Tips for Best Results:
- Be specific: "a person walks forward slowly" works better than just "walking"
- Use present tense: "walks" or "is walking"
- Describe single continuous actions
- Recommended length: 40-60 frames for short actions, 80-120 for walking/running
""")
with gr.Row():
# Left column - Inputs
with gr.Column(scale=1):
gr.Markdown("## π Input")
text_input = gr.Textbox(
label="Text Prompt",
placeholder="Enter motion description (e.g., 'a person walks forward slowly')",
lines=3,
value="a person walks forward",
)
with gr.Row():
length_slider = gr.Slider(
minimum=16,
maximum=196,
value=100,
step=1,
label="Motion Length (frames)",
info="20 frames = 1 second",
)
generate_btn = gr.Button("π¬ Generate Motion", variant="primary", size="lg")
gr.Markdown("### π Example Prompts")
gr.Examples(
examples=create_example_prompts(),
inputs=[text_input, length_slider],
label=None,
)
# Right column - Outputs
with gr.Column(scale=1):
gr.Markdown("## π₯ Output")
info_output = gr.Markdown(
"Generate a motion to see the results here.",
elem_classes=["output-info"],
)
video_output = gr.Video(
label="Generated Motion Video",
elem_classes=["output-video"],
autoplay=True,
show_share_button=True,
)
with gr.Row():
pt_download = gr.File(label="Download Motion Data (.pt)", visible=False)
# Footer
gr.Markdown(
dedent("""
---
### βΉοΈ About
**Motion Latent Diffusion (MLD)** generates 3D human motion by:
1. Encoding text with CLIP
2. Generating motion in latent space via diffusion (50 steps)
3. Decoding to 3D joint positions (22 joints)
4. Visualizing as a 3D skeleton animation
**Citation:** Chen et al., "Executing your Commands via Motion Diffusion in Latent Space", CVPR 2023
**Repository:** [motion-latent-diffusion](https://github.com/ChenFengYe/motion-latent-diffusion)
""")
)
# Event handlers
def generate_and_update(text, length):
video, info, pt = generate_motion(text, length)
if pt:
return video, info, gr.update(value=pt, visible=True)
return video, info, gr.update(visible=False)
generate_btn.click(
fn=generate_and_update,
inputs=[text_input, length_slider],
outputs=[video_output, info_output, pt_download],
)
demo.launch(
server_name="0.0.0.0", # Allow external access
server_port=7860,
share=False,
show_error=True,
)
|