Nekochu commited on
Commit
a49cfa0
·
verified ·
1 Parent(s): e8911be

Fix render

Browse files
README.md CHANGED
@@ -8,21 +8,32 @@ sdk_version: "6.1.0"
8
  app_file: app_new.py
9
  pinned: false
10
  python_version: "3.10"
11
- short_description: Text-to-3D motion generation using ONNX INT8 models
12
  ---
13
 
14
  # MoMask: Text-to-Motion Generation
15
 
16
  Generate 3D human skeleton animations from text descriptions using [MoMask](https://github.com/EricGuo5513/momask-codes).
17
 
18
- ## Model Architecture (ONNX INT8, ~151MB total)
19
- | Model | Size | Precision |
20
- |-------|------|-----------|
21
- | CLIP Text Encoder | 62MB | INT8 |
22
- | Mask Transformer | 20MB | INT8 |
23
- | Residual Transformer | 20MB | INT8 |
24
- | VQ-VAE Decoder | 44MB | FP32 |
25
- | Length Estimator | 441KB | INT8 |
 
 
 
 
 
26
 
27
  ## Usage
28
- Enter a text description and click Generate.
 
 
 
 
 
 
 
8
  app_file: app_new.py
9
  pinned: false
10
  python_version: "3.10"
11
+ short_description: Text-to-3D motion generation using ONNX models
12
  ---
13
 
14
  # MoMask: Text-to-Motion Generation
15
 
16
  Generate 3D human skeleton animations from text descriptions using [MoMask](https://github.com/EricGuo5513/momask-codes).
17
 
18
+ ## Features
19
+ - Text-to-motion generation with classifier-free guidance
20
+ - Download BVH files for Blender import
21
+ - ~7 seconds of motion per generation
22
+
23
+ ## Model Architecture (ONNX FP32, ~416MB total)
24
+ | Model | Size | Purpose |
25
+ |-------|------|---------|
26
+ | CLIP Text Encoder | 254MB | Text embedding |
27
+ | Mask Transformer | 56MB | Initial motion tokens |
28
+ | Residual Transformer | 55MB | Refine motion details |
29
+ | VQ-VAE Decoder | 46MB | Decode to motion |
30
+ | Length Estimator | 0.5MB | Predict motion length |
31
 
32
  ## Usage
33
+ 1. Enter a text description (e.g., "A person walks forward")
34
+ 2. Optionally set duration and seed
35
+ 3. Click Generate
36
+ 4. Download MP4 video or BVH for Blender
37
+
38
+ ## Credits
39
+ Based on [MoMask](https://github.com/EricGuo5513/momask-codes) by Chuan Guo et al.
app_new.py CHANGED
@@ -44,7 +44,8 @@ ONNX_DIR = Path(__file__).parent / "onnx_models"
44
  DEVICE = "cpu"
45
  JOINTS_NUM = 22
46
  TIMESTEPS = 18
47
- COND_SCALE = 4
 
48
  TEMPERATURE = 1.0
49
  TOPK_FILTER = 0.9
50
 
@@ -226,8 +227,8 @@ def gumbel_sample(logits, temperature=1.0):
226
  gumbels = -torch.log(-torch.log(torch.rand_like(logits) + 1e-8) + 1e-8)
227
  return ((logits / max(temperature, 1e-10)) + gumbels).argmax(dim=-1)
228
  # ============ Main Generation Pipeline ============
229
- def generate_motion(text, motion_length=0, seed=None, export_bvh=False):
230
- """Generate motion from text prompt"""
231
  if seed is not None:
232
  torch.manual_seed(seed)
233
  np.random.seed(seed)
@@ -239,6 +240,7 @@ def generate_motion(text, motion_length=0, seed=None, export_bvh=False):
239
 
240
  clip_sess = get_session("clip_text")
241
  text_emb = clip_sess.run(None, {"text_tokens": tokens.numpy()})[0]
 
242
 
243
  if motion_length <= 0:
244
  len_sess = get_session("length_estimator")
@@ -279,11 +281,18 @@ def generate_motion(text, motion_length=0, seed=None, export_bvh=False):
279
 
280
  ids[:, :token_len] = torch.where(is_mask, mask_id, ids[:, :token_len])
281
 
282
- logits = mask_sess.run(None, {
 
283
  "motion_ids": ids.numpy(),
284
  "cond_vector": text_emb,
285
  "padding_mask": padding_mask
286
  })[0]
 
 
 
 
 
 
287
 
288
  logits = torch.from_numpy(logits)
289
  logits = logits[:, :512, :token_len]
@@ -316,12 +325,20 @@ def generate_motion(text, motion_length=0, seed=None, export_bvh=False):
316
 
317
  q_id = np.array([q], dtype=np.int64)
318
 
319
- logits = res_sess.run(None, {
 
320
  "motion_codes": history_sum.astype(np.float32),
321
  "q_id": q_id,
322
  "cond_vector": text_emb,
323
  "padding_mask": padding_mask
324
  })[0]
 
 
 
 
 
 
 
325
 
326
  logits = torch.from_numpy(logits)[:, :512, :token_len].permute(0, 2, 1)
327
  new_ids_q = gumbel_sample(logits, 1.0)
@@ -344,23 +361,20 @@ def generate_motion(text, motion_length=0, seed=None, export_bvh=False):
344
  video_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
345
  plot_3d_motion(video_path, joints, text, fps=20)
346
 
347
- bvh_path = None
348
- if export_bvh:
349
- bvh_path = tempfile.NamedTemporaryFile(suffix=".bvh", delete=False).name
350
- joints_to_bvh(joints, bvh_path, fps=20)
351
- print(f"BVH exported: {bvh_path}")
352
 
353
  return joints, video_path, bvh_path
354
  # ============ Gradio Interface ============
355
  def create_demo():
356
  import gradio as gr
357
 
358
- def generate_fn(text, length, seed, export_bvh):
359
  if not text or text.strip() == "":
360
  return None, None
361
  seed = int(seed) if seed else None
362
  length = float(length) if length else 0
363
- joints, video_path, bvh_path = generate_motion(text, length, seed, export_bvh)
364
  return video_path, bvh_path
365
 
366
  with gr.Blocks(title="MoMask") as demo:
@@ -377,28 +391,27 @@ def create_demo():
377
  info="0 = auto-estimate")
378
  seed = gr.Number(label="Seed", value=42,
379
  info="For reproducibility")
380
- export_bvh = gr.Checkbox(label="Export BVH for Blender", value=True)
381
  btn = gr.Button("Generate", variant="primary")
382
 
383
  with gr.Column():
384
  video = gr.Video(label="Generated Motion")
385
- bvh_file = gr.File(label="BVH Download")
386
 
387
  gr.Examples(
388
  examples=[
389
- ["A person walks forward", 0, 42, True],
390
- ["A person is running on a treadmill", 0, 123, True],
391
- ["A person jumps up and then lands", 0, 456, True],
392
- ["A person does a salsa dance", 0, 789, True],
393
- ["A person kicks with their right leg", 0, 101, True],
394
  ],
395
- inputs=[text, length, seed, export_bvh],
396
  outputs=[video, bvh_file],
397
  fn=generate_fn,
398
  cache_examples=False,
399
  )
400
 
401
- btn.click(fn=generate_fn, inputs=[text, length, seed, export_bvh], outputs=[video, bvh_file])
402
 
403
  return demo
404
 
@@ -409,7 +422,7 @@ if __name__ == "__main__":
409
  length = float(sys.argv[2]) if len(sys.argv) > 2 else 0
410
  seed = int(sys.argv[3]) if len(sys.argv) > 3 else 42
411
 
412
- joints, video_path, bvh_path = generate_motion(text, length, seed, export_bvh=True)
413
  print(f"Video: {video_path}")
414
  print(f"BVH: {bvh_path}")
415
  print(f"Joints shape: {joints.shape}")
 
44
  DEVICE = "cpu"
45
  JOINTS_NUM = 22
46
  TIMESTEPS = 18
47
+ MASK_COND_SCALE = 4.0 # CFG scale for mask transformer
48
+ RES_COND_SCALE = 5.0 # CFG scale for residual transformer
49
  TEMPERATURE = 1.0
50
  TOPK_FILTER = 0.9
51
 
 
227
  gumbels = -torch.log(-torch.log(torch.rand_like(logits) + 1e-8) + 1e-8)
228
  return ((logits / max(temperature, 1e-10)) + gumbels).argmax(dim=-1)
229
  # ============ Main Generation Pipeline ============
230
+ def generate_motion(text, motion_length=0, seed=None):
231
+ """Generate motion from text prompt with CFG"""
232
  if seed is not None:
233
  torch.manual_seed(seed)
234
  np.random.seed(seed)
 
240
 
241
  clip_sess = get_session("clip_text")
242
  text_emb = clip_sess.run(None, {"text_tokens": tokens.numpy()})[0]
243
+ zero_emb = np.zeros_like(text_emb) # For CFG unconditional path
244
 
245
  if motion_length <= 0:
246
  len_sess = get_session("length_estimator")
 
281
 
282
  ids[:, :token_len] = torch.where(is_mask, mask_id, ids[:, :token_len])
283
 
284
+ # CFG: conditional and unconditional logits
285
+ cond_logits = mask_sess.run(None, {
286
  "motion_ids": ids.numpy(),
287
  "cond_vector": text_emb,
288
  "padding_mask": padding_mask
289
  })[0]
290
+ uncond_logits = mask_sess.run(None, {
291
+ "motion_ids": ids.numpy(),
292
+ "cond_vector": zero_emb,
293
+ "padding_mask": padding_mask
294
+ })[0]
295
+ logits = uncond_logits + (cond_logits - uncond_logits) * MASK_COND_SCALE
296
 
297
  logits = torch.from_numpy(logits)
298
  logits = logits[:, :512, :token_len]
 
325
 
326
  q_id = np.array([q], dtype=np.int64)
327
 
328
+ # CFG for residual transformer
329
+ cond_logits = res_sess.run(None, {
330
  "motion_codes": history_sum.astype(np.float32),
331
  "q_id": q_id,
332
  "cond_vector": text_emb,
333
  "padding_mask": padding_mask
334
  })[0]
335
+ uncond_logits = res_sess.run(None, {
336
+ "motion_codes": history_sum.astype(np.float32),
337
+ "q_id": q_id,
338
+ "cond_vector": zero_emb,
339
+ "padding_mask": padding_mask
340
+ })[0]
341
+ logits = uncond_logits + (cond_logits - uncond_logits) * RES_COND_SCALE
342
 
343
  logits = torch.from_numpy(logits)[:, :512, :token_len].permute(0, 2, 1)
344
  new_ids_q = gumbel_sample(logits, 1.0)
 
361
  video_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
362
  plot_3d_motion(video_path, joints, text, fps=20)
363
 
364
+ bvh_path = tempfile.NamedTemporaryFile(suffix=".bvh", delete=False).name
365
+ joints_to_bvh(joints, bvh_path, fps=20)
 
 
 
366
 
367
  return joints, video_path, bvh_path
368
  # ============ Gradio Interface ============
369
  def create_demo():
370
  import gradio as gr
371
 
372
+ def generate_fn(text, length, seed):
373
  if not text or text.strip() == "":
374
  return None, None
375
  seed = int(seed) if seed else None
376
  length = float(length) if length else 0
377
+ joints, video_path, bvh_path = generate_motion(text, length, seed)
378
  return video_path, bvh_path
379
 
380
  with gr.Blocks(title="MoMask") as demo:
 
391
  info="0 = auto-estimate")
392
  seed = gr.Number(label="Seed", value=42,
393
  info="For reproducibility")
 
394
  btn = gr.Button("Generate", variant="primary")
395
 
396
  with gr.Column():
397
  video = gr.Video(label="Generated Motion")
398
+ bvh_file = gr.File(label="BVH Download (for Blender)")
399
 
400
  gr.Examples(
401
  examples=[
402
+ ["A person walks forward", 0, 42],
403
+ ["A person is running on a treadmill", 0, 123],
404
+ ["A person jumps up and then lands", 0, 456],
405
+ ["A person does a salsa dance", 0, 789],
406
+ ["A person kicks with their right leg", 0, 101],
407
  ],
408
+ inputs=[text, length, seed],
409
  outputs=[video, bvh_file],
410
  fn=generate_fn,
411
  cache_examples=False,
412
  )
413
 
414
+ btn.click(fn=generate_fn, inputs=[text, length, seed], outputs=[video, bvh_file])
415
 
416
  return demo
417
 
 
422
  length = float(sys.argv[2]) if len(sys.argv) > 2 else 0
423
  seed = int(sys.argv[3]) if len(sys.argv) > 3 else 42
424
 
425
+ joints, video_path, bvh_path = generate_motion(text, length, seed)
426
  print(f"Video: {video_path}")
427
  print(f"BVH: {bvh_path}")
428
  print(f"Joints shape: {joints.shape}")
onnx_models/clip_text.onnx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75213356a2cca6a6a69cb4ea45142dff121bf0baf5bdad621e0e68fe68355b6a
3
- size 64683509
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fee129a5e73595244105a917c8cd6884bd97f04d6a1d09d00b4e715d590fe90e
3
+ size 254389519
onnx_models/mask_transformer.onnx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8154ce0e72fce61b889c86ebde4e08e1976632f943bde9a49117af7d4b9bd95d
3
- size 20297925
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb8513f25349c03a7ead2447a2d40d906011ff813905a921a2424544a6e632e9
3
+ size 56169224
onnx_models/residual_transformer.onnx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f703b17e8ea1c7557d8f3ba3951c0ca7e56c22e3d3514df5d2f35c08bf0a00c
3
- size 20039878
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:931b5b0bf2b1e507233b48d3108fc0ce89bc49d2fb058d8f6b66c6867b554375
3
+ size 55127345
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
- # Minimal requirements for HuggingFace Space (CPU)
 
2
  onnxruntime>=1.16.0
3
  torch>=2.0.0
4
  numpy
 
1
+ # MoMask HuggingFace Space requirements (CPU)
2
+ gradio>=6.1.0
3
  onnxruntime>=1.16.0
4
  torch>=2.0.0
5
  numpy