ZeyuLing commited on
Commit
33179e5
·
verified ·
1 Parent(s): 8454114

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PRISM-TP2M-1.4B
2
+
3
+ PRISM is a diffusion-based 3D human motion generation model for **text-to-motion (T2M)** and **pose-conditioned generation (TP2M)**. This checkpoint (~1.4B parameters) supports single-segment and multi-segment autoregressive generation.
4
+
5
+ ## Model Details
6
+
7
+ - **Architecture**: Flow-matching DiT transformer with causal spatio-temporal Motion VAE
8
+ - **Text encoder**: UMT5 (T5-style)
9
+ - **Output**: SMPL/SMPL-X body parameters (22 joints, rotation_6d, 30 fps)
10
+
11
+ ## Usage
12
+
13
+ ### Load from Hugging Face
14
+
15
+ ```python
16
+ from mmotion.pipelines.prism_from_pretrained import load_prism_pipeline_from_pretrained
17
+
18
+ pipe = load_prism_pipeline_from_pretrained("ZeyuLing/PRISM-TP2M-1.4B")
19
+ ```
20
+
21
+ ### Text-to-Motion (single segment)
22
+
23
+ ```python
24
+ smplx_dict = pipe(
25
+ prompts="A person walks forward and waves.",
26
+ negative_prompt="",
27
+ num_frames_per_segment=129,
28
+ num_joints=23,
29
+ guidance_scale=5.0,
30
+ )
31
+ # smplx_dict: transl, global_orient, body_pose, etc.
32
+ ```
33
+
34
+ ### Sequential Multi-Segment
35
+
36
+ ```python
37
+ smplx_dict = pipe(
38
+ prompts=["A person waves.", "A person walks.", "A person bows."],
39
+ num_frames_per_segment=[97, 129, 97],
40
+ guidance_scale=5.0,
41
+ )
42
+ ```
43
+
44
+ ### Pose-Conditioned (TP2M)
45
+
46
+ Provide a first-frame `.npz` as condition:
47
+
48
+ ```python
49
+ smplx_dict = pipe(
50
+ prompts="The person stands up and walks.",
51
+ first_frame_motion_path="/path/to/first_frame.npz",
52
+ num_frames_per_segment=129,
53
+ guidance_scale=5.0,
54
+ )
55
+ ```
56
+
57
+ ## Requirements
58
+
59
+ - Python ≥3.9
60
+ - PyTorch (CUDA recommended)
61
+ - transformers, diffusers, einops, mmengine
62
+ - SMPL/SMPL-X body model (for full mesh rendering)
63
+
64
+ ## Citation
65
+
66
+ If you use PRISM in your research, please cite:
67
+
68
+ ```bibtex
69
+ @inproceedings{prism2026,
70
+ title={PRISM: Text-to-Motion and Sequential Motion Generation},
71
+ booktitle={ECCV},
72
+ year={2026},
73
+ }
74
+ ```
75
+
76
+ ## License
77
+
78
+ See the [versatilemotion](https://github.com/...) repository for license terms.
config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "prism",
3
+ "transformer": {
4
+ "patch_size": [
5
+ 1,
6
+ 1
7
+ ],
8
+ "attention_head_dim": 128,
9
+ "cross_attn_norm": true,
10
+ "added_kv_proj_dim": null,
11
+ "eps": 1e-06,
12
+ "ffn_dim": 8960,
13
+ "freq_dim": 256,
14
+ "in_channels": 16,
15
+ "num_attention_heads": 12,
16
+ "num_layers": 30,
17
+ "out_channels": 16,
18
+ "qk_norm": "rms_norm_across_heads",
19
+ "rope_max_seq_len": 1024,
20
+ "text_dim": 4096
21
+ },
22
+ "vae_scale_factor_temporal": 4,
23
+ "max_text_length": 256
24
+ }
text_encoder/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "model_type": "umt5_encoder"
3
+ }
text_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb0c519d265962df648adcc424a0bff12d0bd23c070ab7867166fc03006e84f6
3
+ size 26924267472
transformer/config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "patch_size": [
3
+ 1,
4
+ 1
5
+ ],
6
+ "attention_head_dim": 128,
7
+ "cross_attn_norm": true,
8
+ "added_kv_proj_dim": null,
9
+ "eps": 1e-06,
10
+ "ffn_dim": 8960,
11
+ "freq_dim": 256,
12
+ "in_channels": 16,
13
+ "num_attention_heads": 12,
14
+ "num_layers": 30,
15
+ "out_channels": 16,
16
+ "qk_norm": "rms_norm_across_heads",
17
+ "rope_max_seq_len": 1024,
18
+ "text_dim": 4096,
19
+ "model_type": "prism_transformer"
20
+ }
transformer/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ded6e0d97e9edc5f591969accbcca5315c82cd1f9aeb7080682ad5582e07771f
3
+ size 5675480768
vae/config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "autoencoder_kl_prism_2d",
3
+ "base_dim": 96,
4
+ "decoder_base_dim": null,
5
+ "z_dim": 16,
6
+ "dim_mult": [
7
+ 1,
8
+ 2,
9
+ 4,
10
+ 4
11
+ ],
12
+ "num_res_blocks": 2,
13
+ "attn_scales": [],
14
+ "temporal_downsample": [
15
+ false,
16
+ true,
17
+ true
18
+ ],
19
+ "dropout": 0.0,
20
+ "is_residual": false,
21
+ "in_channels": 6,
22
+ "out_channels": 6,
23
+ "scale_factor_temporal": 4,
24
+ "latents_mean": [
25
+ 0.0092,
26
+ -0.0013,
27
+ -0.0052,
28
+ 0.0025,
29
+ -0.0012,
30
+ 0.0024,
31
+ 0.0021,
32
+ -0.0004,
33
+ -0.0015,
34
+ 0.0013,
35
+ -0.0002,
36
+ 0.0014,
37
+ 0.0018,
38
+ -0.0001,
39
+ -0.0008,
40
+ 0.0009
41
+ ],
42
+ "latents_std": [
43
+ 0.993707,
44
+ 1.020968,
45
+ 0.996201,
46
+ 1.025335,
47
+ 0.997547,
48
+ 1.035847,
49
+ 1.008814,
50
+ 0.999811,
51
+ 0.980396,
52
+ 1.000318,
53
+ 1.033794,
54
+ 0.993485,
55
+ 0.998681,
56
+ 1.038657,
57
+ 1.001396,
58
+ 0.997597
59
+ ],
60
+ "use_static": false,
61
+ "use_rollout_trans": true
62
+ }
vae/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aae4b33f4a939c07026ced40ebddf49fa06d5e1d070babd6d050e10d2b3ca51f
3
+ size 69661320