Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- config/a2m/a2m_t1d128.yaml +18 -0
- config/a2m/a2m_t1d128_posepre.yaml +29 -0
- config/a2m/a2m_t1d512.yaml +18 -0
- config/a2m/a2m_t1d512_doubleref.yaml +18 -0
- config/a2m/a2m_t1d512_pose.yaml +24 -0
- config/a2m/a2m_t1d512_posepre.yaml +29 -0
- config/a2m/a2m_t1d64.yaml +18 -0
- config/a2m/a2m_t1d64_posepre.yaml +29 -0
- config/a2m/a2m_t2d256.yaml +18 -0
- config/a2m/a2m_t2d256_pose.yaml +24 -0
- config/a2m/a2m_t2d256_posepre.yaml +29 -0
- config/a2m/cross_audio_pose_t1d512_l16_dim1024.yaml +25 -0
- config/a2m/cross_audio_pose_t1d512_l64_dim1024.yaml +25 -0
- config/a2m/cross_audio_pose_t2d256_l16_dim1024.yaml +25 -0
- config/a2m/cross_audio_pose_t4d128_l16_dim1024.yaml +25 -0
- config/a2m/cross_audio_pose_t4d128_l32_dim1024.yaml +25 -0
- config/a2m/cross_audio_posepre_t1d512_l16_dim1024.yaml +30 -0
- config/a2m/cross_audio_posepre_t1d512_l32_dim1024.yaml +30 -0
- config/a2m/cross_audio_posepre_t1d512_l64_dim1024.yaml +30 -0
- config/a2m/cross_audio_posepre_t2d256_l16_dim1024.yaml +30 -0
- config/a2m/cross_audio_posepre_t4d128_l16_dim1024.yaml +30 -0
- config/a2m/cross_audio_t1d512_l16_dim1024.yaml +19 -0
- config/a2m/cross_audio_t2d256_l16_dim1024.yaml +19 -0
- config/a2m/cross_audio_t4d128_l16_dim1024.yaml +19 -0
- config/a2m/cross_audio_t4d128_l32_dim1024.yaml +19 -0
- config/accelerate_config_1.yaml +9 -0
- config/accelerate_config_2.yaml +9 -0
- config/accelerate_config_3.yaml +9 -0
- config/accelerate_config_4.yaml +9 -0
- config/accelerate_config_5.yaml +9 -0
- config/accelerate_config_6.yaml +9 -0
- config/accelerate_config_7.yaml +9 -0
- config/accelerate_config_8.yaml +9 -0
- config/inference/a2m.yaml +12 -0
- config/inference/a2m_wpose.yaml +12 -0
- config/inference/amd-s-t1-d1024-spatial-ablation.yaml +6 -0
- config/inference/amd-s-t1-d128-spatial-ablation.yaml +6 -0
- config/inference/amd-s-t1-d256-spatial-ablation.yaml +6 -0
- config/inference/amd-s-t1-d32-spatial-ablation.yaml +6 -0
- config/inference/amd-s-t1-d512-nonorm-spatial-mask25.yaml +6 -0
- config/inference/amd-s-t1-d512-nonorm-spatial-mask50.yaml +6 -0
- config/inference/amd-s-t1-d512-nonorm-spatial-mask75.yaml +6 -0
- config/inference/amd-s-t1-d512-nonorm-spatial-mask90.yaml +6 -0
- config/inference/amd-s-t1-d512-spatial-ablation.yaml +6 -0
- config/inference/amd-s-t1-d64-spatial-ablation.yaml +6 -0
- config/inference/amd-s-t1-d768-spatial-ablation.yaml +6 -0
- config/inference/amd-s-t8-d64-spatial-ablation.yaml +6 -0
- config/inference/p2m.yaml +8 -0
- config/inference/rec.yaml +5 -0
- config/inference/rec_facevid.yaml +5 -0
config/a2m/a2m_t1d128.yaml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio # 200M : 20M + 180M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 1
|
| 7 |
+
motion_in_channel : 128
|
| 8 |
+
num_step : 1000
|
| 9 |
+
|
| 10 |
+
# ----------- Audio feature encoder -----------
|
| 11 |
+
intermediate_dim : 1024
|
| 12 |
+
window_size : 32
|
| 13 |
+
encoder_out_dim : 1024
|
| 14 |
+
|
| 15 |
+
# ----------- Diffusion Transformer -----------
|
| 16 |
+
diffusion_attn_head_dim : 64
|
| 17 |
+
diffusion_attn_num_heads : 16
|
| 18 |
+
diffusion_num_layers : 8
|
config/a2m/a2m_t1d128_posepre.yaml
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio_PosePre # 532M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 1
|
| 7 |
+
motion_in_channel : 128
|
| 8 |
+
num_step : 1000
|
| 9 |
+
|
| 10 |
+
# ----------- pose --------
|
| 11 |
+
pose_height : 32
|
| 12 |
+
pose_width : 32
|
| 13 |
+
pose_inchannel : 4
|
| 14 |
+
pose_patch_size : 2
|
| 15 |
+
|
| 16 |
+
# ----------- pose predictor --------
|
| 17 |
+
pose_predictor_attn_head_dim : 64
|
| 18 |
+
pose_predictor_attn_num_heads : 8
|
| 19 |
+
pose_predictor_attn_num_layers : 4
|
| 20 |
+
|
| 21 |
+
# ----------- Audio feature encoder -----------
|
| 22 |
+
intermediate_dim : 1024
|
| 23 |
+
window_size : 32
|
| 24 |
+
encoder_out_dim : 1024
|
| 25 |
+
|
| 26 |
+
# ----------- Diffusion Transformer -----------
|
| 27 |
+
diffusion_attn_head_dim : 64
|
| 28 |
+
diffusion_attn_num_heads : 16
|
| 29 |
+
diffusion_num_layers : 8
|
config/a2m/a2m_t1d512.yaml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio # 200M : 20M + 180M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 1
|
| 7 |
+
motion_in_channel : 512
|
| 8 |
+
num_step : 1000
|
| 9 |
+
|
| 10 |
+
# ----------- Audio feature encoder -----------
|
| 11 |
+
intermediate_dim : 1024
|
| 12 |
+
window_size : 32
|
| 13 |
+
encoder_out_dim : 1024
|
| 14 |
+
|
| 15 |
+
# ----------- Diffusion Transformer -----------
|
| 16 |
+
diffusion_attn_head_dim : 64
|
| 17 |
+
diffusion_attn_num_heads : 16
|
| 18 |
+
diffusion_num_layers : 8
|
config/a2m/a2m_t1d512_doubleref.yaml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio_DoubleRef # 200M : 20M + 180M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 1
|
| 7 |
+
motion_in_channel : 512
|
| 8 |
+
num_step : 1000
|
| 9 |
+
|
| 10 |
+
# ----------- Audio feature encoder -----------
|
| 11 |
+
intermediate_dim : 1024
|
| 12 |
+
window_size : 32
|
| 13 |
+
encoder_out_dim : 1024
|
| 14 |
+
|
| 15 |
+
# ----------- Diffusion Transformer -----------
|
| 16 |
+
diffusion_attn_head_dim : 64
|
| 17 |
+
diffusion_attn_num_heads : 16
|
| 18 |
+
diffusion_num_layers : 8
|
config/a2m/a2m_t1d512_pose.yaml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio_Pose # 200M : 20M + 180M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 1
|
| 7 |
+
motion_in_channel : 512
|
| 8 |
+
num_step : 1000
|
| 9 |
+
|
| 10 |
+
# ----------- pose --------
|
| 11 |
+
pose_height : 32
|
| 12 |
+
pose_width : 32
|
| 13 |
+
pose_inchannel : 4
|
| 14 |
+
pose_patch_size : 2
|
| 15 |
+
|
| 16 |
+
# ----------- Audio feature encoder -----------
|
| 17 |
+
intermediate_dim : 1024
|
| 18 |
+
window_size : 32
|
| 19 |
+
encoder_out_dim : 1024
|
| 20 |
+
|
| 21 |
+
# ----------- Diffusion Transformer -----------
|
| 22 |
+
diffusion_attn_head_dim : 64
|
| 23 |
+
diffusion_attn_num_heads : 16
|
| 24 |
+
diffusion_num_layers : 8
|
config/a2m/a2m_t1d512_posepre.yaml
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio_PosePre # 532M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 1
|
| 7 |
+
motion_in_channel : 512
|
| 8 |
+
num_step : 1000
|
| 9 |
+
|
| 10 |
+
# ----------- pose --------
|
| 11 |
+
pose_height : 32
|
| 12 |
+
pose_width : 32
|
| 13 |
+
pose_inchannel : 4
|
| 14 |
+
pose_patch_size : 2
|
| 15 |
+
|
| 16 |
+
# ----------- pose predictor --------
|
| 17 |
+
pose_predictor_attn_head_dim : 64
|
| 18 |
+
pose_predictor_attn_num_heads : 8
|
| 19 |
+
pose_predictor_attn_num_layers : 4
|
| 20 |
+
|
| 21 |
+
# ----------- Audio feature encoder -----------
|
| 22 |
+
intermediate_dim : 1024
|
| 23 |
+
window_size : 32
|
| 24 |
+
encoder_out_dim : 1024
|
| 25 |
+
|
| 26 |
+
# ----------- Diffusion Transformer -----------
|
| 27 |
+
diffusion_attn_head_dim : 64
|
| 28 |
+
diffusion_attn_num_heads : 16
|
| 29 |
+
diffusion_num_layers : 8
|
config/a2m/a2m_t1d64.yaml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio # 200M : 20M + 180M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 1
|
| 7 |
+
motion_in_channel : 64
|
| 8 |
+
num_step : 1000
|
| 9 |
+
|
| 10 |
+
# ----------- Audio feature encoder -----------
|
| 11 |
+
intermediate_dim : 1024
|
| 12 |
+
window_size : 32
|
| 13 |
+
encoder_out_dim : 1024
|
| 14 |
+
|
| 15 |
+
# ----------- Diffusion Transformer -----------
|
| 16 |
+
diffusion_attn_head_dim : 64
|
| 17 |
+
diffusion_attn_num_heads : 16
|
| 18 |
+
diffusion_num_layers : 8
|
config/a2m/a2m_t1d64_posepre.yaml
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio_PosePre # 532M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 1
|
| 7 |
+
motion_in_channel : 64
|
| 8 |
+
num_step : 1000
|
| 9 |
+
|
| 10 |
+
# ----------- pose --------
|
| 11 |
+
pose_height : 32
|
| 12 |
+
pose_width : 32
|
| 13 |
+
pose_inchannel : 4
|
| 14 |
+
pose_patch_size : 2
|
| 15 |
+
|
| 16 |
+
# ----------- pose predictor --------
|
| 17 |
+
pose_predictor_attn_head_dim : 64
|
| 18 |
+
pose_predictor_attn_num_heads : 8
|
| 19 |
+
pose_predictor_attn_num_layers : 4
|
| 20 |
+
|
| 21 |
+
# ----------- Audio feature encoder -----------
|
| 22 |
+
intermediate_dim : 1024
|
| 23 |
+
window_size : 32
|
| 24 |
+
encoder_out_dim : 1024
|
| 25 |
+
|
| 26 |
+
# ----------- Diffusion Transformer -----------
|
| 27 |
+
diffusion_attn_head_dim : 64
|
| 28 |
+
diffusion_attn_num_heads : 16
|
| 29 |
+
diffusion_num_layers : 8
|
config/a2m/a2m_t2d256.yaml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio # 200M : 20M + 180M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 2
|
| 7 |
+
motion_in_channel : 256
|
| 8 |
+
num_step : 1000
|
| 9 |
+
|
| 10 |
+
# ----------- Audio feature encoder -----------
|
| 11 |
+
intermediate_dim : 1024
|
| 12 |
+
window_size : 32
|
| 13 |
+
encoder_out_dim : 1024
|
| 14 |
+
|
| 15 |
+
# ----------- Diffusion Transformer -----------
|
| 16 |
+
diffusion_attn_head_dim : 64
|
| 17 |
+
diffusion_attn_num_heads : 16
|
| 18 |
+
diffusion_num_layers : 8
|
config/a2m/a2m_t2d256_pose.yaml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio_Pose # 200M : 20M + 180M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 2
|
| 7 |
+
motion_in_channel : 256
|
| 8 |
+
num_step : 1000
|
| 9 |
+
|
| 10 |
+
# ----------- pose --------
|
| 11 |
+
pose_height : 32
|
| 12 |
+
pose_width : 32
|
| 13 |
+
pose_inchannel : 4
|
| 14 |
+
pose_patch_size : 2
|
| 15 |
+
|
| 16 |
+
# ----------- Audio feature encoder -----------
|
| 17 |
+
intermediate_dim : 1024
|
| 18 |
+
window_size : 32
|
| 19 |
+
encoder_out_dim : 1024
|
| 20 |
+
|
| 21 |
+
# ----------- Diffusion Transformer -----------
|
| 22 |
+
diffusion_attn_head_dim : 64
|
| 23 |
+
diffusion_attn_num_heads : 16
|
| 24 |
+
diffusion_num_layers : 8
|
config/a2m/a2m_t2d256_posepre.yaml
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio_PosePre # 532M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 2
|
| 7 |
+
motion_in_channel : 256
|
| 8 |
+
num_step : 1000
|
| 9 |
+
|
| 10 |
+
# ----------- pose --------
|
| 11 |
+
pose_height : 32
|
| 12 |
+
pose_width : 32
|
| 13 |
+
pose_inchannel : 4
|
| 14 |
+
pose_patch_size : 2
|
| 15 |
+
|
| 16 |
+
# ----------- pose predictor --------
|
| 17 |
+
pose_predictor_attn_head_dim : 64
|
| 18 |
+
pose_predictor_attn_num_heads : 8
|
| 19 |
+
pose_predictor_attn_num_layers : 4
|
| 20 |
+
|
| 21 |
+
# ----------- Audio feature encoder -----------
|
| 22 |
+
intermediate_dim : 1024
|
| 23 |
+
window_size : 32
|
| 24 |
+
encoder_out_dim : 1024
|
| 25 |
+
|
| 26 |
+
# ----------- Diffusion Transformer -----------
|
| 27 |
+
diffusion_attn_head_dim : 64
|
| 28 |
+
diffusion_attn_num_heads : 16
|
| 29 |
+
diffusion_num_layers : 8
|
config/a2m/cross_audio_pose_t1d512_l16_dim1024.yaml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio_Pose # 200M : 20M + 180M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 1
|
| 7 |
+
motion_in_channel : 512
|
| 8 |
+
motion_frames : 16
|
| 9 |
+
num_step : 1000
|
| 10 |
+
|
| 11 |
+
# ----------- pose --------
|
| 12 |
+
pose_height : 32
|
| 13 |
+
pose_width : 32
|
| 14 |
+
pose_inchannel : 4
|
| 15 |
+
pose_patch_size : 2
|
| 16 |
+
|
| 17 |
+
# ----------- Audio feature encoder -----------
|
| 18 |
+
intermediate_dim : 1024
|
| 19 |
+
window_size : 32
|
| 20 |
+
encoder_out_dim : 1024
|
| 21 |
+
|
| 22 |
+
# ----------- Diffusion Transformer -----------
|
| 23 |
+
diffusion_attn_head_dim : 64
|
| 24 |
+
diffusion_attn_num_heads : 16
|
| 25 |
+
diffusion_num_layers : 8
|
config/a2m/cross_audio_pose_t1d512_l64_dim1024.yaml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio_Pose # 200M : 20M + 180M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 1
|
| 7 |
+
motion_in_channel : 512
|
| 8 |
+
motion_frames : 64
|
| 9 |
+
num_step : 1000
|
| 10 |
+
|
| 11 |
+
# ----------- pose --------
|
| 12 |
+
pose_height : 32
|
| 13 |
+
pose_width : 32
|
| 14 |
+
pose_inchannel : 4
|
| 15 |
+
pose_patch_size : 2
|
| 16 |
+
|
| 17 |
+
# ----------- Audio feature encoder -----------
|
| 18 |
+
intermediate_dim : 1024
|
| 19 |
+
window_size : 32
|
| 20 |
+
encoder_out_dim : 1024
|
| 21 |
+
|
| 22 |
+
# ----------- Diffusion Transformer -----------
|
| 23 |
+
diffusion_attn_head_dim : 64
|
| 24 |
+
diffusion_attn_num_heads : 16
|
| 25 |
+
diffusion_num_layers : 8
|
config/a2m/cross_audio_pose_t2d256_l16_dim1024.yaml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio_Pose # 200M : 20M + 180M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 2
|
| 7 |
+
motion_in_channel : 256
|
| 8 |
+
motion_frames : 16
|
| 9 |
+
num_step : 1000
|
| 10 |
+
|
| 11 |
+
# ----------- pose --------
|
| 12 |
+
pose_height : 32
|
| 13 |
+
pose_width : 32
|
| 14 |
+
pose_inchannel : 4
|
| 15 |
+
pose_patch_size : 2
|
| 16 |
+
|
| 17 |
+
# ----------- Audio feature encoder -----------
|
| 18 |
+
intermediate_dim : 1024
|
| 19 |
+
window_size : 32
|
| 20 |
+
encoder_out_dim : 1024
|
| 21 |
+
|
| 22 |
+
# ----------- Diffusion Transformer -----------
|
| 23 |
+
diffusion_attn_head_dim : 64
|
| 24 |
+
diffusion_attn_num_heads : 16
|
| 25 |
+
diffusion_num_layers : 8
|
config/a2m/cross_audio_pose_t4d128_l16_dim1024.yaml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio_Pose # 200M : 20M + 180M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 4
|
| 7 |
+
motion_in_channel : 128
|
| 8 |
+
motion_frames : 16
|
| 9 |
+
num_step : 1000
|
| 10 |
+
|
| 11 |
+
# ----------- pose --------
|
| 12 |
+
pose_height : 32
|
| 13 |
+
pose_width : 32
|
| 14 |
+
pose_inchannel : 4
|
| 15 |
+
pose_patch_size : 2
|
| 16 |
+
|
| 17 |
+
# ----------- Audio feature encoder -----------
|
| 18 |
+
intermediate_dim : 1024
|
| 19 |
+
window_size : 32
|
| 20 |
+
encoder_out_dim : 1024
|
| 21 |
+
|
| 22 |
+
# ----------- Diffusion Transformer -----------
|
| 23 |
+
diffusion_attn_head_dim : 64
|
| 24 |
+
diffusion_attn_num_heads : 16
|
| 25 |
+
diffusion_num_layers : 8
|
config/a2m/cross_audio_pose_t4d128_l32_dim1024.yaml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio_Pose # 200M : 20M + 180M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 4
|
| 7 |
+
motion_in_channel : 128
|
| 8 |
+
motion_frames : 32
|
| 9 |
+
num_step : 1000
|
| 10 |
+
|
| 11 |
+
# ----------- pose --------
|
| 12 |
+
pose_height : 32
|
| 13 |
+
pose_width : 32
|
| 14 |
+
pose_inchannel : 4
|
| 15 |
+
pose_patch_size : 2
|
| 16 |
+
|
| 17 |
+
# ----------- Audio feature encoder -----------
|
| 18 |
+
intermediate_dim : 1024
|
| 19 |
+
window_size : 32
|
| 20 |
+
encoder_out_dim : 1024
|
| 21 |
+
|
| 22 |
+
# ----------- Diffusion Transformer -----------
|
| 23 |
+
diffusion_attn_head_dim : 64
|
| 24 |
+
diffusion_attn_num_heads : 16
|
| 25 |
+
diffusion_num_layers : 8
|
config/a2m/cross_audio_posepre_t1d512_l16_dim1024.yaml
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio_PosePre # 532M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 1
|
| 7 |
+
motion_in_channel : 512
|
| 8 |
+
motion_frames : 16
|
| 9 |
+
num_step : 1000
|
| 10 |
+
|
| 11 |
+
# ----------- pose --------
|
| 12 |
+
pose_height : 32
|
| 13 |
+
pose_width : 32
|
| 14 |
+
pose_inchannel : 4
|
| 15 |
+
pose_patch_size : 2
|
| 16 |
+
|
| 17 |
+
# ----------- pose predictor --------
|
| 18 |
+
pose_predictor_attn_head_dim : 64
|
| 19 |
+
pose_predictor_attn_num_heads : 8
|
| 20 |
+
pose_predictor_attn_num_layers : 4
|
| 21 |
+
|
| 22 |
+
# ----------- Audio feature encoder -----------
|
| 23 |
+
intermediate_dim : 1024
|
| 24 |
+
window_size : 32
|
| 25 |
+
encoder_out_dim : 1024
|
| 26 |
+
|
| 27 |
+
# ----------- Diffusion Transformer -----------
|
| 28 |
+
diffusion_attn_head_dim : 64
|
| 29 |
+
diffusion_attn_num_heads : 16
|
| 30 |
+
diffusion_num_layers : 8
|
config/a2m/cross_audio_posepre_t1d512_l32_dim1024.yaml
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio_PosePre # 532M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 1
|
| 7 |
+
motion_in_channel : 512
|
| 8 |
+
motion_frames : 32
|
| 9 |
+
num_step : 1000
|
| 10 |
+
|
| 11 |
+
# ----------- pose --------
|
| 12 |
+
pose_height : 32
|
| 13 |
+
pose_width : 32
|
| 14 |
+
pose_inchannel : 4
|
| 15 |
+
pose_patch_size : 2
|
| 16 |
+
|
| 17 |
+
# ----------- pose predictor --------
|
| 18 |
+
pose_predictor_attn_head_dim : 64
|
| 19 |
+
pose_predictor_attn_num_heads : 8
|
| 20 |
+
pose_predictor_attn_num_layers : 4
|
| 21 |
+
|
| 22 |
+
# ----------- Audio feature encoder -----------
|
| 23 |
+
intermediate_dim : 1024
|
| 24 |
+
window_size : 32
|
| 25 |
+
encoder_out_dim : 1024
|
| 26 |
+
|
| 27 |
+
# ----------- Diffusion Transformer -----------
|
| 28 |
+
diffusion_attn_head_dim : 64
|
| 29 |
+
diffusion_attn_num_heads : 16
|
| 30 |
+
diffusion_num_layers : 8
|
config/a2m/cross_audio_posepre_t1d512_l64_dim1024.yaml
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio_PosePre # 532M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 1
|
| 7 |
+
motion_in_channel : 512
|
| 8 |
+
motion_frames : 64
|
| 9 |
+
num_step : 1000
|
| 10 |
+
|
| 11 |
+
# ----------- pose --------
|
| 12 |
+
pose_height : 32
|
| 13 |
+
pose_width : 32
|
| 14 |
+
pose_inchannel : 4
|
| 15 |
+
pose_patch_size : 2
|
| 16 |
+
|
| 17 |
+
# ----------- pose predictor --------
|
| 18 |
+
pose_predictor_attn_head_dim : 64
|
| 19 |
+
pose_predictor_attn_num_heads : 8
|
| 20 |
+
pose_predictor_attn_num_layers : 4
|
| 21 |
+
|
| 22 |
+
# ----------- Audio feature encoder -----------
|
| 23 |
+
intermediate_dim : 1024
|
| 24 |
+
window_size : 32
|
| 25 |
+
encoder_out_dim : 1024
|
| 26 |
+
|
| 27 |
+
# ----------- Diffusion Transformer -----------
|
| 28 |
+
diffusion_attn_head_dim : 64
|
| 29 |
+
diffusion_attn_num_heads : 16
|
| 30 |
+
diffusion_num_layers : 8
|
config/a2m/cross_audio_posepre_t2d256_l16_dim1024.yaml
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio_PosePre # 532M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 2
|
| 7 |
+
motion_in_channel : 256
|
| 8 |
+
motion_frames : 16
|
| 9 |
+
num_step : 1000
|
| 10 |
+
|
| 11 |
+
# ----------- pose --------
|
| 12 |
+
pose_height : 32
|
| 13 |
+
pose_width : 32
|
| 14 |
+
pose_inchannel : 4
|
| 15 |
+
pose_patch_size : 2
|
| 16 |
+
|
| 17 |
+
# ----------- pose predictor --------
|
| 18 |
+
pose_predictor_attn_head_dim : 64
|
| 19 |
+
pose_predictor_attn_num_heads : 8
|
| 20 |
+
pose_predictor_attn_num_layers : 4
|
| 21 |
+
|
| 22 |
+
# ----------- Audio feature encoder -----------
|
| 23 |
+
intermediate_dim : 1024
|
| 24 |
+
window_size : 32
|
| 25 |
+
encoder_out_dim : 1024
|
| 26 |
+
|
| 27 |
+
# ----------- Diffusion Transformer -----------
|
| 28 |
+
diffusion_attn_head_dim : 64
|
| 29 |
+
diffusion_attn_num_heads : 16
|
| 30 |
+
diffusion_num_layers : 8
|
config/a2m/cross_audio_posepre_t4d128_l16_dim1024.yaml
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio_PosePre # 532M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 4
|
| 7 |
+
motion_in_channel : 128
|
| 8 |
+
motion_frames : 16
|
| 9 |
+
num_step : 1000
|
| 10 |
+
|
| 11 |
+
# ----------- pose --------
|
| 12 |
+
pose_height : 32
|
| 13 |
+
pose_width : 32
|
| 14 |
+
pose_inchannel : 4
|
| 15 |
+
pose_patch_size : 2
|
| 16 |
+
|
| 17 |
+
# ----------- pose predictor --------
|
| 18 |
+
pose_predictor_attn_head_dim : 64
|
| 19 |
+
pose_predictor_attn_num_heads : 8
|
| 20 |
+
pose_predictor_attn_num_layers : 4
|
| 21 |
+
|
| 22 |
+
# ----------- Audio feature encoder -----------
|
| 23 |
+
intermediate_dim : 1024
|
| 24 |
+
window_size : 32
|
| 25 |
+
encoder_out_dim : 1024
|
| 26 |
+
|
| 27 |
+
# ----------- Diffusion Transformer -----------
|
| 28 |
+
diffusion_attn_head_dim : 64
|
| 29 |
+
diffusion_attn_num_heads : 16
|
| 30 |
+
diffusion_num_layers : 8
|
config/a2m/cross_audio_t1d512_l16_dim1024.yaml
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio # 200M : 20M + 180M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 1
|
| 7 |
+
motion_in_channel : 512
|
| 8 |
+
motion_frames : 16
|
| 9 |
+
num_step : 1000
|
| 10 |
+
|
| 11 |
+
# ----------- Audio feature encoder -----------
|
| 12 |
+
intermediate_dim : 1024
|
| 13 |
+
window_size : 32
|
| 14 |
+
encoder_out_dim : 1024
|
| 15 |
+
|
| 16 |
+
# ----------- Diffusion Transformer -----------
|
| 17 |
+
diffusion_attn_head_dim : 64
|
| 18 |
+
diffusion_attn_num_heads : 16
|
| 19 |
+
diffusion_num_layers : 8
|
config/a2m/cross_audio_t2d256_l16_dim1024.yaml
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio # 200M : 20M + 180M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 2
|
| 7 |
+
motion_in_channel : 256
|
| 8 |
+
motion_frames : 16
|
| 9 |
+
num_step : 1000
|
| 10 |
+
|
| 11 |
+
# ----------- Audio feature encoder -----------
|
| 12 |
+
intermediate_dim : 1024
|
| 13 |
+
window_size : 32
|
| 14 |
+
encoder_out_dim : 1024
|
| 15 |
+
|
| 16 |
+
# ----------- Diffusion Transformer -----------
|
| 17 |
+
diffusion_attn_head_dim : 64
|
| 18 |
+
diffusion_attn_num_heads : 16
|
| 19 |
+
diffusion_num_layers : 8
|
config/a2m/cross_audio_t4d128_l16_dim1024.yaml
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio # 200M : 20M + 180M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 4
|
| 7 |
+
motion_in_channel : 128
|
| 8 |
+
motion_frames : 16
|
| 9 |
+
num_step : 1000
|
| 10 |
+
|
| 11 |
+
# ----------- Audio feature encoder -----------
|
| 12 |
+
intermediate_dim : 1024
|
| 13 |
+
window_size : 32
|
| 14 |
+
encoder_out_dim : 1024
|
| 15 |
+
|
| 16 |
+
# ----------- Diffusion Transformer -----------
|
| 17 |
+
diffusion_attn_head_dim : 64
|
| 18 |
+
diffusion_attn_num_heads : 16
|
| 19 |
+
diffusion_num_layers : 8
|
config/a2m/cross_audio_t4d128_l32_dim1024.yaml
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type : A2MModel_CrossAtten_Audio # 200M : 20M + 180M
|
| 2 |
+
model:
|
| 3 |
+
audio_inchannel : 384
|
| 4 |
+
audio_block : 50
|
| 5 |
+
|
| 6 |
+
motion_num_token : 4
|
| 7 |
+
motion_in_channel : 128
|
| 8 |
+
motion_frames : 32
|
| 9 |
+
num_step : 1000
|
| 10 |
+
|
| 11 |
+
# ----------- Audio feature encoder -----------
|
| 12 |
+
intermediate_dim : 1024
|
| 13 |
+
window_size : 32
|
| 14 |
+
encoder_out_dim : 1024
|
| 15 |
+
|
| 16 |
+
# ----------- Diffusion Transformer -----------
|
| 17 |
+
diffusion_attn_head_dim : 64
|
| 18 |
+
diffusion_attn_num_heads : 16
|
| 19 |
+
diffusion_num_layers : 8
|
config/accelerate_config_1.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
compute_environment: LOCAL_MACHINE
|
| 2 |
+
distributed_type: NO
|
| 3 |
+
fsdp_config: {}
|
| 4 |
+
machine_rank: 0
|
| 5 |
+
main_training_function: main
|
| 6 |
+
num_machines: 1
|
| 7 |
+
num_processes: 1
|
| 8 |
+
gpu_ids: 0,
|
| 9 |
+
use_cpu: false
|
config/accelerate_config_2.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
compute_environment: LOCAL_MACHINE
|
| 2 |
+
distributed_type: MULTI_GPU
|
| 3 |
+
fsdp_config: {}
|
| 4 |
+
machine_rank: 0
|
| 5 |
+
main_training_function: main
|
| 6 |
+
num_machines: 1
|
| 7 |
+
num_processes: 2
|
| 8 |
+
gpu_ids: 0,1
|
| 9 |
+
use_cpu: false
|
config/accelerate_config_3.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
compute_environment: LOCAL_MACHINE
|
| 2 |
+
distributed_type: MULTI_GPU
|
| 3 |
+
fsdp_config: {}
|
| 4 |
+
machine_rank: 0
|
| 5 |
+
main_training_function: main
|
| 6 |
+
num_machines: 1
|
| 7 |
+
num_processes: 3
|
| 8 |
+
gpu_ids: 0,1,2
|
| 9 |
+
use_cpu: false
|
config/accelerate_config_4.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
compute_environment: LOCAL_MACHINE
|
| 2 |
+
distributed_type: MULTI_GPU
|
| 3 |
+
fsdp_config: {}
|
| 4 |
+
machine_rank: 0
|
| 5 |
+
main_training_function: main
|
| 6 |
+
num_machines: 1
|
| 7 |
+
num_processes: 4
|
| 8 |
+
gpu_ids: 0,1,2,3
|
| 9 |
+
use_cpu: false
|
config/accelerate_config_5.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
compute_environment: LOCAL_MACHINE
|
| 2 |
+
distributed_type: MULTI_GPU
|
| 3 |
+
fsdp_config: {}
|
| 4 |
+
machine_rank: 0
|
| 5 |
+
main_training_function: main
|
| 6 |
+
num_machines: 1
|
| 7 |
+
num_processes: 5
|
| 8 |
+
gpu_ids: 0,1,2,3,4
|
| 9 |
+
use_cpu: false
|
config/accelerate_config_6.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
compute_environment: LOCAL_MACHINE
|
| 2 |
+
distributed_type: MULTI_GPU
|
| 3 |
+
fsdp_config: {}
|
| 4 |
+
machine_rank: 0
|
| 5 |
+
main_training_function: main
|
| 6 |
+
num_machines: 1
|
| 7 |
+
num_processes: 6
|
| 8 |
+
gpu_ids: 0,1,2,3,4,5
|
| 9 |
+
use_cpu: false
|
config/accelerate_config_7.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
compute_environment: LOCAL_MACHINE
|
| 2 |
+
distributed_type: MULTI_GPU
|
| 3 |
+
fsdp_config: {}
|
| 4 |
+
machine_rank: 0
|
| 5 |
+
main_training_function: main
|
| 6 |
+
num_machines: 1
|
| 7 |
+
num_processes: 7
|
| 8 |
+
gpu_ids: 0,1,2,3,4,5,6
|
| 9 |
+
use_cpu: false
|
config/accelerate_config_8.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
compute_environment: LOCAL_MACHINE
|
| 2 |
+
distributed_type: MULTI_GPU
|
| 3 |
+
fsdp_config: {}
|
| 4 |
+
machine_rank: 0
|
| 5 |
+
main_training_function: main
|
| 6 |
+
num_machines: 1
|
| 7 |
+
num_processes: 8
|
| 8 |
+
gpu_ids: 0,1,2,3,4,5,6,7
|
| 9 |
+
use_cpu: false
|
config/inference/a2m.yaml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
whisper_model_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/pretrained_weights/whisper_tiny.pt
|
| 2 |
+
audio_separator_model_file: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/pretrained_weights/Kim_Vocal_2.onnx
|
| 3 |
+
cache_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/sample/vocals
|
| 4 |
+
vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
|
| 5 |
+
a2m_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/config/a2m/a2m_t1d512.yaml
|
| 6 |
+
a2m_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/pretrained_weights/a2m.safetensors
|
| 7 |
+
amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial/config.json
|
| 8 |
+
amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial/checkpoints/checkpoint-131000/model.safetensors
|
| 9 |
+
output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/amd_sample/sample
|
| 10 |
+
enable_pose: false
|
| 11 |
+
a2m_sample_steps: 4
|
| 12 |
+
amd_sample_steps: 4
|
config/inference/a2m_wpose.yaml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
whisper_model_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/pretrained_weights/whisper_tiny.pt
|
| 2 |
+
audio_separator_model_file: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/pretrained_weights/Kim_Vocal_2.onnx
|
| 3 |
+
cache_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/sample/vocals
|
| 4 |
+
vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
|
| 5 |
+
a2m_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/config/a2m/a2m_t1d512_posepre.yaml
|
| 6 |
+
a2m_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/a2m/a2m-t1d512-f16-posepre-spatial/checkpoints/checkpoint-103000/model.safetensors
|
| 7 |
+
amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial/config.json
|
| 8 |
+
amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial/checkpoints/checkpoint-131000/model.safetensors
|
| 9 |
+
a2m_sample_steps: 4
|
| 10 |
+
amd_sample_steps: 4
|
| 11 |
+
output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/sample
|
| 12 |
+
enable_pose: true
|
config/inference/amd-s-t1-d1024-spatial-ablation.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d1024-spatial-ablation/checkpoints/checkpoint-104000/model.safetensors
|
| 2 |
+
amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d1024-spatial-ablation/config.json
|
| 3 |
+
amd_sample_steps: 20
|
| 4 |
+
mask_ratio: 0.0
|
| 5 |
+
output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d1024-spatial-ablation/result/mead
|
| 6 |
+
vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
|
config/inference/amd-s-t1-d128-spatial-ablation.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d128-spatial-ablation/checkpoints/checkpoint-485000/model.safetensors
|
| 2 |
+
amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d128-spatial-ablation/config.json
|
| 3 |
+
amd_sample_steps: 20
|
| 4 |
+
mask_ratio: 0.0
|
| 5 |
+
output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d128-spatial-ablation/result/mead
|
| 6 |
+
vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
|
config/inference/amd-s-t1-d256-spatial-ablation.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d256-spatial-ablation/checkpoints/checkpoint-104000/model.safetensors
|
| 2 |
+
amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d256-spatial-ablation/config.json
|
| 3 |
+
amd_sample_steps: 20
|
| 4 |
+
mask_ratio: 0.0
|
| 5 |
+
output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d256-spatial-ablation/result/mead
|
| 6 |
+
vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
|
config/inference/amd-s-t1-d32-spatial-ablation.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d32-spatial-ablation/checkpoints/checkpoint-365000/model.safetensors
|
| 2 |
+
amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d32-spatial-ablation/config.json
|
| 3 |
+
amd_sample_steps: 20
|
| 4 |
+
mask_ratio: 0.0
|
| 5 |
+
output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d32-spatial-ablation/result/mead
|
| 6 |
+
vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
|
config/inference/amd-s-t1-d512-nonorm-spatial-mask25.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial-mask25/checkpoints/checkpoint-88000/model.safetensors
|
| 2 |
+
amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial-mask25/config.json
|
| 3 |
+
amd_sample_steps: 20
|
| 4 |
+
mask_ratio: 0.0
|
| 5 |
+
output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d512-nonorm-spatial-mask25/result/mead
|
| 6 |
+
vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
|
config/inference/amd-s-t1-d512-nonorm-spatial-mask50.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial-mask50/checkpoints/checkpoint-111000/model.safetensors
|
| 2 |
+
amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial-mask50/config.json
|
| 3 |
+
amd_sample_steps: 20
|
| 4 |
+
mask_ratio: 0.0
|
| 5 |
+
output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d512-nonorm-spatial-mask50/result/mead
|
| 6 |
+
vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
|
config/inference/amd-s-t1-d512-nonorm-spatial-mask75.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial-mask75/checkpoints/checkpoint-189000/model.safetensors
|
| 2 |
+
amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial-mask75/config.json
|
| 3 |
+
amd_sample_steps: 20
|
| 4 |
+
mask_ratio: 0.0
|
| 5 |
+
output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d512-nonorm-spatial-mask75/result/mead
|
| 6 |
+
vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
|
config/inference/amd-s-t1-d512-nonorm-spatial-mask90.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial-mask90/checkpoints/checkpoint-110000/model.safetensors
|
| 2 |
+
amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial-mask90/config.json
|
| 3 |
+
amd_sample_steps: 20
|
| 4 |
+
mask_ratio: 0.0
|
| 5 |
+
output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d512-nonorm-spatial-mask90/result/mead
|
| 6 |
+
vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
|
config/inference/amd-s-t1-d512-spatial-ablation.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d512-spatial-ablation/checkpoints/checkpoint-140000/model.safetensors
|
| 2 |
+
amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d512-spatial-ablation/config.json
|
| 3 |
+
amd_sample_steps: 20
|
| 4 |
+
mask_ratio: 0.0
|
| 5 |
+
output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d512-spatial-ablation/result/mead
|
| 6 |
+
vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
|
config/inference/amd-s-t1-d64-spatial-ablation.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d64-spatial-ablation/checkpoints/checkpoint-378000/model.safetensors
|
| 2 |
+
amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d64-spatial-ablation/config.json
|
| 3 |
+
amd_sample_steps: 20
|
| 4 |
+
mask_ratio: 0.0
|
| 5 |
+
output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d64-spatial-ablation/result/mead
|
| 6 |
+
vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
|
config/inference/amd-s-t1-d768-spatial-ablation.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d768-spatial-ablation/checkpoints/checkpoint-102000/model.safetensors
|
| 2 |
+
amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d768-spatial-ablation/config.json
|
| 3 |
+
amd_sample_steps: 20
|
| 4 |
+
mask_ratio: 0.0
|
| 5 |
+
output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d768-spatial-ablation/result/mead
|
| 6 |
+
vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
|
config/inference/amd-s-t8-d64-spatial-ablation.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t8-d64-spatial-ablation/checkpoints/checkpoint-103000/model.safetensors
|
| 2 |
+
amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t8-d64-spatial-ablation/config.json
|
| 3 |
+
amd_sample_steps: 20
|
| 4 |
+
mask_ratio: 0.0
|
| 5 |
+
output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t8-d64-spatial-ablation/result/mead
|
| 6 |
+
vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
|
config/inference/p2m.yaml
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
|
| 2 |
+
p2m_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/config/p2m/p2m_t1d512.yaml
|
| 3 |
+
p2m_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/pretrained_weights/p2m.safetensors
|
| 4 |
+
amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial/config.json
|
| 5 |
+
amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial/checkpoints/checkpoint-131000/model.safetensors
|
| 6 |
+
output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/sample
|
| 7 |
+
p2m_sample_steps: 4
|
| 8 |
+
amd_sample_steps: 4
|
config/inference/rec.yaml
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
|
| 2 |
+
amd_config_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/exp/newexp/amd_ablation/amd-s-t1-d512-doubleref-ablation/config.json
|
| 3 |
+
amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/pretrained_weights/rec.safetensors
|
| 4 |
+
output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/sample/reconstruction
|
| 5 |
+
amd_sample_steps: 4
|
config/inference/rec_facevid.yaml
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
|
| 2 |
+
amd_config_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/exp/newexp/amd_ablation/amd-s-t1-d512-doubleref-ablation/config.json
|
| 3 |
+
amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/pretrained_weights/rec.safetensors
|
| 4 |
+
output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/sample/reconstruction/facevid_step20
|
| 5 |
+
amd_sample_steps: 20
|