HappyP4nda commited on May 7, 2025

Commit

bd546bf

verified ·

1 Parent(s): 4f9c366

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

config/a2m/a2m_t1d128.yaml +18 -0
config/a2m/a2m_t1d128_posepre.yaml +29 -0
config/a2m/a2m_t1d512.yaml +18 -0
config/a2m/a2m_t1d512_doubleref.yaml +18 -0
config/a2m/a2m_t1d512_pose.yaml +24 -0
config/a2m/a2m_t1d512_posepre.yaml +29 -0
config/a2m/a2m_t1d64.yaml +18 -0
config/a2m/a2m_t1d64_posepre.yaml +29 -0
config/a2m/a2m_t2d256.yaml +18 -0
config/a2m/a2m_t2d256_pose.yaml +24 -0
config/a2m/a2m_t2d256_posepre.yaml +29 -0
config/a2m/cross_audio_pose_t1d512_l16_dim1024.yaml +25 -0
config/a2m/cross_audio_pose_t1d512_l64_dim1024.yaml +25 -0
config/a2m/cross_audio_pose_t2d256_l16_dim1024.yaml +25 -0
config/a2m/cross_audio_pose_t4d128_l16_dim1024.yaml +25 -0
config/a2m/cross_audio_pose_t4d128_l32_dim1024.yaml +25 -0
config/a2m/cross_audio_posepre_t1d512_l16_dim1024.yaml +30 -0
config/a2m/cross_audio_posepre_t1d512_l32_dim1024.yaml +30 -0
config/a2m/cross_audio_posepre_t1d512_l64_dim1024.yaml +30 -0
config/a2m/cross_audio_posepre_t2d256_l16_dim1024.yaml +30 -0
config/a2m/cross_audio_posepre_t4d128_l16_dim1024.yaml +30 -0
config/a2m/cross_audio_t1d512_l16_dim1024.yaml +19 -0
config/a2m/cross_audio_t2d256_l16_dim1024.yaml +19 -0
config/a2m/cross_audio_t4d128_l16_dim1024.yaml +19 -0
config/a2m/cross_audio_t4d128_l32_dim1024.yaml +19 -0
config/accelerate_config_1.yaml +9 -0
config/accelerate_config_2.yaml +9 -0
config/accelerate_config_3.yaml +9 -0
config/accelerate_config_4.yaml +9 -0
config/accelerate_config_5.yaml +9 -0
config/accelerate_config_6.yaml +9 -0
config/accelerate_config_7.yaml +9 -0
config/accelerate_config_8.yaml +9 -0
config/inference/a2m.yaml +12 -0
config/inference/a2m_wpose.yaml +12 -0
config/inference/amd-s-t1-d1024-spatial-ablation.yaml +6 -0
config/inference/amd-s-t1-d128-spatial-ablation.yaml +6 -0
config/inference/amd-s-t1-d256-spatial-ablation.yaml +6 -0
config/inference/amd-s-t1-d32-spatial-ablation.yaml +6 -0
config/inference/amd-s-t1-d512-nonorm-spatial-mask25.yaml +6 -0
config/inference/amd-s-t1-d512-nonorm-spatial-mask50.yaml +6 -0
config/inference/amd-s-t1-d512-nonorm-spatial-mask75.yaml +6 -0
config/inference/amd-s-t1-d512-nonorm-spatial-mask90.yaml +6 -0
config/inference/amd-s-t1-d512-spatial-ablation.yaml +6 -0
config/inference/amd-s-t1-d64-spatial-ablation.yaml +6 -0
config/inference/amd-s-t1-d768-spatial-ablation.yaml +6 -0
config/inference/amd-s-t8-d64-spatial-ablation.yaml +6 -0
config/inference/p2m.yaml +8 -0
config/inference/rec.yaml +5 -0
config/inference/rec_facevid.yaml +5 -0

config/a2m/a2m_t1d128.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+model_type : A2MModel_CrossAtten_Audio  # 200M : 20M + 180M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 1
+  motion_in_channel : 128
+  num_step : 1000
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/a2m_t1d128_posepre.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+model_type : A2MModel_CrossAtten_Audio_PosePre  # 532M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 1
+  motion_in_channel : 128
+  num_step : 1000
+  # ----------- pose --------
+  pose_height : 32
+  pose_width : 32
+  pose_inchannel : 4
+  pose_patch_size : 2
+  # ----------- pose predictor --------
+  pose_predictor_attn_head_dim : 64
+  pose_predictor_attn_num_heads : 8
+  pose_predictor_attn_num_layers : 4
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/a2m_t1d512.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+model_type : A2MModel_CrossAtten_Audio  # 200M : 20M + 180M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 1
+  motion_in_channel : 512
+  num_step : 1000
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/a2m_t1d512_doubleref.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+model_type : A2MModel_CrossAtten_Audio_DoubleRef  # 200M : 20M + 180M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 1
+  motion_in_channel : 512
+  num_step : 1000
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/a2m_t1d512_pose.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+model_type : A2MModel_CrossAtten_Audio_Pose  # 200M : 20M + 180M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 1
+  motion_in_channel : 512
+  num_step : 1000
+  # ----------- pose --------
+  pose_height : 32
+  pose_width : 32
+  pose_inchannel : 4
+  pose_patch_size : 2
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/a2m_t1d512_posepre.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+model_type : A2MModel_CrossAtten_Audio_PosePre  # 532M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 1
+  motion_in_channel : 512
+  num_step : 1000
+  # ----------- pose --------
+  pose_height : 32
+  pose_width : 32
+  pose_inchannel : 4
+  pose_patch_size : 2
+  # ----------- pose predictor --------
+  pose_predictor_attn_head_dim : 64
+  pose_predictor_attn_num_heads : 8
+  pose_predictor_attn_num_layers : 4
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/a2m_t1d64.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+model_type : A2MModel_CrossAtten_Audio  # 200M : 20M + 180M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 1
+  motion_in_channel : 64
+  num_step : 1000
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/a2m_t1d64_posepre.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+model_type : A2MModel_CrossAtten_Audio_PosePre  # 532M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 1
+  motion_in_channel : 64
+  num_step : 1000
+  # ----------- pose --------
+  pose_height : 32
+  pose_width : 32
+  pose_inchannel : 4
+  pose_patch_size : 2
+  # ----------- pose predictor --------
+  pose_predictor_attn_head_dim : 64
+  pose_predictor_attn_num_heads : 8
+  pose_predictor_attn_num_layers : 4
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/a2m_t2d256.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+model_type : A2MModel_CrossAtten_Audio  # 200M : 20M + 180M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 2
+  motion_in_channel : 256
+  num_step : 1000
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/a2m_t2d256_pose.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+model_type : A2MModel_CrossAtten_Audio_Pose  # 200M : 20M + 180M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 2
+  motion_in_channel : 256
+  num_step : 1000
+  # ----------- pose --------
+  pose_height : 32
+  pose_width : 32
+  pose_inchannel : 4
+  pose_patch_size : 2
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/a2m_t2d256_posepre.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+model_type : A2MModel_CrossAtten_Audio_PosePre  # 532M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 2
+  motion_in_channel : 256
+  num_step : 1000
+  # ----------- pose --------
+  pose_height : 32
+  pose_width : 32
+  pose_inchannel : 4
+  pose_patch_size : 2
+  # ----------- pose predictor --------
+  pose_predictor_attn_head_dim : 64
+  pose_predictor_attn_num_heads : 8
+  pose_predictor_attn_num_layers : 4
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/cross_audio_pose_t1d512_l16_dim1024.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+model_type : A2MModel_CrossAtten_Audio_Pose  # 200M : 20M + 180M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 1
+  motion_in_channel : 512
+  motion_frames  : 16
+  num_step : 1000
+  # ----------- pose --------
+  pose_height : 32
+  pose_width : 32
+  pose_inchannel : 4
+  pose_patch_size : 2
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/cross_audio_pose_t1d512_l64_dim1024.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+model_type : A2MModel_CrossAtten_Audio_Pose  # 200M : 20M + 180M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 1
+  motion_in_channel : 512
+  motion_frames  : 64
+  num_step : 1000
+  # ----------- pose --------
+  pose_height : 32
+  pose_width : 32
+  pose_inchannel : 4
+  pose_patch_size : 2
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/cross_audio_pose_t2d256_l16_dim1024.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+model_type : A2MModel_CrossAtten_Audio_Pose  # 200M : 20M + 180M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 2
+  motion_in_channel : 256
+  motion_frames  : 16
+  num_step : 1000
+  # ----------- pose --------
+  pose_height : 32
+  pose_width : 32
+  pose_inchannel : 4
+  pose_patch_size : 2
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/cross_audio_pose_t4d128_l16_dim1024.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+model_type : A2MModel_CrossAtten_Audio_Pose  # 200M : 20M + 180M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 4
+  motion_in_channel : 128
+  motion_frames  : 16
+  num_step : 1000
+  # ----------- pose --------
+  pose_height : 32
+  pose_width : 32
+  pose_inchannel : 4
+  pose_patch_size : 2
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/cross_audio_pose_t4d128_l32_dim1024.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+model_type : A2MModel_CrossAtten_Audio_Pose  # 200M : 20M + 180M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 4
+  motion_in_channel : 128
+  motion_frames  : 32
+  num_step : 1000
+  # ----------- pose --------
+  pose_height : 32
+  pose_width : 32
+  pose_inchannel : 4
+  pose_patch_size : 2
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/cross_audio_posepre_t1d512_l16_dim1024.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+model_type : A2MModel_CrossAtten_Audio_PosePre  # 532M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 1
+  motion_in_channel : 512
+  motion_frames  : 16
+  num_step : 1000
+  # ----------- pose --------
+  pose_height : 32
+  pose_width : 32
+  pose_inchannel : 4
+  pose_patch_size : 2
+  # ----------- pose predictor --------
+  pose_predictor_attn_head_dim : 64
+  pose_predictor_attn_num_heads : 8
+  pose_predictor_attn_num_layers : 4
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/cross_audio_posepre_t1d512_l32_dim1024.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+model_type : A2MModel_CrossAtten_Audio_PosePre  # 532M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 1
+  motion_in_channel : 512
+  motion_frames  : 32
+  num_step : 1000
+  # ----------- pose --------
+  pose_height : 32
+  pose_width : 32
+  pose_inchannel : 4
+  pose_patch_size : 2
+  # ----------- pose predictor --------
+  pose_predictor_attn_head_dim : 64
+  pose_predictor_attn_num_heads : 8
+  pose_predictor_attn_num_layers : 4
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/cross_audio_posepre_t1d512_l64_dim1024.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+model_type : A2MModel_CrossAtten_Audio_PosePre  # 532M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 1
+  motion_in_channel : 512
+  motion_frames  : 64
+  num_step : 1000
+  # ----------- pose --------
+  pose_height : 32
+  pose_width : 32
+  pose_inchannel : 4
+  pose_patch_size : 2
+  # ----------- pose predictor --------
+  pose_predictor_attn_head_dim : 64
+  pose_predictor_attn_num_heads : 8
+  pose_predictor_attn_num_layers : 4
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/cross_audio_posepre_t2d256_l16_dim1024.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+model_type : A2MModel_CrossAtten_Audio_PosePre  # 532M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 2
+  motion_in_channel : 256
+  motion_frames  : 16
+  num_step : 1000
+  # ----------- pose --------
+  pose_height : 32
+  pose_width : 32
+  pose_inchannel : 4
+  pose_patch_size : 2
+  # ----------- pose predictor --------
+  pose_predictor_attn_head_dim : 64
+  pose_predictor_attn_num_heads : 8
+  pose_predictor_attn_num_layers : 4
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/cross_audio_posepre_t4d128_l16_dim1024.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+model_type : A2MModel_CrossAtten_Audio_PosePre  # 532M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 4
+  motion_in_channel : 128
+  motion_frames  : 16
+  num_step : 1000
+  # ----------- pose --------
+  pose_height : 32
+  pose_width : 32
+  pose_inchannel : 4
+  pose_patch_size : 2
+  # ----------- pose predictor --------
+  pose_predictor_attn_head_dim : 64
+  pose_predictor_attn_num_heads : 8
+  pose_predictor_attn_num_layers : 4
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/cross_audio_t1d512_l16_dim1024.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+model_type : A2MModel_CrossAtten_Audio  # 200M : 20M + 180M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 1
+  motion_in_channel : 512
+  motion_frames  : 16
+  num_step : 1000
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/cross_audio_t2d256_l16_dim1024.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+model_type : A2MModel_CrossAtten_Audio  # 200M : 20M + 180M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 2
+  motion_in_channel : 256
+  motion_frames  : 16
+  num_step : 1000
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/cross_audio_t4d128_l16_dim1024.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+model_type : A2MModel_CrossAtten_Audio  # 200M : 20M + 180M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 4
+  motion_in_channel : 128
+  motion_frames  : 16
+  num_step : 1000
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/a2m/cross_audio_t4d128_l32_dim1024.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+model_type : A2MModel_CrossAtten_Audio  # 200M : 20M + 180M
+model:
+  audio_inchannel  : 384
+  audio_block : 50
+  motion_num_token : 4
+  motion_in_channel : 128
+  motion_frames  : 32
+  num_step : 1000
+  # ----------- Audio feature encoder -----------
+  intermediate_dim : 1024
+  window_size : 32
+  encoder_out_dim : 1024
+  # ----------- Diffusion Transformer -----------
+  diffusion_attn_head_dim  : 64
+  diffusion_attn_num_heads : 16
+  diffusion_num_layers : 8

config/accelerate_config_1.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+compute_environment: LOCAL_MACHINE
+distributed_type: NO
+fsdp_config: {}
+machine_rank: 0
+main_training_function: main
+num_machines: 1
+num_processes: 1
+gpu_ids: 0,
+use_cpu: false

config/accelerate_config_2.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+compute_environment: LOCAL_MACHINE
+distributed_type: MULTI_GPU
+fsdp_config: {}
+machine_rank: 0
+main_training_function: main
+num_machines: 1
+num_processes: 2
+gpu_ids: 0,1
+use_cpu: false

config/accelerate_config_3.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+compute_environment: LOCAL_MACHINE
+distributed_type: MULTI_GPU
+fsdp_config: {}
+machine_rank: 0
+main_training_function: main
+num_machines: 1
+num_processes: 3
+gpu_ids: 0,1,2
+use_cpu: false

config/accelerate_config_4.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+compute_environment: LOCAL_MACHINE
+distributed_type: MULTI_GPU
+fsdp_config: {}
+machine_rank: 0
+main_training_function: main
+num_machines: 1
+num_processes: 4
+gpu_ids: 0,1,2,3
+use_cpu: false

config/accelerate_config_5.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+compute_environment: LOCAL_MACHINE
+distributed_type: MULTI_GPU
+fsdp_config: {}
+machine_rank: 0
+main_training_function: main
+num_machines: 1
+num_processes: 5
+gpu_ids: 0,1,2,3,4
+use_cpu: false

config/accelerate_config_6.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+compute_environment: LOCAL_MACHINE
+distributed_type: MULTI_GPU
+fsdp_config: {}
+machine_rank: 0
+main_training_function: main
+num_machines: 1
+num_processes: 6
+gpu_ids: 0,1,2,3,4,5
+use_cpu: false

config/accelerate_config_7.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+compute_environment: LOCAL_MACHINE
+distributed_type: MULTI_GPU
+fsdp_config: {}
+machine_rank: 0
+main_training_function: main
+num_machines: 1
+num_processes: 7
+gpu_ids: 0,1,2,3,4,5,6
+use_cpu: false

config/accelerate_config_8.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+compute_environment: LOCAL_MACHINE
+distributed_type: MULTI_GPU
+fsdp_config: {}
+machine_rank: 0
+main_training_function: main
+num_machines: 1
+num_processes: 8
+gpu_ids: 0,1,2,3,4,5,6,7
+use_cpu: false

config/inference/a2m.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+whisper_model_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/pretrained_weights/whisper_tiny.pt
+audio_separator_model_file: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/pretrained_weights/Kim_Vocal_2.onnx
+cache_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/sample/vocals
+vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
+a2m_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/config/a2m/a2m_t1d512.yaml
+a2m_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/pretrained_weights/a2m.safetensors
+amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial/config.json
+amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial/checkpoints/checkpoint-131000/model.safetensors
+output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/amd_sample/sample
+enable_pose: false
+a2m_sample_steps: 4
+amd_sample_steps: 4

config/inference/a2m_wpose.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+whisper_model_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/pretrained_weights/whisper_tiny.pt
+audio_separator_model_file: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/pretrained_weights/Kim_Vocal_2.onnx
+cache_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/sample/vocals
+vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
+a2m_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/config/a2m/a2m_t1d512_posepre.yaml
+a2m_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/a2m/a2m-t1d512-f16-posepre-spatial/checkpoints/checkpoint-103000/model.safetensors
+amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial/config.json
+amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial/checkpoints/checkpoint-131000/model.safetensors
+a2m_sample_steps: 4
+amd_sample_steps: 4
+output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/sample
+enable_pose: true

config/inference/amd-s-t1-d1024-spatial-ablation.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d1024-spatial-ablation/checkpoints/checkpoint-104000/model.safetensors
+amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d1024-spatial-ablation/config.json
+amd_sample_steps: 20
+mask_ratio: 0.0
+output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d1024-spatial-ablation/result/mead
+vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse

config/inference/amd-s-t1-d128-spatial-ablation.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d128-spatial-ablation/checkpoints/checkpoint-485000/model.safetensors
+amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d128-spatial-ablation/config.json
+amd_sample_steps: 20
+mask_ratio: 0.0
+output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d128-spatial-ablation/result/mead
+vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse

config/inference/amd-s-t1-d256-spatial-ablation.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d256-spatial-ablation/checkpoints/checkpoint-104000/model.safetensors
+amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d256-spatial-ablation/config.json
+amd_sample_steps: 20
+mask_ratio: 0.0
+output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d256-spatial-ablation/result/mead
+vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse

config/inference/amd-s-t1-d32-spatial-ablation.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d32-spatial-ablation/checkpoints/checkpoint-365000/model.safetensors
+amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d32-spatial-ablation/config.json
+amd_sample_steps: 20
+mask_ratio: 0.0
+output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d32-spatial-ablation/result/mead
+vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse

config/inference/amd-s-t1-d512-nonorm-spatial-mask25.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial-mask25/checkpoints/checkpoint-88000/model.safetensors
+amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial-mask25/config.json
+amd_sample_steps: 20
+mask_ratio: 0.0
+output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d512-nonorm-spatial-mask25/result/mead
+vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse

config/inference/amd-s-t1-d512-nonorm-spatial-mask50.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial-mask50/checkpoints/checkpoint-111000/model.safetensors
+amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial-mask50/config.json
+amd_sample_steps: 20
+mask_ratio: 0.0
+output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d512-nonorm-spatial-mask50/result/mead
+vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse

config/inference/amd-s-t1-d512-nonorm-spatial-mask75.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial-mask75/checkpoints/checkpoint-189000/model.safetensors
+amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial-mask75/config.json
+amd_sample_steps: 20
+mask_ratio: 0.0
+output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d512-nonorm-spatial-mask75/result/mead
+vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse

config/inference/amd-s-t1-d512-nonorm-spatial-mask90.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial-mask90/checkpoints/checkpoint-110000/model.safetensors
+amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial-mask90/config.json
+amd_sample_steps: 20
+mask_ratio: 0.0
+output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d512-nonorm-spatial-mask90/result/mead
+vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse

config/inference/amd-s-t1-d512-spatial-ablation.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d512-spatial-ablation/checkpoints/checkpoint-140000/model.safetensors
+amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d512-spatial-ablation/config.json
+amd_sample_steps: 20
+mask_ratio: 0.0
+output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d512-spatial-ablation/result/mead
+vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse

config/inference/amd-s-t1-d64-spatial-ablation.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d64-spatial-ablation/checkpoints/checkpoint-378000/model.safetensors
+amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d64-spatial-ablation/config.json
+amd_sample_steps: 20
+mask_ratio: 0.0
+output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d64-spatial-ablation/result/mead
+vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse

config/inference/amd-s-t1-d768-spatial-ablation.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d768-spatial-ablation/checkpoints/checkpoint-102000/model.safetensors
+amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t1-d768-spatial-ablation/config.json
+amd_sample_steps: 20
+mask_ratio: 0.0
+output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t1-d768-spatial-ablation/result/mead
+vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse

config/inference/amd-s-t8-d64-spatial-ablation.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t8-d64-spatial-ablation/checkpoints/checkpoint-103000/model.safetensors
+amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd_ablation/amd-s-t8-d64-spatial-ablation/config.json
+amd_sample_steps: 20
+mask_ratio: 0.0
+output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/test/test_frame2frame_reconstruction/amd-s-t8-d64-spatial-ablation/result/mead
+vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse

config/inference/p2m.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
+p2m_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/config/p2m/p2m_t1d512.yaml
+p2m_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/pretrained_weights/p2m.safetensors
+amd_config_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial/config.json
+amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/newexp/amd/amd-s-t1-d512-nonorm-spatial/checkpoints/checkpoint-131000/model.safetensors
+output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/sample
+p2m_sample_steps: 4
+amd_sample_steps: 4

config/inference/rec.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
+amd_config_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/exp/newexp/amd_ablation/amd-s-t1-d512-doubleref-ablation/config.json
+amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/pretrained_weights/rec.safetensors
+output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/sample/reconstruction
+amd_sample_steps: 4

config/inference/rec_facevid.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+vae_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/model-checkpoints/sd-vae-ft-mse
+amd_config_path: /mnt/pfs-mc0p4k/tts/team/digital_avatar_group/sunwenzhang/qiyuan/exp/newexp/amd_ablation/amd-s-t1-d512-doubleref-ablation/config.json
+amd_ckpt_path: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/pretrained_weights/rec.safetensors
+output_dir: /mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/sample/reconstruction/facevid_step20
+amd_sample_steps: 20