MMEdit / config.yaml
CocoBro's picture
Upload folder using huggingface_hub
9fd03c9 verified
sample_rate: 24000
downsampling_ratio: 480
seed: 42
model:
autoencoder:
_target_: models.autoencoder.waveform.stable_vae.StableVAE
encoder:
_target_: models.autoencoder.waveform.stable_vae.OobleckEncoder
in_channels: 1
channels: 128
c_mults:
- 1
- 2
- 4
- 8
strides:
- 2
- 4
- 6
- 10
latent_dim: 256
use_snake: true
decoder:
_target_: models.autoencoder.waveform.stable_vae.OobleckDecoder
out_channels: 1
channels: 128
c_mults:
- 1
- 2
- 4
- 8
strides:
- 2
- 4
- 6
- 10
latent_dim: 128
use_snake: true
final_tanh: false
io_channels: 1
latent_dim: 128
downsampling_ratio: 480
sample_rate: 24000
pretrained_ckpt: ckpt/mmedit/vae/epoch=13-step=1000000.ckpt
bottleneck:
_target_: models.autoencoder.waveform.stable_vae.VAEBottleneck
backbone:
_target_: models.dit.mmdit_back.MMAudio
latent_dim: 128
text_dim: 1024
hidden_dim: 1024
depth: 12
fused_depth: 8
num_heads: 16
mlp_ratio: 4.0
latent_seq_len: 500
text_seq_len: 320
ta_context_dim: 128
ta_context_fusion: concat
ta_context_norm: false
content_dim: 1024
noise_scheduler_name: stabilityai/stable-diffusion-2-1
snr_gamma: 5.0
cfg_drop_ratio: 0.2
_target_: models.diffusion.SingleTaskCrossAttentionAudioDiffusion
content_encoder:
_target_: models.content_encoder.content_encoder.ContentEncoder
embed_dim: 1024
text_encoder:
_target_: models.content_encoder.llm_encoder.Qwen2AudioEmbedder
model_path: ckpt/qwen2-audio-7B-instruct
embed_dim: 1024
max_length: 320
audio_encoder:
_target_: models.autoencoder.waveform.stable_vae.StableVAEProjectorWrapper
vae_dim: 128
embed_dim: 128
loss_fn:
_target_: losses.base.IndentityWrapper
warmup_params:
warmup_steps: 1000
warmup_epochs: null
epoch_length: null
gradient_accumulation_steps: 1
optimizer:
_target_: torch.optim.AdamW
lr: 3.0e-05
weight_decay: 0.01
lr_scheduler:
_target_: transformers.get_scheduler
name: linear
epochs: 100
epoch_length: null