| sample_rate: 24000 | |
| downsampling_ratio: 480 | |
| seed: 42 | |
| model: | |
| autoencoder: | |
| _target_: models.autoencoder.waveform.stable_vae.StableVAE | |
| encoder: | |
| _target_: models.autoencoder.waveform.stable_vae.OobleckEncoder | |
| in_channels: 1 | |
| channels: 128 | |
| c_mults: | |
| - 1 | |
| - 2 | |
| - 4 | |
| - 8 | |
| strides: | |
| - 2 | |
| - 4 | |
| - 6 | |
| - 10 | |
| latent_dim: 256 | |
| use_snake: true | |
| decoder: | |
| _target_: models.autoencoder.waveform.stable_vae.OobleckDecoder | |
| out_channels: 1 | |
| channels: 128 | |
| c_mults: | |
| - 1 | |
| - 2 | |
| - 4 | |
| - 8 | |
| strides: | |
| - 2 | |
| - 4 | |
| - 6 | |
| - 10 | |
| latent_dim: 128 | |
| use_snake: true | |
| final_tanh: false | |
| io_channels: 1 | |
| latent_dim: 128 | |
| downsampling_ratio: 480 | |
| sample_rate: 24000 | |
| pretrained_ckpt: ckpt/mmedit/vae/epoch=13-step=1000000.ckpt | |
| bottleneck: | |
| _target_: models.autoencoder.waveform.stable_vae.VAEBottleneck | |
| backbone: | |
| _target_: models.dit.mmdit_back.MMAudio | |
| latent_dim: 128 | |
| text_dim: 1024 | |
| hidden_dim: 1024 | |
| depth: 12 | |
| fused_depth: 8 | |
| num_heads: 16 | |
| mlp_ratio: 4.0 | |
| latent_seq_len: 500 | |
| text_seq_len: 320 | |
| ta_context_dim: 128 | |
| ta_context_fusion: concat | |
| ta_context_norm: false | |
| content_dim: 1024 | |
| noise_scheduler_name: stabilityai/stable-diffusion-2-1 | |
| snr_gamma: 5.0 | |
| cfg_drop_ratio: 0.2 | |
| _target_: models.diffusion.SingleTaskCrossAttentionAudioDiffusion | |
| content_encoder: | |
| _target_: models.content_encoder.content_encoder.ContentEncoder | |
| embed_dim: 1024 | |
| text_encoder: | |
| _target_: models.content_encoder.llm_encoder.Qwen2AudioEmbedder | |
| model_path: ckpt/qwen2-audio-7B-instruct | |
| embed_dim: 1024 | |
| max_length: 320 | |
| audio_encoder: | |
| _target_: models.autoencoder.waveform.stable_vae.StableVAEProjectorWrapper | |
| vae_dim: 128 | |
| embed_dim: 128 | |
| loss_fn: | |
| _target_: losses.base.IndentityWrapper | |
| warmup_params: | |
| warmup_steps: 1000 | |
| warmup_epochs: null | |
| epoch_length: null | |
| gradient_accumulation_steps: 1 | |
| optimizer: | |
| _target_: torch.optim.AdamW | |
| lr: 3.0e-05 | |
| weight_decay: 0.01 | |
| lr_scheduler: | |
| _target_: transformers.get_scheduler | |
| name: linear | |
| epochs: 100 | |
| epoch_length: null | |