File size: 2,217 Bytes
9fd03c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
sample_rate: 24000
downsampling_ratio: 480
seed: 42
model:
  autoencoder:
    _target_: models.autoencoder.waveform.stable_vae.StableVAE
    encoder:
      _target_: models.autoencoder.waveform.stable_vae.OobleckEncoder
      in_channels: 1
      channels: 128
      c_mults:
      - 1
      - 2
      - 4
      - 8
      strides:
      - 2
      - 4
      - 6
      - 10
      latent_dim: 256
      use_snake: true
    decoder:
      _target_: models.autoencoder.waveform.stable_vae.OobleckDecoder
      out_channels: 1
      channels: 128
      c_mults:
      - 1
      - 2
      - 4
      - 8
      strides:
      - 2
      - 4
      - 6
      - 10
      latent_dim: 128
      use_snake: true
      final_tanh: false
    io_channels: 1
    latent_dim: 128
    downsampling_ratio: 480
    sample_rate: 24000
    pretrained_ckpt: ckpt/mmedit/vae/epoch=13-step=1000000.ckpt
    bottleneck:
      _target_: models.autoencoder.waveform.stable_vae.VAEBottleneck
  backbone:
    _target_: models.dit.mmdit_back.MMAudio
    latent_dim: 128
    text_dim: 1024
    hidden_dim: 1024
    depth: 12
    fused_depth: 8
    num_heads: 16
    mlp_ratio: 4.0
    latent_seq_len: 500
    text_seq_len: 320
    ta_context_dim: 128
    ta_context_fusion: concat
    ta_context_norm: false
  content_dim: 1024
  noise_scheduler_name: stabilityai/stable-diffusion-2-1
  snr_gamma: 5.0
  cfg_drop_ratio: 0.2
  _target_: models.diffusion.SingleTaskCrossAttentionAudioDiffusion
  content_encoder:
    _target_: models.content_encoder.content_encoder.ContentEncoder
    embed_dim: 1024
    text_encoder:
      _target_: models.content_encoder.llm_encoder.Qwen2AudioEmbedder
      model_path: ckpt/qwen2-audio-7B-instruct
      embed_dim: 1024
      max_length: 320
    audio_encoder:
      _target_: models.autoencoder.waveform.stable_vae.StableVAEProjectorWrapper
      vae_dim: 128
      embed_dim: 128
loss_fn:
  _target_: losses.base.IndentityWrapper
warmup_params:
  warmup_steps: 1000
  warmup_epochs: null
  epoch_length: null
gradient_accumulation_steps: 1
optimizer:
  _target_: torch.optim.AdamW
  lr: 3.0e-05
  weight_decay: 0.01
lr_scheduler:
  _target_: transformers.get_scheduler
  name: linear
epochs: 100
epoch_length: null