CocoBro commited on
Commit
9fd03c9
·
verified ·
1 Parent(s): 190bbc5

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. config.yaml +93 -0
config.yaml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sample_rate: 24000
2
+ downsampling_ratio: 480
3
+ seed: 42
4
+ model:
5
+ autoencoder:
6
+ _target_: models.autoencoder.waveform.stable_vae.StableVAE
7
+ encoder:
8
+ _target_: models.autoencoder.waveform.stable_vae.OobleckEncoder
9
+ in_channels: 1
10
+ channels: 128
11
+ c_mults:
12
+ - 1
13
+ - 2
14
+ - 4
15
+ - 8
16
+ strides:
17
+ - 2
18
+ - 4
19
+ - 6
20
+ - 10
21
+ latent_dim: 256
22
+ use_snake: true
23
+ decoder:
24
+ _target_: models.autoencoder.waveform.stable_vae.OobleckDecoder
25
+ out_channels: 1
26
+ channels: 128
27
+ c_mults:
28
+ - 1
29
+ - 2
30
+ - 4
31
+ - 8
32
+ strides:
33
+ - 2
34
+ - 4
35
+ - 6
36
+ - 10
37
+ latent_dim: 128
38
+ use_snake: true
39
+ final_tanh: false
40
+ io_channels: 1
41
+ latent_dim: 128
42
+ downsampling_ratio: 480
43
+ sample_rate: 24000
44
+ pretrained_ckpt: ckpt/mmedit/vae/epoch=13-step=1000000.ckpt
45
+ bottleneck:
46
+ _target_: models.autoencoder.waveform.stable_vae.VAEBottleneck
47
+ backbone:
48
+ _target_: models.dit.mmdit_back.MMAudio
49
+ latent_dim: 128
50
+ text_dim: 1024
51
+ hidden_dim: 1024
52
+ depth: 12
53
+ fused_depth: 8
54
+ num_heads: 16
55
+ mlp_ratio: 4.0
56
+ latent_seq_len: 500
57
+ text_seq_len: 320
58
+ ta_context_dim: 128
59
+ ta_context_fusion: concat
60
+ ta_context_norm: false
61
+ content_dim: 1024
62
+ noise_scheduler_name: stabilityai/stable-diffusion-2-1
63
+ snr_gamma: 5.0
64
+ cfg_drop_ratio: 0.2
65
+ _target_: models.diffusion.SingleTaskCrossAttentionAudioDiffusion
66
+ content_encoder:
67
+ _target_: models.content_encoder.content_encoder.ContentEncoder
68
+ embed_dim: 1024
69
+ text_encoder:
70
+ _target_: models.content_encoder.llm_encoder.Qwen2AudioEmbedder
71
+ model_path: ckpt/qwen2-audio-7B-instruct
72
+ embed_dim: 1024
73
+ max_length: 320
74
+ audio_encoder:
75
+ _target_: models.autoencoder.waveform.stable_vae.StableVAEProjectorWrapper
76
+ vae_dim: 128
77
+ embed_dim: 128
78
+ loss_fn:
79
+ _target_: losses.base.IndentityWrapper
80
+ warmup_params:
81
+ warmup_steps: 1000
82
+ warmup_epochs: null
83
+ epoch_length: null
84
+ gradient_accumulation_steps: 1
85
+ optimizer:
86
+ _target_: torch.optim.AdamW
87
+ lr: 3.0e-05
88
+ weight_decay: 0.01
89
+ lr_scheduler:
90
+ _target_: transformers.get_scheduler
91
+ name: linear
92
+ epochs: 100
93
+ epoch_length: null