rsxdalv CypressYang commited on
Commit
0933517
·
verified ·
0 Parent(s):

Duplicate from CypressYang/SongBloom

Browse files

Co-authored-by: CypressYang <CypressYang@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - en
5
+ - zh
6
+ library_name: songbloom
7
+ ---
8
+
9
+ ## Introduction
10
+ We propose SongBloom, a novel framework for full-length song generation that leverages an interleaved paradigm of autoregressive sketching and diffusion-based refinement. SongBloom employs an autoregressive diffusion model that combines the high fidelity of diffusion models with the scalability of language models. Specifically, it gradually extends a musical sketch from short to long and refines the details from coarse to fine-grained. The interleaved generation paradigm effectively integrates prior semantic and acoustic context to guide the generation process. Experimental results demonstrate that SongBloom outperforms existing methods across both subjective and objective metrics and achieves performance comparable to the state-of-the-art commercial music generation platforms.
11
+
12
+ ## Model Configuration
13
+ TODO
14
+
15
+ ## Papers
16
+ * [Model Paper](https://huggingface.co/papers/2506.07634)
17
+ * [Github Repo](https://github.com/Cypress-Yang/SongBloom)
autoencoder_music_dsp1920.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10ccb6c83613781ad32e998a90597ba7eb9292911a224598da1fd53728eb4cd3
3
+ size 674920616
songbloom_full_150s.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aaeaee1dc889c8790e53064189ddcb7c66396cbfef3b15794a53e41d17d55fd2
3
+ size 7827256017
songbloom_full_150s.yaml ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cfg_file:
2
+ precision: 'bf16-mixed' # ['16-mixed', 'bf16-mixed']
3
+ min_dur: 60
4
+ max_dur: 150
5
+ sr: 48000
6
+
7
+ pretrained_path: ${dynamic_path:???/songbloom_full_150s.pt}
8
+ continue_checkpoint:
9
+
10
+ train_dataset:
11
+ lyric_processor: phoneme
12
+ prompt_len: 10
13
+
14
+ vae:
15
+ vae_cfg: ${dynamic_path:???/stable_audio_1920_vae.json}
16
+ vae_ckpt: ${dynamic_path:???/autoencoder_music_dsp1920.ckpt}
17
+ sr: ${sr}
18
+
19
+ model:
20
+ block_size: 16
21
+ latent_dim: 64
22
+ dim: 1536
23
+ num_heads: 24
24
+ lm_layers: 36
25
+ diff_layers: 12
26
+ num_pitch: 16384
27
+ time_cond_type: prepend
28
+ timestep_features_dim: 256
29
+ diffusion_objective: rectified_flow
30
+ timestep_sampler: logit_normal
31
+ backend: llama
32
+ rotary_base_val: 20000
33
+ init_std: 0.02
34
+ h_dropout: 0.05
35
+
36
+ condition_provider_cfg:
37
+ prompt_wav:
38
+ type: audio_tokenizer_wrapper
39
+ output_dim: ${model.dim}
40
+ audio_tokenizer:
41
+ max_len: 250 # 25.0 * 10s
42
+ lyrics:
43
+ type: phoneme_tokenizer
44
+ output_dim: ${model.dim}
45
+ vocab_list: ${load_yaml:${dynamic_path:???/vocab_g2p.yaml}}
46
+ max_len: 600
47
+ max_sentence_per_structure: 50
48
+ mode: sum
49
+
50
+
51
+ cfg_dropout: 0.1
52
+ attribute_dropout:
53
+ text:
54
+ lyrics: 0.
55
+ wav:
56
+ prompt_wav: 0.1
57
+
58
+ fuser_cfg:
59
+ cross_attention_pos_emb: false
60
+ cross_attention_pos_emb_scale: 1
61
+ sum: []
62
+ prepend: [lyrics, prompt_wav]
63
+ cross: []
64
+ input_interpolate: []
65
+
66
+
67
+
68
+ inference:
69
+ cfg_coef: 1.5
70
+ temp: 0.9
71
+ diff_temp: 0.95
72
+ top_k: 100
73
+ penalty_repeat: True
74
+ penalty_window: 50
75
+ steps: 36
76
+ dit_cfg_type: h
songbloom_full_150s_dpo.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:734a5f2700ca8fb660d8a904ba513610aef3a46cf6f10e4db365e02df7e31758
3
+ size 7258793371
songbloom_full_150s_dpo.yaml ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cfg_file:
2
+ precision: 'bf16-mixed' # ['16-mixed', 'bf16-mixed']
3
+ min_dur: 60
4
+ max_dur: 150
5
+ sr: 48000
6
+
7
+ pretrained_path: ${dynamic_path:???/songbloom_full_150s_dpo.pt}
8
+ continue_checkpoint:
9
+
10
+ train_dataset:
11
+ lyric_processor: phoneme
12
+ prompt_len: 10
13
+
14
+ vae:
15
+ vae_cfg: ${dynamic_path:???/stable_audio_1920_vae.json}
16
+ vae_ckpt: ${dynamic_path:???/autoencoder_music_dsp1920.ckpt}
17
+ sr: ${sr}
18
+
19
+ model:
20
+ block_size: 16
21
+ latent_dim: 64
22
+ dim: 1536
23
+ num_heads: 24
24
+ lm_layers: 36
25
+ diff_layers: 12
26
+ num_pitch: 16384
27
+ time_cond_type: prepend
28
+ timestep_features_dim: 256
29
+ diffusion_objective: rectified_flow
30
+ timestep_sampler: logit_normal
31
+ backend: llama
32
+ rotary_base_val: 20000
33
+ init_std: 0.02
34
+ h_dropout: 0.05
35
+
36
+ condition_provider_cfg:
37
+ prompt_wav:
38
+ type: audio_tokenizer_wrapper
39
+ output_dim: ${model.dim}
40
+ audio_tokenizer:
41
+ max_len: 250 # 25.0 * 10s
42
+ lyrics:
43
+ type: phoneme_tokenizer
44
+ output_dim: ${model.dim}
45
+ vocab_list: ${load_yaml:${dynamic_path:???/vocab_g2p.yaml}}
46
+ max_len: 600
47
+ max_sentence_per_structure: 50
48
+ mode: sum
49
+
50
+
51
+ cfg_dropout: 0.1
52
+ attribute_dropout:
53
+ text:
54
+ lyrics: 0.
55
+ wav:
56
+ prompt_wav: 0.1
57
+
58
+ fuser_cfg:
59
+ cross_attention_pos_emb: false
60
+ cross_attention_pos_emb_scale: 1
61
+ sum: []
62
+ prepend: [lyrics, prompt_wav]
63
+ cross: []
64
+ input_interpolate: []
65
+
66
+
67
+
68
+ inference:
69
+ cfg_coef: 1.5
70
+ temp: 0.9
71
+ diff_temp: 0.95
72
+ top_k: 100
73
+ penalty_repeat: True
74
+ penalty_window: 50
75
+ steps: 36
76
+ dit_cfg_type: h
stable_audio_1920_vae.json ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "autoencoder",
3
+ "sample_size": 403200,
4
+ "sample_rate": 48000,
5
+ "audio_channels": 2,
6
+ "model": {
7
+ "encoder": {
8
+ "type": "oobleck",
9
+ "config": {
10
+ "in_channels": 2,
11
+ "channels": 128,
12
+ "c_mults": [1, 2, 4, 8, 16],
13
+ "strides": [2, 4, 4, 6, 10],
14
+ "latent_dim": 128,
15
+ "use_snake": true
16
+ }
17
+ },
18
+ "decoder": {
19
+ "type": "oobleck",
20
+ "config": {
21
+ "out_channels": 2,
22
+ "channels": 128,
23
+ "c_mults": [1, 2, 4, 8, 16],
24
+ "strides": [2, 4, 4, 6, 10],
25
+ "latent_dim": 64,
26
+ "use_snake": true,
27
+ "final_tanh": false
28
+ }
29
+ },
30
+ "bottleneck": {
31
+ "type": "vae"
32
+ },
33
+ "latent_dim": 64,
34
+ "downsampling_ratio": 1920,
35
+ "io_channels": 2
36
+ },
37
+ "training": {
38
+ "learning_rate": 1.5e-4,
39
+ "warmup_steps": 0,
40
+ "use_ema": true,
41
+ "optimizer_configs": {
42
+ "autoencoder": {
43
+ "optimizer": {
44
+ "type": "AdamW",
45
+ "config": {
46
+ "betas": [0.8, 0.99],
47
+ "lr": 1.5e-4,
48
+ "weight_decay": 1e-3
49
+ }
50
+ },
51
+ "scheduler": {
52
+ "type": "InverseLR",
53
+ "config": {
54
+ "inv_gamma": 200000,
55
+ "power": 0.5,
56
+ "warmup": 0.999
57
+ }
58
+ }
59
+ },
60
+ "discriminator": {
61
+ "optimizer": {
62
+ "type": "AdamW",
63
+ "config": {
64
+ "betas": [0.8, 0.99],
65
+ "lr": 3e-4,
66
+ "weight_decay": 1e-3
67
+ }
68
+ },
69
+ "scheduler": {
70
+ "type": "InverseLR",
71
+ "config": {
72
+ "inv_gamma": 200000,
73
+ "power": 0.5,
74
+ "warmup": 0.999
75
+ }
76
+ }
77
+ }
78
+ },
79
+ "loss_configs": {
80
+ "discriminator": {
81
+ "type": "encodec",
82
+ "config": {
83
+ "filters": 64,
84
+ "n_ffts": [2048, 1024, 512, 256, 128],
85
+ "hop_lengths": [512, 256, 128, 64, 32],
86
+ "win_lengths": [2048, 1024, 512, 256, 128]
87
+ },
88
+ "weights": {
89
+ "adversarial": 0.1,
90
+ "feature_matching": 5.0
91
+ }
92
+ },
93
+ "spectral": {
94
+ "type": "mrstft",
95
+ "config": {
96
+ "fft_sizes": [2048, 1024, 512, 256, 128, 64, 32],
97
+ "hop_sizes": [512, 256, 128, 64, 32, 16, 8],
98
+ "win_lengths": [2048, 1024, 512, 256, 128, 64, 32],
99
+ "perceptual_weighting": true
100
+ },
101
+ "weights": {
102
+ "mrstft": 1.0
103
+ }
104
+ },
105
+ "time": {
106
+ "type": "l1",
107
+ "weights": {
108
+ "l1": 0.0
109
+ }
110
+ },
111
+ "bottleneck": {
112
+ "type": "kl",
113
+ "weights": {
114
+ "kl": 1e-4
115
+ }
116
+ }
117
+ },
118
+ "demo": {
119
+ "demo_every": 2000
120
+ }
121
+ }
122
+ }
vocab_g2p.yaml ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - ''
2
+ - '[verse]'
3
+ - '[chorus]'
4
+ - '[bridge]'
5
+ - '[intro]'
6
+ - '[outro]'
7
+ - '[inst]'
8
+ - '[silence]'
9
+ - '!'
10
+ - ','
11
+ - '-'
12
+ - .
13
+ - '?'
14
+ - AA
15
+ - AA0
16
+ - AA1
17
+ - AA2
18
+ - AE0
19
+ - AE1
20
+ - AE2
21
+ - AH0
22
+ - AH1
23
+ - AH2
24
+ - AO0
25
+ - AO1
26
+ - AO2
27
+ - AW0
28
+ - AW1
29
+ - AW2
30
+ - AY0
31
+ - AY1
32
+ - AY2
33
+ - B
34
+ - CH
35
+ - D
36
+ - DH
37
+ - E1
38
+ - E2
39
+ - E3
40
+ - E4
41
+ - E5
42
+ - EE
43
+ - EH0
44
+ - EH1
45
+ - EH2
46
+ - ER
47
+ - ER0
48
+ - ER1
49
+ - ER2
50
+ - EY0
51
+ - EY1
52
+ - EY2
53
+ - En1
54
+ - En2
55
+ - En3
56
+ - En4
57
+ - En5
58
+ - F
59
+ - G
60
+ - HH
61
+ - I
62
+ - IH
63
+ - IH0
64
+ - IH1
65
+ - IH2
66
+ - IY0
67
+ - IY1
68
+ - IY2
69
+ - JH
70
+ - K
71
+ - L
72
+ - M
73
+ - N
74
+ - NG
75
+ - OO
76
+ - OW0
77
+ - OW1
78
+ - OW2
79
+ - OY0
80
+ - OY1
81
+ - OY2
82
+ - P
83
+ - R
84
+ - S
85
+ - SH
86
+ - SP
87
+ - SP2
88
+ - SP3
89
+ - T
90
+ - TH
91
+ - U
92
+ - UH0
93
+ - UH1
94
+ - UH2
95
+ - UNK
96
+ - UW0
97
+ - UW1
98
+ - UW2
99
+ - V
100
+ - W
101
+ - Y
102
+ - Z
103
+ - ZH
104
+ - _
105
+ - a
106
+ - a1
107
+ - a2
108
+ - a3
109
+ - a4
110
+ - a5
111
+ - ai1
112
+ - ai2
113
+ - ai3
114
+ - ai4
115
+ - ai5
116
+ - an1
117
+ - an2
118
+ - an3
119
+ - an4
120
+ - an5
121
+ - ang1
122
+ - ang2
123
+ - ang3
124
+ - ang4
125
+ - ang5
126
+ - ao1
127
+ - ao2
128
+ - ao3
129
+ - ao4
130
+ - ao5
131
+ - b
132
+ - by
133
+ - c
134
+ - ch
135
+ - cl
136
+ - d
137
+ - dy
138
+ - e
139
+ - e1
140
+ - e2
141
+ - e3
142
+ - e4
143
+ - e5
144
+ - ei1
145
+ - ei2
146
+ - ei3
147
+ - ei4
148
+ - ei5
149
+ - en1
150
+ - en2
151
+ - en3
152
+ - en4
153
+ - en5
154
+ - eng1
155
+ - eng2
156
+ - eng3
157
+ - eng4
158
+ - eng5
159
+ - er1
160
+ - er2
161
+ - er3
162
+ - er4
163
+ - er5
164
+ - f
165
+ - g
166
+ - gy
167
+ - h
168
+ - hy
169
+ - i
170
+ - i01
171
+ - i02
172
+ - i03
173
+ - i04
174
+ - i05
175
+ - i1
176
+ - i2
177
+ - i3
178
+ - i4
179
+ - i5
180
+ - ia1
181
+ - ia2
182
+ - ia3
183
+ - ia4
184
+ - ia5
185
+ - ian1
186
+ - ian2
187
+ - ian3
188
+ - ian4
189
+ - ian5
190
+ - iang1
191
+ - iang2
192
+ - iang3
193
+ - iang4
194
+ - iang5
195
+ - iao1
196
+ - iao2
197
+ - iao3
198
+ - iao4
199
+ - iao5
200
+ - ie1
201
+ - ie2
202
+ - ie3
203
+ - ie4
204
+ - ie5
205
+ - in1
206
+ - in2
207
+ - in3
208
+ - in4
209
+ - in5
210
+ - ing1
211
+ - ing2
212
+ - ing3
213
+ - ing4
214
+ - ing5
215
+ - iong1
216
+ - iong2
217
+ - iong3
218
+ - iong4
219
+ - iong5
220
+ - ir1
221
+ - ir2
222
+ - ir3
223
+ - ir4
224
+ - ir5
225
+ - iu1
226
+ - iu2
227
+ - iu3
228
+ - iu4
229
+ - iu5
230
+ - j
231
+ - k
232
+ - ky
233
+ - l
234
+ - m
235
+ - my
236
+ - n
237
+ - ny
238
+ - o
239
+ - o1
240
+ - o2
241
+ - o3
242
+ - o4
243
+ - o5
244
+ - ong1
245
+ - ong2
246
+ - ong3
247
+ - ong4
248
+ - ong5
249
+ - ou1
250
+ - ou2
251
+ - ou3
252
+ - ou4
253
+ - ou5
254
+ - p
255
+ - py
256
+ - q
257
+ - r
258
+ - ry
259
+ - s
260
+ - sh
261
+ - t
262
+ - ts
263
+ - u
264
+ - u1
265
+ - u2
266
+ - u3
267
+ - u4
268
+ - u5
269
+ - ua1
270
+ - ua2
271
+ - ua3
272
+ - ua4
273
+ - ua5
274
+ - uai1
275
+ - uai2
276
+ - uai3
277
+ - uai4
278
+ - uai5
279
+ - uan1
280
+ - uan2
281
+ - uan3
282
+ - uan4
283
+ - uan5
284
+ - uang1
285
+ - uang2
286
+ - uang3
287
+ - uang4
288
+ - uang5
289
+ - ui1
290
+ - ui2
291
+ - ui3
292
+ - ui4
293
+ - ui5
294
+ - un1
295
+ - un2
296
+ - un3
297
+ - un4
298
+ - un5
299
+ - uo1
300
+ - uo2
301
+ - uo3
302
+ - uo4
303
+ - uo5
304
+ - v
305
+ - v1
306
+ - v2
307
+ - v3
308
+ - v4
309
+ - v5
310
+ - van1
311
+ - van2
312
+ - van3
313
+ - van4
314
+ - van5
315
+ - ve1
316
+ - ve2
317
+ - ve3
318
+ - ve4
319
+ - ve5
320
+ - vn1
321
+ - vn2
322
+ - vn3
323
+ - vn4
324
+ - vn5
325
+ - w
326
+ - x
327
+ - y
328
+ - z
329
+ - zh
330
+ - "\u2026"