liweiche commited on
Commit
76698d9
·
verified ·
1 Parent(s): 525a5df

Upload folder using huggingface_hub

Browse files
kmeans_v200.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:148b638a3f392906bb0ba92c398d2e6c252f186e99db4fa38501e428c58c1de9
3
+ size 819328
vae-gslm/hp.yaml ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ train:
3
+ batch_size: 48
4
+ bits_per_second: 18500
5
+ min_audio_length: 3.0
6
+ num_workers: 6
7
+ path: /usr3/liweiche/libri-light/libri-light/data_preparation/vad_20s/tokens_v200.txt
8
+ post_pad:
9
+ mel:
10
+ length: 12.8
11
+ tokens:
12
+ num_tokens: 640
13
+ preprocess_mels: /usr3/liweiche/libri-light/libri-light/data_preparation/vad_20s/mels
14
+ preprocess_mels_recursive_dir: true
15
+ random_crop_mel_utt:
16
+ max_seg_sec: 4.0
17
+ min_seg_sec: 2.0
18
+ sample_rate: 16000
19
+ sampler:
20
+ shuffle: true
21
+ type: standard
22
+ token_segment_size: 640
23
+ wavdir: /usr3/liweiche/libri-light/libri-light/data_preparation/vad_20s/
24
+ with_text: false
25
+ with_tokens: true
26
+ val:
27
+ batch_size: 8
28
+ bits_per_second: 18500
29
+ min_audio_length: 3.2
30
+ num_workers: 2
31
+ pad:
32
+ mode: constant
33
+ multiple_of: 400
34
+ path: /usr3/liweiche/LibriSpeech-960/dev/tokens_v200_libri-light.txt
35
+ random_crop_mel_utt:
36
+ max_seg_sec: 5.0
37
+ min_seg_sec: 1.0
38
+ sample_rate: 16000
39
+ sampler:
40
+ shuffle: true
41
+ type: standard
42
+ token_segment_size: 150
43
+ wavdir: /usr3/liweiche/LibriSpeech-960/dev
44
+ with_text: false
45
+ with_tokens: true
46
+ hubert:
47
+ sample_rate: 50
48
+ logging:
49
+ log_dir: outputs/libri-light/lvtr
50
+ num_samples: 10
51
+ plot_attn: false
52
+ sample_length: 7.0
53
+ sample_prior_length: 2.0
54
+ temperature: 1.0
55
+ model:
56
+ decoder:
57
+ cond_unet:
58
+ time_embedding:
59
+ activation:
60
+ identifier: SiLU
61
+ dim: 256
62
+ maxpos: 1000
63
+ unet:
64
+ condition_dim: 32
65
+ conditional:
66
+ - false
67
+ - true
68
+ - true
69
+ - true
70
+ - true
71
+ - false
72
+ connection_type: concat
73
+ final_norm: true
74
+ hidden_channels:
75
+ - 2048
76
+ - 2048
77
+ - 2048
78
+ - 2048
79
+ - 2048
80
+ - 2048
81
+ init_channel: 512
82
+ layer:
83
+ activation:
84
+ identifier: SiLU
85
+ aux_in_channels: 32
86
+ causal_padding: true
87
+ condition_type: concat
88
+ hidden_channels: 2048
89
+ in_channels: 512
90
+ in_dim: 32
91
+ kernel_size: 7
92
+ norm:
93
+ eps: 1.0e-06
94
+ identifier: InstanceNorm
95
+ time_dim: 256
96
+ num_layers: 6
97
+ out_channels:
98
+ - 512
99
+ - 512
100
+ - 512
101
+ - 512
102
+ - 512
103
+ - 512
104
+ resample_ksize:
105
+ - 1
106
+ - 1
107
+ - 1
108
+ - 1
109
+ - 1
110
+ - 1
111
+ resample_rates:
112
+ - 1
113
+ - 1
114
+ - 1
115
+ - 1
116
+ - 1
117
+ - 1
118
+ skip_connection:
119
+ - null
120
+ - null
121
+ - null
122
+ - 2
123
+ - 1
124
+ - 0
125
+ time_dim: 256
126
+ upward_layer:
127
+ activation:
128
+ identifier: SiLU
129
+ aux_in_channels: 0
130
+ boundary: 3
131
+ condition_type: concat
132
+ future_padding: true
133
+ hidden_channels: 2048
134
+ in_channels: 512
135
+ in_dim: 32
136
+ kernel_size: 7
137
+ norm:
138
+ eps: 1.0e-06
139
+ identifier: InstanceNorm
140
+ time_dim: 256
141
+ diffusion:
142
+ beta_schedule:
143
+ identifier: cosine
144
+ clamp_range:
145
+ - -3.0
146
+ - 1.2
147
+ ddim_sampling_eta: 1.0
148
+ identifier: ConditionalBottleNeckUNet
149
+ input_scale: 5.0
150
+ loss_type: l1
151
+ objective: pred_noise
152
+ timesteps: 1000
153
+ encoder:
154
+ final_norm: true
155
+ hidden_channels:
156
+ - 2048
157
+ - 2048
158
+ - 2048
159
+ identifier: BottleNeckResNet
160
+ init_channel: 512
161
+ layer:
162
+ activation:
163
+ identifier: ReLU
164
+ aux_in_channels: 0
165
+ causal_padding: true
166
+ hidden_channels: 2048
167
+ in_channels: 512
168
+ kernel_size: 7
169
+ norm:
170
+ eps: 1.0e-06
171
+ identifier: InstanceNorm
172
+ num_layers: 3
173
+ out_channels:
174
+ - 512
175
+ - 512
176
+ - 512
177
+ resample_ksize:
178
+ - 1
179
+ - 1
180
+ - 1
181
+ resample_rates:
182
+ - 1
183
+ - 1
184
+ - 1
185
+ latent_dim: 4
186
+ tokens:
187
+ embedding_dim: 64
188
+ vocab_size: 200
189
+ transformer:
190
+ bias: false
191
+ flow:
192
+ conditional: true
193
+ layer:
194
+ activation:
195
+ identifier: GELU
196
+ hidden_dim: 64
197
+ mean_only: false
198
+ norm:
199
+ eps: 1.0e-06
200
+ identifier: LayerNorm
201
+ scale_range:
202
+ - 0.5
203
+ - 2.0
204
+ num_layers: 4
205
+ layer:
206
+ activation:
207
+ identifier: GELU
208
+ dim: 1024
209
+ ffd_size: 4096
210
+ norm:
211
+ eps: 1.0e-06
212
+ identifier: RMSNorm
213
+ self_attn:
214
+ causal: true
215
+ nheads: 16
216
+ num_layers: 16
217
+ rpe:
218
+ identifier: ALiBi
219
+ maxpos: 1024
220
+ utterance_encoder:
221
+ embedding_dim: 128
222
+ init_channel: 64
223
+ layer:
224
+ activation:
225
+ identifier: ReLU
226
+ in_channels: 256
227
+ kernel_size: 4
228
+ norm:
229
+ eps: 1.0e-06
230
+ identifier: InstanceNorm
231
+ out_channels: 512
232
+ stride: -2
233
+ num_layers: 3
234
+ out_channels:
235
+ - 128
236
+ - 256
237
+ - 512
238
+ resample_ksize:
239
+ - 4
240
+ - 4
241
+ - 4
242
+ resample_rates:
243
+ - -2
244
+ - -2
245
+ - -2
246
+ trainer:
247
+ compile:
248
+ mode: default
249
+ ddp_strategy: ddp
250
+ distributed: true
251
+ identifier: trainers.speech.lvtr.LVTRTrainer
252
+ limit_val_batches: 500
253
+ precision: 16-mixed
254
+ save_every_n_epoch: 1
255
+ total_steps: 1200000
256
+ val_check_interval: 10000
257
+ training:
258
+ fixed_beta: 0.04
259
+ gradient_accumulation: 2
260
+ mel_rescale:
261
+ mean: -1.5
262
+ std: 2.0
263
+ optimizer:
264
+ beta1: 0.9
265
+ beta2: 0.98
266
+ exclude_norm_and_bias_from_weight_decay: true
267
+ identifier: AdamW
268
+ lr: 0.0005
269
+ weight_decay: 0.1
270
+ scale_rec_beta: false
271
+ scheduler:
272
+ flat_steps: 30000
273
+ identifier: cosine
274
+ min_lr: 5.0e-05
275
+ warmup_kld: 30000
276
+ token_kld_weight: 0.5
277
+ vocoder:
278
+ path: ./vocoder_ckpt
vae-gslm/last-cpt.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01e59383ebf8c510af9f2390aae9485696fac16c4e83e0d0b460731270095310
3
+ size 908011386
vocoder/hp.yaml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ trainer:
2
+ identifier: "trainers.vocoder.hfgan.HiFiGANTrainer"
3
+ total_steps: 1600000 # Total Steps * 2 (GANs)
4
+ check_val_every_n_epoch: 2
5
+ save_every_n_epoch: 2
6
+ limit_val_batches: 500
7
+ precision: "32"
8
+ distributed: false
9
+
10
+ logging:
11
+ log_dir: "outputs/hfgan_50hz_librispeech"
12
+ num_samples: 10
13
+
14
+ feature:
15
+ sample_rate: 16000
16
+ n_fft: 1025
17
+ win_length: 1024
18
+ hop_length: 320
19
+ n_mels: 80
20
+ f_min: 0
21
+ f_max: 8000
22
+ power: 1.0
23
+ log_scale: true
24
+
25
+ model:
26
+ generator:
27
+ weight_norm: true
28
+ upsample_rates: [5, 4, 2, 2, 2, 2]
29
+ upsample_kernel_sizes: [10, 8, 4, 4, 4, 4]
30
+ upsample_initial_channel: 512
31
+ resblock_kernel_sizes: [3, 7, 11]
32
+ resblock_dilation_sizes:
33
+ - [1, 3, 5]
34
+ - [1, 3, 5]
35
+ - [1, 3, 5]
36
+ in_channels: 80
37
+ kernel_size: 7
38
+
39
+ mrd:
40
+ weight_norm: true
41
+ resolutions:
42
+ - [1024, 120, 600]
43
+ - [2048, 240, 1200]
44
+ - [512, 50, 240]
45
+
46
+ mpd:
47
+ weight_norm: true
48
+ periods: [2, 3, 5, 7, 11]
49
+
50
+ training:
51
+ generator:
52
+ optimizer:
53
+ identifier: "Adam"
54
+ lr: 0.0001
55
+ beta1: 0.8
56
+ beta2: 0.98
57
+ scheduler:
58
+ identifier: "triangle"
59
+ warmup_steps: 0
60
+ flat_steps: 100000
61
+
62
+ discriminator:
63
+ optimizer:
64
+ identifier: "Adam"
65
+ lr: 0.0001
66
+ beta1: 0.8
67
+ beta2: 0.98
68
+ scheduler:
69
+ identifier: "triangle"
70
+ warmup_steps: 0
71
+ flat_steps: 100000
72
+
73
+ mel_loss_weight: 40.0
74
+
75
+ data:
76
+ train:
77
+ path: "/usr2/liweiche/LibriSpeech-960/train/metadata.txt"
78
+ wavdir: "/usr2/liweiche/LibriSpeech-960/train"
79
+ segment_size: 1.0
80
+ sample_rate: 16000
81
+ dither: true
82
+ with_text: false
83
+ num_workers: 32
84
+ batch_size: 24
85
+ min_audio_length: 1.5
86
+ bits_per_second: 18500
87
+
88
+ sampler:
89
+ type: "standard"
90
+ shuffle: true
91
+ val:
92
+ path: "/usr2/liweiche/LibriSpeech-960/dev/metadata.txt"
93
+ wavdir: "/usr2/liweiche/LibriSpeech-960/dev"
94
+ sample_rate: 16000
95
+ segment_size: 7.0
96
+ with_text: false
97
+ num_workers: 8
98
+ batch_size: 4
99
+ min_audio_length: 4.0
100
+ bits_per_second: 18500
101
+
102
+ sampler:
103
+ type: "standard"
104
+ shuffle: false
vocoder/last-cpt.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0497a846126b5b0a7df7f2004f303bb17010f11ba424ec0132d5b14cfffbac8c
3
+ size 51877178