he-shuwei commited on
Commit
233a12e
·
verified ·
1 Parent(s): bf79a57

Upload checkpoints/pretrain_decoder/config.yaml with huggingface_hub

Browse files
checkpoints/pretrain_decoder/config.yaml ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accumulate_grad_batches: 1
2
+ amp: true
3
+ attention_dropout: 0.1
4
+ audio_num_mel_bins: 80
5
+ audio_sample_rate: 16000
6
+ base_config:
7
+ - configs/m2se_vtts.yaml
8
+ binary_data_dir: data/binary_data_pretrain_decoder
9
+ cfg_guidance_scale: 2.0
10
+ clip_grad_norm: 1.0
11
+ ddim_eta: 0.0
12
+ ddim_steps: 100
13
+ debug: false
14
+ dec_ffn_kernel_size: 9
15
+ dec_layers: 4
16
+ decoder_pretrain_mask_span: 10
17
+ decoder_pretrain_mask_start_prob: 0.065
18
+ decoder_pretrain_mel_mask: true
19
+ decoder_pretrain_spec_augment: true
20
+ default_num_caption_tokens: 16
21
+ diff_decoder_type: transformer-F5Base
22
+ diff_loss_type: l2
23
+ dit_attn_backend: torch
24
+ dit_attn_mask_enabled: true
25
+ dit_checkpoint_activations: true
26
+ dit_drop_path_rate: 0.15
27
+ dit_dropout: 0.15
28
+ dit_long_skip_connection: true
29
+ dit_pe_attn_head: 1
30
+ dit_qk_norm: null
31
+ dropout: 0.1
32
+ ds_workers: 4
33
+ dur_loss: mse
34
+ dur_predictor_kernel: 3
35
+ dur_predictor_layers: 2
36
+ ema_decay: 0.9999
37
+ enc_ffn_kernel_size: 9
38
+ enc_layers: 4
39
+ eval_audio_num_samples: 10
40
+ ffn_act: gelu
41
+ ffn_hidden_size: 2048
42
+ ffn_padding: SAME
43
+ fft_size: 1024
44
+ fmax: 8000
45
+ fmin: 0
46
+ hidden_size: 512
47
+ hop_size: 256
48
+ infer: false
49
+ keep_bins: 80
50
+ lambda_energy: 0.05
51
+ lambda_f0: 0.5
52
+ lambda_uv: 0.5
53
+ lgsu_iterations: 2
54
+ load_clip: false
55
+ lr: 0.0001
56
+ max_epochs: 1000
57
+ max_frames: 8000
58
+ max_sentences: 128
59
+ max_tokens: 128000
60
+ max_updates: 160000
61
+ max_valid_sentences: 128
62
+ max_valid_tokens: 128000
63
+ mfa_output_dir: data/processed_data/mfa/outputs
64
+ min_snr_gamma: 5
65
+ num_ckpt_keep: 3
66
+ num_heads: 8
67
+ num_sanity_val_steps: 1
68
+ num_valid_plots: 10
69
+ optimizer_adam_beta2: 0.999
70
+ persistent_workers: true
71
+ phone_set_path: data/binary_data/phone_set.json
72
+ pitch_loss: l1
73
+ pitch_norm: standard
74
+ pitch_type: frame
75
+ predictor_dropout: 0.5
76
+ predictor_grad: 0.1
77
+ predictor_hidden: 256
78
+ predictor_kernel: 5
79
+ predictor_layers: 3
80
+ prefetch_factor: 4
81
+ pretrained_decoder_path: null
82
+ pretrained_encoder_path: null
83
+ print_nan_grads: false
84
+ processed_data_dir: data/processed_data
85
+ raw_data_dir: data/raw_data/soundspaces_speech
86
+ resume_from_checkpoint: 0
87
+ rmvpe_ckpt: checkpoints/RMVPE/rmvpe.pt
88
+ save_best: true
89
+ save_codes: []
90
+ schedule_type: cosine
91
+ scheduler_type: cosine
92
+ seed: 1234
93
+ self_condition: false
94
+ sort_by_len: true
95
+ spatial_num_heads: 16
96
+ spec_aug_prob: 0.5
97
+ spec_augment_freq_masks: 2
98
+ spec_augment_freq_width: 10
99
+ spec_augment_time_masks: 2
100
+ spec_augment_time_width: 50
101
+ spec_max:
102
+ - 2.1879
103
+ - 1.8991
104
+ - 2.1358
105
+ - 2.1123
106
+ - 2.1055
107
+ - 2.1296
108
+ - 2.2195
109
+ - 2.136
110
+ - 2.089
111
+ - 2.0317
112
+ - 2.182
113
+ - 2.0508
114
+ - 1.9991
115
+ - 2.0789
116
+ - 2.1077
117
+ - 1.9954
118
+ - 2.0502
119
+ - 2.0491
120
+ - 1.9095
121
+ - 1.8531
122
+ - 1.9297
123
+ - 1.8946
124
+ - 1.844
125
+ - 1.9792
126
+ - 1.8273
127
+ - 1.9192
128
+ - 1.7508
129
+ - 1.7955
130
+ - 1.6119
131
+ - 1.6795
132
+ - 1.7442
133
+ - 1.5747
134
+ - 1.5096
135
+ - 1.6116
136
+ - 1.3568
137
+ - 1.579
138
+ - 1.2652
139
+ - 1.3127
140
+ - 1.5129
141
+ - 1.3126
142
+ - 1.3471
143
+ - 1.0709
144
+ - 1.0851
145
+ - 1.1595
146
+ - 0.8298
147
+ - 0.7789
148
+ - 0.9075
149
+ - 0.767
150
+ - 0.9798
151
+ - 0.7773
152
+ - 0.5978
153
+ - 0.8436
154
+ - 0.7244
155
+ - 0.8123
156
+ - 0.9104
157
+ - 0.8252
158
+ - 0.8225
159
+ - 0.7235
160
+ - 0.6883
161
+ - 0.8559
162
+ - 0.8016
163
+ - 0.783
164
+ - 0.8467
165
+ - 0.6792
166
+ - 0.8935
167
+ - 0.8483
168
+ - 0.571
169
+ - 0.7259
170
+ - 0.7561
171
+ - 0.8435
172
+ - 0.6317
173
+ - 0.6531
174
+ - 0.4406
175
+ - 0.3391
176
+ - 0.3603
177
+ - 0.2577
178
+ - 0.3985
179
+ - 0.538
180
+ - -0.0428
181
+ - -0.9947
182
+ spec_min:
183
+ - -11.5129
184
+ - -11.5129
185
+ - -11.5129
186
+ - -11.5129
187
+ - -11.5129
188
+ - -11.5129
189
+ - -11.5129
190
+ - -11.5129
191
+ - -11.5129
192
+ - -11.5129
193
+ - -11.5129
194
+ - -11.5129
195
+ - -11.5129
196
+ - -11.5129
197
+ - -11.5129
198
+ - -11.5129
199
+ - -11.5129
200
+ - -11.5129
201
+ - -11.5129
202
+ - -11.5129
203
+ - -11.5129
204
+ - -11.5129
205
+ - -11.5129
206
+ - -11.5129
207
+ - -11.5129
208
+ - -11.5129
209
+ - -11.5129
210
+ - -11.5129
211
+ - -11.5129
212
+ - -11.5129
213
+ - -11.5129
214
+ - -11.5129
215
+ - -11.5129
216
+ - -11.5129
217
+ - -11.5129
218
+ - -11.5129
219
+ - -11.5129
220
+ - -11.5129
221
+ - -11.5129
222
+ - -11.5129
223
+ - -11.5129
224
+ - -11.5129
225
+ - -11.5129
226
+ - -11.5129
227
+ - -11.5129
228
+ - -11.5129
229
+ - -11.5129
230
+ - -11.5129
231
+ - -11.5129
232
+ - -11.5129
233
+ - -11.5129
234
+ - -11.5129
235
+ - -11.5129
236
+ - -11.5129
237
+ - -11.5129
238
+ - -11.5129
239
+ - -11.5129
240
+ - -11.5129
241
+ - -11.5129
242
+ - -11.5129
243
+ - -11.5129
244
+ - -11.5129
245
+ - -11.5129
246
+ - -11.5129
247
+ - -11.5129
248
+ - -11.5129
249
+ - -11.5129
250
+ - -11.5129
251
+ - -11.5129
252
+ - -11.5129
253
+ - -11.5129
254
+ - -11.5129
255
+ - -11.5129
256
+ - -11.5129
257
+ - -11.5129
258
+ - -11.5129
259
+ - -11.5129
260
+ - -11.5129
261
+ - -11.5129
262
+ - -11.5129
263
+ spk_embed_dim: 192
264
+ task_cls: m2se_vtts.tasks.pretrain_task.DecoderPretrainTask
265
+ tb_log_interval: 10
266
+ test_input_dir: ''
267
+ test_num: 100
268
+ test_set_name: test_seen
269
+ text_dim: 768
270
+ timesteps: 1000
271
+ top_k_regions: 140
272
+ uncond_prob: 0.15
273
+ use_cfg_inference: true
274
+ use_controlnet_finetune: false
275
+ use_ddim: true
276
+ use_ema: true
277
+ use_energy_embed: true
278
+ use_gt_dur: false
279
+ use_gt_f0: false
280
+ use_pitch_embed: true
281
+ use_pos_embed: true
282
+ use_spec_augment: false
283
+ use_spk_embed: true
284
+ use_spk_id: false
285
+ use_uv: true
286
+ use_visual: false
287
+ uv_label_smoothing: 0.1
288
+ val_check_interval: 2000
289
+ val_prefixes:
290
+ - valid
291
+ - test_seen
292
+ - test_unseen
293
+ valid_monitor_key: val_loss
294
+ valid_monitor_mode: min
295
+ vision_dim: 1024
296
+ vocoder: bigvgan
297
+ vocoder_ckpt: checkpoints/bigvgan/g_00076000
298
+ vocoder_config: null
299
+ vt_enc_layers: 3
300
+ warmup_updates: 4000
301
+ weight_decay: 0.08
302
+ win_size: 1024
303
+ work_dir: checkpoints/pretrain_decoder_emilia