he-shuwei commited on
Commit
24e8bdd
·
verified ·
1 Parent(s): 02364e1

Add pretrain encoder config

Browse files
Files changed (1) hide show
  1. pretrain_encoder/config.yaml +297 -0
pretrain_encoder/config.yaml ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accumulate_grad_batches: 1
2
+ amp: true
3
+ attention_dropout: 0.1
4
+ audio_num_mel_bins: 80
5
+ audio_sample_rate: 16000
6
+ base_config:
7
+ - configs/m2se_vtts.yaml
8
+ binary_data_dir: data/binary_data_pretrain_encoder
9
+ cfg_guidance_scale: 2.0
10
+ clip_grad_norm: 1.0
11
+ ddim_eta: 0.0
12
+ ddim_steps: 100
13
+ debug: false
14
+ dec_ffn_kernel_size: 9
15
+ dec_layers: 4
16
+ default_num_caption_tokens: 16
17
+ diff_decoder_type: transformer-Base
18
+ diff_loss_type: l2
19
+ dit_attn_backend: torch
20
+ dit_attn_mask_enabled: true
21
+ dit_checkpoint_activations: true
22
+ dit_drop_path_rate: 0.15
23
+ dit_dropout: 0.15
24
+ dit_long_skip_connection: true
25
+ dit_pe_attn_head: 1
26
+ dit_qk_norm: null
27
+ dropout: 0.1
28
+ ds_workers: 4
29
+ dur_loss: mse
30
+ dur_loss_weight: 0.0
31
+ dur_predictor_kernel: 3
32
+ dur_predictor_layers: 2
33
+ ema_decay: 0.9999
34
+ enc_ffn_kernel_size: 9
35
+ enc_layers: 4
36
+ eval_audio_num_samples: 10
37
+ ffn_act: gelu
38
+ ffn_hidden_size: 2048
39
+ ffn_padding: SAME
40
+ fft_size: 1024
41
+ fmax: 8000
42
+ fmin: 0
43
+ hidden_size: 512
44
+ hop_size: 256
45
+ infer: false
46
+ keep_bins: 80
47
+ lambda_energy: 0.05
48
+ lambda_f0: 0.5
49
+ lambda_uv: 0.5
50
+ lgsu_iterations: 2
51
+ load_clip: false
52
+ lr: 1.0
53
+ max_epochs: 1000
54
+ max_frames: 8000
55
+ max_sentences: 1024
56
+ max_tokens: 200000
57
+ max_updates: 1000000
58
+ max_valid_sentences: 16
59
+ max_valid_tokens: 25000
60
+ mfa_output_dir: data/processed_data/mfa/outputs
61
+ mlm_loss_weight: 1.0
62
+ mlm_mask_prob: 0.15
63
+ num_ckpt_keep: 5
64
+ num_heads: 8
65
+ num_sanity_val_steps: 1
66
+ num_valid_plots: 10
67
+ optimizer_adam_beta2: 0.999
68
+ persistent_workers: true
69
+ phone_set_path: data/binary_data/phone_set.json
70
+ pitch_loss: l1
71
+ pitch_norm: standard
72
+ pitch_type: frame
73
+ predictor_dropout: 0.5
74
+ predictor_grad: 0.1
75
+ predictor_hidden: 256
76
+ predictor_kernel: 5
77
+ predictor_layers: 3
78
+ prefetch_factor: 4
79
+ pretrained_decoder_path: null
80
+ pretrained_encoder_path: null
81
+ print_nan_grads: false
82
+ processed_data_dir: data/processed_data
83
+ raw_data_dir: data/raw_data/soundspaces_speech
84
+ resume_from_checkpoint: 0
85
+ rmvpe_ckpt: checkpoints/RMVPE/rmvpe.pt
86
+ save_best: true
87
+ save_codes: []
88
+ schedule_type: cosine
89
+ scheduler_type: cosine
90
+ seed: 1234
91
+ self_condition: false
92
+ sort_by_len: true
93
+ spatial_num_heads: 16
94
+ spec_aug_prob: 0.5
95
+ spec_max:
96
+ - 2.1879
97
+ - 1.8991
98
+ - 2.1358
99
+ - 2.1123
100
+ - 2.1055
101
+ - 2.1296
102
+ - 2.2195
103
+ - 2.136
104
+ - 2.089
105
+ - 2.0317
106
+ - 2.182
107
+ - 2.0508
108
+ - 1.9991
109
+ - 2.0789
110
+ - 2.1077
111
+ - 1.9954
112
+ - 2.0502
113
+ - 2.0491
114
+ - 1.9095
115
+ - 1.8531
116
+ - 1.9297
117
+ - 1.8946
118
+ - 1.844
119
+ - 1.9792
120
+ - 1.8273
121
+ - 1.9192
122
+ - 1.7508
123
+ - 1.7955
124
+ - 1.6119
125
+ - 1.6795
126
+ - 1.7442
127
+ - 1.5747
128
+ - 1.5096
129
+ - 1.6116
130
+ - 1.3568
131
+ - 1.579
132
+ - 1.2652
133
+ - 1.3127
134
+ - 1.5129
135
+ - 1.3126
136
+ - 1.3471
137
+ - 1.0709
138
+ - 1.0851
139
+ - 1.1595
140
+ - 0.8298
141
+ - 0.7789
142
+ - 0.9075
143
+ - 0.767
144
+ - 0.9798
145
+ - 0.7773
146
+ - 0.5978
147
+ - 0.8436
148
+ - 0.7244
149
+ - 0.8123
150
+ - 0.9104
151
+ - 0.8252
152
+ - 0.8225
153
+ - 0.7235
154
+ - 0.6883
155
+ - 0.8559
156
+ - 0.8016
157
+ - 0.783
158
+ - 0.8467
159
+ - 0.6792
160
+ - 0.8935
161
+ - 0.8483
162
+ - 0.571
163
+ - 0.7259
164
+ - 0.7561
165
+ - 0.8435
166
+ - 0.6317
167
+ - 0.6531
168
+ - 0.4406
169
+ - 0.3391
170
+ - 0.3603
171
+ - 0.2577
172
+ - 0.3985
173
+ - 0.538
174
+ - -0.0428
175
+ - -0.9947
176
+ spec_min:
177
+ - -11.5129
178
+ - -11.5129
179
+ - -11.5129
180
+ - -11.5129
181
+ - -11.5129
182
+ - -11.5129
183
+ - -11.5129
184
+ - -11.5129
185
+ - -11.5129
186
+ - -11.5129
187
+ - -11.5129
188
+ - -11.5129
189
+ - -11.5129
190
+ - -11.5129
191
+ - -11.5129
192
+ - -11.5129
193
+ - -11.5129
194
+ - -11.5129
195
+ - -11.5129
196
+ - -11.5129
197
+ - -11.5129
198
+ - -11.5129
199
+ - -11.5129
200
+ - -11.5129
201
+ - -11.5129
202
+ - -11.5129
203
+ - -11.5129
204
+ - -11.5129
205
+ - -11.5129
206
+ - -11.5129
207
+ - -11.5129
208
+ - -11.5129
209
+ - -11.5129
210
+ - -11.5129
211
+ - -11.5129
212
+ - -11.5129
213
+ - -11.5129
214
+ - -11.5129
215
+ - -11.5129
216
+ - -11.5129
217
+ - -11.5129
218
+ - -11.5129
219
+ - -11.5129
220
+ - -11.5129
221
+ - -11.5129
222
+ - -11.5129
223
+ - -11.5129
224
+ - -11.5129
225
+ - -11.5129
226
+ - -11.5129
227
+ - -11.5129
228
+ - -11.5129
229
+ - -11.5129
230
+ - -11.5129
231
+ - -11.5129
232
+ - -11.5129
233
+ - -11.5129
234
+ - -11.5129
235
+ - -11.5129
236
+ - -11.5129
237
+ - -11.5129
238
+ - -11.5129
239
+ - -11.5129
240
+ - -11.5129
241
+ - -11.5129
242
+ - -11.5129
243
+ - -11.5129
244
+ - -11.5129
245
+ - -11.5129
246
+ - -11.5129
247
+ - -11.5129
248
+ - -11.5129
249
+ - -11.5129
250
+ - -11.5129
251
+ - -11.5129
252
+ - -11.5129
253
+ - -11.5129
254
+ - -11.5129
255
+ - -11.5129
256
+ - -11.5129
257
+ spk_embed_dim: 192
258
+ task_cls: m2se_vtts.tasks.pretrain_task.EncoderPretrainTask
259
+ tb_log_interval: 10
260
+ test_input_dir: ''
261
+ test_num: 100
262
+ test_set_name: test_seen
263
+ text_dim: 768
264
+ timesteps: 1000
265
+ top_k_regions: 140
266
+ uncond_prob: 0.15
267
+ use_cfg_inference: true
268
+ use_controlnet_finetune: false
269
+ use_ddim: true
270
+ use_ema: true
271
+ use_energy_embed: false
272
+ use_gt_dur: false
273
+ use_gt_f0: false
274
+ use_pitch_embed: false
275
+ use_pos_embed: true
276
+ use_spec_augment: false
277
+ use_spk_embed: true
278
+ use_spk_id: false
279
+ use_uv: true
280
+ use_visual: false
281
+ uv_label_smoothing: 0.1
282
+ val_check_interval: 2000
283
+ val_prefixes:
284
+ - valid
285
+ - test_seen
286
+ - test_unseen
287
+ valid_monitor_key: val_loss
288
+ valid_monitor_mode: min
289
+ vision_dim: 1024
290
+ vocoder: bigvgan
291
+ vocoder_ckpt: checkpoints/bigvgan/g_00076000
292
+ vocoder_config: null
293
+ vt_enc_layers: 3
294
+ warmup_updates: 4000
295
+ weight_decay: 0.08
296
+ win_size: 1024
297
+ work_dir: checkpoints/pretrain_encoder