hik63382 commited on
Commit
8cd9b6b
·
verified ·
1 Parent(s): cca84d5

Full NZG NZG 73 model: t3_turbo_v1.yaml

Browse files
Files changed (1) hide show
  1. t3_turbo_v1.yaml +387 -0
t3_turbo_v1.yaml ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ align_f0: false
2
+ align_loss_weight: 1.0
3
+ asc_loss_weight: 0.02
4
+ attention_mechanism: graves
5
+ augment_sr: false
6
+ base_model: null
7
+ bit_depth: 9
8
+ causal_convs: false
9
+ causal_decoder: false
10
+ clap_dims: 512
11
+ compat_dcnar_f0_std_cond: false
12
+ conv_stack_dilation: !!python/tuple
13
+ - 1
14
+ - 3
15
+ - 9
16
+ - 27
17
+ convbn_bias: false
18
+ cudnn_deterministic: false
19
+ dcnar_1d_discrim: false
20
+ dcnar_aligner_kernel: 5
21
+ dcnar_aligner_type: null
22
+ dcnar_allow_trivial_speaker_table: true
23
+ dcnar_batch_size: 24
24
+ dcnar_conformer: false
25
+ dcnar_conformer_attn_chunk_size: null
26
+ dcnar_conformer_attn_dim_head: 64
27
+ dcnar_conformer_attn_ff_mult: 4
28
+ dcnar_conformer_attn_win_size: null
29
+ dcnar_conv_weight_grouping: 1
30
+ dcnar_df0_loss_weight: 0.5
31
+ dcnar_dim_lrg: 512
32
+ dcnar_dim_sml: 256
33
+ dcnar_dim_style: 32
34
+ dcnar_discrim_tanh: false
35
+ dcnar_dtw_loss_weight: 1
36
+ dcnar_dur_loss_weight: 0.1
37
+ dcnar_dur_pred_scale: linear
38
+ dcnar_f0_cond_mel_decoding: false
39
+ dcnar_f0_cond_mel_decoding_teacher_forcing: true
40
+ dcnar_f0_loss_weight: 0.5
41
+ dcnar_gan_dims: 64
42
+ dcnar_global_style: true
43
+ dcnar_hard_gumbel_tones: false
44
+ dcnar_hubert_downsample: 1
45
+ dcnar_inpaint_vae: false
46
+ dcnar_inpaint_vae_kld_loss_weight: 0
47
+ dcnar_inpaint_vae_latent_dim: 32
48
+ dcnar_inpaint_vae_warmup_steps: 5000
49
+ dcnar_inpaint_vae_weight_step_size: 0.0002
50
+ dcnar_local_f0: false
51
+ dcnar_local_intensity: false
52
+ dcnar_local_style: false
53
+ dcnar_lr: 0.0001
54
+ dcnar_mel_adv: false
55
+ dcnar_mel_loss_weight: 10.0
56
+ dcnar_mixed_sr_loss: false
57
+ dcnar_n_terminal_tones: 0
58
+ dcnar_ph_f0_loss_weight: 1.0
59
+ dcnar_ph_hubert_loss_weight: 1.0
60
+ dcnar_ph_intensity_loss_weight: 1.0
61
+ dcnar_pitch_adv: false
62
+ dcnar_prosody_adv: false
63
+ dcnar_prosody_stats_cond: false
64
+ dcnar_pstat_weight_f0_mean: 10
65
+ dcnar_pstat_weight_f0_std: 100
66
+ dcnar_pstat_weight_intensity_mean: 10
67
+ dcnar_pstat_weight_intensity_std: 0
68
+ dcnar_pstat_weight_phdur_mean: 1
69
+ dcnar_pstat_weight_phdur_std: 1
70
+ dcnar_reverb_label: false
71
+ dcnar_sampler: default
72
+ dcnar_sr_label: false
73
+ dcnar_terminal_tone_usl_weight: 0
74
+ dcnar_terminal_tone_weight: 0
75
+ dcnar_upsampling: gaussian
76
+ dcnar_use_log_f0_frames: false
77
+ dcnar_use_toucan_utt_embs: false
78
+ dcnar_usl_mfcc: false
79
+ dcnar_usl_mfcc_deltas: false
80
+ dcnar_usl_mfcc_dim: 12
81
+ dcnar_usl_mfcc_var_dec: false
82
+ dcnar_usl_slim: false
83
+ dcnar_usl_slim_dim: 16
84
+ dcnar_usl_with_f0: false
85
+ dcnar_utt_dur_loss_weight: 0
86
+ dcnar_vc_local_hubert: false
87
+ dcnar_vc_mode: nn
88
+ dcnar_vc_text_predict: false
89
+ dcnar_vuv_loss_weight: 0.5
90
+ dcvoc_causal: false
91
+ dcvoc_causal_lookahead: 3
92
+ dcvoc_channel_downsample_mode: interleave
93
+ dcvoc_convs_per_scale: 8
94
+ dcvoc_disc_duplicates: 1
95
+ dcvoc_disc_mpwd: true
96
+ dcvoc_disc_mrsd: false
97
+ dcvoc_disc_pdd: true
98
+ dcvoc_disc_phase_aug: false
99
+ dcvoc_discriminator_bound: 1.01
100
+ dcvoc_groups_init: 8
101
+ dcvoc_halfres_conv: true
102
+ dcvoc_hidden_init: 1024
103
+ dcvoc_hop: 8
104
+ dcvoc_kernel: 7
105
+ dcvoc_mel_bneck: 256
106
+ dcvoc_smpwd_hidden_max: 1024
107
+ dcvoc_smpwd_periods:
108
+ - 2
109
+ - 3
110
+ - 5
111
+ - 7
112
+ - 9
113
+ - 11
114
+ - 13
115
+ dcvoc_upsample_method: linear
116
+ denoise: false
117
+ dfd_clip_stft: 1.0e-09
118
+ dfd_ramdisk_path: /mnt/ramdisk
119
+ ema_coeff: 0.99995
120
+ emo_embedded_speaker_id: false
121
+ emotion_adv: false
122
+ enable_eos_bos_chars: true
123
+ encoder_type: voice_encoder
124
+ eval_crosslang: false
125
+ eval_langs: dataset
126
+ eval_max_ref_samples: 192
127
+ eval_max_repeats: 1
128
+ eval_max_runs: 10
129
+ eval_max_sentences: 192
130
+ eval_mbnet_name: null
131
+ eval_models_dir: saved_models
132
+ eval_n_plots: 2
133
+ eval_n_wavs: 4
134
+ eval_reference: train
135
+ eval_syn_batch_size: 64
136
+ eval_text_source: default
137
+ eval_ve_name: universal/ve_v2
138
+ eval_voc_max_frames: 2000
139
+ eval_voc_name: null
140
+ f0_mode: praat
141
+ flatten_lstm_params: true
142
+ fmax: 16000
143
+ fmin: 0
144
+ frames_per_framegroup: 10
145
+ freeze_mel_head: false
146
+ gmvae_ema_lr: 0.0001
147
+ gmvae_latent_dim: 16
148
+ gmvae_num_components: 0
149
+ gpt_masked_loss: false
150
+ gpt_prod_max_text: 200
151
+ gpt_speaker_ref_type: same_speaker
152
+ gpt_transformer_type: gpt2-medium
153
+ hifigan_channels: 256
154
+ hooli_enc_dims: 256
155
+ hooli_filter_size: 257
156
+ hooli_inv_no_uv: false
157
+ hooli_inv_pitch_diff_reg_weight: 0
158
+ hooli_inv_pitch_shift_reg_weight: 0
159
+ hooli_nfft: 16
160
+ hooli_osc_freq_cutoff: 0.15
161
+ hooli_safe_step: true
162
+ hooli_tv_fir: false
163
+ hooli_wn_dims: 64
164
+ hooligan_discriminators: univnet
165
+ hooligan_istft: true
166
+ hop_size: 320
167
+ input_pos_emb: handled_internally_by_backbone
168
+ is_lora: false
169
+ language_embed_size: 16
170
+ legacy_gpt_hidden_size: 1024
171
+ lfcc_nfilts: 128
172
+ llama_config_name: Llama_520M
173
+ lora_alpha: 64
174
+ lora_dropout: 0.05
175
+ lora_r: 32
176
+ lossynet_bsize: 25
177
+ lossynet_clip_stft: 1.0e-09
178
+ lossynet_lr: 0.001
179
+ lossynet_n_out_classes: 2
180
+ lowest_sr: 8000
181
+ max_LR: 0.001
182
+ max_conditioning_inputs: 2
183
+ max_decoder_frames: 2000
184
+ max_f0_freq: 600
185
+ max_speech_tokens: 604
186
+ max_text_tokens: 402
187
+ max_total_tokens: 8196
188
+ mel_pad_difference: 1
189
+ mel_power: 1.0
190
+ mel_type: db
191
+ min_LR: 1.0e-06
192
+ min_f0_freq: 75
193
+ mpbert_n_freeze: 0
194
+ mpbert_tokenizer: null
195
+ mpbert_type: transformer
196
+ mu_law: true
197
+ n_cqcc_bins: 96
198
+ n_cqt_bins: 84
199
+ n_fft: 2048
200
+ n_gpt_channels: 1024
201
+ n_reverbs: 256
202
+ n_spk_cond_samples: 2
203
+ n_state_per_symbol: 1
204
+ n_transformer_heads: 16
205
+ n_transformer_layers: 30
206
+ normalize_loudness: false
207
+ normalized_mels: true
208
+ num_ceps: 29
209
+ num_diacritcs: 512
210
+ num_freq: 1025
211
+ num_heads: 4
212
+ num_mels: 256
213
+ num_style_tokens: 0
214
+ num_tones: 16
215
+ onehot_language: false
216
+ onehot_speaker: false
217
+ pf_word_boundaries: false
218
+ phonemizer_backend: espeak
219
+ preemphasis: 0.97
220
+ preemphasize_voc_target: false
221
+ prenet_type: original
222
+ project_conditioning: false
223
+ prosody_embed_size: 0
224
+ r_schedule:
225
+ - - 1
226
+ - -1
227
+ rvc_emb_channels: 768
228
+ rvc_enc_spk_input: false
229
+ rvc_f0_up: 0
230
+ rvc_f0_voc: true
231
+ rvc_filter_channels: 768
232
+ rvc_gin_channels: 256
233
+ rvc_hidden_channels: 192
234
+ rvc_inter_channels: 192
235
+ rvc_kernel_size: 3
236
+ rvc_mel_bins: 80
237
+ rvc_n_heads: 2
238
+ rvc_n_layers: 6
239
+ rvc_p_dropout: 0
240
+ rvc_resblock: '1'
241
+ rvc_resblock_dilation_sizes:
242
+ - - 1
243
+ - 3
244
+ - 5
245
+ - - 1
246
+ - 3
247
+ - 5
248
+ - - 1
249
+ - 3
250
+ - 5
251
+ rvc_resblock_kernel_sizes:
252
+ - 3
253
+ - 7
254
+ - 11
255
+ rvc_seg_enc_size_frames: 370
256
+ rvc_seg_enc_size_samples: 118400
257
+ rvc_seg_voc_size_frames: 40
258
+ rvc_seg_voc_size_samples: 12800
259
+ rvc_speaker_enc: table
260
+ rvc_speaker_enc_type: V1
261
+ rvc_speaker_pitch: null
262
+ rvc_spec_channels: 513
263
+ rvc_spk_embed_dim: 109
264
+ rvc_stft_filter_len: 1024
265
+ rvc_stft_win_len: 1024
266
+ rvc_train_kl_weight: 1.0
267
+ rvc_train_mel_weight: 45
268
+ rvc_upsample_initial_channel: 512
269
+ rvc_upsample_kernel_sizes:
270
+ - 20
271
+ - 16
272
+ - 4
273
+ - 4
274
+ rvc_upsample_rates:
275
+ - 10
276
+ - 8
277
+ - 2
278
+ - 2
279
+ rvc_use_f0: true
280
+ sample_rate: 32000
281
+ scheduler_max_total_steps: 200000
282
+ seed: 0
283
+ self_conditioning: false
284
+ separate_stopnet: false
285
+ singing_dim: 4
286
+ speaker_embed_size: 256
287
+ speech_cond_prompt_len: 250
288
+ speech_token_type: tortoise
289
+ speech_tokens_dict_size: 6563
290
+ speed_scale: 0.1
291
+ start_speech_token: 6561
292
+ start_text_token: 255
293
+ stepwise_sigmoid_noise: 2.0
294
+ stft_magnitude_min: 0.0001
295
+ stop_speech_token: 6562
296
+ stop_text_token: 0
297
+ stop_threshold: 0.25
298
+ style_embed_size: 256
299
+ supports_cfg: false
300
+ symbol_type: tortoise/data/gpt2_medium.json
301
+ syn_ar_f0_predict: true
302
+ syn_batch_frames: 16000
303
+ syn_batch_size: 32
304
+ syn_mel_scale: 1
305
+ syn_predict_f0: true
306
+ syn_sampler: binnedlength
307
+ syn_symmetric_mel: false
308
+ syn_train_max_frames: 700
309
+ syn_train_min_duration: 1
310
+ taco1_postnet: true
311
+ taco_decoder_att_rnn_dim: 1024
312
+ taco_decoder_prenet_dim: 256
313
+ taco_decoder_rnn_dim: 1024
314
+ taco_disjoint_conditioning: true
315
+ taco_encoder_dim: 512
316
+ taco_grad_clip: 1
317
+ taco_loss_masking: true
318
+ taco_lr: 0.0001
319
+ taco_weight_decay: 1.0e-06
320
+ target_loudness: -18
321
+ text_loss_weight: 0.1
322
+ text_preproc: none
323
+ text_tokens_dict_size: 50276
324
+ ti_vocoder: false
325
+ toucan_utt_emb_dim: 704
326
+ trim_silence: true
327
+ upsample_factors: !!python/tuple
328
+ - 5
329
+ - 8
330
+ - 8
331
+ upsample_rate: null
332
+ upsamplenet_dropout: false
333
+ upsamplenet_lr: 1.0e-05
334
+ use_adv_speaker_classifier: false
335
+ use_clap_embeds: false
336
+ use_diacritic: false
337
+ use_emotion_table: false
338
+ use_lamb_optimizer: false
339
+ use_language_table: false
340
+ use_monotonic_alignment: false
341
+ use_mpbert: false
342
+ use_one_cycle_lr: false
343
+ use_perceiver_resampler: false
344
+ use_pf: false
345
+ use_ph_durations: false
346
+ use_singing_labels: false
347
+ use_snr_labels: false
348
+ use_speaker_table: false
349
+ use_speech_codes_as_input: true
350
+ use_sv2tts: false
351
+ use_tb: false
352
+ use_tone: false
353
+ use_tpgst: false
354
+ use_wandb: false
355
+ vad_algo: webrtc
356
+ vad_margin: 0.1
357
+ validate_sr: true
358
+ validate_wav_len: true
359
+ vc_mel2f0: false
360
+ vc_soft_gt_pitch: false
361
+ vc_soft_units: true
362
+ ve_final_relu: false
363
+ ve_hidden_size: 768
364
+ ve_lr: 0.0001
365
+ ve_min_samples: 20
366
+ ve_partial_frames: 128
367
+ ve_spk_batch_size: 128
368
+ ve_utt_batch_size: 10
369
+ voc_future_horizon: 11
370
+ voc_lvc: false
371
+ voc_lvc_dims: 8
372
+ voc_noise_fir: true
373
+ voc_subscale: 0
374
+ voc_train_max_duration: 30
375
+ voc_train_min_duration: 1.5
376
+ voc_voiced_logits_scale: 0
377
+ vocoder_bsize: 16
378
+ vocoder_fc_dims: 512
379
+ vocoder_hidden_size: 512
380
+ vocoder_input_length: 16000
381
+ vocoder_input_pad: 0
382
+ vocoder_lr: 0.0001
383
+ vocoder_mode: MOL
384
+ wandb_watch_model: false
385
+ webrtc_mode: 2
386
+ weight_init: false
387
+ win_size: 2048