| base_config: configs/tts/base.yaml
|
| task_cls: tasks.tts.fs2.FastSpeech2Task
|
|
|
|
|
| hidden_size: 256
|
| dropout: 0.1
|
| encoder_type: fft
|
| encoder_K: 8
|
| decoder_type: fft
|
| use_pos_embed: true
|
|
|
|
|
| predictor_hidden: -1
|
| predictor_kernel: 5
|
| predictor_layers: 2
|
| dur_predictor_kernel: 3
|
| dur_predictor_layers: 2
|
| predictor_dropout: 0.5
|
|
|
|
|
| use_pitch_embed: true
|
| pitch_type: ph
|
| use_uv: true
|
| cwt_hidden_size: 128
|
| cwt_layers: 2
|
| cwt_loss: l1
|
| cwt_add_f0_loss: false
|
| cwt_std_scale: 0.8
|
|
|
| pitch_ar: false
|
|
|
| pitch_loss: 'l1'
|
| pitch_norm: log
|
| use_energy_embed: false
|
|
|
|
|
| use_spk_id: false
|
| use_split_spk_id: false
|
| use_spk_embed: false
|
| use_var_enc: false
|
| lambda_commit: 0.25
|
| ref_norm_layer: bn
|
| pitch_enc_hidden_stride_kernel:
|
| - 0,2,5
|
| - 0,2,5
|
| - 0,2,5
|
| dur_enc_hidden_stride_kernel:
|
| - 0,2,3
|
| - 0,2,3
|
| - 0,1,3
|
|
|
|
|
|
|
| mel_loss: l1:0.5|ssim:0.5
|
|
|
|
|
| lambda_f0: 1.0
|
| lambda_uv: 1.0
|
| lambda_energy: 0.1
|
| lambda_ph_dur: 1.0
|
| lambda_sent_dur: 1.0
|
| lambda_word_dur: 1.0
|
| predictor_grad: 0.1
|
|
|
|
|
| pretrain_fs_ckpt: ''
|
| warmup_updates: 2000
|
| max_tokens: 32000
|
| max_sentences: 100000
|
| max_eval_sentences: 1
|
| max_updates: 120000
|
| num_valid_plots: 5
|
| num_test_samples: 0
|
| test_ids: []
|
| use_gt_dur: false
|
| use_gt_f0: false
|
|
|
|
|
| dur_loss: mse
|
| norm_type: gn |