File size: 2,720 Bytes
0a454f4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | acous_params:
- - 480
- 1200
- 80
- - 240
- 1200
- 160
amp: false
audio_num_mel_bins: 160
audio_sample_rate: 24000
balance_sil: true
c_spk_enc: 512
char_dict_size: 15000
conv_use_pos: false
dec0_dilations:
- 1
- 2
- 4
- 1
- 2
- 4
- 1
dec0_kernel_size: 3
dec_dilations:
- 1
- 2
- 1
- 2
- 1
dec_ffn_kernel_size: 9
dec_hidden_size: 2048
dec_inp_add_noise: false
dec_kernel_size: 5
dec_layers: 4
dec_num_heads: 8
dec_post_net_kernel: 3
decoder_rnn_dim: 0
decoder_type: conv
dropout: 0.0
ds_add_pitch_embed: false
dur_alpha: 1.0
dur_code_size: 128
dur_context_enc: true
dur_log: true
dur_model_hidden_size: 512
dur_model_layers: 8
dur_model_type: ar_mse
dur_predictor_kernel: 3
dur_predictor_layers: 2
dur_txt_hs: 512
dur_use_char: true
dur_use_spk: true
enc_dec_norm: ln
enc_dilations:
- 1
- 1
- 1
- 1
enc_ffn_kernel_size: 3
enc_hidden_size: 256
enc_kernel_size: 5
enc_layers: 4
enc_post_net_kernel: 3
enc_pre_ln: true
enc_prenet: true
encoder_K: 8
encoder_type: rel_fft
f0_max: 600
f0_min: 60
ffn_act: gelu
ffn_hidden_size: 1024
fft_size: 1200
fg_spk_enc_hidden: 256
flatten_dec: true
fmax: 12000
fmin: 0
frames_multiple: 8
hidden_size: 512
hop_size: 240
ignore_begin_end_sil: false
lat_for_dur: false
latent_size: 256
layers_in_block: 2
ling_label_dict_size:
- 20
- 4
- 5
- 2
- 3
- 3
- 3
- 6
- 15
ling_labels:
- tone
lm_num_layers: 24
lm_use_enc: true
loud_norm: false
max_tokens: 6000
mel_vmax: 0.5
mel_vmin: -6
min_frames: 0
mix_melout_timbre: true
mix_ph_timbre: false
model_type: 1
multistage: false
no_text_enc: false
num_heads: 2
out_wav_norm: true
pad_frames: false
precision: fp16
predict_pitch: false
predictor_dropout: 0.0
predictor_grad: 1.0
predictor_hidden: -1
predictor_kernel: 5
predictor_layers: 5
print_nan_grads: true
ref_mel_bins: 160
ref_size_max: 2000
ref_size_min: 1000
remove_sil: false
shuffle_ref: false
split_ref: true
temperature: 0.8
tone_percep_ckpt: ''
train_spk_embed_only: false
use_bert_input: false
use_char: true
use_cur_global: false
use_cur_global_dec: true
use_dur_embed: true
use_dur_mask_embed: true
use_finegrained_spk: false
use_global_lat: false
use_gpt: true
use_gt_dur: false
use_gt_f0: false
use_mix_spk_embed: false
use_new_vae: false
use_ph_level_f0: false
use_ph_pos_embed: true
use_pitch_embed: false
use_pitch_embed_dec: false
use_pitch_pred: true
use_pos_embed: false
use_post_ln: false
use_random_spk_embed: false
use_rot_embed: true
use_spk_embed: false
use_spk_enc: false
use_spk_id: false
use_text_postnet: true
use_uv: true
use_vae: true
use_vqvae: true
use_word_encoder: true
use_word_input: false
vae_dur_grad: 0.1
vae_enc_hidden_size: 384
vae_word_conder_layers: 0
vq_stride: 8
w_nonsil: 10.0
w_sil: 1.0
word_dict_size: 10000
z_channels: 64
z_clamp: 2.0 |