| | |
| | |
| | |
| | |
| | |
| | lexicon: |
| | - AA |
| | - AE |
| | - AH |
| | - AO |
| | - AW |
| | - AY |
| | - B |
| | - CH |
| | - D |
| | - DH |
| | - EH |
| | - ER |
| | - EY |
| | - F |
| | - G |
| | - HH |
| | - IH |
| | - IY |
| | - JH |
| | - K |
| | - L |
| | - M |
| | - N |
| | - NG |
| | - OW |
| | - OY |
| | - P |
| | - R |
| | - S |
| | - SH |
| | - T |
| | - TH |
| | - UH |
| | - UW |
| | - V |
| | - W |
| | - Y |
| | - Z |
| | - ZH |
| | - spn |
| |
|
| | n_symbols: 42 |
| | padding_idx: 0 |
| | n_mel_channels: 80 |
| | sample_rate: 22050 |
| |
|
| | |
| | enc_num_layers: 4 |
| | enc_num_head: 2 |
| | enc_d_model: 384 |
| | enc_ffn_dim: 1024 |
| | enc_k_dim: 384 |
| | enc_v_dim: 384 |
| | enc_dropout: 0.2 |
| |
|
| | |
| | dec_num_layers: 4 |
| | dec_num_head: 2 |
| | dec_d_model: 384 |
| | dec_ffn_dim: 1024 |
| | dec_k_dim: 384 |
| | dec_v_dim: 384 |
| | dec_dropout: 0.2 |
| |
|
| | |
| | postnet_embedding_dim: 512 |
| | postnet_kernel_size: 5 |
| | postnet_n_convolutions: 5 |
| | postnet_dropout: 0.5 |
| |
|
| | |
| | normalize_before: True |
| | ffn_type: 1dcnn |
| | ffn_cnn_kernel_size_list: [9, 1] |
| |
|
| | |
| | dur_pred_kernel_size: 3 |
| | pitch_pred_kernel_size: 3 |
| | energy_pred_kernel_size: 3 |
| | variance_predictor_dropout: 0.5 |
| |
|
| | |
| | spn_predictor: !new:speechbrain.lobes.models.FastSpeech2.SPNPredictor |
| | enc_num_layers: !ref <enc_num_layers> |
| | enc_num_head: !ref <enc_num_head> |
| | enc_d_model: !ref <enc_d_model> |
| | enc_ffn_dim: !ref <enc_ffn_dim> |
| | enc_k_dim: !ref <enc_k_dim> |
| | enc_v_dim: !ref <enc_v_dim> |
| | enc_dropout: !ref <enc_dropout> |
| | normalize_before: !ref <normalize_before> |
| | ffn_type: !ref <ffn_type> |
| | ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list> |
| | n_char: !ref <n_symbols> |
| | padding_idx: !ref <padding_idx> |
| |
|
| | |
| | model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2 |
| | enc_num_layers: !ref <enc_num_layers> |
| | enc_num_head: !ref <enc_num_head> |
| | enc_d_model: !ref <enc_d_model> |
| | enc_ffn_dim: !ref <enc_ffn_dim> |
| | enc_k_dim: !ref <enc_k_dim> |
| | enc_v_dim: !ref <enc_v_dim> |
| | enc_dropout: !ref <enc_dropout> |
| | dec_num_layers: !ref <dec_num_layers> |
| | dec_num_head: !ref <dec_num_head> |
| | dec_d_model: !ref <dec_d_model> |
| | dec_ffn_dim: !ref <dec_ffn_dim> |
| | dec_k_dim: !ref <dec_k_dim> |
| | dec_v_dim: !ref <dec_v_dim> |
| | dec_dropout: !ref <dec_dropout> |
| | normalize_before: !ref <normalize_before> |
| | ffn_type: !ref <ffn_type> |
| | ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list> |
| | n_char: !ref <n_symbols> |
| | n_mels: !ref <n_mel_channels> |
| | postnet_embedding_dim: !ref <postnet_embedding_dim> |
| | postnet_kernel_size: !ref <postnet_kernel_size> |
| | postnet_n_convolutions: !ref <postnet_n_convolutions> |
| | postnet_dropout: !ref <postnet_dropout> |
| | padding_idx: !ref <padding_idx> |
| | dur_pred_kernel_size: !ref <dur_pred_kernel_size> |
| | pitch_pred_kernel_size: !ref <pitch_pred_kernel_size> |
| | energy_pred_kernel_size: !ref <energy_pred_kernel_size> |
| | variance_predictor_dropout: !ref <variance_predictor_dropout> |
| |
|
| |
|
| | input_encoder: !new:speechbrain.dataio.encoder.TextEncoder |
| |
|
| | modules: |
| | spn_predictor: !ref <spn_predictor> |
| | model: !ref <model> |
| |
|
| | pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer |
| | loadables: |
| | spn_predictor: !ref <spn_predictor> |
| | model: !ref <model> |