update with spn_predictor
#1
by
pradnya-hf-dev - opened
- hyperparams.yaml +23 -4
hyperparams.yaml
CHANGED
|
@@ -2,6 +2,7 @@
|
|
| 2 |
# Model: Fastspeech2 for TTS
|
| 3 |
# Authors: Sathvik Udupa, Yingzhi Wang, Pradnya Kandarkar
|
| 4 |
# ################################
|
|
|
|
| 5 |
# Input parameters
|
| 6 |
lexicon:
|
| 7 |
- AA
|
|
@@ -45,7 +46,7 @@ lexicon:
|
|
| 45 |
- ZH
|
| 46 |
- spn
|
| 47 |
|
| 48 |
-
n_symbols: 41 #fixed
|
| 49 |
padding_idx: 0
|
| 50 |
n_mel_channels: 80
|
| 51 |
|
|
@@ -73,18 +74,34 @@ postnet_kernel_size: 5
|
|
| 73 |
postnet_n_convolutions: 5
|
| 74 |
postnet_dropout: 0.5
|
| 75 |
|
| 76 |
-
#
|
| 77 |
normalize_before: True
|
| 78 |
ffn_type: 1dcnn #1dcnn or ffn
|
| 79 |
ffn_cnn_kernel_size_list: [9, 1]
|
| 80 |
|
| 81 |
-
#
|
| 82 |
dur_pred_kernel_size: 3
|
| 83 |
pitch_pred_kernel_size: 3
|
| 84 |
energy_pred_kernel_size: 3
|
| 85 |
variance_predictor_dropout: 0.5
|
| 86 |
|
| 87 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2
|
| 89 |
enc_num_layers: !ref <enc_num_layers>
|
| 90 |
enc_num_head: !ref <enc_num_head>
|
|
@@ -119,8 +136,10 @@ model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2
|
|
| 119 |
input_encoder: !new:speechbrain.dataio.encoder.TextEncoder
|
| 120 |
|
| 121 |
modules:
|
|
|
|
| 122 |
model: !ref <model>
|
| 123 |
|
| 124 |
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
|
| 125 |
loadables:
|
|
|
|
| 126 |
model: !ref <model>
|
|
|
|
| 2 |
# Model: Fastspeech2 for TTS
|
| 3 |
# Authors: Sathvik Udupa, Yingzhi Wang, Pradnya Kandarkar
|
| 4 |
# ################################
|
| 5 |
+
|
| 6 |
# Input parameters
|
| 7 |
lexicon:
|
| 8 |
- AA
|
|
|
|
| 46 |
- ZH
|
| 47 |
- spn
|
| 48 |
|
| 49 |
+
n_symbols: 41 #fixed depending on symbols in the lexicon +1 for a dummy symbol used for padding
|
| 50 |
padding_idx: 0
|
| 51 |
n_mel_channels: 80
|
| 52 |
|
|
|
|
| 74 |
postnet_n_convolutions: 5
|
| 75 |
postnet_dropout: 0.5
|
| 76 |
|
| 77 |
+
# common
|
| 78 |
normalize_before: True
|
| 79 |
ffn_type: 1dcnn #1dcnn or ffn
|
| 80 |
ffn_cnn_kernel_size_list: [9, 1]
|
| 81 |
|
| 82 |
+
# variance predictor
|
| 83 |
dur_pred_kernel_size: 3
|
| 84 |
pitch_pred_kernel_size: 3
|
| 85 |
energy_pred_kernel_size: 3
|
| 86 |
variance_predictor_dropout: 0.5
|
| 87 |
|
| 88 |
+
# SPN predictor
|
| 89 |
+
spn_predictor: !new:speechbrain.lobes.models.FastSpeech2.SPNPredictor
|
| 90 |
+
enc_num_layers: !ref <enc_num_layers>
|
| 91 |
+
enc_num_head: !ref <enc_num_head>
|
| 92 |
+
enc_d_model: !ref <enc_d_model>
|
| 93 |
+
enc_ffn_dim: !ref <enc_ffn_dim>
|
| 94 |
+
enc_k_dim: !ref <enc_k_dim>
|
| 95 |
+
enc_v_dim: !ref <enc_v_dim>
|
| 96 |
+
enc_dropout: !ref <enc_dropout>
|
| 97 |
+
normalize_before: !ref <normalize_before>
|
| 98 |
+
ffn_type: !ref <ffn_type>
|
| 99 |
+
ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list>
|
| 100 |
+
n_char: !ref <n_symbols>
|
| 101 |
+
padding_idx: !ref <padding_idx>
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
#model
|
| 105 |
model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2
|
| 106 |
enc_num_layers: !ref <enc_num_layers>
|
| 107 |
enc_num_head: !ref <enc_num_head>
|
|
|
|
| 136 |
input_encoder: !new:speechbrain.dataio.encoder.TextEncoder
|
| 137 |
|
| 138 |
modules:
|
| 139 |
+
spn_predictor: !ref <spn_predictor>
|
| 140 |
model: !ref <model>
|
| 141 |
|
| 142 |
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
|
| 143 |
loadables:
|
| 144 |
+
spn_predictor: !ref <spn_predictor>
|
| 145 |
model: !ref <model>
|