| | |
| | |
| | |
| | |
| |
|
| | |
| | sample_rate: 16000 |
| | time_resolution: 0.01 |
| | n_fft: 400 |
| | n_mels_vad: 40 |
| | batch_size: 512 |
| |
|
| | |
| | cnn1_channels: 16 |
| | cnn2_channels: 32 |
| | cnn_kernelsize: (3, 3) |
| | rnn_layers: 2 |
| | rnn_neurons: 32 |
| | rnn_bidirectional: True |
| | dnn_blocks: 1 |
| | dnn_neurons: 16 |
| | output_neurons_vad: 1 |
| |
|
| | |
| | n_mels_ecapa: 80 |
| | out_neurons_ecapa: 7205 |
| | emb_dim: 192 |
| |
|
| | dataloader_opts: |
| | batch_size: !ref <batch_size> |
| |
|
| | |
| | compute_fbank_vad: !new:speechbrain.lobes.features.Fbank |
| | sample_rate: !ref <sample_rate> |
| | n_fft: !ref <n_fft> |
| | n_mels: !ref <n_mels_vad> |
| | hop_length: !ref <time_resolution> * 1000 |
| |
|
| | mean_var_norm_vad: !new:speechbrain.processing.features.InputNormalization |
| | norm_type: sentence |
| |
|
| | cnn: !new:speechbrain.nnet.containers.Sequential |
| | input_shape: [null, null, !ref <n_mels_vad>] |
| | norm1: !name:speechbrain.nnet.normalization.LayerNorm |
| | cnn1: !name:speechbrain.lobes.models.CRDNN.CNN_Block |
| | channels: !ref <cnn1_channels> |
| | kernel_size: !ref <cnn_kernelsize> |
| | cnn2: !name:speechbrain.lobes.models.CRDNN.CNN_Block |
| | channels: !ref <cnn2_channels> |
| | kernel_size: !ref <cnn_kernelsize> |
| |
|
| | rnn: !new:speechbrain.nnet.RNN.GRU |
| | input_shape: [null, null, 320] |
| | hidden_size: !ref <rnn_neurons> |
| | num_layers: !ref <rnn_layers> |
| | bidirectional: !ref <rnn_bidirectional> |
| |
|
| | dnn: !new:speechbrain.nnet.containers.Sequential |
| | input_shape: [null, null, !ref <rnn_neurons> * 2] |
| | dnn1: !name:speechbrain.lobes.models.CRDNN.DNN_Block |
| | neurons: !ref <dnn_neurons> |
| | dnn2: !name:speechbrain.lobes.models.CRDNN.DNN_Block |
| | neurons: !ref <dnn_neurons> |
| | lin: !name:speechbrain.nnet.linear.Linear |
| | n_neurons: !ref <output_neurons_vad> |
| | bias: False |
| | |
| |
|
| | |
| | compute_fbank_ecapa: !new:speechbrain.lobes.features.Fbank |
| | n_mels: !ref <n_mels_ecapa> |
| |
|
| | mean_var_norm_ecapa: !new:speechbrain.processing.features.InputNormalization |
| | norm_type: sentence |
| | std_norm: False |
| |
|
| | embedding_model: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN |
| | input_size: !ref <n_mels_ecapa> |
| | channels: [1024, 1024, 1024, 1024, 3072] |
| | kernel_sizes: [5, 3, 3, 3, 1] |
| | dilations: [1, 2, 3, 4, 1] |
| | attention_channels: 128 |
| | lin_neurons: 192 |
| |
|
| | mean_var_norm_emb: !new:speechbrain.processing.features.InputNormalization |
| | norm_type: global |
| | std_norm: False |
| | |
| |
|
| | vad: !new:torch.nn.ModuleList |
| | - [!ref <cnn>, !ref <rnn>, !ref <dnn>] |
| |
|
| | |
| | modules: |
| | compute_fbank_vad: !ref <compute_fbank_vad> |
| | compute_fbank_ecapa: !ref <compute_fbank_ecapa> |
| | cnn: !ref <cnn> |
| | rnn: !ref <rnn> |
| | dnn: !ref <dnn> |
| | mean_var_norm_vad: !ref <mean_var_norm_vad> |
| | mean_var_norm_ecapa: !ref <mean_var_norm_ecapa> |
| | embedding_model: !ref <embedding_model> |
| | mean_var_norm_emb: !ref <mean_var_norm_emb> |
| |
|
| | pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer |
| | loadables: |
| | vad: !ref <vad> |
| | embedding_model: !ref <embedding_model> |
| | mean_var_norm_vad: !ref <mean_var_norm_vad> |
| | mean_var_norm_emb: !ref <mean_var_norm_emb> |
| |
|