|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
seed: 200 |
|
|
device: cpu |
|
|
sample_rate: 16000 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
output_neurons: 4 |
|
|
blank_index: 0 |
|
|
|
|
|
|
|
|
n_prototypes: 10 |
|
|
|
|
|
|
|
|
emb_dim: 768 |
|
|
|
|
|
|
|
|
rnn_layers: 2 |
|
|
rnn_neurons: 512 |
|
|
|
|
|
|
|
|
dnn_blocks: 2 |
|
|
dnn_neurons: 512 |
|
|
|
|
|
|
|
|
dec_dnn_blocks: [1] |
|
|
dec_dnn_neurons: [128] |
|
|
|
|
|
|
|
|
activation: !name:torch.nn.LeakyReLU |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT |
|
|
source: "Orange/SSA-HuBERT-base-60k" |
|
|
output_norm: True |
|
|
freeze: False |
|
|
save_path: whubert_checkpoint |
|
|
|
|
|
|
|
|
f0Compute: !new:modules.F0Extractor |
|
|
device: !ref <device> |
|
|
sample_rate: !ref <sample_rate> |
|
|
|
|
|
|
|
|
enc: !new:speechbrain.nnet.RNN.GRU |
|
|
input_shape: [null, null, !ref <emb_dim>] |
|
|
hidden_size: !ref <rnn_neurons> |
|
|
num_layers: !ref <rnn_layers> |
|
|
bidirectional: True |
|
|
dropout: 0.15 |
|
|
|
|
|
|
|
|
dec: !new:speechbrain.lobes.models.VanillaNN.VanillaNN |
|
|
input_shape: [null, null, 1024] |
|
|
activation: !ref <activation> |
|
|
dnn_blocks: !ref <dnn_blocks> |
|
|
dnn_neurons: !ref <dnn_neurons> |
|
|
|
|
|
|
|
|
pitch_dec: !new:modules.PitchDecoderLayer |
|
|
input_shape: [null, null, !ref <dnn_neurons>] |
|
|
dnn_blocks: !ref <dec_dnn_blocks> |
|
|
dnn_neurons: !ref <dec_dnn_neurons> |
|
|
|
|
|
|
|
|
proto: !new:modules.PrototypeLayer |
|
|
n_prototypes: !ref <n_prototypes> |
|
|
latent_dims: !ref <dnn_neurons> |
|
|
|
|
|
|
|
|
output_lin: !new:speechbrain.nnet.linear.Linear |
|
|
input_size: !ref <n_prototypes> |
|
|
n_neurons: !ref <output_neurons> |
|
|
bias: True |
|
|
|
|
|
|
|
|
log_softmax: !new:speechbrain.nnet.activations.Softmax |
|
|
apply_log: True |
|
|
|
|
|
|
|
|
label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder |
|
|
|
|
|
|
|
|
|
|
|
modules: |
|
|
wav2vec2: !ref <wav2vec2> |
|
|
enc: !ref <enc> |
|
|
dec: !ref <dec> |
|
|
pitch_dec: !ref <pitch_dec> |
|
|
proto: !ref <proto> |
|
|
output_lin: !ref <output_lin> |
|
|
|
|
|
|
|
|
model: !new:torch.nn.ModuleList |
|
|
- [!ref <enc>, !ref <dec>, !ref <proto>, !ref <output_lin>, !ref <pitch_dec>] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer |
|
|
loadables: |
|
|
model: !ref <model> |
|
|
wav2vec2: !ref <wav2vec2> |
|
|
tokenizer: !ref <label_encoder> |
|
|
paths: |
|
|
model: !ref <save_folder>/model.ckpt |
|
|
wav2vec2: !ref <save_folder>/wav2vec2.ckpt |
|
|
tokenizer: !ref <save_folder>/tokenizer.ckpt |
|
|
|
|
|
|
|
|
|
|
|
save_folder: ./CKPT+2025-10-20+08-19-07+00 |
|
|
|