| # ############################################################################ | |
| # Model: SSL with Wav2Vec (training from scratch) | |
| # Authors: Artem Ploujnikov, Yingzhi Wang | |
| # # ############################################################################ | |
| # Seed needs to be set at top of yaml, before objects with parameters are instantiated | |
| seed: 42 | |
| __set_seed: !apply:torch.manual_seed [!ref <seed>] | |
| train_regression_metric: True | |
| batch_size: 4 | |
| num_workers: 4 | |
| src_sample_rate: 24000 | |
| tgt_sample_rate: 16000 | |
| contrastive: False | |
| lr: 0.00001 | |
| number_of_epochs: 10 | |
| ckpt_interval_minutes: 15 | |
| activation: !name:torch.nn.LeakyReLU | |
| d_model: 512 | |
| d_ffn: 2048 | |
| num_layers: 3 | |
| nhead: 4 | |
| dropout: 0.5 | |
| wavlm_source: microsoft/wavlm-large | |
| wavlm_save_path: . | |
| splits: ["train", "valid", "test"] | |
| subset: "full" | |
| skip_prep: False | |
| wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM | |
| source: !ref <wavlm_source> | |
| output_norm: True | |
| save_path: !ref <wavlm_save_path> | |
| model: !new:speechbrain.lobes.models.eval.ssl.TransformerRegression | |
| base_model: !ref <wavlm> | |
| d_model: !ref <d_model> | |
| d_ffn: !ref <d_ffn> | |
| num_layers: !ref <num_layers> | |
| nhead: !ref <nhead> | |
| dropout: !ref <dropout> | |
| activation: !ref <activation> | |
| modules: | |
| model: !ref <model> | |
| pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer | |
| loadables: | |
| model: !ref <model> |