sinarashidi commited on
Commit
0a10a75
·
verified ·
1 Parent(s): ce272ba

Create hyperparams.yaml

Browse files
Files changed (1) hide show
  1. hyperparams.yaml +99 -0
hyperparams.yaml ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pretrained_path: sinarashidi/30epoch
2
+ sample_rate: 16000
3
+
4
+ # URL for the HuggingFace model we want to load as encoder
5
+ wav2vec2_hub: m3hrdadfi/wav2vec2-large-xlsr-persian-v3
6
+
7
+ # Outputs
8
+ vocab_size: 100
9
+ blank_index: 99
10
+ bos_index: 97
11
+ eos_index: 98
12
+ pad_index: 99
13
+ label_smoothing: 0.0
14
+
15
+ # Encoder
16
+ features_dim: 1024
17
+
18
+ # Length Regulator
19
+ enc_kernel_size: 3
20
+ enc_stride: 2
21
+
22
+ # Transformer decoder
23
+ embedding_size: 512
24
+ d_model: 512
25
+ nhead: 8
26
+ num_encoder_layers: 0
27
+ num_decoder_layers: 6
28
+ d_ffn: 2048
29
+ transformer_dropout: 0.1
30
+ activation: !name:torch.nn.GELU
31
+ output_neurons: !ref <vocab_size>
32
+ attention_type: "RelPosMHAXL"
33
+
34
+ # Decoding parameters
35
+ min_decode_ratio: 0.0
36
+ max_decode_ratio: 1.0
37
+
38
+ wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
39
+ source: !ref <wav2vec2_hub>
40
+ output_norm: True
41
+ freeze: True
42
+ freeze_feature_extractor: True
43
+ apply_spec_augment : True
44
+ save_path: wav2vec2_checkpoints
45
+
46
+ length_regulator: !new:speechbrain.nnet.CNN.Conv1d
47
+ input_shape: [null, null, !ref <features_dim>]
48
+ out_channels: !ref <embedding_size>
49
+ kernel_size: !ref <enc_kernel_size>
50
+ stride: !ref <enc_stride>
51
+
52
+ transformer_decoder: !new:speechbrain.lobes.models.transformer.TransformerST.TransformerST # yamllint disable-line rule:line-length
53
+ input_size: !ref <embedding_size>
54
+ tgt_vocab: !ref <output_neurons>
55
+ d_model: !ref <d_model>
56
+ nhead: !ref <nhead>
57
+ num_encoder_layers: !ref <num_encoder_layers>
58
+ num_decoder_layers: !ref <num_decoder_layers>
59
+ d_ffn: !ref <d_ffn>
60
+ dropout: !ref <transformer_dropout>
61
+ activation: !ref <activation>
62
+ attention_type: !ref <attention_type>
63
+ normalize_before: True
64
+ causal: False
65
+
66
+ log_softmax: !new:speechbrain.nnet.activations.Softmax
67
+ apply_log: True
68
+
69
+ seq_lin: !new:speechbrain.nnet.linear.Linear
70
+ input_size: !ref <d_model>
71
+ n_neurons: !ref <output_neurons>
72
+
73
+ model: !new:torch.nn.ModuleList
74
+ - [!ref <length_regulator>, !ref <transformer_decoder>, !ref <seq_lin>]
75
+
76
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
77
+ wav2vec2: !ref <wav2vec2>
78
+ length_regulator: !ref <length_regulator>
79
+
80
+ decoder_beamsearch: !new:speechbrain.decoders.seq2seq.S2STransformerBeamSearcher
81
+ modules: [!ref <transformer_decoder>, !ref <seq_lin>]
82
+ bos_index: !ref <bos_index>
83
+ eos_index: !ref <eos_index>
84
+ min_decode_ratio: !ref <min_decode_ratio>
85
+ max_decode_ratio: !ref <max_decode_ratio>
86
+ beam_size: 10
87
+ temperature: 1.0
88
+
89
+ modules:
90
+ encoder: !ref <encoder>
91
+ decoder: !ref <decoder_beamsearch>
92
+
93
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
94
+ loadables:
95
+ model: !ref <model>
96
+ wav2vec2: !ref <wav2vec2>
97
+ paths:
98
+ wav2vec2: !ref <pretrained_path>/wav2vec2.ckpt
99
+ model: !ref <pretrained_path>/model.ckpt