datasetsANDmodels commited on
Commit
1182c80
·
verified ·
1 Parent(s): fd719ad

Upload 4 files

Browse files
Files changed (4) hide show
  1. hyperparams.yaml +188 -0
  2. model.ckpt +3 -0
  3. normalizer.ckpt +3 -0
  4. tokenizer.ckpt +3 -0
hyperparams.yaml ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ############################################################################
2
+ # Model: Streaming E2E Conformer-Transducer ASR
3
+ # Encoder: Conformer
4
+ # Decoder: LSTM + greedy search
5
+ # Tokens: BPE with unigram
6
+ # losses: Transducer + CTC (optional) + CE (optional)
7
+ # Training: Librispeech 960h
8
+ # Authors: Sylvain de Langen 2023, Titouan Parcollet 2023
9
+ # ############################################################################
10
+
11
+ save_folder: !ref librispeech-streaming-conformer-transducer
12
+
13
+ # Feature parameters
14
+ sample_rate: 16000
15
+ n_fft: 512
16
+ n_mels: 80
17
+ win_length: 32
18
+
19
+ # Streaming
20
+ streaming: True # controls all Dynamic Chunk Training & chunk size & left context mechanisms
21
+
22
+ # Model parameters
23
+ # Transformer
24
+ d_model: 512
25
+ joint_dim: 640
26
+ nhead: 8
27
+ num_encoder_layers: 12
28
+ num_decoder_layers: 0
29
+ d_ffn: 2048
30
+ transformer_dropout: 0.1
31
+ activation: !name:torch.nn.GELU
32
+ output_neurons: 1000
33
+ dec_dim: 512
34
+ dec_emb_dropout: 0.2
35
+ dec_dropout: 0.1
36
+
37
+ # Decoding parameters
38
+ blank_index: 0
39
+ bos_index: 0
40
+ eos_index: 0
41
+ pad_index: 0
42
+ beam_size: 10
43
+ nbest: 1
44
+ # by default {state,expand}_beam = 2.3 as mention in paper
45
+ # https://arxiv.org/abs/1904.02619
46
+ state_beam: 2.3
47
+ expand_beam: 2.3
48
+ lm_weight: 0.50
49
+
50
+ normalize: !new:speechbrain.processing.features.InputNormalization
51
+ norm_type: global
52
+ update_until_epoch: 4
53
+
54
+ compute_features: !new:speechbrain.lobes.features.Fbank
55
+ sample_rate: !ref <sample_rate>
56
+ n_fft: !ref <n_fft>
57
+ n_mels: !ref <n_mels>
58
+ win_length: !ref <win_length>
59
+
60
+ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
61
+ input_shape: (8, 10, 80)
62
+ num_blocks: 2
63
+ num_layers_per_block: 1
64
+ out_channels: (64, 32)
65
+ kernel_sizes: (3, 3)
66
+ strides: (2, 2)
67
+ residuals: (False, False)
68
+
69
+ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
70
+ input_size: 640
71
+ tgt_vocab: !ref <output_neurons>
72
+ d_model: !ref <d_model>
73
+ nhead: !ref <nhead>
74
+ num_encoder_layers: !ref <num_encoder_layers>
75
+ num_decoder_layers: !ref <num_decoder_layers>
76
+ d_ffn: !ref <d_ffn>
77
+ dropout: !ref <transformer_dropout>
78
+ activation: !ref <activation>
79
+ encoder_module: conformer
80
+ attention_type: RelPosMHAXL
81
+ normalize_before: True
82
+ causal: False
83
+
84
+ # We must call an encoder wrapper so the decoder isn't run (we don't have any)
85
+ enc: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
86
+ transformer: !ref <Transformer>
87
+
88
+ # For MTL CTC over the encoder
89
+ proj_ctc: !new:speechbrain.nnet.linear.Linear
90
+ input_size: !ref <joint_dim>
91
+ n_neurons: !ref <output_neurons>
92
+
93
+ # Define some projection layers to make sure that enc and dec
94
+ # output dim are the same before joining
95
+ proj_enc: !new:speechbrain.nnet.linear.Linear
96
+ input_size: !ref <d_model>
97
+ n_neurons: !ref <joint_dim>
98
+ bias: False
99
+
100
+ proj_dec: !new:speechbrain.nnet.linear.Linear
101
+ input_size: !ref <dec_dim>
102
+ n_neurons: !ref <joint_dim>
103
+ bias: False
104
+
105
+ emb: !new:speechbrain.nnet.embedding.Embedding
106
+ num_embeddings: !ref <output_neurons>
107
+ consider_as_one_hot: True
108
+ blank_id: !ref <blank_index>
109
+
110
+ dec: !new:speechbrain.nnet.RNN.LSTM
111
+ input_shape: [null, null, !ref <output_neurons> - 1]
112
+ hidden_size: !ref <dec_dim>
113
+ num_layers: 1
114
+ re_init: True
115
+
116
+ Tjoint: !new:speechbrain.nnet.transducer.transducer_joint.Transducer_joint
117
+ joint: sum # joint [sum | concat]
118
+ nonlinearity: !ref <activation>
119
+
120
+ transducer_lin: !new:speechbrain.nnet.linear.Linear
121
+ input_size: !ref <joint_dim>
122
+ n_neurons: !ref <output_neurons>
123
+ bias: False
124
+
125
+ modules:
126
+ CNN: !ref <CNN>
127
+ enc: !ref <enc>
128
+ emb: !ref <emb>
129
+ dec: !ref <dec>
130
+ Tjoint: !ref <Tjoint>
131
+ transducer_lin: !ref <transducer_lin>
132
+ normalize: !ref <normalize>
133
+ proj_ctc: !ref <proj_ctc>
134
+ proj_dec: !ref <proj_dec>
135
+ proj_enc: !ref <proj_enc>
136
+
137
+ model: !new:torch.nn.ModuleList
138
+ - [!ref <CNN>, !ref <enc>, !ref <emb>, !ref <dec>, !ref <proj_enc>, !ref <proj_dec>, !ref <proj_ctc>, !ref <transducer_lin>]
139
+
140
+ # Tokenizer initialization
141
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
142
+
143
+ Greedysearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher
144
+ decode_network_lst: [!ref <emb>, !ref <dec>, !ref <proj_dec>]
145
+ tjoint: !ref <Tjoint>
146
+ classifier_network: [!ref <transducer_lin>]
147
+ blank_id: !ref <blank_index>
148
+ beam_size: 1
149
+ nbest: 1
150
+
151
+ Beamsearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher
152
+ decode_network_lst: [!ref <emb>, !ref <dec>, !ref <proj_dec>]
153
+ tjoint: !ref <Tjoint>
154
+ classifier_network: [!ref <transducer_lin>]
155
+ blank_id: !ref <blank_index>
156
+ beam_size: !ref <beam_size>
157
+ nbest: !ref <nbest>
158
+ # lm_module: !ref <lm_model>
159
+ # lm_weight: !ref <lm_weight>
160
+ state_beam: !ref <state_beam>
161
+ expand_beam: !ref <expand_beam>
162
+
163
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
164
+ collect_in: !ref <save_folder>
165
+ loadables:
166
+ model: !ref <model>
167
+ normalizer: !ref <normalize>
168
+ tokenizer: !ref <tokenizer>
169
+
170
+ # inference stuff
171
+
172
+ make_tokenizer_streaming_context: !name:speechbrain.tokenizers.SentencePiece.SentencePieceDecoderStreamingContext
173
+ tokenizer_decode_streaming: !name:speechbrain.tokenizers.SentencePiece.spm_decode_preserve_leading_space
174
+
175
+ make_decoder_streaming_context: !name:speechbrain.decoders.transducer.TransducerGreedySearcherStreamingContext # default constructor
176
+ decoding_function: !name:speechbrain.decoders.transducer.TransducerBeamSearcher.transducer_greedy_decode_streaming
177
+ - !ref <Greedysearcher> # self
178
+
179
+ fea_streaming_extractor: !new:speechbrain.lobes.features.StreamingFeatureWrapper
180
+ module: !new:speechbrain.nnet.containers.LengthsCapableSequential
181
+ - !ref <compute_features>
182
+ - !ref <normalize>
183
+ - !ref <CNN>
184
+ # don't consider normalization as part of the input filter chain.
185
+ # normalization will operate at chunk level, which mismatches training
186
+ # somewhat, but does not appear to result in noticeable degradation.
187
+ properties: !apply:speechbrain.utils.filter_analysis.stack_filter_properties
188
+ - [!ref <compute_features>, !ref <CNN>]
model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d7ae523e01d0ced372b1cc7b3c607659ca90b941989f1e2fadfb8fd22aa1e90
3
+ size 334293097
normalizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee5191b7683876c706dac861ae68b44e5860faff8e17fd130ef821b9c1f2fe58
3
+ size 1779
tokenizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37a6cba34cd520b33fd83612d5efc8ba7e351166541eb2726642bb3032234d31
3
+ size 253217