jes3275 commited on
Commit
9ac1a2f
·
1 Parent(s): 7dda99b

Update consolidated README and meta.yaml with results

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. data_respin/bh/nlsyms.txt +3 -0
  2. data_respin/bn/nlsyms.txt +5 -0
  3. data_respin/ch/nlsyms.txt +4 -0
  4. data_respin/hi/nlsyms.txt +5 -0
  5. data_respin/kn/nlsyms.txt +5 -0
  6. data_respin/mg/nlsyms.txt +4 -0
  7. data_respin/mr/nlsyms.txt +4 -0
  8. data_respin/mt/nlsyms.txt +4 -0
  9. data_respin/te/nlsyms.txt +4 -0
  10. exp_small/exp_bh/README.md +405 -0
  11. exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md +27 -0
  12. exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml +304 -0
  13. exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png +0 -0
  14. exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png +0 -0
  15. exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png +0 -0
  16. exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png +0 -0
  17. exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png +0 -0
  18. exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png +0 -0
  19. exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png +0 -0
  20. exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png +0 -0
  21. exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png +0 -0
  22. exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png +0 -0
  23. exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png +0 -0
  24. exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png +0 -0
  25. exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png +0 -0
  26. exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png +0 -0
  27. exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png +0 -0
  28. exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png +0 -0
  29. exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png +0 -0
  30. exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth +3 -0
  31. exp_small/exp_bh/meta.yaml +8 -0
  32. exp_small/exp_bn/README.md +399 -0
  33. exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md +27 -0
  34. exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml +298 -0
  35. exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png +0 -0
  36. exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png +0 -0
  37. exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png +0 -0
  38. exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png +0 -0
  39. exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png +0 -0
  40. exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png +0 -0
  41. exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png +0 -0
  42. exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png +0 -0
  43. exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png +0 -0
  44. exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png +0 -0
  45. exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png +0 -0
  46. exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png +0 -0
  47. exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png +0 -0
  48. exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png +0 -0
  49. exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png +0 -0
  50. exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png +0 -0
data_respin/bh/nlsyms.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 1
2
+ 2
3
+ 3
data_respin/bn/nlsyms.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ 1
2
+ 2
3
+ 3
4
+ 4
5
+ 5
data_respin/ch/nlsyms.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ 1
2
+ 2
3
+ 3
4
+ 4
data_respin/hi/nlsyms.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ 1
2
+ 2
3
+ 3
4
+ 4
5
+ 5
data_respin/kn/nlsyms.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ 1
2
+ 2
3
+ 3
4
+ 4
5
+ 5
data_respin/mg/nlsyms.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ 1
2
+ 2
3
+ 3
4
+ 4
data_respin/mr/nlsyms.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ 1
2
+ 2
3
+ 3
4
+ 4
data_respin/mt/nlsyms.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ 1
2
+ 2
3
+ 3
4
+ 4
data_respin/te/nlsyms.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ 1
2
+ 2
3
+ 3
4
+ 4
exp_small/exp_bh/README.md ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - automatic-speech-recognition
6
+ language: bh
7
+ datasets:
8
+ - respin_small
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 ASR model
13
+
14
+ ### `SpireLab/spire_respin_baselines_espnet`
15
+
16
+ This model was trained by wtc7 using respin_small recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+
26
+ pip install -e .
27
+ cd egs2/respin_small/asr1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model SpireLab/spire_respin_baselines_espnet
29
+ ```
30
+
31
+ <!-- Generated by scripts/utils/show_asr_result.sh -->
32
+ # RESULTS
33
+ ## Environments
34
+ - date: `Sun May 25 02:31:02 IST 2025`
35
+ - python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]`
36
+ - espnet version: `espnet 202412`
37
+ - pytorch version: `pytorch 2.3.0+cu121`
38
+ - Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c`
39
+ - Commit date: `Tue Jan 14 20:06:15 2025 -0500`
40
+
41
+ ## exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1
42
+ ### WER
43
+
44
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
45
+ |---|---|---|---|---|---|---|---|---|
46
+ |decode_lid_asr_model_valid.acc.ave/test_bh|2220|22453|85.9|13.4|0.7|1.1|15.2|73.2|
47
+
48
+ ### CER
49
+
50
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
51
+ |---|---|---|---|---|---|---|---|---|
52
+ |decode_lid_asr_model_valid.acc.ave/test_bh|2220|104745|96.8|2.1|1.1|1.2|4.4|73.2|
53
+
54
+ ### TER
55
+
56
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
57
+ |---|---|---|---|---|---|---|---|---|
58
+
59
+ ## ASR config
60
+
61
+ <details><summary>expand</summary>
62
+
63
+ ```
64
+ config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e8_mactrue_bs6M_gacc1.yaml
65
+ print_config: false
66
+ log_level: INFO
67
+ drop_last_iter: false
68
+ dry_run: false
69
+ iterator_type: sequence
70
+ valid_iterator_type: null
71
+ output_dir: exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1
72
+ ngpu: 1
73
+ seed: 2022
74
+ num_workers: 8
75
+ num_att_plot: 3
76
+ dist_backend: nccl
77
+ dist_init_method: env://
78
+ dist_world_size: null
79
+ dist_rank: null
80
+ local_rank: 0
81
+ dist_master_addr: null
82
+ dist_master_port: null
83
+ dist_launcher: null
84
+ multiprocessing_distributed: false
85
+ unused_parameters: false
86
+ sharded_ddp: false
87
+ use_deepspeed: false
88
+ deepspeed_config: null
89
+ cudnn_enabled: true
90
+ cudnn_benchmark: false
91
+ cudnn_deterministic: true
92
+ use_tf32: false
93
+ collect_stats: false
94
+ write_collected_feats: false
95
+ max_epoch: 70
96
+ patience: 5
97
+ val_scheduler_criterion:
98
+ - valid
99
+ - loss
100
+ early_stopping_criterion:
101
+ - valid
102
+ - loss
103
+ - min
104
+ best_model_criterion:
105
+ - - valid
106
+ - acc
107
+ - max
108
+ keep_nbest_models: 5
109
+ nbest_averaging_interval: 0
110
+ grad_clip: 5.0
111
+ grad_clip_type: 2.0
112
+ grad_noise: false
113
+ accum_grad: 1
114
+ no_forward_run: false
115
+ resume: true
116
+ train_dtype: float32
117
+ use_amp: true
118
+ log_interval: null
119
+ use_matplotlib: true
120
+ use_tensorboard: true
121
+ create_graph_in_tensorboard: false
122
+ use_wandb: false
123
+ wandb_project: null
124
+ wandb_id: null
125
+ wandb_entity: null
126
+ wandb_name: null
127
+ wandb_model_log_interval: -1
128
+ detect_anomaly: false
129
+ use_adapter: false
130
+ adapter: lora
131
+ save_strategy: all
132
+ adapter_conf: {}
133
+ pretrain_path: null
134
+ init_param: []
135
+ ignore_init_mismatch: false
136
+ freeze_param: []
137
+ num_iters_per_epoch: null
138
+ batch_size: 20
139
+ valid_batch_size: null
140
+ batch_bins: 6000000
141
+ valid_batch_bins: null
142
+ category_sample_size: 10
143
+ train_shape_file:
144
+ - exp_small/exp_bh/asr_stats_raw_bh_char_sp/train/speech_shape
145
+ - exp_small/exp_bh/asr_stats_raw_bh_char_sp/train/text_shape.char
146
+ valid_shape_file:
147
+ - exp_small/exp_bh/asr_stats_raw_bh_char_sp/valid/speech_shape
148
+ - exp_small/exp_bh/asr_stats_raw_bh_char_sp/valid/text_shape.char
149
+ batch_type: numel
150
+ valid_batch_type: null
151
+ fold_length:
152
+ - 80000
153
+ - 150
154
+ sort_in_batch: descending
155
+ shuffle_within_batch: false
156
+ sort_batch: descending
157
+ multiple_iterator: false
158
+ chunk_length: 500
159
+ chunk_shift_ratio: 0.5
160
+ num_cache_chunks: 1024
161
+ chunk_excluded_key_prefixes: []
162
+ chunk_default_fs: null
163
+ chunk_max_abs_length: null
164
+ chunk_discard_short_samples: true
165
+ train_data_path_and_name_and_type:
166
+ - - dump/bh/raw/train_bh_sp/wav.scp
167
+ - speech
168
+ - sound
169
+ - - dump/bh/raw/train_bh_sp/text
170
+ - text
171
+ - text
172
+ valid_data_path_and_name_and_type:
173
+ - - dump/bh/raw/dev_bh/wav.scp
174
+ - speech
175
+ - sound
176
+ - - dump/bh/raw/dev_bh/text
177
+ - text
178
+ - text
179
+ multi_task_dataset: false
180
+ allow_variable_data_keys: false
181
+ max_cache_size: 0.0
182
+ max_cache_fd: 32
183
+ allow_multi_rates: false
184
+ valid_max_cache_size: null
185
+ exclude_weight_decay: false
186
+ exclude_weight_decay_conf: {}
187
+ optim: adam
188
+ optim_conf:
189
+ lr: 0.002
190
+ weight_decay: 1.0e-06
191
+ scheduler: warmuplr
192
+ scheduler_conf:
193
+ warmup_steps: 15000
194
+ token_list:
195
+ - <blank>
196
+ - <unk>
197
+ - <space>
198
+ - ा
199
+ - े
200
+ - क
201
+ - र
202
+ - ल
203
+ - स
204
+ - न
205
+ - म
206
+ - त
207
+ - ी
208
+ - ि
209
+ - ह
210
+ - ब
211
+ - ्
212
+ - प
213
+ - व
214
+ - ज
215
+ - ं
216
+ - ो
217
+ - द
218
+ - ख
219
+ - य
220
+ - ग
221
+ - ट
222
+ - ु
223
+ - अ
224
+ - ई
225
+ - इ
226
+ - च
227
+ - भ
228
+ - आ
229
+ - ू
230
+ - उ
231
+ - ए
232
+ - श
233
+ - ै
234
+ - ध
235
+ - ड
236
+ - फ
237
+ - ड़
238
+ - ौ
239
+ - .
240
+ - छ
241
+ - ण
242
+ - ष
243
+ - थ
244
+ - ओ
245
+ - ढ़
246
+ - घ
247
+ - ठ
248
+ - ॉ
249
+ - ��
250
+ - ढ
251
+ - ऑ
252
+ - ँ
253
+ - ऊ
254
+ - ऋ
255
+ - औ
256
+ - झ
257
+ - ज़
258
+ - फ़
259
+ - ऐ
260
+ - ञ
261
+ - ऽ
262
+ - ख़
263
+ - क़
264
+ - ़
265
+ - ः
266
+ - ॅ
267
+ - ऱ
268
+ - <sos/eos>
269
+ init: null
270
+ input_size: null
271
+ ctc_conf:
272
+ dropout_rate: 0.0
273
+ ctc_type: builtin
274
+ reduce: true
275
+ ignore_nan_grad: null
276
+ zero_infinity: true
277
+ brctc_risk_strategy: exp
278
+ brctc_group_strategy: end
279
+ brctc_risk_factor: 0.0
280
+ joint_net_conf: null
281
+ use_preprocessor: true
282
+ use_lang_prompt: false
283
+ use_nlp_prompt: false
284
+ token_type: char
285
+ bpemodel: null
286
+ non_linguistic_symbols: null
287
+ cleaner: null
288
+ g2p: null
289
+ speech_volume_normalize: null
290
+ rir_scp: null
291
+ rir_apply_prob: 1.0
292
+ noise_scp: null
293
+ noise_apply_prob: 1.0
294
+ noise_db_range: '13_15'
295
+ short_noise_thres: 0.5
296
+ aux_ctc_tasks: []
297
+ frontend: default
298
+ frontend_conf:
299
+ n_fft: 512
300
+ win_length: 400
301
+ hop_length: 160
302
+ fs: 16k
303
+ specaug: specaug
304
+ specaug_conf:
305
+ apply_time_warp: true
306
+ time_warp_window: 5
307
+ time_warp_mode: bicubic
308
+ apply_freq_mask: true
309
+ freq_mask_width_range:
310
+ - 0
311
+ - 27
312
+ num_freq_mask: 2
313
+ apply_time_mask: true
314
+ time_mask_width_ratio_range:
315
+ - 0.0
316
+ - 0.05
317
+ num_time_mask: 5
318
+ normalize: utterance_mvn
319
+ normalize_conf: {}
320
+ model: espnet
321
+ model_conf:
322
+ ctc_weight: 0.3
323
+ lsm_weight: 0.1
324
+ length_normalized_loss: false
325
+ preencoder: null
326
+ preencoder_conf: {}
327
+ encoder: e_branchformer
328
+ encoder_conf:
329
+ output_size: 256
330
+ attention_heads: 4
331
+ attention_layer_type: rel_selfattn
332
+ pos_enc_layer_type: rel_pos
333
+ rel_pos_type: latest
334
+ cgmlp_linear_units: 1024
335
+ cgmlp_conv_kernel: 31
336
+ use_linear_after_conv: false
337
+ gate_activation: identity
338
+ num_blocks: 8
339
+ dropout_rate: 0.1
340
+ positional_dropout_rate: 0.1
341
+ attention_dropout_rate: 0.1
342
+ input_layer: conv2d2
343
+ layer_drop_rate: 0.0
344
+ linear_units: 1024
345
+ positionwise_layer_type: linear
346
+ use_ffn: true
347
+ macaron_ffn: true
348
+ merge_conv_kernel: 31
349
+ postencoder: null
350
+ postencoder_conf: {}
351
+ decoder: transformer
352
+ decoder_conf:
353
+ attention_heads: 4
354
+ linear_units: 2048
355
+ num_blocks: 6
356
+ dropout_rate: 0.1
357
+ positional_dropout_rate: 0.1
358
+ self_attention_dropout_rate: 0.1
359
+ src_attention_dropout_rate: 0.1
360
+ layer_drop_rate: 0.0
361
+ preprocessor: default
362
+ preprocessor_conf: {}
363
+ required:
364
+ - output_dir
365
+ - token_list
366
+ version: '202409'
367
+ distributed: false
368
+ ```
369
+
370
+ </details>
371
+
372
+
373
+
374
+ ### Citing ESPnet
375
+
376
+ ```BibTex
377
+ @inproceedings{watanabe2018espnet,
378
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
379
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
380
+ year={2018},
381
+ booktitle={Proceedings of Interspeech},
382
+ pages={2207--2211},
383
+ doi={10.21437/Interspeech.2018-1456},
384
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
385
+ }
386
+
387
+
388
+
389
+
390
+
391
+
392
+ ```
393
+
394
+ or arXiv:
395
+
396
+ ```bibtex
397
+ @misc{watanabe2018espnet,
398
+ title={ESPnet: End-to-End Speech Processing Toolkit},
399
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
400
+ year={2018},
401
+ eprint={1804.00015},
402
+ archivePrefix={arXiv},
403
+ primaryClass={cs.CL}
404
+ }
405
+ ```
exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by scripts/utils/show_asr_result.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Sun May 25 02:31:02 IST 2025`
5
+ - python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]`
6
+ - espnet version: `espnet 202412`
7
+ - pytorch version: `pytorch 2.3.0+cu121`
8
+ - Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c`
9
+ - Commit date: `Tue Jan 14 20:06:15 2025 -0500`
10
+
11
+ ## exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1
12
+ ### WER
13
+
14
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
15
+ |---|---|---|---|---|---|---|---|---|
16
+ |decode_lid_asr_model_valid.acc.ave/test_bh|2220|22453|85.9|13.4|0.7|1.1|15.2|73.2|
17
+
18
+ ### CER
19
+
20
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
21
+ |---|---|---|---|---|---|---|---|---|
22
+ |decode_lid_asr_model_valid.acc.ave/test_bh|2220|104745|96.8|2.1|1.1|1.2|4.4|73.2|
23
+
24
+ ### TER
25
+
26
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
27
+ |---|---|---|---|---|---|---|---|---|
exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e8_mactrue_bs6M_gacc1.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1
9
+ ngpu: 1
10
+ seed: 2022
11
+ num_workers: 8
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: false
23
+ sharded_ddp: false
24
+ use_deepspeed: false
25
+ deepspeed_config: null
26
+ cudnn_enabled: true
27
+ cudnn_benchmark: false
28
+ cudnn_deterministic: true
29
+ use_tf32: false
30
+ collect_stats: false
31
+ write_collected_feats: false
32
+ max_epoch: 70
33
+ patience: 5
34
+ val_scheduler_criterion:
35
+ - valid
36
+ - loss
37
+ early_stopping_criterion:
38
+ - valid
39
+ - loss
40
+ - min
41
+ best_model_criterion:
42
+ - - valid
43
+ - acc
44
+ - max
45
+ keep_nbest_models: 5
46
+ nbest_averaging_interval: 0
47
+ grad_clip: 5.0
48
+ grad_clip_type: 2.0
49
+ grad_noise: false
50
+ accum_grad: 1
51
+ no_forward_run: false
52
+ resume: true
53
+ train_dtype: float32
54
+ use_amp: true
55
+ log_interval: null
56
+ use_matplotlib: true
57
+ use_tensorboard: true
58
+ create_graph_in_tensorboard: false
59
+ use_wandb: false
60
+ wandb_project: null
61
+ wandb_id: null
62
+ wandb_entity: null
63
+ wandb_name: null
64
+ wandb_model_log_interval: -1
65
+ detect_anomaly: false
66
+ use_adapter: false
67
+ adapter: lora
68
+ save_strategy: all
69
+ adapter_conf: {}
70
+ pretrain_path: null
71
+ init_param: []
72
+ ignore_init_mismatch: false
73
+ freeze_param: []
74
+ num_iters_per_epoch: null
75
+ batch_size: 20
76
+ valid_batch_size: null
77
+ batch_bins: 6000000
78
+ valid_batch_bins: null
79
+ category_sample_size: 10
80
+ train_shape_file:
81
+ - exp_small/exp_bh/asr_stats_raw_bh_char_sp/train/speech_shape
82
+ - exp_small/exp_bh/asr_stats_raw_bh_char_sp/train/text_shape.char
83
+ valid_shape_file:
84
+ - exp_small/exp_bh/asr_stats_raw_bh_char_sp/valid/speech_shape
85
+ - exp_small/exp_bh/asr_stats_raw_bh_char_sp/valid/text_shape.char
86
+ batch_type: numel
87
+ valid_batch_type: null
88
+ fold_length:
89
+ - 80000
90
+ - 150
91
+ sort_in_batch: descending
92
+ shuffle_within_batch: false
93
+ sort_batch: descending
94
+ multiple_iterator: false
95
+ chunk_length: 500
96
+ chunk_shift_ratio: 0.5
97
+ num_cache_chunks: 1024
98
+ chunk_excluded_key_prefixes: []
99
+ chunk_default_fs: null
100
+ chunk_max_abs_length: null
101
+ chunk_discard_short_samples: true
102
+ train_data_path_and_name_and_type:
103
+ - - dump/bh/raw/train_bh_sp/wav.scp
104
+ - speech
105
+ - sound
106
+ - - dump/bh/raw/train_bh_sp/text
107
+ - text
108
+ - text
109
+ valid_data_path_and_name_and_type:
110
+ - - dump/bh/raw/dev_bh/wav.scp
111
+ - speech
112
+ - sound
113
+ - - dump/bh/raw/dev_bh/text
114
+ - text
115
+ - text
116
+ multi_task_dataset: false
117
+ allow_variable_data_keys: false
118
+ max_cache_size: 0.0
119
+ max_cache_fd: 32
120
+ allow_multi_rates: false
121
+ valid_max_cache_size: null
122
+ exclude_weight_decay: false
123
+ exclude_weight_decay_conf: {}
124
+ optim: adam
125
+ optim_conf:
126
+ lr: 0.002
127
+ weight_decay: 1.0e-06
128
+ scheduler: warmuplr
129
+ scheduler_conf:
130
+ warmup_steps: 15000
131
+ token_list:
132
+ - <blank>
133
+ - <unk>
134
+ - <space>
135
+ - ा
136
+ - े
137
+ - क
138
+ - र
139
+ - ल
140
+ - स
141
+ - न
142
+ - म
143
+ - त
144
+ - ी
145
+ - ि
146
+ - ह
147
+ - ब
148
+ - ्
149
+ - प
150
+ - व
151
+ - ज
152
+ - ं
153
+ - ो
154
+ - द
155
+ - ख
156
+ - य
157
+ - ग
158
+ - ट
159
+ - ु
160
+ - अ
161
+ - ई
162
+ - इ
163
+ - च
164
+ - भ
165
+ - आ
166
+ - ू
167
+ - उ
168
+ - ए
169
+ - श
170
+ - ै
171
+ - ध
172
+ - ड
173
+ - फ
174
+ - ड़
175
+ - ौ
176
+ - .
177
+ - छ
178
+ - ण
179
+ - ष
180
+ - थ
181
+ - ओ
182
+ - ढ़
183
+ - घ
184
+ - ठ
185
+ - ॉ
186
+ - ृ
187
+ - ढ
188
+ - ऑ
189
+ - ँ
190
+ - ऊ
191
+ - ऋ
192
+ - औ
193
+ - झ
194
+ - ज़
195
+ - फ़
196
+ - ऐ
197
+ - ञ
198
+ - ऽ
199
+ - ख़
200
+ - क़
201
+ - ़
202
+ - ः
203
+ - ॅ
204
+ - ऱ
205
+ - <sos/eos>
206
+ init: null
207
+ input_size: null
208
+ ctc_conf:
209
+ dropout_rate: 0.0
210
+ ctc_type: builtin
211
+ reduce: true
212
+ ignore_nan_grad: null
213
+ zero_infinity: true
214
+ brctc_risk_strategy: exp
215
+ brctc_group_strategy: end
216
+ brctc_risk_factor: 0.0
217
+ joint_net_conf: null
218
+ use_preprocessor: true
219
+ use_lang_prompt: false
220
+ use_nlp_prompt: false
221
+ token_type: char
222
+ bpemodel: null
223
+ non_linguistic_symbols: null
224
+ cleaner: null
225
+ g2p: null
226
+ speech_volume_normalize: null
227
+ rir_scp: null
228
+ rir_apply_prob: 1.0
229
+ noise_scp: null
230
+ noise_apply_prob: 1.0
231
+ noise_db_range: '13_15'
232
+ short_noise_thres: 0.5
233
+ aux_ctc_tasks: []
234
+ frontend: default
235
+ frontend_conf:
236
+ n_fft: 512
237
+ win_length: 400
238
+ hop_length: 160
239
+ fs: 16k
240
+ specaug: specaug
241
+ specaug_conf:
242
+ apply_time_warp: true
243
+ time_warp_window: 5
244
+ time_warp_mode: bicubic
245
+ apply_freq_mask: true
246
+ freq_mask_width_range:
247
+ - 0
248
+ - 27
249
+ num_freq_mask: 2
250
+ apply_time_mask: true
251
+ time_mask_width_ratio_range:
252
+ - 0.0
253
+ - 0.05
254
+ num_time_mask: 5
255
+ normalize: utterance_mvn
256
+ normalize_conf: {}
257
+ model: espnet
258
+ model_conf:
259
+ ctc_weight: 0.3
260
+ lsm_weight: 0.1
261
+ length_normalized_loss: false
262
+ preencoder: null
263
+ preencoder_conf: {}
264
+ encoder: e_branchformer
265
+ encoder_conf:
266
+ output_size: 256
267
+ attention_heads: 4
268
+ attention_layer_type: rel_selfattn
269
+ pos_enc_layer_type: rel_pos
270
+ rel_pos_type: latest
271
+ cgmlp_linear_units: 1024
272
+ cgmlp_conv_kernel: 31
273
+ use_linear_after_conv: false
274
+ gate_activation: identity
275
+ num_blocks: 8
276
+ dropout_rate: 0.1
277
+ positional_dropout_rate: 0.1
278
+ attention_dropout_rate: 0.1
279
+ input_layer: conv2d2
280
+ layer_drop_rate: 0.0
281
+ linear_units: 1024
282
+ positionwise_layer_type: linear
283
+ use_ffn: true
284
+ macaron_ffn: true
285
+ merge_conv_kernel: 31
286
+ postencoder: null
287
+ postencoder_conf: {}
288
+ decoder: transformer
289
+ decoder_conf:
290
+ attention_heads: 4
291
+ linear_units: 2048
292
+ num_blocks: 6
293
+ dropout_rate: 0.1
294
+ positional_dropout_rate: 0.1
295
+ self_attention_dropout_rate: 0.1
296
+ src_attention_dropout_rate: 0.1
297
+ layer_drop_rate: 0.0
298
+ preprocessor: default
299
+ preprocessor_conf: {}
300
+ required:
301
+ - output_dir
302
+ - token_list
303
+ version: '202409'
304
+ distributed: false
exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png ADDED
exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png ADDED
exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png ADDED
exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png ADDED
exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png ADDED
exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png ADDED
exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png ADDED
exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png ADDED
exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png ADDED
exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png ADDED
exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png ADDED
exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png ADDED
exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png ADDED
exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png ADDED
exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png ADDED
exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png ADDED
exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png ADDED
exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fd89d459111cb4a487d4809ee4278936f8a8eeec09385890a3a4ac45df5eabe
3
+ size 112628010
exp_small/exp_bh/meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202412'
2
+ files:
3
+ asr_model_file: exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth
4
+ python: "3.8.10 (default, Mar 18 2025, 20:04:55) \n[GCC 9.4.0]"
5
+ timestamp: 1748120464.687666
6
+ torch: 2.3.0+cu121
7
+ yaml_files:
8
+ asr_train_config: exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml
exp_small/exp_bn/README.md ADDED
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - automatic-speech-recognition
6
+ language: bn
7
+ datasets:
8
+ - respin_small
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 ASR model
13
+
14
+ ### `SpireLab/spire_respin_baselines_espnet`
15
+
16
+ This model was trained by wtc7 using respin_small recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+
26
+ pip install -e .
27
+ cd egs2/respin_small/asr1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model SpireLab/spire_respin_baselines_espnet
29
+ ```
30
+
31
+ <!-- Generated by scripts/utils/show_asr_result.sh -->
32
+ # RESULTS
33
+ ## Environments
34
+ - date: `Sun May 25 02:31:23 IST 2025`
35
+ - python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]`
36
+ - espnet version: `espnet 202412`
37
+ - pytorch version: `pytorch 2.3.0+cu121`
38
+ - Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c`
39
+ - Commit date: `Tue Jan 14 20:06:15 2025 -0500`
40
+
41
+ ## exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1
42
+ ### WER
43
+
44
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
45
+ |---|---|---|---|---|---|---|---|---|
46
+ |decode_lid_asr_model_valid.acc.ave/test_bn|2174|20534|86.3|12.6|1.1|1.2|15.0|65.7|
47
+
48
+ ### CER
49
+
50
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
51
+ |---|---|---|---|---|---|---|---|---|
52
+ |decode_lid_asr_model_valid.acc.ave/test_bn|2174|114101|97.1|1.6|1.2|1.2|4.1|65.7|
53
+
54
+ ### TER
55
+
56
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
57
+ |---|---|---|---|---|---|---|---|---|
58
+
59
+ ## ASR config
60
+
61
+ <details><summary>expand</summary>
62
+
63
+ ```
64
+ config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e8_mactrue_bs6M_gacc1.yaml
65
+ print_config: false
66
+ log_level: INFO
67
+ drop_last_iter: false
68
+ dry_run: false
69
+ iterator_type: sequence
70
+ valid_iterator_type: null
71
+ output_dir: exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1
72
+ ngpu: 1
73
+ seed: 2022
74
+ num_workers: 8
75
+ num_att_plot: 3
76
+ dist_backend: nccl
77
+ dist_init_method: env://
78
+ dist_world_size: null
79
+ dist_rank: null
80
+ local_rank: 0
81
+ dist_master_addr: null
82
+ dist_master_port: null
83
+ dist_launcher: null
84
+ multiprocessing_distributed: false
85
+ unused_parameters: false
86
+ sharded_ddp: false
87
+ use_deepspeed: false
88
+ deepspeed_config: null
89
+ cudnn_enabled: true
90
+ cudnn_benchmark: false
91
+ cudnn_deterministic: true
92
+ use_tf32: false
93
+ collect_stats: false
94
+ write_collected_feats: false
95
+ max_epoch: 70
96
+ patience: 5
97
+ val_scheduler_criterion:
98
+ - valid
99
+ - loss
100
+ early_stopping_criterion:
101
+ - valid
102
+ - loss
103
+ - min
104
+ best_model_criterion:
105
+ - - valid
106
+ - acc
107
+ - max
108
+ keep_nbest_models: 5
109
+ nbest_averaging_interval: 0
110
+ grad_clip: 5.0
111
+ grad_clip_type: 2.0
112
+ grad_noise: false
113
+ accum_grad: 1
114
+ no_forward_run: false
115
+ resume: true
116
+ train_dtype: float32
117
+ use_amp: true
118
+ log_interval: null
119
+ use_matplotlib: true
120
+ use_tensorboard: true
121
+ create_graph_in_tensorboard: false
122
+ use_wandb: false
123
+ wandb_project: null
124
+ wandb_id: null
125
+ wandb_entity: null
126
+ wandb_name: null
127
+ wandb_model_log_interval: -1
128
+ detect_anomaly: false
129
+ use_adapter: false
130
+ adapter: lora
131
+ save_strategy: all
132
+ adapter_conf: {}
133
+ pretrain_path: null
134
+ init_param: []
135
+ ignore_init_mismatch: false
136
+ freeze_param: []
137
+ num_iters_per_epoch: null
138
+ batch_size: 20
139
+ valid_batch_size: null
140
+ batch_bins: 6000000
141
+ valid_batch_bins: null
142
+ category_sample_size: 10
143
+ train_shape_file:
144
+ - exp_small/exp_bn/asr_stats_raw_bn_char_sp/train/speech_shape
145
+ - exp_small/exp_bn/asr_stats_raw_bn_char_sp/train/text_shape.char
146
+ valid_shape_file:
147
+ - exp_small/exp_bn/asr_stats_raw_bn_char_sp/valid/speech_shape
148
+ - exp_small/exp_bn/asr_stats_raw_bn_char_sp/valid/text_shape.char
149
+ batch_type: numel
150
+ valid_batch_type: null
151
+ fold_length:
152
+ - 80000
153
+ - 150
154
+ sort_in_batch: descending
155
+ shuffle_within_batch: false
156
+ sort_batch: descending
157
+ multiple_iterator: false
158
+ chunk_length: 500
159
+ chunk_shift_ratio: 0.5
160
+ num_cache_chunks: 1024
161
+ chunk_excluded_key_prefixes: []
162
+ chunk_default_fs: null
163
+ chunk_max_abs_length: null
164
+ chunk_discard_short_samples: true
165
+ train_data_path_and_name_and_type:
166
+ - - dump/bn/raw/train_bn_sp/wav.scp
167
+ - speech
168
+ - sound
169
+ - - dump/bn/raw/train_bn_sp/text
170
+ - text
171
+ - text
172
+ valid_data_path_and_name_and_type:
173
+ - - dump/bn/raw/dev_bn/wav.scp
174
+ - speech
175
+ - sound
176
+ - - dump/bn/raw/dev_bn/text
177
+ - text
178
+ - text
179
+ multi_task_dataset: false
180
+ allow_variable_data_keys: false
181
+ max_cache_size: 0.0
182
+ max_cache_fd: 32
183
+ allow_multi_rates: false
184
+ valid_max_cache_size: null
185
+ exclude_weight_decay: false
186
+ exclude_weight_decay_conf: {}
187
+ optim: adam
188
+ optim_conf:
189
+ lr: 0.002
190
+ weight_decay: 1.0e-06
191
+ scheduler: warmuplr
192
+ scheduler_conf:
193
+ warmup_steps: 15000
194
+ token_list:
195
+ - <blank>
196
+ - <unk>
197
+ - <space>
198
+ - া
199
+ - ে
200
+ - র
201
+ - ক
202
+ - ্
203
+ - ি
204
+ - ন
205
+ - ব
206
+ - ল
207
+ - য
208
+ - ম
209
+ - স
210
+ - ত
211
+ - প
212
+ - ট
213
+ - য়
214
+ - হ
215
+ - ু
216
+ - দ
217
+ - ো
218
+ - জ
219
+ - ই
220
+ - গ
221
+ - চ
222
+ - ছ
223
+ - শ
224
+ - আ
225
+ - থ
226
+ - ভ
227
+ - এ
228
+ - ষ
229
+ - ধ
230
+ - ী
231
+ - উ
232
+ - ফ
233
+ - খ
234
+ - ড
235
+ - অ
236
+ - ং
237
+ - ও
238
+ - ড়
239
+ - ণ
240
+ - ঙ
241
+ - ঁ
242
+ - ৃ
243
+ - .
244
+ - ঠ
245
+ - ৈ
246
+ - ূ
247
+ - ৎ
248
+ - ঞ
249
+ - ��
250
+ - ঋ
251
+ - ঝ
252
+ - ৌ
253
+ - ঢ
254
+ - ়
255
+ - ঢ়
256
+ - ঃ
257
+ - ঊ
258
+ - ঐ
259
+ - ঔ
260
+ - ঈ
261
+ - ৠ
262
+ - <sos/eos>
263
+ init: null
264
+ input_size: null
265
+ ctc_conf:
266
+ dropout_rate: 0.0
267
+ ctc_type: builtin
268
+ reduce: true
269
+ ignore_nan_grad: null
270
+ zero_infinity: true
271
+ brctc_risk_strategy: exp
272
+ brctc_group_strategy: end
273
+ brctc_risk_factor: 0.0
274
+ joint_net_conf: null
275
+ use_preprocessor: true
276
+ use_lang_prompt: false
277
+ use_nlp_prompt: false
278
+ token_type: char
279
+ bpemodel: null
280
+ non_linguistic_symbols: null
281
+ cleaner: null
282
+ g2p: null
283
+ speech_volume_normalize: null
284
+ rir_scp: null
285
+ rir_apply_prob: 1.0
286
+ noise_scp: null
287
+ noise_apply_prob: 1.0
288
+ noise_db_range: '13_15'
289
+ short_noise_thres: 0.5
290
+ aux_ctc_tasks: []
291
+ frontend: default
292
+ frontend_conf:
293
+ n_fft: 512
294
+ win_length: 400
295
+ hop_length: 160
296
+ fs: 16k
297
+ specaug: specaug
298
+ specaug_conf:
299
+ apply_time_warp: true
300
+ time_warp_window: 5
301
+ time_warp_mode: bicubic
302
+ apply_freq_mask: true
303
+ freq_mask_width_range:
304
+ - 0
305
+ - 27
306
+ num_freq_mask: 2
307
+ apply_time_mask: true
308
+ time_mask_width_ratio_range:
309
+ - 0.0
310
+ - 0.05
311
+ num_time_mask: 5
312
+ normalize: utterance_mvn
313
+ normalize_conf: {}
314
+ model: espnet
315
+ model_conf:
316
+ ctc_weight: 0.3
317
+ lsm_weight: 0.1
318
+ length_normalized_loss: false
319
+ preencoder: null
320
+ preencoder_conf: {}
321
+ encoder: e_branchformer
322
+ encoder_conf:
323
+ output_size: 256
324
+ attention_heads: 4
325
+ attention_layer_type: rel_selfattn
326
+ pos_enc_layer_type: rel_pos
327
+ rel_pos_type: latest
328
+ cgmlp_linear_units: 1024
329
+ cgmlp_conv_kernel: 31
330
+ use_linear_after_conv: false
331
+ gate_activation: identity
332
+ num_blocks: 8
333
+ dropout_rate: 0.1
334
+ positional_dropout_rate: 0.1
335
+ attention_dropout_rate: 0.1
336
+ input_layer: conv2d2
337
+ layer_drop_rate: 0.0
338
+ linear_units: 1024
339
+ positionwise_layer_type: linear
340
+ use_ffn: true
341
+ macaron_ffn: true
342
+ merge_conv_kernel: 31
343
+ postencoder: null
344
+ postencoder_conf: {}
345
+ decoder: transformer
346
+ decoder_conf:
347
+ attention_heads: 4
348
+ linear_units: 2048
349
+ num_blocks: 6
350
+ dropout_rate: 0.1
351
+ positional_dropout_rate: 0.1
352
+ self_attention_dropout_rate: 0.1
353
+ src_attention_dropout_rate: 0.1
354
+ layer_drop_rate: 0.0
355
+ preprocessor: default
356
+ preprocessor_conf: {}
357
+ required:
358
+ - output_dir
359
+ - token_list
360
+ version: '202409'
361
+ distributed: false
362
+ ```
363
+
364
+ </details>
365
+
366
+
367
+
368
+ ### Citing ESPnet
369
+
370
+ ```BibTex
371
+ @inproceedings{watanabe2018espnet,
372
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
373
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
374
+ year={2018},
375
+ booktitle={Proceedings of Interspeech},
376
+ pages={2207--2211},
377
+ doi={10.21437/Interspeech.2018-1456},
378
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
379
+ }
380
+
381
+
382
+
383
+
384
+
385
+
386
+ ```
387
+
388
+ or arXiv:
389
+
390
+ ```bibtex
391
+ @misc{watanabe2018espnet,
392
+ title={ESPnet: End-to-End Speech Processing Toolkit},
393
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
394
+ year={2018},
395
+ eprint={1804.00015},
396
+ archivePrefix={arXiv},
397
+ primaryClass={cs.CL}
398
+ }
399
+ ```
exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by scripts/utils/show_asr_result.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Sun May 25 02:31:23 IST 2025`
5
+ - python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]`
6
+ - espnet version: `espnet 202412`
7
+ - pytorch version: `pytorch 2.3.0+cu121`
8
+ - Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c`
9
+ - Commit date: `Tue Jan 14 20:06:15 2025 -0500`
10
+
11
+ ## exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1
12
+ ### WER
13
+
14
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
15
+ |---|---|---|---|---|---|---|---|---|
16
+ |decode_lid_asr_model_valid.acc.ave/test_bn|2174|20534|86.3|12.6|1.1|1.2|15.0|65.7|
17
+
18
+ ### CER
19
+
20
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
21
+ |---|---|---|---|---|---|---|---|---|
22
+ |decode_lid_asr_model_valid.acc.ave/test_bn|2174|114101|97.1|1.6|1.2|1.2|4.1|65.7|
23
+
24
+ ### TER
25
+
26
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
27
+ |---|---|---|---|---|---|---|---|---|
exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e8_mactrue_bs6M_gacc1.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1
9
+ ngpu: 1
10
+ seed: 2022
11
+ num_workers: 8
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: false
23
+ sharded_ddp: false
24
+ use_deepspeed: false
25
+ deepspeed_config: null
26
+ cudnn_enabled: true
27
+ cudnn_benchmark: false
28
+ cudnn_deterministic: true
29
+ use_tf32: false
30
+ collect_stats: false
31
+ write_collected_feats: false
32
+ max_epoch: 70
33
+ patience: 5
34
+ val_scheduler_criterion:
35
+ - valid
36
+ - loss
37
+ early_stopping_criterion:
38
+ - valid
39
+ - loss
40
+ - min
41
+ best_model_criterion:
42
+ - - valid
43
+ - acc
44
+ - max
45
+ keep_nbest_models: 5
46
+ nbest_averaging_interval: 0
47
+ grad_clip: 5.0
48
+ grad_clip_type: 2.0
49
+ grad_noise: false
50
+ accum_grad: 1
51
+ no_forward_run: false
52
+ resume: true
53
+ train_dtype: float32
54
+ use_amp: true
55
+ log_interval: null
56
+ use_matplotlib: true
57
+ use_tensorboard: true
58
+ create_graph_in_tensorboard: false
59
+ use_wandb: false
60
+ wandb_project: null
61
+ wandb_id: null
62
+ wandb_entity: null
63
+ wandb_name: null
64
+ wandb_model_log_interval: -1
65
+ detect_anomaly: false
66
+ use_adapter: false
67
+ adapter: lora
68
+ save_strategy: all
69
+ adapter_conf: {}
70
+ pretrain_path: null
71
+ init_param: []
72
+ ignore_init_mismatch: false
73
+ freeze_param: []
74
+ num_iters_per_epoch: null
75
+ batch_size: 20
76
+ valid_batch_size: null
77
+ batch_bins: 6000000
78
+ valid_batch_bins: null
79
+ category_sample_size: 10
80
+ train_shape_file:
81
+ - exp_small/exp_bn/asr_stats_raw_bn_char_sp/train/speech_shape
82
+ - exp_small/exp_bn/asr_stats_raw_bn_char_sp/train/text_shape.char
83
+ valid_shape_file:
84
+ - exp_small/exp_bn/asr_stats_raw_bn_char_sp/valid/speech_shape
85
+ - exp_small/exp_bn/asr_stats_raw_bn_char_sp/valid/text_shape.char
86
+ batch_type: numel
87
+ valid_batch_type: null
88
+ fold_length:
89
+ - 80000
90
+ - 150
91
+ sort_in_batch: descending
92
+ shuffle_within_batch: false
93
+ sort_batch: descending
94
+ multiple_iterator: false
95
+ chunk_length: 500
96
+ chunk_shift_ratio: 0.5
97
+ num_cache_chunks: 1024
98
+ chunk_excluded_key_prefixes: []
99
+ chunk_default_fs: null
100
+ chunk_max_abs_length: null
101
+ chunk_discard_short_samples: true
102
+ train_data_path_and_name_and_type:
103
+ - - dump/bn/raw/train_bn_sp/wav.scp
104
+ - speech
105
+ - sound
106
+ - - dump/bn/raw/train_bn_sp/text
107
+ - text
108
+ - text
109
+ valid_data_path_and_name_and_type:
110
+ - - dump/bn/raw/dev_bn/wav.scp
111
+ - speech
112
+ - sound
113
+ - - dump/bn/raw/dev_bn/text
114
+ - text
115
+ - text
116
+ multi_task_dataset: false
117
+ allow_variable_data_keys: false
118
+ max_cache_size: 0.0
119
+ max_cache_fd: 32
120
+ allow_multi_rates: false
121
+ valid_max_cache_size: null
122
+ exclude_weight_decay: false
123
+ exclude_weight_decay_conf: {}
124
+ optim: adam
125
+ optim_conf:
126
+ lr: 0.002
127
+ weight_decay: 1.0e-06
128
+ scheduler: warmuplr
129
+ scheduler_conf:
130
+ warmup_steps: 15000
131
+ token_list:
132
+ - <blank>
133
+ - <unk>
134
+ - <space>
135
+ - া
136
+ - ে
137
+ - র
138
+ - ক
139
+ - ্
140
+ - ি
141
+ - ন
142
+ - ব
143
+ - ল
144
+ - য
145
+ - ম
146
+ - স
147
+ - ত
148
+ - প
149
+ - ট
150
+ - য়
151
+ - হ
152
+ - ু
153
+ - দ
154
+ - ো
155
+ - জ
156
+ - ই
157
+ - গ
158
+ - চ
159
+ - ছ
160
+ - শ
161
+ - আ
162
+ - থ
163
+ - ভ
164
+ - এ
165
+ - ষ
166
+ - ধ
167
+ - ী
168
+ - উ
169
+ - ফ
170
+ - খ
171
+ - ড
172
+ - অ
173
+ - ং
174
+ - ও
175
+ - ড়
176
+ - ণ
177
+ - ঙ
178
+ - ঁ
179
+ - ৃ
180
+ - .
181
+ - ঠ
182
+ - ৈ
183
+ - ূ
184
+ - ৎ
185
+ - ঞ
186
+ - ঘ
187
+ - ঋ
188
+ - ঝ
189
+ - ৌ
190
+ - ঢ
191
+ - ়
192
+ - ঢ়
193
+ - ঃ
194
+ - ঊ
195
+ - ঐ
196
+ - ঔ
197
+ - ঈ
198
+ - ৠ
199
+ - <sos/eos>
200
+ init: null
201
+ input_size: null
202
+ ctc_conf:
203
+ dropout_rate: 0.0
204
+ ctc_type: builtin
205
+ reduce: true
206
+ ignore_nan_grad: null
207
+ zero_infinity: true
208
+ brctc_risk_strategy: exp
209
+ brctc_group_strategy: end
210
+ brctc_risk_factor: 0.0
211
+ joint_net_conf: null
212
+ use_preprocessor: true
213
+ use_lang_prompt: false
214
+ use_nlp_prompt: false
215
+ token_type: char
216
+ bpemodel: null
217
+ non_linguistic_symbols: null
218
+ cleaner: null
219
+ g2p: null
220
+ speech_volume_normalize: null
221
+ rir_scp: null
222
+ rir_apply_prob: 1.0
223
+ noise_scp: null
224
+ noise_apply_prob: 1.0
225
+ noise_db_range: '13_15'
226
+ short_noise_thres: 0.5
227
+ aux_ctc_tasks: []
228
+ frontend: default
229
+ frontend_conf:
230
+ n_fft: 512
231
+ win_length: 400
232
+ hop_length: 160
233
+ fs: 16k
234
+ specaug: specaug
235
+ specaug_conf:
236
+ apply_time_warp: true
237
+ time_warp_window: 5
238
+ time_warp_mode: bicubic
239
+ apply_freq_mask: true
240
+ freq_mask_width_range:
241
+ - 0
242
+ - 27
243
+ num_freq_mask: 2
244
+ apply_time_mask: true
245
+ time_mask_width_ratio_range:
246
+ - 0.0
247
+ - 0.05
248
+ num_time_mask: 5
249
+ normalize: utterance_mvn
250
+ normalize_conf: {}
251
+ model: espnet
252
+ model_conf:
253
+ ctc_weight: 0.3
254
+ lsm_weight: 0.1
255
+ length_normalized_loss: false
256
+ preencoder: null
257
+ preencoder_conf: {}
258
+ encoder: e_branchformer
259
+ encoder_conf:
260
+ output_size: 256
261
+ attention_heads: 4
262
+ attention_layer_type: rel_selfattn
263
+ pos_enc_layer_type: rel_pos
264
+ rel_pos_type: latest
265
+ cgmlp_linear_units: 1024
266
+ cgmlp_conv_kernel: 31
267
+ use_linear_after_conv: false
268
+ gate_activation: identity
269
+ num_blocks: 8
270
+ dropout_rate: 0.1
271
+ positional_dropout_rate: 0.1
272
+ attention_dropout_rate: 0.1
273
+ input_layer: conv2d2
274
+ layer_drop_rate: 0.0
275
+ linear_units: 1024
276
+ positionwise_layer_type: linear
277
+ use_ffn: true
278
+ macaron_ffn: true
279
+ merge_conv_kernel: 31
280
+ postencoder: null
281
+ postencoder_conf: {}
282
+ decoder: transformer
283
+ decoder_conf:
284
+ attention_heads: 4
285
+ linear_units: 2048
286
+ num_blocks: 6
287
+ dropout_rate: 0.1
288
+ positional_dropout_rate: 0.1
289
+ self_attention_dropout_rate: 0.1
290
+ src_attention_dropout_rate: 0.1
291
+ layer_drop_rate: 0.0
292
+ preprocessor: default
293
+ preprocessor_conf: {}
294
+ required:
295
+ - output_dir
296
+ - token_list
297
+ version: '202409'
298
+ distributed: false
exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png ADDED
exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png ADDED
exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png ADDED
exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png ADDED
exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png ADDED
exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png ADDED
exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png ADDED
exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png ADDED
exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png ADDED
exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png ADDED
exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png ADDED
exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png ADDED
exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png ADDED
exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png ADDED
exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png ADDED
exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png ADDED