diff --git a/data_respin/bh/nlsyms.txt b/data_respin/bh/nlsyms.txt new file mode 100644 index 0000000000000000000000000000000000000000..01e79c32a8c99c557f0757da7cb6d65b3414466d --- /dev/null +++ b/data_respin/bh/nlsyms.txt @@ -0,0 +1,3 @@ +1 +2 +3 diff --git a/data_respin/bn/nlsyms.txt b/data_respin/bn/nlsyms.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a1218a1024a212bb3db30becd860315f9f3ac52 --- /dev/null +++ b/data_respin/bn/nlsyms.txt @@ -0,0 +1,5 @@ +1 +2 +3 +4 +5 diff --git a/data_respin/ch/nlsyms.txt b/data_respin/ch/nlsyms.txt new file mode 100644 index 0000000000000000000000000000000000000000..94ebaf900161394059478fd88aec30e59092a1d7 --- /dev/null +++ b/data_respin/ch/nlsyms.txt @@ -0,0 +1,4 @@ +1 +2 +3 +4 diff --git a/data_respin/hi/nlsyms.txt b/data_respin/hi/nlsyms.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a1218a1024a212bb3db30becd860315f9f3ac52 --- /dev/null +++ b/data_respin/hi/nlsyms.txt @@ -0,0 +1,5 @@ +1 +2 +3 +4 +5 diff --git a/data_respin/kn/nlsyms.txt b/data_respin/kn/nlsyms.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a1218a1024a212bb3db30becd860315f9f3ac52 --- /dev/null +++ b/data_respin/kn/nlsyms.txt @@ -0,0 +1,5 @@ +1 +2 +3 +4 +5 diff --git a/data_respin/mg/nlsyms.txt b/data_respin/mg/nlsyms.txt new file mode 100644 index 0000000000000000000000000000000000000000..94ebaf900161394059478fd88aec30e59092a1d7 --- /dev/null +++ b/data_respin/mg/nlsyms.txt @@ -0,0 +1,4 @@ +1 +2 +3 +4 diff --git a/data_respin/mr/nlsyms.txt b/data_respin/mr/nlsyms.txt new file mode 100644 index 0000000000000000000000000000000000000000..94ebaf900161394059478fd88aec30e59092a1d7 --- /dev/null +++ b/data_respin/mr/nlsyms.txt @@ -0,0 +1,4 @@ +1 +2 +3 +4 diff --git a/data_respin/mt/nlsyms.txt b/data_respin/mt/nlsyms.txt new file mode 100644 index 0000000000000000000000000000000000000000..94ebaf900161394059478fd88aec30e59092a1d7 --- /dev/null +++ b/data_respin/mt/nlsyms.txt @@ -0,0 +1,4 @@ +1 +2 +3 +4 diff --git a/data_respin/te/nlsyms.txt b/data_respin/te/nlsyms.txt new file mode 100644 index 0000000000000000000000000000000000000000..94ebaf900161394059478fd88aec30e59092a1d7 --- /dev/null +++ b/data_respin/te/nlsyms.txt @@ -0,0 +1,4 @@ +1 +2 +3 +4 diff --git a/exp_small/exp_bh/README.md b/exp_small/exp_bh/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4a7f1ba5d38b7bed4e13f2e678052afe235051bc --- /dev/null +++ b/exp_small/exp_bh/README.md @@ -0,0 +1,405 @@ +--- +tags: +- espnet +- audio +- automatic-speech-recognition +language: bh +datasets: +- respin_small +license: cc-by-4.0 +--- + +## ESPnet2 ASR model + +### `SpireLab/spire_respin_baselines_espnet` + +This model was trained by wtc7 using respin_small recipe in [espnet](https://github.com/espnet/espnet/). + +### Demo: How to use in ESPnet2 + +Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html) +if you haven't done that already. + +```bash +cd espnet + +pip install -e . +cd egs2/respin_small/asr1 +./run.sh --skip_data_prep false --skip_train true --download_model SpireLab/spire_respin_baselines_espnet +``` + + +# RESULTS +## Environments +- date: `Sun May 25 02:31:02 IST 2025` +- python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]` +- espnet version: `espnet 202412` +- pytorch version: `pytorch 2.3.0+cu121` +- Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c` + - Commit date: `Tue Jan 14 20:06:15 2025 -0500` + +## exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_bh|2220|22453|85.9|13.4|0.7|1.1|15.2|73.2| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_bh|2220|104745|96.8|2.1|1.1|1.2|4.4|73.2| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| + +## ASR config + +
expand + +``` +config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e8_mactrue_bs6M_gacc1.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +ngpu: 1 +seed: 2022 +num_workers: 8 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: 0 +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +use_deepspeed: false +deepspeed_config: null +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +use_tf32: false +collect_stats: false +write_collected_feats: false +max_epoch: 70 +patience: 5 +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: true +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +use_adapter: false +adapter: lora +save_strategy: all +adapter_conf: {} +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +category_sample_size: 10 +train_shape_file: +- exp_small/exp_bh/asr_stats_raw_bh_char_sp/train/speech_shape +- exp_small/exp_bh/asr_stats_raw_bh_char_sp/train/text_shape.char +valid_shape_file: +- exp_small/exp_bh/asr_stats_raw_bh_char_sp/valid/speech_shape +- exp_small/exp_bh/asr_stats_raw_bh_char_sp/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +chunk_default_fs: null +chunk_max_abs_length: null +chunk_discard_short_samples: true +train_data_path_and_name_and_type: +- - dump/bh/raw/train_bh_sp/wav.scp + - speech + - sound +- - dump/bh/raw/train_bh_sp/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/bh/raw/dev_bh/wav.scp + - speech + - sound +- - dump/bh/raw/dev_bh/text + - text + - text +multi_task_dataset: false +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +allow_multi_rates: false +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 +token_list: +- +- +- +- ा +- े +- क +- र +- ल +- स +- न +- म +- त +- ी +- ि +- ह +- ब +- ् +- प +- व +- ज +- ं +- ो +- द +- ख +- य +- ग +- ट +- ु +- अ +- ई +- इ +- च +- भ +- आ +- ू +- उ +- ए +- श +- ै +- ध +- ड +- फ +- ड़ +- ौ +- . +- छ +- ण +- ष +- थ +- ओ +- ढ़ +- घ +- ठ +- ॉ +- ृ +- ढ +- ऑ +- ँ +- ऊ +- ऋ +- औ +- झ +- ज़ +- फ़ +- ऐ +- ञ +- ऽ +- ख़ +- क़ +- ़ +- ः +- ॅ +- ऱ +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true + brctc_risk_strategy: exp + brctc_group_strategy: end + brctc_risk_factor: 0.0 +joint_net_conf: null +use_preprocessor: true +use_lang_prompt: false +use_nlp_prompt: false +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + n_fft: 512 + win_length: 400 + hop_length: 160 + fs: 16k +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0.0 + - 0.05 + num_time_mask: 5 +normalize: utterance_mvn +normalize_conf: {} +model: espnet +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: e_branchformer +encoder_conf: + output_size: 256 + attention_heads: 4 + attention_layer_type: rel_selfattn + pos_enc_layer_type: rel_pos + rel_pos_type: latest + cgmlp_linear_units: 1024 + cgmlp_conv_kernel: 31 + use_linear_after_conv: false + gate_activation: identity + num_blocks: 8 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d2 + layer_drop_rate: 0.0 + linear_units: 1024 + positionwise_layer_type: linear + use_ffn: true + macaron_ffn: true + merge_conv_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + layer_drop_rate: 0.0 +preprocessor: default +preprocessor_conf: {} +required: +- output_dir +- token_list +version: '202409' +distributed: false +``` + +
+ + + +### Citing ESPnet + +```BibTex +@inproceedings{watanabe2018espnet, + author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, + title={{ESPnet}: End-to-End Speech Processing Toolkit}, + year={2018}, + booktitle={Proceedings of Interspeech}, + pages={2207--2211}, + doi={10.21437/Interspeech.2018-1456}, + url={http://dx.doi.org/10.21437/Interspeech.2018-1456} +} + + + + + + +``` + +or arXiv: + +```bibtex +@misc{watanabe2018espnet, + title={ESPnet: End-to-End Speech Processing Toolkit}, + author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, + year={2018}, + eprint={1804.00015}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` diff --git a/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md new file mode 100644 index 0000000000000000000000000000000000000000..29432e31344892186a8e3fe1dead866bf574b4bb --- /dev/null +++ b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md @@ -0,0 +1,27 @@ + +# RESULTS +## Environments +- date: `Sun May 25 02:31:02 IST 2025` +- python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]` +- espnet version: `espnet 202412` +- pytorch version: `pytorch 2.3.0+cu121` +- Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c` + - Commit date: `Tue Jan 14 20:06:15 2025 -0500` + +## exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_bh|2220|22453|85.9|13.4|0.7|1.1|15.2|73.2| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_bh|2220|104745|96.8|2.1|1.1|1.2|4.4|73.2| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| diff --git a/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c42ee80bdb7b7f9f8b1e310d612c1c4bff7c3ed1 --- /dev/null +++ b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml @@ -0,0 +1,304 @@ +config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e8_mactrue_bs6M_gacc1.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +ngpu: 1 +seed: 2022 +num_workers: 8 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: 0 +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +use_deepspeed: false +deepspeed_config: null +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +use_tf32: false +collect_stats: false +write_collected_feats: false +max_epoch: 70 +patience: 5 +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: true +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +use_adapter: false +adapter: lora +save_strategy: all +adapter_conf: {} +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +category_sample_size: 10 +train_shape_file: +- exp_small/exp_bh/asr_stats_raw_bh_char_sp/train/speech_shape +- exp_small/exp_bh/asr_stats_raw_bh_char_sp/train/text_shape.char +valid_shape_file: +- exp_small/exp_bh/asr_stats_raw_bh_char_sp/valid/speech_shape +- exp_small/exp_bh/asr_stats_raw_bh_char_sp/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +chunk_default_fs: null +chunk_max_abs_length: null +chunk_discard_short_samples: true +train_data_path_and_name_and_type: +- - dump/bh/raw/train_bh_sp/wav.scp + - speech + - sound +- - dump/bh/raw/train_bh_sp/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/bh/raw/dev_bh/wav.scp + - speech + - sound +- - dump/bh/raw/dev_bh/text + - text + - text +multi_task_dataset: false +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +allow_multi_rates: false +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 +token_list: +- +- +- +- ा +- े +- क +- र +- ल +- स +- न +- म +- त +- ी +- ि +- ह +- ब +- ् +- प +- व +- ज +- ं +- ो +- द +- ख +- य +- ग +- ट +- ु +- अ +- ई +- इ +- च +- भ +- आ +- ू +- उ +- ए +- श +- ै +- ध +- ड +- फ +- ड़ +- ौ +- . +- छ +- ण +- ष +- थ +- ओ +- ढ़ +- घ +- ठ +- ॉ +- ृ +- ढ +- ऑ +- ँ +- ऊ +- ऋ +- औ +- झ +- ज़ +- फ़ +- ऐ +- ञ +- ऽ +- ख़ +- क़ +- ़ +- ः +- ॅ +- ऱ +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true + brctc_risk_strategy: exp + brctc_group_strategy: end + brctc_risk_factor: 0.0 +joint_net_conf: null +use_preprocessor: true +use_lang_prompt: false +use_nlp_prompt: false +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + n_fft: 512 + win_length: 400 + hop_length: 160 + fs: 16k +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0.0 + - 0.05 + num_time_mask: 5 +normalize: utterance_mvn +normalize_conf: {} +model: espnet +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: e_branchformer +encoder_conf: + output_size: 256 + attention_heads: 4 + attention_layer_type: rel_selfattn + pos_enc_layer_type: rel_pos + rel_pos_type: latest + cgmlp_linear_units: 1024 + cgmlp_conv_kernel: 31 + use_linear_after_conv: false + gate_activation: identity + num_blocks: 8 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d2 + layer_drop_rate: 0.0 + linear_units: 1024 + positionwise_layer_type: linear + use_ffn: true + macaron_ffn: true + merge_conv_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + layer_drop_rate: 0.0 +preprocessor: default +preprocessor_conf: {} +required: +- output_dir +- token_list +version: '202409' +distributed: false diff --git a/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png new file mode 100644 index 0000000000000000000000000000000000000000..dc22c60ab4cbf3935986a5c6a147917f4e8cb5cc Binary files /dev/null and b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png differ diff --git a/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..40fd1461dcc4be23aa2044f2765381bd0babeb63 Binary files /dev/null and b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png differ diff --git a/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png new file mode 100644 index 0000000000000000000000000000000000000000..dc59263d59a6701554d4a5ce6ee97237eae220b0 Binary files /dev/null and b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png differ diff --git a/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..33d9ebfa5934023b842edf43f3b4ffb2eb4ed1b2 Binary files /dev/null and b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png differ diff --git a/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png new file mode 100644 index 0000000000000000000000000000000000000000..b81631543c9df8fe6cb495a10c59e30554838552 Binary files /dev/null and b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png differ diff --git a/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..a5e18032c65dd1d6fb4f824d537bfc53b41a05ff Binary files /dev/null and b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png differ diff --git a/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png new file mode 100644 index 0000000000000000000000000000000000000000..312541f6d1ab2648d316caa3d54d4cd5c3bb6540 Binary files /dev/null and b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png differ diff --git a/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..7ec0636e69cdb9c21e172502aeb675a16eea29fe Binary files /dev/null and b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png differ diff --git a/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png new file mode 100644 index 0000000000000000000000000000000000000000..c152b6e98daeef099ebbe5696e9ba367e9f705a1 Binary files /dev/null and b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png differ diff --git a/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..a7af94bc4b5beb29b3ea04870bf32902759e9d61 Binary files /dev/null and b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png differ diff --git a/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png new file mode 100644 index 0000000000000000000000000000000000000000..6f0fac44a8f46b0364360513015e6df8db00f9d7 Binary files /dev/null and b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png differ diff --git a/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..fbc890f02e1f270d6708a171480231d515167e95 Binary files /dev/null and b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png differ diff --git a/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png new file mode 100644 index 0000000000000000000000000000000000000000..3a39b83490bd817b24edfca5aa4beff696d0912d Binary files /dev/null and b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png differ diff --git a/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png new file mode 100644 index 0000000000000000000000000000000000000000..6a2382085022615669e2661b8a686fbaa7f4878a Binary files /dev/null and b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png differ diff --git a/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png new file mode 100644 index 0000000000000000000000000000000000000000..0a710a8f7f3e91a8be08adbf358270fc236e6a90 Binary files /dev/null and b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png differ diff --git a/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png new file mode 100644 index 0000000000000000000000000000000000000000..c99ba5d9d1a0a392074b5f22b2e14dd2ea2f45b9 Binary files /dev/null and b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png differ diff --git a/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png new file mode 100644 index 0000000000000000000000000000000000000000..d1c5e28d40cdad9bf2d270f1a9a2143b02882475 Binary files /dev/null and b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png differ diff --git a/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth new file mode 100644 index 0000000000000000000000000000000000000000..6da7f4ce30fdae34445cb63c812d7c0284276e9c --- /dev/null +++ b/exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fd89d459111cb4a487d4809ee4278936f8a8eeec09385890a3a4ac45df5eabe +size 112628010 diff --git a/exp_small/exp_bh/meta.yaml b/exp_small/exp_bh/meta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..288a5f5427a15e62a608635e3ff2e16e743bd325 --- /dev/null +++ b/exp_small/exp_bh/meta.yaml @@ -0,0 +1,8 @@ +espnet: '202412' +files: + asr_model_file: exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth +python: "3.8.10 (default, Mar 18 2025, 20:04:55) \n[GCC 9.4.0]" +timestamp: 1748120464.687666 +torch: 2.3.0+cu121 +yaml_files: + asr_train_config: exp_small/exp_bh/asr_bh_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml diff --git a/exp_small/exp_bn/README.md b/exp_small/exp_bn/README.md new file mode 100644 index 0000000000000000000000000000000000000000..adabbdbc89b38ddc395a8d4317081cf5a4da2c6c --- /dev/null +++ b/exp_small/exp_bn/README.md @@ -0,0 +1,399 @@ +--- +tags: +- espnet +- audio +- automatic-speech-recognition +language: bn +datasets: +- respin_small +license: cc-by-4.0 +--- + +## ESPnet2 ASR model + +### `SpireLab/spire_respin_baselines_espnet` + +This model was trained by wtc7 using respin_small recipe in [espnet](https://github.com/espnet/espnet/). + +### Demo: How to use in ESPnet2 + +Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html) +if you haven't done that already. + +```bash +cd espnet + +pip install -e . +cd egs2/respin_small/asr1 +./run.sh --skip_data_prep false --skip_train true --download_model SpireLab/spire_respin_baselines_espnet +``` + + +# RESULTS +## Environments +- date: `Sun May 25 02:31:23 IST 2025` +- python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]` +- espnet version: `espnet 202412` +- pytorch version: `pytorch 2.3.0+cu121` +- Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c` + - Commit date: `Tue Jan 14 20:06:15 2025 -0500` + +## exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_bn|2174|20534|86.3|12.6|1.1|1.2|15.0|65.7| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_bn|2174|114101|97.1|1.6|1.2|1.2|4.1|65.7| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| + +## ASR config + +
expand + +``` +config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e8_mactrue_bs6M_gacc1.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +ngpu: 1 +seed: 2022 +num_workers: 8 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: 0 +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +use_deepspeed: false +deepspeed_config: null +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +use_tf32: false +collect_stats: false +write_collected_feats: false +max_epoch: 70 +patience: 5 +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: true +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +use_adapter: false +adapter: lora +save_strategy: all +adapter_conf: {} +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +category_sample_size: 10 +train_shape_file: +- exp_small/exp_bn/asr_stats_raw_bn_char_sp/train/speech_shape +- exp_small/exp_bn/asr_stats_raw_bn_char_sp/train/text_shape.char +valid_shape_file: +- exp_small/exp_bn/asr_stats_raw_bn_char_sp/valid/speech_shape +- exp_small/exp_bn/asr_stats_raw_bn_char_sp/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +chunk_default_fs: null +chunk_max_abs_length: null +chunk_discard_short_samples: true +train_data_path_and_name_and_type: +- - dump/bn/raw/train_bn_sp/wav.scp + - speech + - sound +- - dump/bn/raw/train_bn_sp/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/bn/raw/dev_bn/wav.scp + - speech + - sound +- - dump/bn/raw/dev_bn/text + - text + - text +multi_task_dataset: false +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +allow_multi_rates: false +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 +token_list: +- +- +- +- া +- ে +- র +- ক +- ্ +- ি +- ন +- ব +- ল +- য +- ম +- স +- ত +- প +- ট +- য় +- হ +- ু +- দ +- ো +- জ +- ই +- গ +- চ +- ছ +- শ +- আ +- থ +- ভ +- এ +- ষ +- ধ +- ী +- উ +- ফ +- খ +- ড +- অ +- ং +- ও +- ড় +- ণ +- ঙ +- ঁ +- ৃ +- . +- ঠ +- ৈ +- ূ +- ৎ +- ঞ +- ঘ +- ঋ +- ঝ +- ৌ +- ঢ +- ় +- ঢ় +- ঃ +- ঊ +- ঐ +- ঔ +- ঈ +- ৠ +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true + brctc_risk_strategy: exp + brctc_group_strategy: end + brctc_risk_factor: 0.0 +joint_net_conf: null +use_preprocessor: true +use_lang_prompt: false +use_nlp_prompt: false +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + n_fft: 512 + win_length: 400 + hop_length: 160 + fs: 16k +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0.0 + - 0.05 + num_time_mask: 5 +normalize: utterance_mvn +normalize_conf: {} +model: espnet +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: e_branchformer +encoder_conf: + output_size: 256 + attention_heads: 4 + attention_layer_type: rel_selfattn + pos_enc_layer_type: rel_pos + rel_pos_type: latest + cgmlp_linear_units: 1024 + cgmlp_conv_kernel: 31 + use_linear_after_conv: false + gate_activation: identity + num_blocks: 8 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d2 + layer_drop_rate: 0.0 + linear_units: 1024 + positionwise_layer_type: linear + use_ffn: true + macaron_ffn: true + merge_conv_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + layer_drop_rate: 0.0 +preprocessor: default +preprocessor_conf: {} +required: +- output_dir +- token_list +version: '202409' +distributed: false +``` + +
+ + + +### Citing ESPnet + +```BibTex +@inproceedings{watanabe2018espnet, + author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, + title={{ESPnet}: End-to-End Speech Processing Toolkit}, + year={2018}, + booktitle={Proceedings of Interspeech}, + pages={2207--2211}, + doi={10.21437/Interspeech.2018-1456}, + url={http://dx.doi.org/10.21437/Interspeech.2018-1456} +} + + + + + + +``` + +or arXiv: + +```bibtex +@misc{watanabe2018espnet, + title={ESPnet: End-to-End Speech Processing Toolkit}, + author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, + year={2018}, + eprint={1804.00015}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` diff --git a/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md new file mode 100644 index 0000000000000000000000000000000000000000..5443480e2fc4f35be2ef9d81d111f5e32b2e25eb --- /dev/null +++ b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md @@ -0,0 +1,27 @@ + +# RESULTS +## Environments +- date: `Sun May 25 02:31:23 IST 2025` +- python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]` +- espnet version: `espnet 202412` +- pytorch version: `pytorch 2.3.0+cu121` +- Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c` + - Commit date: `Tue Jan 14 20:06:15 2025 -0500` + +## exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_bn|2174|20534|86.3|12.6|1.1|1.2|15.0|65.7| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_bn|2174|114101|97.1|1.6|1.2|1.2|4.1|65.7| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| diff --git a/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c1419716fd885920738c116fc7c1ee849eb74e46 --- /dev/null +++ b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml @@ -0,0 +1,298 @@ +config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e8_mactrue_bs6M_gacc1.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +ngpu: 1 +seed: 2022 +num_workers: 8 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: 0 +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +use_deepspeed: false +deepspeed_config: null +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +use_tf32: false +collect_stats: false +write_collected_feats: false +max_epoch: 70 +patience: 5 +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: true +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +use_adapter: false +adapter: lora +save_strategy: all +adapter_conf: {} +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +category_sample_size: 10 +train_shape_file: +- exp_small/exp_bn/asr_stats_raw_bn_char_sp/train/speech_shape +- exp_small/exp_bn/asr_stats_raw_bn_char_sp/train/text_shape.char +valid_shape_file: +- exp_small/exp_bn/asr_stats_raw_bn_char_sp/valid/speech_shape +- exp_small/exp_bn/asr_stats_raw_bn_char_sp/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +chunk_default_fs: null +chunk_max_abs_length: null +chunk_discard_short_samples: true +train_data_path_and_name_and_type: +- - dump/bn/raw/train_bn_sp/wav.scp + - speech + - sound +- - dump/bn/raw/train_bn_sp/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/bn/raw/dev_bn/wav.scp + - speech + - sound +- - dump/bn/raw/dev_bn/text + - text + - text +multi_task_dataset: false +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +allow_multi_rates: false +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 +token_list: +- +- +- +- া +- ে +- র +- ক +- ্ +- ি +- ন +- ব +- ল +- য +- ম +- স +- ত +- প +- ট +- য় +- হ +- ু +- দ +- ো +- জ +- ই +- গ +- চ +- ছ +- শ +- আ +- থ +- ভ +- এ +- ষ +- ধ +- ী +- উ +- ফ +- খ +- ড +- অ +- ং +- ও +- ড় +- ণ +- ঙ +- ঁ +- ৃ +- . +- ঠ +- ৈ +- ূ +- ৎ +- ঞ +- ঘ +- ঋ +- ঝ +- ৌ +- ঢ +- ় +- ঢ় +- ঃ +- ঊ +- ঐ +- ঔ +- ঈ +- ৠ +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true + brctc_risk_strategy: exp + brctc_group_strategy: end + brctc_risk_factor: 0.0 +joint_net_conf: null +use_preprocessor: true +use_lang_prompt: false +use_nlp_prompt: false +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + n_fft: 512 + win_length: 400 + hop_length: 160 + fs: 16k +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0.0 + - 0.05 + num_time_mask: 5 +normalize: utterance_mvn +normalize_conf: {} +model: espnet +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: e_branchformer +encoder_conf: + output_size: 256 + attention_heads: 4 + attention_layer_type: rel_selfattn + pos_enc_layer_type: rel_pos + rel_pos_type: latest + cgmlp_linear_units: 1024 + cgmlp_conv_kernel: 31 + use_linear_after_conv: false + gate_activation: identity + num_blocks: 8 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d2 + layer_drop_rate: 0.0 + linear_units: 1024 + positionwise_layer_type: linear + use_ffn: true + macaron_ffn: true + merge_conv_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + layer_drop_rate: 0.0 +preprocessor: default +preprocessor_conf: {} +required: +- output_dir +- token_list +version: '202409' +distributed: false diff --git a/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png new file mode 100644 index 0000000000000000000000000000000000000000..c761fbe1a4309c233b24d8cd38d16543fba7fbd5 Binary files /dev/null and b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png differ diff --git a/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..94c807bae89326eaf580a4eac61973e5fe23032d Binary files /dev/null and b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png differ diff --git a/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png new file mode 100644 index 0000000000000000000000000000000000000000..9aafd9120f877993d5f5ff68d611c9e735fb838f Binary files /dev/null and b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png differ diff --git a/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..699b37544ffd0dd1fe482621b16ce0c7bfc07359 Binary files /dev/null and b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png differ diff --git a/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png new file mode 100644 index 0000000000000000000000000000000000000000..d2d0e4b9388766a7180a30f0a3081b00b9a4911d Binary files /dev/null and b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png differ diff --git a/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..d791e037051df025937d1d85f2af2f7aa0ef9509 Binary files /dev/null and b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png differ diff --git a/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png new file mode 100644 index 0000000000000000000000000000000000000000..09524877336acf572b7df4c5baea5cdd617e8f1c Binary files /dev/null and b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png differ diff --git a/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..59010b2dc657cae10c879645d6f7d0558a0adf7d Binary files /dev/null and b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png differ diff --git a/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png new file mode 100644 index 0000000000000000000000000000000000000000..cadabe11c0f7b67bcd1039b59b7dd1b8857f6ca1 Binary files /dev/null and b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png differ diff --git a/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..0e12f4361516bbf9f589bfc99f3efd5cbe59e064 Binary files /dev/null and b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png differ diff --git a/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png new file mode 100644 index 0000000000000000000000000000000000000000..dc3acc2d53d8e956d9a16fae0c64d3eaaa347570 Binary files /dev/null and b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png differ diff --git a/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..a5cbd3dcf98f9513d77f01d6049e33c34fd6281a Binary files /dev/null and b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png differ diff --git a/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png new file mode 100644 index 0000000000000000000000000000000000000000..b029de85095e0969451e2904f8b15fe26c3dc86e Binary files /dev/null and b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png differ diff --git a/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png new file mode 100644 index 0000000000000000000000000000000000000000..496ffa5921abf7e0f8c9087267850423f3ef10d8 Binary files /dev/null and b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png differ diff --git a/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png new file mode 100644 index 0000000000000000000000000000000000000000..21eaaf53111319a33c77fc648f9ba7b82a7ef183 Binary files /dev/null and b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png differ diff --git a/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png new file mode 100644 index 0000000000000000000000000000000000000000..cf37d9cd34f1e8dcb42f50ea99bbdf0f3d6886da Binary files /dev/null and b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png differ diff --git a/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png new file mode 100644 index 0000000000000000000000000000000000000000..b5eb8ccd9e15d50ddb672c31a066a77e0b76e9b5 Binary files /dev/null and b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png differ diff --git a/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth new file mode 100644 index 0000000000000000000000000000000000000000..610f235d38ff660978ffcdf57e9b51bc8707bb45 --- /dev/null +++ b/exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:467a3d2b8252e0530c63dbc92ce05e8d96a104907690b7f43b618d82d0806530 +size 112609578 diff --git a/exp_small/exp_bn/meta.yaml b/exp_small/exp_bn/meta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..90ecdedf038e0a51058b482a5885978271ccdb51 --- /dev/null +++ b/exp_small/exp_bn/meta.yaml @@ -0,0 +1,8 @@ +espnet: '202412' +files: + asr_model_file: exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth +python: "3.8.10 (default, Mar 18 2025, 20:04:55) \n[GCC 9.4.0]" +timestamp: 1748120485.2726 +torch: 2.3.0+cu121 +yaml_files: + asr_train_config: exp_small/exp_bn/asr_bn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml diff --git a/exp_small/exp_ch/README.md b/exp_small/exp_ch/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bceb479e831397cf9188a3c7373890a3e402bc9a --- /dev/null +++ b/exp_small/exp_ch/README.md @@ -0,0 +1,403 @@ +--- +tags: +- espnet +- audio +- automatic-speech-recognition +language: ch +datasets: +- respin_small +license: cc-by-4.0 +--- + +## ESPnet2 ASR model + +### `SpireLab/spire_respin_baselines_espnet` + +This model was trained by wtc7 using respin_small recipe in [espnet](https://github.com/espnet/espnet/). + +### Demo: How to use in ESPnet2 + +Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html) +if you haven't done that already. + +```bash +cd espnet + +pip install -e . +cd egs2/respin_small/asr1 +./run.sh --skip_data_prep false --skip_train true --download_model SpireLab/spire_respin_baselines_espnet +``` + + +# RESULTS +## Environments +- date: `Sun May 25 02:31:44 IST 2025` +- python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]` +- espnet version: `espnet 202412` +- pytorch version: `pytorch 2.3.0+cu121` +- Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c` + - Commit date: `Tue Jan 14 20:06:15 2025 -0500` + +## exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_ch|2234|27969|89.9|9.6|0.5|0.5|10.6|67.5| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_ch|2234|120476|97.7|1.4|0.9|0.8|3.1|67.5| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| + +## ASR config + +
expand + +``` +config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e8_mactrue_bs6M_gacc1.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +ngpu: 1 +seed: 2022 +num_workers: 8 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: 0 +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +use_deepspeed: false +deepspeed_config: null +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +use_tf32: false +collect_stats: false +write_collected_feats: false +max_epoch: 70 +patience: 5 +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: true +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +use_adapter: false +adapter: lora +save_strategy: all +adapter_conf: {} +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +category_sample_size: 10 +train_shape_file: +- exp_small/exp_ch/asr_stats_raw_ch_char_sp/train/speech_shape +- exp_small/exp_ch/asr_stats_raw_ch_char_sp/train/text_shape.char +valid_shape_file: +- exp_small/exp_ch/asr_stats_raw_ch_char_sp/valid/speech_shape +- exp_small/exp_ch/asr_stats_raw_ch_char_sp/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +chunk_default_fs: null +chunk_max_abs_length: null +chunk_discard_short_samples: true +train_data_path_and_name_and_type: +- - dump/ch/raw/train_ch_sp/wav.scp + - speech + - sound +- - dump/ch/raw/train_ch_sp/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/ch/raw/dev_ch/wav.scp + - speech + - sound +- - dump/ch/raw/dev_ch/text + - text + - text +multi_task_dataset: false +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +allow_multi_rates: false +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 +token_list: +- +- +- +- े +- ा +- क +- र +- न +- ल +- म +- ह +- स +- ब +- ो +- ी +- ि +- त +- ज +- प +- ं +- थ +- य +- ग +- द +- व +- ख +- इ +- ् +- ु +- अ +- ट +- च +- ू +- ए +- उ +- भ +- घ +- फ +- आ +- ड़ +- ध +- ओ +- ई +- ड +- छ +- . +- ँ +- ै +- ठ +- ौ +- झ +- ढ़ +- श +- ढ +- ण +- ऊ +- ॉ +- ऑ +- ष +- ऋ +- ृ +- ऐ +- औ +- फ़ +- ज़ +- ॅ +- ः +- क़ +- ख़ +- ञ +- ़ +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true + brctc_risk_strategy: exp + brctc_group_strategy: end + brctc_risk_factor: 0.0 +joint_net_conf: null +use_preprocessor: true +use_lang_prompt: false +use_nlp_prompt: false +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + n_fft: 512 + win_length: 400 + hop_length: 160 + fs: 16k +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0.0 + - 0.05 + num_time_mask: 5 +normalize: utterance_mvn +normalize_conf: {} +model: espnet +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: e_branchformer +encoder_conf: + output_size: 256 + attention_heads: 4 + attention_layer_type: rel_selfattn + pos_enc_layer_type: rel_pos + rel_pos_type: latest + cgmlp_linear_units: 1024 + cgmlp_conv_kernel: 31 + use_linear_after_conv: false + gate_activation: identity + num_blocks: 8 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d2 + layer_drop_rate: 0.0 + linear_units: 1024 + positionwise_layer_type: linear + use_ffn: true + macaron_ffn: true + merge_conv_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + layer_drop_rate: 0.0 +preprocessor: default +preprocessor_conf: {} +required: +- output_dir +- token_list +version: '202409' +distributed: false +``` + +
+ + + +### Citing ESPnet + +```BibTex +@inproceedings{watanabe2018espnet, + author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, + title={{ESPnet}: End-to-End Speech Processing Toolkit}, + year={2018}, + booktitle={Proceedings of Interspeech}, + pages={2207--2211}, + doi={10.21437/Interspeech.2018-1456}, + url={http://dx.doi.org/10.21437/Interspeech.2018-1456} +} + + + + + + +``` + +or arXiv: + +```bibtex +@misc{watanabe2018espnet, + title={ESPnet: End-to-End Speech Processing Toolkit}, + author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, + year={2018}, + eprint={1804.00015}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` diff --git a/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md new file mode 100644 index 0000000000000000000000000000000000000000..97ebfed36379cbd0192a32b216b297c5dd66c61e --- /dev/null +++ b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md @@ -0,0 +1,27 @@ + +# RESULTS +## Environments +- date: `Sun May 25 02:31:44 IST 2025` +- python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]` +- espnet version: `espnet 202412` +- pytorch version: `pytorch 2.3.0+cu121` +- Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c` + - Commit date: `Tue Jan 14 20:06:15 2025 -0500` + +## exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_ch|2234|27969|89.9|9.6|0.5|0.5|10.6|67.5| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_ch|2234|120476|97.7|1.4|0.9|0.8|3.1|67.5| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| diff --git a/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..164c54c514159cf6f19a2f04bdd962f51681b6e2 --- /dev/null +++ b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml @@ -0,0 +1,302 @@ +config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e8_mactrue_bs6M_gacc1.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +ngpu: 1 +seed: 2022 +num_workers: 8 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: 0 +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +use_deepspeed: false +deepspeed_config: null +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +use_tf32: false +collect_stats: false +write_collected_feats: false +max_epoch: 70 +patience: 5 +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: true +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +use_adapter: false +adapter: lora +save_strategy: all +adapter_conf: {} +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +category_sample_size: 10 +train_shape_file: +- exp_small/exp_ch/asr_stats_raw_ch_char_sp/train/speech_shape +- exp_small/exp_ch/asr_stats_raw_ch_char_sp/train/text_shape.char +valid_shape_file: +- exp_small/exp_ch/asr_stats_raw_ch_char_sp/valid/speech_shape +- exp_small/exp_ch/asr_stats_raw_ch_char_sp/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +chunk_default_fs: null +chunk_max_abs_length: null +chunk_discard_short_samples: true +train_data_path_and_name_and_type: +- - dump/ch/raw/train_ch_sp/wav.scp + - speech + - sound +- - dump/ch/raw/train_ch_sp/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/ch/raw/dev_ch/wav.scp + - speech + - sound +- - dump/ch/raw/dev_ch/text + - text + - text +multi_task_dataset: false +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +allow_multi_rates: false +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 +token_list: +- +- +- +- े +- ा +- क +- र +- न +- ल +- म +- ह +- स +- ब +- ो +- ी +- ि +- त +- ज +- प +- ं +- थ +- य +- ग +- द +- व +- ख +- इ +- ् +- ु +- अ +- ट +- च +- ू +- ए +- उ +- भ +- घ +- फ +- आ +- ड़ +- ध +- ओ +- ई +- ड +- छ +- . +- ँ +- ै +- ठ +- ौ +- झ +- ढ़ +- श +- ढ +- ण +- ऊ +- ॉ +- ऑ +- ष +- ऋ +- ृ +- ऐ +- औ +- फ़ +- ज़ +- ॅ +- ः +- क़ +- ख़ +- ञ +- ़ +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true + brctc_risk_strategy: exp + brctc_group_strategy: end + brctc_risk_factor: 0.0 +joint_net_conf: null +use_preprocessor: true +use_lang_prompt: false +use_nlp_prompt: false +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + n_fft: 512 + win_length: 400 + hop_length: 160 + fs: 16k +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0.0 + - 0.05 + num_time_mask: 5 +normalize: utterance_mvn +normalize_conf: {} +model: espnet +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: e_branchformer +encoder_conf: + output_size: 256 + attention_heads: 4 + attention_layer_type: rel_selfattn + pos_enc_layer_type: rel_pos + rel_pos_type: latest + cgmlp_linear_units: 1024 + cgmlp_conv_kernel: 31 + use_linear_after_conv: false + gate_activation: identity + num_blocks: 8 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d2 + layer_drop_rate: 0.0 + linear_units: 1024 + positionwise_layer_type: linear + use_ffn: true + macaron_ffn: true + merge_conv_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + layer_drop_rate: 0.0 +preprocessor: default +preprocessor_conf: {} +required: +- output_dir +- token_list +version: '202409' +distributed: false diff --git a/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png new file mode 100644 index 0000000000000000000000000000000000000000..bfbf084b5c303f273307444359cf0dd20168d664 Binary files /dev/null and b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png differ diff --git a/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..1b094e48f1a29ad262dac6aa5a89ab614c73d042 Binary files /dev/null and b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png differ diff --git a/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png new file mode 100644 index 0000000000000000000000000000000000000000..848ffbb2d15326a984c3dbd07c3d82bdba0789f2 Binary files /dev/null and b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png differ diff --git a/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..a9abec601b57e3dafaa2c96414aea294da05cd40 Binary files /dev/null and b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png differ diff --git a/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png new file mode 100644 index 0000000000000000000000000000000000000000..8a093ef42413b9d7a2e4650edf2c4c75f1fda416 Binary files /dev/null and b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png differ diff --git a/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..31b49bf044b74f35166e3af0e654457417b56f52 Binary files /dev/null and b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png differ diff --git a/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png new file mode 100644 index 0000000000000000000000000000000000000000..7c60ccde4d6b2d28ac7d96c6b06df7a0c335206a Binary files /dev/null and b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png differ diff --git a/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..4c0dfde15aed9ac245bc7f83ba81e9ee7c1c0d55 Binary files /dev/null and b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png differ diff --git a/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png new file mode 100644 index 0000000000000000000000000000000000000000..cf6014d74ff10b27a435333ea008ca92d248f41f Binary files /dev/null and b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png differ diff --git a/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..1c0e2dbd7a4d9d976ec4726169ac37167fe337b7 Binary files /dev/null and b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png differ diff --git a/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png new file mode 100644 index 0000000000000000000000000000000000000000..44996e5ca0f8c68524bdf1aed0837e6294f31831 Binary files /dev/null and b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png differ diff --git a/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..712645eff6127fd9482bd78e5c1c17fcbd5171c6 Binary files /dev/null and b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png differ diff --git a/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png new file mode 100644 index 0000000000000000000000000000000000000000..1744bd0c2e066fbb9629485e030f736bb8a84728 Binary files /dev/null and b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png differ diff --git a/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png new file mode 100644 index 0000000000000000000000000000000000000000..027caa52a656255e509e1cc7513840c7b21b69ab Binary files /dev/null and b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png differ diff --git a/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png new file mode 100644 index 0000000000000000000000000000000000000000..9161b26b2378a41744d3802d6285c0ac42455cc0 Binary files /dev/null and b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png differ diff --git a/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png new file mode 100644 index 0000000000000000000000000000000000000000..e6d25269beae8ee3ca94a4e0a1b06087fa3d2358 Binary files /dev/null and b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png differ diff --git a/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png new file mode 100644 index 0000000000000000000000000000000000000000..7b2b74114e9174431c33ecb0b392d7b4a672ff38 Binary files /dev/null and b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png differ diff --git a/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth new file mode 100644 index 0000000000000000000000000000000000000000..e9ef22231446dec3bee3a1a764042e1ac46e8c90 --- /dev/null +++ b/exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d390cb7554a32f765a545a74b322ba57f3f419944ba4e2e673fbe9a2252df35 +size 112621866 diff --git a/exp_small/exp_ch/meta.yaml b/exp_small/exp_ch/meta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..79f85afef90066535b36341cd3ed87957441d4f2 --- /dev/null +++ b/exp_small/exp_ch/meta.yaml @@ -0,0 +1,8 @@ +espnet: '202412' +files: + asr_model_file: exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth +python: "3.8.10 (default, Mar 18 2025, 20:04:55) \n[GCC 9.4.0]" +timestamp: 1748120505.893883 +torch: 2.3.0+cu121 +yaml_files: + asr_train_config: exp_small/exp_ch/asr_ch_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml diff --git a/exp_small/exp_hi/README.md b/exp_small/exp_hi/README.md new file mode 100644 index 0000000000000000000000000000000000000000..98ecfb80925b74d98cdd4edc7bdd456077505da6 --- /dev/null +++ b/exp_small/exp_hi/README.md @@ -0,0 +1,407 @@ +--- +tags: +- espnet +- audio +- automatic-speech-recognition +language: hi +datasets: +- respin_small +license: cc-by-4.0 +--- + +## ESPnet2 ASR model + +### `SpireLab/spire_respin_baselines_espnet` + +This model was trained by wtc7 using respin_small recipe in [espnet](https://github.com/espnet/espnet/). + +### Demo: How to use in ESPnet2 + +Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html) +if you haven't done that already. + +```bash +cd espnet + +pip install -e . +cd egs2/respin_small/asr1 +./run.sh --skip_data_prep false --skip_train true --download_model SpireLab/spire_respin_baselines_espnet +``` + + +# RESULTS +## Environments +- date: `Sun May 25 02:32:04 IST 2025` +- python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]` +- espnet version: `espnet 202412` +- pytorch version: `pytorch 2.3.0+cu121` +- Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c` + - Commit date: `Tue Jan 14 20:06:15 2025 -0500` + +## exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_hi|2288|24958|90.9|8.6|0.5|0.9|9.9|55.8| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_hi|2288|121598|97.6|1.6|0.8|0.8|3.1|55.8| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| + +## ASR config + +
expand + +``` +config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e8_mactrue_bs6M_gacc1.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +ngpu: 1 +seed: 2022 +num_workers: 8 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: 0 +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +use_deepspeed: false +deepspeed_config: null +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +use_tf32: false +collect_stats: false +write_collected_feats: false +max_epoch: 70 +patience: 5 +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: true +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +use_adapter: false +adapter: lora +save_strategy: all +adapter_conf: {} +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +category_sample_size: 10 +train_shape_file: +- exp_small/exp_hi/asr_stats_raw_hi_char_sp/train/speech_shape +- exp_small/exp_hi/asr_stats_raw_hi_char_sp/train/text_shape.char +valid_shape_file: +- exp_small/exp_hi/asr_stats_raw_hi_char_sp/valid/speech_shape +- exp_small/exp_hi/asr_stats_raw_hi_char_sp/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +chunk_default_fs: null +chunk_max_abs_length: null +chunk_discard_short_samples: true +train_data_path_and_name_and_type: +- - dump/hi/raw/train_hi_sp/wav.scp + - speech + - sound +- - dump/hi/raw/train_hi_sp/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/hi/raw/dev_hi/wav.scp + - speech + - sound +- - dump/hi/raw/dev_hi/text + - text + - text +multi_task_dataset: false +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +allow_multi_rates: false +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 +token_list: +- +- +- +- ा +- क +- े +- र +- ् +- त +- स +- ी +- न +- ह +- ं +- ि +- म +- ो +- प +- ै +- ल +- य +- ज +- ब +- व +- द +- ग +- ु +- ट +- ए +- ू +- श +- च +- भ +- अ +- ख +- आ +- ध +- ड +- फ +- उ +- ण +- ई +- ष +- इ +- थ +- ौ +- ड़ +- . +- छ +- औ +- ॉ +- ृ +- ँ +- झ +- ऋ +- घ +- ओ +- ढ़ +- ठ +- ज़ +- ऑ +- ऊ +- ऐ +- ञ +- ़ +- फ़ +- ढ +- ः +- ख़ +- क़ +- ग़ +- ङ +- ॅ +- ऍ +- ॠ +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true + brctc_risk_strategy: exp + brctc_group_strategy: end + brctc_risk_factor: 0.0 +joint_net_conf: null +use_preprocessor: true +use_lang_prompt: false +use_nlp_prompt: false +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + n_fft: 512 + win_length: 400 + hop_length: 160 + fs: 16k +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0.0 + - 0.05 + num_time_mask: 5 +normalize: utterance_mvn +normalize_conf: {} +model: espnet +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: e_branchformer +encoder_conf: + output_size: 256 + attention_heads: 4 + attention_layer_type: rel_selfattn + pos_enc_layer_type: rel_pos + rel_pos_type: latest + cgmlp_linear_units: 1024 + cgmlp_conv_kernel: 31 + use_linear_after_conv: false + gate_activation: identity + num_blocks: 8 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d2 + layer_drop_rate: 0.0 + linear_units: 1024 + positionwise_layer_type: linear + use_ffn: true + macaron_ffn: true + merge_conv_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + layer_drop_rate: 0.0 +preprocessor: default +preprocessor_conf: {} +required: +- output_dir +- token_list +version: '202409' +distributed: false +``` + +
+ + + +### Citing ESPnet + +```BibTex +@inproceedings{watanabe2018espnet, + author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, + title={{ESPnet}: End-to-End Speech Processing Toolkit}, + year={2018}, + booktitle={Proceedings of Interspeech}, + pages={2207--2211}, + doi={10.21437/Interspeech.2018-1456}, + url={http://dx.doi.org/10.21437/Interspeech.2018-1456} +} + + + + + + +``` + +or arXiv: + +```bibtex +@misc{watanabe2018espnet, + title={ESPnet: End-to-End Speech Processing Toolkit}, + author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, + year={2018}, + eprint={1804.00015}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` diff --git a/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md new file mode 100644 index 0000000000000000000000000000000000000000..d9b1ec533a7c45e95b9cbf25af798acede7d73c5 --- /dev/null +++ b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md @@ -0,0 +1,27 @@ + +# RESULTS +## Environments +- date: `Sun May 25 02:32:04 IST 2025` +- python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]` +- espnet version: `espnet 202412` +- pytorch version: `pytorch 2.3.0+cu121` +- Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c` + - Commit date: `Tue Jan 14 20:06:15 2025 -0500` + +## exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_hi|2288|24958|90.9|8.6|0.5|0.9|9.9|55.8| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_hi|2288|121598|97.6|1.6|0.8|0.8|3.1|55.8| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| diff --git a/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aa7c07027e7334c2fb8744d3a49eda525dae5a56 --- /dev/null +++ b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml @@ -0,0 +1,306 @@ +config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e8_mactrue_bs6M_gacc1.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +ngpu: 1 +seed: 2022 +num_workers: 8 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: 0 +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +use_deepspeed: false +deepspeed_config: null +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +use_tf32: false +collect_stats: false +write_collected_feats: false +max_epoch: 70 +patience: 5 +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: true +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +use_adapter: false +adapter: lora +save_strategy: all +adapter_conf: {} +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +category_sample_size: 10 +train_shape_file: +- exp_small/exp_hi/asr_stats_raw_hi_char_sp/train/speech_shape +- exp_small/exp_hi/asr_stats_raw_hi_char_sp/train/text_shape.char +valid_shape_file: +- exp_small/exp_hi/asr_stats_raw_hi_char_sp/valid/speech_shape +- exp_small/exp_hi/asr_stats_raw_hi_char_sp/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +chunk_default_fs: null +chunk_max_abs_length: null +chunk_discard_short_samples: true +train_data_path_and_name_and_type: +- - dump/hi/raw/train_hi_sp/wav.scp + - speech + - sound +- - dump/hi/raw/train_hi_sp/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/hi/raw/dev_hi/wav.scp + - speech + - sound +- - dump/hi/raw/dev_hi/text + - text + - text +multi_task_dataset: false +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +allow_multi_rates: false +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 +token_list: +- +- +- +- ा +- क +- े +- र +- ् +- त +- स +- ी +- न +- ह +- ं +- ि +- म +- ो +- प +- ै +- ल +- य +- ज +- ब +- व +- द +- ग +- ु +- ट +- ए +- ू +- श +- च +- भ +- अ +- ख +- आ +- ध +- ड +- फ +- उ +- ण +- ई +- ष +- इ +- थ +- ौ +- ड़ +- . +- छ +- औ +- ॉ +- ृ +- ँ +- झ +- ऋ +- घ +- ओ +- ढ़ +- ठ +- ज़ +- ऑ +- ऊ +- ऐ +- ञ +- ़ +- फ़ +- ढ +- ः +- ख़ +- क़ +- ग़ +- ङ +- ॅ +- ऍ +- ॠ +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true + brctc_risk_strategy: exp + brctc_group_strategy: end + brctc_risk_factor: 0.0 +joint_net_conf: null +use_preprocessor: true +use_lang_prompt: false +use_nlp_prompt: false +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + n_fft: 512 + win_length: 400 + hop_length: 160 + fs: 16k +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0.0 + - 0.05 + num_time_mask: 5 +normalize: utterance_mvn +normalize_conf: {} +model: espnet +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: e_branchformer +encoder_conf: + output_size: 256 + attention_heads: 4 + attention_layer_type: rel_selfattn + pos_enc_layer_type: rel_pos + rel_pos_type: latest + cgmlp_linear_units: 1024 + cgmlp_conv_kernel: 31 + use_linear_after_conv: false + gate_activation: identity + num_blocks: 8 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d2 + layer_drop_rate: 0.0 + linear_units: 1024 + positionwise_layer_type: linear + use_ffn: true + macaron_ffn: true + merge_conv_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + layer_drop_rate: 0.0 +preprocessor: default +preprocessor_conf: {} +required: +- output_dir +- token_list +version: '202409' +distributed: false diff --git a/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png new file mode 100644 index 0000000000000000000000000000000000000000..5c0ef73d558be2399067bf5a4f2ea62302da5392 Binary files /dev/null and b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png differ diff --git a/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..4c8036b12af6348cc35ffef657b3f24a9218090b Binary files /dev/null and b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png differ diff --git a/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png new file mode 100644 index 0000000000000000000000000000000000000000..fbf788d83c6481ea93ee70d379678b8e2e0939f3 Binary files /dev/null and b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png differ diff --git a/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..3e7c2b4dc07ae034b22acfdc01252285fdb731dd Binary files /dev/null and b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png differ diff --git a/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png new file mode 100644 index 0000000000000000000000000000000000000000..169ae08d82771c6d63353abf3c38f4cac6681668 Binary files /dev/null and b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png differ diff --git a/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..89526f703af4b81ad523c1bf38ebc8e50173fd64 Binary files /dev/null and b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png differ diff --git a/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png new file mode 100644 index 0000000000000000000000000000000000000000..e6b4017a8f72723c39880dd9ee8e5136586ac751 Binary files /dev/null and b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png differ diff --git a/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..8e7e2cc9ea859d67480cc1a200c4f703cf334083 Binary files /dev/null and b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png differ diff --git a/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png new file mode 100644 index 0000000000000000000000000000000000000000..746aa2f896c0103b75daa8992166cbfa39f2ce2e Binary files /dev/null and b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png differ diff --git a/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..694494cd3f31e8240ad979d7e23bb83bbfae5645 Binary files /dev/null and b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png differ diff --git a/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png new file mode 100644 index 0000000000000000000000000000000000000000..d0ac1b132ca6a08b38aecfde073281de16143304 Binary files /dev/null and b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png differ diff --git a/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..570a93a70151a6f4b5fbfdd8161d1ee2fcba9955 Binary files /dev/null and b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png differ diff --git a/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png new file mode 100644 index 0000000000000000000000000000000000000000..c9f642024c8051477bee8c112b214e76b25d3469 Binary files /dev/null and b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png differ diff --git a/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png new file mode 100644 index 0000000000000000000000000000000000000000..a880d202e71140f5c856f7fb37a0545528ed0a5f Binary files /dev/null and b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png differ diff --git a/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png new file mode 100644 index 0000000000000000000000000000000000000000..f9ab20dac983b9a9f67f413907c759c12023d356 Binary files /dev/null and b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png differ diff --git a/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png new file mode 100644 index 0000000000000000000000000000000000000000..9c13238cfc4c34d6e59a552030f7d57030220a11 Binary files /dev/null and b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png differ diff --git a/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png new file mode 100644 index 0000000000000000000000000000000000000000..6963553e15222cf2b01dd5cf6e00e355f5561b3a Binary files /dev/null and b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png differ diff --git a/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth new file mode 100644 index 0000000000000000000000000000000000000000..2e8d30b66164d54693abc3909820c6005aefc66a --- /dev/null +++ b/exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed3a266a4fc66a62b01b7dbf24fa03a91928caa1da09db3ad226509befe7a106 +size 112634154 diff --git a/exp_small/exp_hi/meta.yaml b/exp_small/exp_hi/meta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bdf9ef1318f7bc80c34d07d1e43c36451f447d23 --- /dev/null +++ b/exp_small/exp_hi/meta.yaml @@ -0,0 +1,8 @@ +espnet: '202412' +files: + asr_model_file: exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth +python: "3.8.10 (default, Mar 18 2025, 20:04:55) \n[GCC 9.4.0]" +timestamp: 1748120526.738138 +torch: 2.3.0+cu121 +yaml_files: + asr_train_config: exp_small/exp_hi/asr_hi_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml diff --git a/exp_small/exp_kn/README.md b/exp_small/exp_kn/README.md new file mode 100644 index 0000000000000000000000000000000000000000..50b449d5ea3a82b71bd5988f3d7a5d6b5a522042 --- /dev/null +++ b/exp_small/exp_kn/README.md @@ -0,0 +1,400 @@ +--- +tags: +- espnet +- audio +- automatic-speech-recognition +language: kn +datasets: +- respin_small +license: cc-by-4.0 +--- + +## ESPnet2 ASR model + +### `SpireLab/spire_respin_baselines_espnet` + +This model was trained by wtc7 using respin_small recipe in [espnet](https://github.com/espnet/espnet/). + +### Demo: How to use in ESPnet2 + +Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html) +if you haven't done that already. + +```bash +cd espnet + +pip install -e . +cd egs2/respin_small/asr1 +./run.sh --skip_data_prep false --skip_train true --download_model SpireLab/spire_respin_baselines_espnet +``` + + +# RESULTS +## Environments +- date: `Sun May 25 02:32:26 IST 2025` +- python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]` +- espnet version: `espnet 202412` +- pytorch version: `pytorch 2.3.0+cu121` +- Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c` + - Commit date: `Tue Jan 14 20:06:15 2025 -0500` + +## exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_kn|2161|17676|77.6|20.8|1.6|2.1|24.5|73.5| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_kn|2161|126552|97.0|1.7|1.4|1.6|4.6|73.5| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| + +## ASR config + +
expand + +``` +config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e8_mactrue_bs6M_gacc1.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +ngpu: 1 +seed: 2022 +num_workers: 8 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: 0 +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +use_deepspeed: false +deepspeed_config: null +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +use_tf32: false +collect_stats: false +write_collected_feats: false +max_epoch: 70 +patience: 5 +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: true +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +use_adapter: false +adapter: lora +save_strategy: all +adapter_conf: {} +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +category_sample_size: 10 +train_shape_file: +- exp_small/exp_kn/asr_stats_raw_kn_char_sp/train/speech_shape +- exp_small/exp_kn/asr_stats_raw_kn_char_sp/train/text_shape.char +valid_shape_file: +- exp_small/exp_kn/asr_stats_raw_kn_char_sp/valid/speech_shape +- exp_small/exp_kn/asr_stats_raw_kn_char_sp/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +chunk_default_fs: null +chunk_max_abs_length: null +chunk_discard_short_samples: true +train_data_path_and_name_and_type: +- - dump/kn/raw/train_kn_sp/wav.scp + - speech + - sound +- - dump/kn/raw/train_kn_sp/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/kn/raw/dev_kn/wav.scp + - speech + - sound +- - dump/kn/raw/dev_kn/text + - text + - text +multi_task_dataset: false +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +allow_multi_rates: false +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 +token_list: +- +- +- +- ್ +- ಿ +- ಾ +- ರ +- ು +- ನ +- ಕ +- ತ +- ದ +- ೆ +- ಗ +- ಸ +- ಲ +- ವ +- ಯ +- ಂ +- ಮ +- ಬ +- ಳ +- ಡ +- ಟ +- ಹ +- ಪ +- ೇ +- ಅ +- ೊ +- ಣ +- ೋ +- ಜ +- ಇ +- ೂ +- ಷ +- ಚ +- ೀ +- ಎ +- ಆ +- ಶ +- ೈ +- ಧ +- ಒ +- ಭ +- . +- ಉ +- ಫ +- ಥ +- ಖ +- ೃ +- ೌ +- ಏ +- ಐ +- ಈ +- ಠ +- ಘ +- ಛ +- ಓ +- ಔ +- ಞ +- ಊ +- ಋ +- ೕ +- ಢ +- ಃ +- ಝ +- ೖ +- ೯ +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true + brctc_risk_strategy: exp + brctc_group_strategy: end + brctc_risk_factor: 0.0 +joint_net_conf: null +use_preprocessor: true +use_lang_prompt: false +use_nlp_prompt: false +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + n_fft: 512 + win_length: 400 + hop_length: 160 + fs: 16k +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0.0 + - 0.05 + num_time_mask: 5 +normalize: utterance_mvn +normalize_conf: {} +model: espnet +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: e_branchformer +encoder_conf: + output_size: 256 + attention_heads: 4 + attention_layer_type: rel_selfattn + pos_enc_layer_type: rel_pos + rel_pos_type: latest + cgmlp_linear_units: 1024 + cgmlp_conv_kernel: 31 + use_linear_after_conv: false + gate_activation: identity + num_blocks: 8 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d2 + layer_drop_rate: 0.0 + linear_units: 1024 + positionwise_layer_type: linear + use_ffn: true + macaron_ffn: true + merge_conv_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + layer_drop_rate: 0.0 +preprocessor: default +preprocessor_conf: {} +required: +- output_dir +- token_list +version: '202409' +distributed: false +``` + +
+ + + +### Citing ESPnet + +```BibTex +@inproceedings{watanabe2018espnet, + author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, + title={{ESPnet}: End-to-End Speech Processing Toolkit}, + year={2018}, + booktitle={Proceedings of Interspeech}, + pages={2207--2211}, + doi={10.21437/Interspeech.2018-1456}, + url={http://dx.doi.org/10.21437/Interspeech.2018-1456} +} + + + + + + +``` + +or arXiv: + +```bibtex +@misc{watanabe2018espnet, + title={ESPnet: End-to-End Speech Processing Toolkit}, + author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, + year={2018}, + eprint={1804.00015}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` diff --git a/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md new file mode 100644 index 0000000000000000000000000000000000000000..af7951cb73872344f7ddd53242c0adc91cb7015e --- /dev/null +++ b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md @@ -0,0 +1,27 @@ + +# RESULTS +## Environments +- date: `Sun May 25 02:32:26 IST 2025` +- python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]` +- espnet version: `espnet 202412` +- pytorch version: `pytorch 2.3.0+cu121` +- Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c` + - Commit date: `Tue Jan 14 20:06:15 2025 -0500` + +## exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_kn|2161|17676|77.6|20.8|1.6|2.1|24.5|73.5| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_kn|2161|126552|97.0|1.7|1.4|1.6|4.6|73.5| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| diff --git a/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bdebe2ab6834c3f0cc595c3ade7b75eb02c5e650 --- /dev/null +++ b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml @@ -0,0 +1,299 @@ +config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e8_mactrue_bs6M_gacc1.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +ngpu: 1 +seed: 2022 +num_workers: 8 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: 0 +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +use_deepspeed: false +deepspeed_config: null +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +use_tf32: false +collect_stats: false +write_collected_feats: false +max_epoch: 70 +patience: 5 +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: true +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +use_adapter: false +adapter: lora +save_strategy: all +adapter_conf: {} +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +category_sample_size: 10 +train_shape_file: +- exp_small/exp_kn/asr_stats_raw_kn_char_sp/train/speech_shape +- exp_small/exp_kn/asr_stats_raw_kn_char_sp/train/text_shape.char +valid_shape_file: +- exp_small/exp_kn/asr_stats_raw_kn_char_sp/valid/speech_shape +- exp_small/exp_kn/asr_stats_raw_kn_char_sp/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +chunk_default_fs: null +chunk_max_abs_length: null +chunk_discard_short_samples: true +train_data_path_and_name_and_type: +- - dump/kn/raw/train_kn_sp/wav.scp + - speech + - sound +- - dump/kn/raw/train_kn_sp/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/kn/raw/dev_kn/wav.scp + - speech + - sound +- - dump/kn/raw/dev_kn/text + - text + - text +multi_task_dataset: false +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +allow_multi_rates: false +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 +token_list: +- +- +- +- ್ +- ಿ +- ಾ +- ರ +- ು +- ನ +- ಕ +- ತ +- ದ +- ೆ +- ಗ +- ಸ +- ಲ +- ವ +- ಯ +- ಂ +- ಮ +- ಬ +- ಳ +- ಡ +- ಟ +- ಹ +- ಪ +- ೇ +- ಅ +- ೊ +- ಣ +- ೋ +- ಜ +- ಇ +- ೂ +- ಷ +- ಚ +- ೀ +- ಎ +- ಆ +- ಶ +- ೈ +- ಧ +- ಒ +- ಭ +- . +- ಉ +- ಫ +- ಥ +- ಖ +- ೃ +- ೌ +- ಏ +- ಐ +- ಈ +- ಠ +- ಘ +- ಛ +- ಓ +- ಔ +- ಞ +- ಊ +- ಋ +- ೕ +- ಢ +- ಃ +- ಝ +- ೖ +- ೯ +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true + brctc_risk_strategy: exp + brctc_group_strategy: end + brctc_risk_factor: 0.0 +joint_net_conf: null +use_preprocessor: true +use_lang_prompt: false +use_nlp_prompt: false +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + n_fft: 512 + win_length: 400 + hop_length: 160 + fs: 16k +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0.0 + - 0.05 + num_time_mask: 5 +normalize: utterance_mvn +normalize_conf: {} +model: espnet +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: e_branchformer +encoder_conf: + output_size: 256 + attention_heads: 4 + attention_layer_type: rel_selfattn + pos_enc_layer_type: rel_pos + rel_pos_type: latest + cgmlp_linear_units: 1024 + cgmlp_conv_kernel: 31 + use_linear_after_conv: false + gate_activation: identity + num_blocks: 8 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d2 + layer_drop_rate: 0.0 + linear_units: 1024 + positionwise_layer_type: linear + use_ffn: true + macaron_ffn: true + merge_conv_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + layer_drop_rate: 0.0 +preprocessor: default +preprocessor_conf: {} +required: +- output_dir +- token_list +version: '202409' +distributed: false diff --git a/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png new file mode 100644 index 0000000000000000000000000000000000000000..404ab7423b5f6ba9145b7f648908eff36c46ecdd Binary files /dev/null and b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png differ diff --git a/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..380c5e3d75a8568bc1cc5ff422d9506fc17c9280 Binary files /dev/null and b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png differ diff --git a/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png new file mode 100644 index 0000000000000000000000000000000000000000..e364b9e2e5e120f9a1fe2932d458d2e8e0855211 Binary files /dev/null and b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png differ diff --git a/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..1c5440092d9d068cd8d86f9af644b2ee97488874 Binary files /dev/null and b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png differ diff --git a/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png new file mode 100644 index 0000000000000000000000000000000000000000..b17832fb8a9cb57375cb58db2cab7b0a5fdbc6ff Binary files /dev/null and b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png differ diff --git a/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..f7dec6912950fc4812a9933ece7756226be2ab7a Binary files /dev/null and b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png differ diff --git a/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png new file mode 100644 index 0000000000000000000000000000000000000000..5d50993d6634745f3644404154baf49814456b4e Binary files /dev/null and b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png differ diff --git a/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..3e66b751a8b717cf81b70eab5da1f7dfa28bd439 Binary files /dev/null and b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png differ diff --git a/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png new file mode 100644 index 0000000000000000000000000000000000000000..4cbe3226b95822cf699cf98ed50e0965d84636d3 Binary files /dev/null and b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png differ diff --git a/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..97e54ed5f68b3d91289d2260b2728be2aff653b3 Binary files /dev/null and b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png differ diff --git a/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png new file mode 100644 index 0000000000000000000000000000000000000000..79653393d7eb0dd369163bd6f8a40940bbeb002d Binary files /dev/null and b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png differ diff --git a/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..58d6fbbd506c37e07579b6655c12e53b8f92d7a7 Binary files /dev/null and b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png differ diff --git a/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png new file mode 100644 index 0000000000000000000000000000000000000000..4c95273c182c35c95fc33db28759ea315f3e06d3 Binary files /dev/null and b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png differ diff --git a/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png new file mode 100644 index 0000000000000000000000000000000000000000..1686d4073754dda7739b49fc4e5bc4b26469e000 Binary files /dev/null and b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png differ diff --git a/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png new file mode 100644 index 0000000000000000000000000000000000000000..b62670f84553dbf3ce9937c22f61257aa6534415 Binary files /dev/null and b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png differ diff --git a/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png new file mode 100644 index 0000000000000000000000000000000000000000..2715f1b09549822dad5fca85ea5382c41bbd7d1f Binary files /dev/null and b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png differ diff --git a/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png new file mode 100644 index 0000000000000000000000000000000000000000..fe4e66889f6fb6ea6afd35248ca3d2b072170a8e Binary files /dev/null and b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png differ diff --git a/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth new file mode 100644 index 0000000000000000000000000000000000000000..63876ae7ede45a0d2595193062b461d4c77b0819 --- /dev/null +++ b/exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b582860d8b57d7b3c87ecabdde6d7ade849c21f5bca2a9c6e31f9c2fe0f35994 +size 112612650 diff --git a/exp_small/exp_kn/meta.yaml b/exp_small/exp_kn/meta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ced4626c0e42f79fdd6a1f3477521f2bb2babbfa --- /dev/null +++ b/exp_small/exp_kn/meta.yaml @@ -0,0 +1,8 @@ +espnet: '202412' +files: + asr_model_file: exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth +python: "3.8.10 (default, Mar 18 2025, 20:04:55) \n[GCC 9.4.0]" +timestamp: 1748124996.523497 +torch: 2.3.0+cu121 +yaml_files: + asr_train_config: exp_small/exp_kn/asr_kn_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml diff --git a/exp_small/exp_mg/README.md b/exp_small/exp_mg/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b08e3a56b2a16a1c06e6ed97677fdd8be2d6133e --- /dev/null +++ b/exp_small/exp_mg/README.md @@ -0,0 +1,406 @@ +--- +tags: +- espnet +- audio +- automatic-speech-recognition +language: mg +datasets: +- respin_small +license: cc-by-4.0 +--- + +## ESPnet2 ASR model + +### `SpireLab/spire_respin_baselines_espnet` + +This model was trained by wtc7 using respin_small recipe in [espnet](https://github.com/espnet/espnet/). + +### Demo: How to use in ESPnet2 + +Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html) +if you haven't done that already. + +```bash +cd espnet + +pip install -e . +cd egs2/respin_small/asr1 +./run.sh --skip_data_prep false --skip_train true --download_model SpireLab/spire_respin_baselines_espnet +``` + + +# RESULTS +## Environments +- date: `Sun May 25 02:32:46 IST 2025` +- python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]` +- espnet version: `espnet 202412` +- pytorch version: `pytorch 2.3.0+cu121` +- Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c` + - Commit date: `Tue Jan 14 20:06:15 2025 -0500` + +## exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_mg|2193|22217|81.5|17.6|0.9|1.8|20.4|82.2| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_mg|2193|105161|95.6|3.1|1.3|1.6|6.0|82.2| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| + +## ASR config + +
expand + +``` +config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e8_mactrue_bs6M_gacc1.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +ngpu: 1 +seed: 2022 +num_workers: 8 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: 0 +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +use_deepspeed: false +deepspeed_config: null +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +use_tf32: false +collect_stats: false +write_collected_feats: false +max_epoch: 70 +patience: 5 +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: true +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +use_adapter: false +adapter: lora +save_strategy: all +adapter_conf: {} +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +category_sample_size: 10 +train_shape_file: +- exp_small/exp_mg/asr_stats_raw_mg_char_sp/train/speech_shape +- exp_small/exp_mg/asr_stats_raw_mg_char_sp/train/text_shape.char +valid_shape_file: +- exp_small/exp_mg/asr_stats_raw_mg_char_sp/valid/speech_shape +- exp_small/exp_mg/asr_stats_raw_mg_char_sp/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +chunk_default_fs: null +chunk_max_abs_length: null +chunk_discard_short_samples: true +train_data_path_and_name_and_type: +- - dump/mg/raw/train_mg_sp/wav.scp + - speech + - sound +- - dump/mg/raw/train_mg_sp/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/mg/raw/dev_mg/wav.scp + - speech + - sound +- - dump/mg/raw/dev_mg/text + - text + - text +multi_task_dataset: false +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +allow_multi_rates: false +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 +token_list: +- +- +- +- ा +- क +- े +- र +- ् +- स +- ल +- न +- ह +- त +- म +- ि +- ी +- प +- ो +- ब +- य +- ं +- व +- ज +- द +- ग +- इ +- ट +- ु +- ई +- ै +- ख +- च +- छ +- ू +- श +- भ +- अ +- आ +- ध +- ए +- ड +- उ +- फ +- ष +- ण +- थ +- ड़ +- ौ +- . +- ऽ +- ृ +- ॉ +- औ +- ढ़ +- घ +- ठ +- ँ +- ओ +- ऋ +- ऑ +- ऊ +- झ +- ज़ +- ढ +- ऐ +- फ़ +- ञ +- ः +- ख़ +- क़ +- ङ +- ग़ +- ऍ +- ॅ +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true + brctc_risk_strategy: exp + brctc_group_strategy: end + brctc_risk_factor: 0.0 +joint_net_conf: null +use_preprocessor: true +use_lang_prompt: false +use_nlp_prompt: false +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + n_fft: 512 + win_length: 400 + hop_length: 160 + fs: 16k +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0.0 + - 0.05 + num_time_mask: 5 +normalize: utterance_mvn +normalize_conf: {} +model: espnet +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: e_branchformer +encoder_conf: + output_size: 256 + attention_heads: 4 + attention_layer_type: rel_selfattn + pos_enc_layer_type: rel_pos + rel_pos_type: latest + cgmlp_linear_units: 1024 + cgmlp_conv_kernel: 31 + use_linear_after_conv: false + gate_activation: identity + num_blocks: 8 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d2 + layer_drop_rate: 0.0 + linear_units: 1024 + positionwise_layer_type: linear + use_ffn: true + macaron_ffn: true + merge_conv_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + layer_drop_rate: 0.0 +preprocessor: default +preprocessor_conf: {} +required: +- output_dir +- token_list +version: '202409' +distributed: false +``` + +
+ + + +### Citing ESPnet + +```BibTex +@inproceedings{watanabe2018espnet, + author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, + title={{ESPnet}: End-to-End Speech Processing Toolkit}, + year={2018}, + booktitle={Proceedings of Interspeech}, + pages={2207--2211}, + doi={10.21437/Interspeech.2018-1456}, + url={http://dx.doi.org/10.21437/Interspeech.2018-1456} +} + + + + + + +``` + +or arXiv: + +```bibtex +@misc{watanabe2018espnet, + title={ESPnet: End-to-End Speech Processing Toolkit}, + author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, + year={2018}, + eprint={1804.00015}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` diff --git a/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md new file mode 100644 index 0000000000000000000000000000000000000000..7155c46bb2735ac572ae7cda24505b6a88067f73 --- /dev/null +++ b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md @@ -0,0 +1,27 @@ + +# RESULTS +## Environments +- date: `Sun May 25 02:32:46 IST 2025` +- python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]` +- espnet version: `espnet 202412` +- pytorch version: `pytorch 2.3.0+cu121` +- Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c` + - Commit date: `Tue Jan 14 20:06:15 2025 -0500` + +## exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_mg|2193|22217|81.5|17.6|0.9|1.8|20.4|82.2| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_mg|2193|105161|95.6|3.1|1.3|1.6|6.0|82.2| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| diff --git a/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d76be829e5acdd299fa4870cd35bf4bd1964f03e --- /dev/null +++ b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml @@ -0,0 +1,305 @@ +config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e8_mactrue_bs6M_gacc1.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +ngpu: 1 +seed: 2022 +num_workers: 8 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: 0 +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +use_deepspeed: false +deepspeed_config: null +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +use_tf32: false +collect_stats: false +write_collected_feats: false +max_epoch: 70 +patience: 5 +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: true +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +use_adapter: false +adapter: lora +save_strategy: all +adapter_conf: {} +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +category_sample_size: 10 +train_shape_file: +- exp_small/exp_mg/asr_stats_raw_mg_char_sp/train/speech_shape +- exp_small/exp_mg/asr_stats_raw_mg_char_sp/train/text_shape.char +valid_shape_file: +- exp_small/exp_mg/asr_stats_raw_mg_char_sp/valid/speech_shape +- exp_small/exp_mg/asr_stats_raw_mg_char_sp/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +chunk_default_fs: null +chunk_max_abs_length: null +chunk_discard_short_samples: true +train_data_path_and_name_and_type: +- - dump/mg/raw/train_mg_sp/wav.scp + - speech + - sound +- - dump/mg/raw/train_mg_sp/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/mg/raw/dev_mg/wav.scp + - speech + - sound +- - dump/mg/raw/dev_mg/text + - text + - text +multi_task_dataset: false +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +allow_multi_rates: false +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 +token_list: +- +- +- +- ा +- क +- े +- र +- ् +- स +- ल +- न +- ह +- त +- म +- ि +- ी +- प +- ो +- ब +- य +- ं +- व +- ज +- द +- ग +- इ +- ट +- ु +- ई +- ै +- ख +- च +- छ +- ू +- श +- भ +- अ +- आ +- ध +- ए +- ड +- उ +- फ +- ष +- ण +- थ +- ड़ +- ौ +- . +- ऽ +- ृ +- ॉ +- औ +- ढ़ +- घ +- ठ +- ँ +- ओ +- ऋ +- ऑ +- ऊ +- झ +- ज़ +- ढ +- ऐ +- फ़ +- ञ +- ः +- ख़ +- क़ +- ङ +- ग़ +- ऍ +- ॅ +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true + brctc_risk_strategy: exp + brctc_group_strategy: end + brctc_risk_factor: 0.0 +joint_net_conf: null +use_preprocessor: true +use_lang_prompt: false +use_nlp_prompt: false +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + n_fft: 512 + win_length: 400 + hop_length: 160 + fs: 16k +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0.0 + - 0.05 + num_time_mask: 5 +normalize: utterance_mvn +normalize_conf: {} +model: espnet +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: e_branchformer +encoder_conf: + output_size: 256 + attention_heads: 4 + attention_layer_type: rel_selfattn + pos_enc_layer_type: rel_pos + rel_pos_type: latest + cgmlp_linear_units: 1024 + cgmlp_conv_kernel: 31 + use_linear_after_conv: false + gate_activation: identity + num_blocks: 8 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d2 + layer_drop_rate: 0.0 + linear_units: 1024 + positionwise_layer_type: linear + use_ffn: true + macaron_ffn: true + merge_conv_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + layer_drop_rate: 0.0 +preprocessor: default +preprocessor_conf: {} +required: +- output_dir +- token_list +version: '202409' +distributed: false diff --git a/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png new file mode 100644 index 0000000000000000000000000000000000000000..a636cb7beeb95eaa1131f8c7bf2fa89076997153 Binary files /dev/null and b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png differ diff --git a/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..4553580d9a207542eb8045db21712ed976ee6a88 Binary files /dev/null and b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png differ diff --git a/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png new file mode 100644 index 0000000000000000000000000000000000000000..63c11621f6658b26ef0ecb54c9712026d04291ae Binary files /dev/null and b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png differ diff --git a/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..1fb4126990ab5432b39e04d169f7a0d57883ba98 Binary files /dev/null and b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png differ diff --git a/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png new file mode 100644 index 0000000000000000000000000000000000000000..b17832fb8a9cb57375cb58db2cab7b0a5fdbc6ff Binary files /dev/null and b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png differ diff --git a/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..36af10f66129f167bb2bc623cde101dee6f9872e Binary files /dev/null and b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png differ diff --git a/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png new file mode 100644 index 0000000000000000000000000000000000000000..05b8d531220bbf8e2d90f3803b482b9265be8fcc Binary files /dev/null and b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png differ diff --git a/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..d25cf17651c8070879fc0cafa0b80f0cba72763f Binary files /dev/null and b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png differ diff --git a/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png new file mode 100644 index 0000000000000000000000000000000000000000..49f200cdc05d8dab4efa9968298e5bbfd1ab5324 Binary files /dev/null and b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png differ diff --git a/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..6e80d0336f4fdc3f4f99135ed7702fb4af7b1b22 Binary files /dev/null and b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png differ diff --git a/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png new file mode 100644 index 0000000000000000000000000000000000000000..26b26416c7f8d880d7b990ef4cb6e837ce7917eb Binary files /dev/null and b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png differ diff --git a/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..80ec6708e26177481a9478731eabcb33fc0e4c96 Binary files /dev/null and b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png differ diff --git a/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png new file mode 100644 index 0000000000000000000000000000000000000000..b6a82d2cb5e76448e7cc9245bbeba7f352f6ce5d Binary files /dev/null and b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png differ diff --git a/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png new file mode 100644 index 0000000000000000000000000000000000000000..6c710a2f158b9a717c784df90ee3c3df4fd3a33b Binary files /dev/null and b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png differ diff --git a/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png new file mode 100644 index 0000000000000000000000000000000000000000..d8bb04294da9e38d85db76de2e7ee94b7fd1d196 Binary files /dev/null and b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png differ diff --git a/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png new file mode 100644 index 0000000000000000000000000000000000000000..6e7c58d345e72767884db3a104c603a18cf030bb Binary files /dev/null and b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png differ diff --git a/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png new file mode 100644 index 0000000000000000000000000000000000000000..1e1f03905865d525918f57e57e464dc969e7c8ab Binary files /dev/null and b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png differ diff --git a/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth new file mode 100644 index 0000000000000000000000000000000000000000..bd91ab36763ec3f1f3855d117357552c3ba7c2e4 --- /dev/null +++ b/exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9d0342d8e5a783c57d583c3e3737dc9711225d1aba758db3cac9698c81d2645 +size 112631082 diff --git a/exp_small/exp_mg/meta.yaml b/exp_small/exp_mg/meta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b97ba0229dbb4cac4ecbf0c373dfcb159ce61dd7 --- /dev/null +++ b/exp_small/exp_mg/meta.yaml @@ -0,0 +1,8 @@ +espnet: '202412' +files: + asr_model_file: exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth +python: "3.8.10 (default, Mar 18 2025, 20:04:55) \n[GCC 9.4.0]" +timestamp: 1748120568.092736 +torch: 2.3.0+cu121 +yaml_files: + asr_train_config: exp_small/exp_mg/asr_mg_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml diff --git a/exp_small/exp_mr/README.md b/exp_small/exp_mr/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a9a2947ee8f203951a8ee411b930e6e28995cd87 --- /dev/null +++ b/exp_small/exp_mr/README.md @@ -0,0 +1,404 @@ +--- +tags: +- espnet +- audio +- automatic-speech-recognition +language: mr +datasets: +- respin_small +license: cc-by-4.0 +--- + +## ESPnet2 ASR model + +### `SpireLab/spire_respin_baselines_espnet` + +This model was trained by wtc7 using respin_small recipe in [espnet](https://github.com/espnet/espnet/). + +### Demo: How to use in ESPnet2 + +Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html) +if you haven't done that already. + +```bash +cd espnet + +pip install -e . +cd egs2/respin_small/asr1 +./run.sh --skip_data_prep false --skip_train true --download_model SpireLab/spire_respin_baselines_espnet +``` + + +# RESULTS +## Environments +- date: `Sun May 25 02:33:06 IST 2025` +- python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]` +- espnet version: `espnet 202412` +- pytorch version: `pytorch 2.3.0+cu121` +- Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c` + - Commit date: `Tue Jan 14 20:06:15 2025 -0500` + +## exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_mr|2170|17526|86.7|12.2|1.1|1.1|14.5|57.9| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_mr|2170|108873|97.8|1.4|0.8|0.9|3.1|57.9| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| + +## ASR config + +
expand + +``` +config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e8_mactrue_bs6M_gacc1.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +ngpu: 1 +seed: 2022 +num_workers: 8 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: 0 +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +use_deepspeed: false +deepspeed_config: null +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +use_tf32: false +collect_stats: false +write_collected_feats: false +max_epoch: 70 +patience: 5 +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: true +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +use_adapter: false +adapter: lora +save_strategy: all +adapter_conf: {} +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +category_sample_size: 10 +train_shape_file: +- exp_small/exp_mr/asr_stats_raw_mr_char_sp/train/speech_shape +- exp_small/exp_mr/asr_stats_raw_mr_char_sp/train/text_shape.char +valid_shape_file: +- exp_small/exp_mr/asr_stats_raw_mr_char_sp/valid/speech_shape +- exp_small/exp_mr/asr_stats_raw_mr_char_sp/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +chunk_default_fs: null +chunk_max_abs_length: null +chunk_discard_short_samples: true +train_data_path_and_name_and_type: +- - dump/mr/raw/train_mr_sp/wav.scp + - speech + - sound +- - dump/mr/raw/train_mr_sp/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/mr/raw/dev_mr/wav.scp + - speech + - sound +- - dump/mr/raw/dev_mr/text + - text + - text +multi_task_dataset: false +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +allow_multi_rates: false +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 +token_list: +- +- +- +- ा +- ् +- र +- त +- क +- े +- य +- ी +- स +- न +- व +- ल +- म +- ि +- प +- च +- ं +- ह +- ो +- ज +- ण +- द +- आ +- ग +- श +- ब +- ु +- ट +- ू +- ड +- ध +- अ +- ख +- ठ +- ळ +- भ +- ष +- फ +- उ +- ए +- थ +- . +- घ +- झ +- ँ +- ै +- ई +- ढ +- इ +- ॉ +- ऊ +- ॅ +- ृ +- ऑ +- ऱ +- ओ +- ौ +- छ +- ञ +- औ +- ॲ +- ः +- ऐ +- ऍ +- ऋ +- ़ +- ':' +- ड़ +- फ़ +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true + brctc_risk_strategy: exp + brctc_group_strategy: end + brctc_risk_factor: 0.0 +joint_net_conf: null +use_preprocessor: true +use_lang_prompt: false +use_nlp_prompt: false +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + n_fft: 512 + win_length: 400 + hop_length: 160 + fs: 16k +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0.0 + - 0.05 + num_time_mask: 5 +normalize: utterance_mvn +normalize_conf: {} +model: espnet +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: e_branchformer +encoder_conf: + output_size: 256 + attention_heads: 4 + attention_layer_type: rel_selfattn + pos_enc_layer_type: rel_pos + rel_pos_type: latest + cgmlp_linear_units: 1024 + cgmlp_conv_kernel: 31 + use_linear_after_conv: false + gate_activation: identity + num_blocks: 8 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d2 + layer_drop_rate: 0.0 + linear_units: 1024 + positionwise_layer_type: linear + use_ffn: true + macaron_ffn: true + merge_conv_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + layer_drop_rate: 0.0 +preprocessor: default +preprocessor_conf: {} +required: +- output_dir +- token_list +version: '202409' +distributed: false +``` + +
+ + + +### Citing ESPnet + +```BibTex +@inproceedings{watanabe2018espnet, + author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, + title={{ESPnet}: End-to-End Speech Processing Toolkit}, + year={2018}, + booktitle={Proceedings of Interspeech}, + pages={2207--2211}, + doi={10.21437/Interspeech.2018-1456}, + url={http://dx.doi.org/10.21437/Interspeech.2018-1456} +} + + + + + + +``` + +or arXiv: + +```bibtex +@misc{watanabe2018espnet, + title={ESPnet: End-to-End Speech Processing Toolkit}, + author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, + year={2018}, + eprint={1804.00015}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` diff --git a/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md new file mode 100644 index 0000000000000000000000000000000000000000..fc92c324c8070c96df9b05f3e74904985d76b43b --- /dev/null +++ b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md @@ -0,0 +1,27 @@ + +# RESULTS +## Environments +- date: `Sun May 25 02:33:06 IST 2025` +- python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]` +- espnet version: `espnet 202412` +- pytorch version: `pytorch 2.3.0+cu121` +- Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c` + - Commit date: `Tue Jan 14 20:06:15 2025 -0500` + +## exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_mr|2170|17526|86.7|12.2|1.1|1.1|14.5|57.9| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_mr|2170|108873|97.8|1.4|0.8|0.9|3.1|57.9| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| diff --git a/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f3bf4668186b3c43b9ecff1432f3249f2dfc9620 --- /dev/null +++ b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml @@ -0,0 +1,303 @@ +config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e8_mactrue_bs6M_gacc1.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +ngpu: 1 +seed: 2022 +num_workers: 8 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: 0 +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +use_deepspeed: false +deepspeed_config: null +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +use_tf32: false +collect_stats: false +write_collected_feats: false +max_epoch: 70 +patience: 5 +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: true +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +use_adapter: false +adapter: lora +save_strategy: all +adapter_conf: {} +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +category_sample_size: 10 +train_shape_file: +- exp_small/exp_mr/asr_stats_raw_mr_char_sp/train/speech_shape +- exp_small/exp_mr/asr_stats_raw_mr_char_sp/train/text_shape.char +valid_shape_file: +- exp_small/exp_mr/asr_stats_raw_mr_char_sp/valid/speech_shape +- exp_small/exp_mr/asr_stats_raw_mr_char_sp/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +chunk_default_fs: null +chunk_max_abs_length: null +chunk_discard_short_samples: true +train_data_path_and_name_and_type: +- - dump/mr/raw/train_mr_sp/wav.scp + - speech + - sound +- - dump/mr/raw/train_mr_sp/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/mr/raw/dev_mr/wav.scp + - speech + - sound +- - dump/mr/raw/dev_mr/text + - text + - text +multi_task_dataset: false +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +allow_multi_rates: false +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 +token_list: +- +- +- +- ा +- ् +- र +- त +- क +- े +- य +- ी +- स +- न +- व +- ल +- म +- ि +- प +- च +- ं +- ह +- ो +- ज +- ण +- द +- आ +- ग +- श +- ब +- ु +- ट +- ू +- ड +- ध +- अ +- ख +- ठ +- ळ +- भ +- ष +- फ +- उ +- ए +- थ +- . +- घ +- झ +- ँ +- ै +- ई +- ढ +- इ +- ॉ +- ऊ +- ॅ +- ृ +- ऑ +- ऱ +- ओ +- ौ +- छ +- ञ +- औ +- ॲ +- ः +- ऐ +- ऍ +- ऋ +- ़ +- ':' +- ड़ +- फ़ +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true + brctc_risk_strategy: exp + brctc_group_strategy: end + brctc_risk_factor: 0.0 +joint_net_conf: null +use_preprocessor: true +use_lang_prompt: false +use_nlp_prompt: false +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + n_fft: 512 + win_length: 400 + hop_length: 160 + fs: 16k +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0.0 + - 0.05 + num_time_mask: 5 +normalize: utterance_mvn +normalize_conf: {} +model: espnet +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: e_branchformer +encoder_conf: + output_size: 256 + attention_heads: 4 + attention_layer_type: rel_selfattn + pos_enc_layer_type: rel_pos + rel_pos_type: latest + cgmlp_linear_units: 1024 + cgmlp_conv_kernel: 31 + use_linear_after_conv: false + gate_activation: identity + num_blocks: 8 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d2 + layer_drop_rate: 0.0 + linear_units: 1024 + positionwise_layer_type: linear + use_ffn: true + macaron_ffn: true + merge_conv_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + layer_drop_rate: 0.0 +preprocessor: default +preprocessor_conf: {} +required: +- output_dir +- token_list +version: '202409' +distributed: false diff --git a/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png new file mode 100644 index 0000000000000000000000000000000000000000..cc5b2ce43052b59baf99f2484f3aac853bf9908a Binary files /dev/null and b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png differ diff --git a/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..f8db215f4c8d465ca7afdf6bf2ed710360961ad6 Binary files /dev/null and b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png differ diff --git a/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png new file mode 100644 index 0000000000000000000000000000000000000000..c4f2797022bf774763c4ee97a9c87dcc6cf28786 Binary files /dev/null and b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png differ diff --git a/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..0acb6cbc7799996cf994fb18be4a0833bd6b195c Binary files /dev/null and b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png differ diff --git a/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png new file mode 100644 index 0000000000000000000000000000000000000000..b17832fb8a9cb57375cb58db2cab7b0a5fdbc6ff Binary files /dev/null and b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png differ diff --git a/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..7578383cfb7879d6feae0507e7c89a4cf4e4f1b7 Binary files /dev/null and b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png differ diff --git a/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png new file mode 100644 index 0000000000000000000000000000000000000000..9da5227d2cbff13011688245058cc25fb71794b3 Binary files /dev/null and b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png differ diff --git a/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..3c3f72e7deee0bb4eb4d18d8eabf9127b2d13671 Binary files /dev/null and b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png differ diff --git a/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png new file mode 100644 index 0000000000000000000000000000000000000000..f0a0fe70a42531c96082bb8ef674472db9f1038e Binary files /dev/null and b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png differ diff --git a/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..1651a35f36659f404abfe7860e5e859d72854ab8 Binary files /dev/null and b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png differ diff --git a/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png new file mode 100644 index 0000000000000000000000000000000000000000..72b7a93e26b77ddd3225483fd847b664f3db3bf2 Binary files /dev/null and b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png differ diff --git a/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..998722098dab63a884f4838453422a381017a746 Binary files /dev/null and b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png differ diff --git a/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png new file mode 100644 index 0000000000000000000000000000000000000000..31ae7789ef22db0a099e21a8dadad9fb265b86d7 Binary files /dev/null and b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png differ diff --git a/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png new file mode 100644 index 0000000000000000000000000000000000000000..1b3f78f633044decf897e7f3354a26ff8be51926 Binary files /dev/null and b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png differ diff --git a/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png new file mode 100644 index 0000000000000000000000000000000000000000..47d6dae6c135dca568ce77a15f00b983d34d5d18 Binary files /dev/null and b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png differ diff --git a/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png new file mode 100644 index 0000000000000000000000000000000000000000..6ded9f41cde4517616e2d906323bd017c1e6f530 Binary files /dev/null and b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png differ diff --git a/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png new file mode 100644 index 0000000000000000000000000000000000000000..531a2c555e64a3cd88943d8aabf40281b624fde1 Binary files /dev/null and b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png differ diff --git a/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth new file mode 100644 index 0000000000000000000000000000000000000000..316d2e4d9ed27283b793467aadedc55111f3c0ea --- /dev/null +++ b/exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:142cdd748248259de4addba7ff2b17b2b576a3cc7aed1ea6f399bf0cc6a560eb +size 112624938 diff --git a/exp_small/exp_mr/meta.yaml b/exp_small/exp_mr/meta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a343d6f6cd70353b600ba98492da0ca70940f2b2 --- /dev/null +++ b/exp_small/exp_mr/meta.yaml @@ -0,0 +1,8 @@ +espnet: '202412' +files: + asr_model_file: exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth +python: "3.8.10 (default, Mar 18 2025, 20:04:55) \n[GCC 9.4.0]" +timestamp: 1748120588.150078 +torch: 2.3.0+cu121 +yaml_files: + asr_train_config: exp_small/exp_mr/asr_mr_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml diff --git a/exp_small/exp_mt/README.md b/exp_small/exp_mt/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f7d10a0173642c0edbd08ba072f6a384987ef004 --- /dev/null +++ b/exp_small/exp_mt/README.md @@ -0,0 +1,405 @@ +--- +tags: +- espnet +- audio +- automatic-speech-recognition +language: mt +datasets: +- respin_small +license: cc-by-4.0 +--- + +## ESPnet2 ASR model + +### `SpireLab/spire_respin_baselines_espnet` + +This model was trained by wtc7 using respin_small recipe in [espnet](https://github.com/espnet/espnet/). + +### Demo: How to use in ESPnet2 + +Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html) +if you haven't done that already. + +```bash +cd espnet + +pip install -e . +cd egs2/respin_small/asr1 +./run.sh --skip_data_prep false --skip_train true --download_model SpireLab/spire_respin_baselines_espnet +``` + + +# RESULTS +## Environments +- date: `Sun May 25 02:33:26 IST 2025` +- python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]` +- espnet version: `espnet 202412` +- pytorch version: `pytorch 2.3.0+cu121` +- Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c` + - Commit date: `Tue Jan 14 20:06:15 2025 -0500` + +## exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_mt|2172|22835|83.1|16.2|0.7|1.0|17.9|78.2| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_mt|2172|107571|96.2|2.6|1.2|1.2|5.0|78.2| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| + +## ASR config + +
expand + +``` +config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e12_mactrue_edrop0.0_ddrop0.0.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +ngpu: 1 +seed: 2022 +num_workers: 4 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: 0 +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +use_deepspeed: false +deepspeed_config: null +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +use_tf32: false +collect_stats: false +write_collected_feats: false +max_epoch: 70 +patience: 10 +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 10 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 4 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: true +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +use_adapter: false +adapter: lora +save_strategy: all +adapter_conf: {} +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 12000000 +valid_batch_bins: null +train_shape_file: +- exp_small/exp_mt/asr_stats_raw_mt_char_sp/train/speech_shape +- exp_small/exp_mt/asr_stats_raw_mt_char_sp/train/text_shape.char +valid_shape_file: +- exp_small/exp_mt/asr_stats_raw_mt_char_sp/valid/speech_shape +- exp_small/exp_mt/asr_stats_raw_mt_char_sp/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +chunk_default_fs: null +chunk_max_abs_length: null +chunk_discard_short_samples: true +train_data_path_and_name_and_type: +- - dump/mt/raw/train_mt_sp/wav.scp + - speech + - sound +- - dump/mt/raw/train_mt_sp/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/mt/raw/dev_mt/wav.scp + - speech + - sound +- - dump/mt/raw/dev_mt/text + - text + - text +multi_task_dataset: false +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +allow_multi_rates: false +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 +token_list: +- +- +- +- ा +- क +- र +- े +- ् +- ल +- स +- न +- ि +- त +- म +- ै +- ी +- य +- प +- ो +- ब +- छ +- ह +- ज +- ं +- व +- द +- ग +- ु +- ट +- इ +- अ +- भ +- ख +- आ +- श +- च +- ए +- ू +- ध +- उ +- ण +- ँ +- ष +- फ +- ड +- थ +- ड़ +- . +- ई +- ृ +- ौ +- ॅ +- ओ +- ऋ +- घ +- ढ़ +- ठ +- ॉ +- ऽ +- ऑ +- झ +- ऊ +- औ +- ञ +- ढ +- ः +- ऐ +- फ़ +- ज़ +- ॠ +- ख़ +- क़ +- ङ +- ग़ +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true + brctc_risk_strategy: exp + brctc_group_strategy: end + brctc_risk_factor: 0.0 +joint_net_conf: null +use_preprocessor: true +use_lang_prompt: false +use_nlp_prompt: false +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + n_fft: 512 + win_length: 400 + hop_length: 160 + fs: 16k +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0.0 + - 0.05 + num_time_mask: 5 +normalize: utterance_mvn +normalize_conf: {} +model: espnet +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: e_branchformer +encoder_conf: + output_size: 256 + attention_heads: 4 + attention_layer_type: rel_selfattn + pos_enc_layer_type: rel_pos + rel_pos_type: latest + cgmlp_linear_units: 1024 + cgmlp_conv_kernel: 31 + use_linear_after_conv: false + gate_activation: identity + num_blocks: 12 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d + layer_drop_rate: 0.0 + linear_units: 1024 + positionwise_layer_type: linear + use_ffn: true + macaron_ffn: true + merge_conv_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + layer_drop_rate: 0.0 +preprocessor: default +preprocessor_conf: {} +required: +- output_dir +- token_list +version: '202402' +distributed: false +``` + +
+ + + +### Citing ESPnet + +```BibTex +@inproceedings{watanabe2018espnet, + author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, + title={{ESPnet}: End-to-End Speech Processing Toolkit}, + year={2018}, + booktitle={Proceedings of Interspeech}, + pages={2207--2211}, + doi={10.21437/Interspeech.2018-1456}, + url={http://dx.doi.org/10.21437/Interspeech.2018-1456} +} + + + + + + +``` + +or arXiv: + +```bibtex +@misc{watanabe2018espnet, + title={ESPnet: End-to-End Speech Processing Toolkit}, + author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, + year={2018}, + eprint={1804.00015}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` diff --git a/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md new file mode 100644 index 0000000000000000000000000000000000000000..5614c6164c85cf60ece3994b94da219fe6862e06 --- /dev/null +++ b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md @@ -0,0 +1,27 @@ + +# RESULTS +## Environments +- date: `Sun May 25 02:33:26 IST 2025` +- python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]` +- espnet version: `espnet 202412` +- pytorch version: `pytorch 2.3.0+cu121` +- Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c` + - Commit date: `Tue Jan 14 20:06:15 2025 -0500` + +## exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_mt|2172|22835|83.1|16.2|0.7|1.0|17.9|78.2| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_mt|2172|107571|96.2|2.6|1.2|1.2|5.0|78.2| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| diff --git a/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..133b5df8c68031a0aa8584bc2fd151cf97f49b0f --- /dev/null +++ b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml @@ -0,0 +1,304 @@ +config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e12_mactrue_edrop0.0_ddrop0.0.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +ngpu: 1 +seed: 2022 +num_workers: 4 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: 0 +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +use_deepspeed: false +deepspeed_config: null +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +use_tf32: false +collect_stats: false +write_collected_feats: false +max_epoch: 70 +patience: 10 +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 10 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 4 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: true +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +use_adapter: false +adapter: lora +save_strategy: all +adapter_conf: {} +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 12000000 +valid_batch_bins: null +train_shape_file: +- exp_small/exp_mt/asr_stats_raw_mt_char_sp/train/speech_shape +- exp_small/exp_mt/asr_stats_raw_mt_char_sp/train/text_shape.char +valid_shape_file: +- exp_small/exp_mt/asr_stats_raw_mt_char_sp/valid/speech_shape +- exp_small/exp_mt/asr_stats_raw_mt_char_sp/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +chunk_default_fs: null +chunk_max_abs_length: null +chunk_discard_short_samples: true +train_data_path_and_name_and_type: +- - dump/mt/raw/train_mt_sp/wav.scp + - speech + - sound +- - dump/mt/raw/train_mt_sp/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/mt/raw/dev_mt/wav.scp + - speech + - sound +- - dump/mt/raw/dev_mt/text + - text + - text +multi_task_dataset: false +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +allow_multi_rates: false +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 +token_list: +- +- +- +- ा +- क +- र +- े +- ् +- ल +- स +- न +- ि +- त +- म +- ै +- ी +- य +- प +- ो +- ब +- छ +- ह +- ज +- ं +- व +- द +- ग +- ु +- ट +- इ +- अ +- भ +- ख +- आ +- श +- च +- ए +- ू +- ध +- उ +- ण +- ँ +- ष +- फ +- ड +- थ +- ड़ +- . +- ई +- ृ +- ौ +- ॅ +- ओ +- ऋ +- घ +- ढ़ +- ठ +- ॉ +- ऽ +- ऑ +- झ +- ऊ +- औ +- ञ +- ढ +- ः +- ऐ +- फ़ +- ज़ +- ॠ +- ख़ +- क़ +- ङ +- ग़ +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true + brctc_risk_strategy: exp + brctc_group_strategy: end + brctc_risk_factor: 0.0 +joint_net_conf: null +use_preprocessor: true +use_lang_prompt: false +use_nlp_prompt: false +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + n_fft: 512 + win_length: 400 + hop_length: 160 + fs: 16k +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0.0 + - 0.05 + num_time_mask: 5 +normalize: utterance_mvn +normalize_conf: {} +model: espnet +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: e_branchformer +encoder_conf: + output_size: 256 + attention_heads: 4 + attention_layer_type: rel_selfattn + pos_enc_layer_type: rel_pos + rel_pos_type: latest + cgmlp_linear_units: 1024 + cgmlp_conv_kernel: 31 + use_linear_after_conv: false + gate_activation: identity + num_blocks: 12 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d + layer_drop_rate: 0.0 + linear_units: 1024 + positionwise_layer_type: linear + use_ffn: true + macaron_ffn: true + merge_conv_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + layer_drop_rate: 0.0 +preprocessor: default +preprocessor_conf: {} +required: +- output_dir +- token_list +version: '202402' +distributed: false diff --git a/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png new file mode 100644 index 0000000000000000000000000000000000000000..460c129ca5b750ffd7c92e6f0a17639bc278dcc3 Binary files /dev/null and b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png differ diff --git a/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..b0bacf4137d8261a805cbdb16665bc3d42d16af7 Binary files /dev/null and b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png differ diff --git a/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png new file mode 100644 index 0000000000000000000000000000000000000000..0b2f641a2b5dc9d93a476f2763139d122a604bfd Binary files /dev/null and b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png differ diff --git a/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..6e0ab7725bee495a752761ffdc435eec3fcbb2ea Binary files /dev/null and b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png differ diff --git a/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png new file mode 100644 index 0000000000000000000000000000000000000000..d401ae9382d0d4d9bd4df70ff89e4031088552da Binary files /dev/null and b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png differ diff --git a/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..c13d2e4b0300fc1edc6d58b5e45a1b824ff38af2 Binary files /dev/null and b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png differ diff --git a/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png new file mode 100644 index 0000000000000000000000000000000000000000..ef8b2dfb2439662b08aca95573e331b11e7bcfbc Binary files /dev/null and b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png differ diff --git a/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..e9c4988b8a5bdb240f32c7a912f4f054a50c420f Binary files /dev/null and b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png differ diff --git a/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png new file mode 100644 index 0000000000000000000000000000000000000000..add98ff1836b8222f70398ce8316b54d8e7f3fe4 Binary files /dev/null and b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png differ diff --git a/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..fa04675541cd6bf6af3fab670abb78ab05cf3686 Binary files /dev/null and b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png differ diff --git a/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png new file mode 100644 index 0000000000000000000000000000000000000000..aa4e2cbc9a41a9176296acd6894894f33872f4a6 Binary files /dev/null and b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png differ diff --git a/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..006414d983fb264a9cc34ab9893a5578287a8d24 Binary files /dev/null and b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png differ diff --git a/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png new file mode 100644 index 0000000000000000000000000000000000000000..90a7484646c772feedfa74d8692f0306e056d481 Binary files /dev/null and b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png differ diff --git a/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png new file mode 100644 index 0000000000000000000000000000000000000000..9edeaad2d808bbd5ba5ed4100ea8b3a228129ceb Binary files /dev/null and b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png differ diff --git a/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png new file mode 100644 index 0000000000000000000000000000000000000000..f0ab54d792e5a369b15804dcaafeac0f35ff8ad6 Binary files /dev/null and b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png differ diff --git a/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png new file mode 100644 index 0000000000000000000000000000000000000000..139426d9fb6d2825d7fc41c7605f422988a91347 Binary files /dev/null and b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png differ diff --git a/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png new file mode 100644 index 0000000000000000000000000000000000000000..083223a47969bf2d021bd1c4de421fd8e07069a8 Binary files /dev/null and b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png differ diff --git a/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_10best.pth b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_10best.pth new file mode 100644 index 0000000000000000000000000000000000000000..e758ef684d45b6a8493ba36239d2b4564e23ab98 --- /dev/null +++ b/exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_10best.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58aa3e35f6fe4c1981fc160bd3d8a1ff0c12fbe64794fca806c5a597f9565be1 +size 139056430 diff --git a/exp_small/exp_mt/meta.yaml b/exp_small/exp_mt/meta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8c70b4a0d65850f93eea3ba9a8842d87de374c14 --- /dev/null +++ b/exp_small/exp_mt/meta.yaml @@ -0,0 +1,8 @@ +espnet: '202412' +files: + asr_model_file: exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_10best.pth +python: "3.8.10 (default, Mar 18 2025, 20:04:55) \n[GCC 9.4.0]" +timestamp: 1748120608.234658 +torch: 2.3.0+cu121 +yaml_files: + asr_train_config: exp_small/exp_mt/asr_mt_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml diff --git a/exp_small/exp_te/README.md b/exp_small/exp_te/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a8ccbe91407709b5ca89a18a20b2eca286eb02ef --- /dev/null +++ b/exp_small/exp_te/README.md @@ -0,0 +1,399 @@ +--- +tags: +- espnet +- audio +- automatic-speech-recognition +language: te +datasets: +- respin_small +license: cc-by-4.0 +--- + +## ESPnet2 ASR model + +### `SpireLab/spire_respin_baselines_espnet` + +This model was trained by wtc7 using respin_small recipe in [espnet](https://github.com/espnet/espnet/). + +### Demo: How to use in ESPnet2 + +Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html) +if you haven't done that already. + +```bash +cd espnet + +pip install -e . +cd egs2/respin_small/asr1 +./run.sh --skip_data_prep false --skip_train true --download_model SpireLab/spire_respin_baselines_espnet +``` + + +# RESULTS +## Environments +- date: `Sun May 25 02:33:47 IST 2025` +- python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]` +- espnet version: `espnet 202412` +- pytorch version: `pytorch 2.3.0+cu121` +- Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c` + - Commit date: `Tue Jan 14 20:06:15 2025 -0500` + +## exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_te|2226|17825|80.6|17.2|2.2|2.2|21.6|72.5| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_te|2226|125985|97.1|1.8|1.2|1.2|4.1|72.5| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| + +## ASR config + +
expand + +``` +config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e8_mactrue_bs6M_gacc1.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +ngpu: 1 +seed: 2022 +num_workers: 8 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: 0 +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +use_deepspeed: false +deepspeed_config: null +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +use_tf32: false +collect_stats: false +write_collected_feats: false +max_epoch: 70 +patience: 5 +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: true +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +use_adapter: false +adapter: lora +save_strategy: all +adapter_conf: {} +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +category_sample_size: 10 +train_shape_file: +- exp_small/exp_te/asr_stats_raw_te_char_sp/train/speech_shape +- exp_small/exp_te/asr_stats_raw_te_char_sp/train/text_shape.char +valid_shape_file: +- exp_small/exp_te/asr_stats_raw_te_char_sp/valid/speech_shape +- exp_small/exp_te/asr_stats_raw_te_char_sp/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +chunk_default_fs: null +chunk_max_abs_length: null +chunk_discard_short_samples: true +train_data_path_and_name_and_type: +- - dump/te/raw/train_te_sp/wav.scp + - speech + - sound +- - dump/te/raw/train_te_sp/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/te/raw/dev_te/wav.scp + - speech + - sound +- - dump/te/raw/dev_te/text + - text + - text +multi_task_dataset: false +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +allow_multi_rates: false +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 +token_list: +- +- +- +- ్ +- ు +- ా +- ి +- న +- ల +- ం +- క +- ర +- ప +- త +- వ +- ట +- స +- ే +- య +- డ +- ద +- మ +- చ +- ో +- గ +- ె +- బ +- ీ +- అ +- ొ +- ఎ +- ూ +- జ +- ై +- ఉ +- ధ +- ఇ +- ఆ +- ష +- భ +- శ +- ఏ +- ళ +- ఫ +- ణ +- . +- హ +- థ +- ఒ +- ఖ +- ఈ +- ౌ +- ఐ +- ృ +- ఓ +- ఊ +- ఋ +- ఛ +- ఘ +- ఠ +- ఔ +- ఱ +- ఢ +- ఞ +- ః +- ౖ +- ౦ +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true + brctc_risk_strategy: exp + brctc_group_strategy: end + brctc_risk_factor: 0.0 +joint_net_conf: null +use_preprocessor: true +use_lang_prompt: false +use_nlp_prompt: false +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + n_fft: 512 + win_length: 400 + hop_length: 160 + fs: 16k +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0.0 + - 0.05 + num_time_mask: 5 +normalize: utterance_mvn +normalize_conf: {} +model: espnet +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: e_branchformer +encoder_conf: + output_size: 256 + attention_heads: 4 + attention_layer_type: rel_selfattn + pos_enc_layer_type: rel_pos + rel_pos_type: latest + cgmlp_linear_units: 1024 + cgmlp_conv_kernel: 31 + use_linear_after_conv: false + gate_activation: identity + num_blocks: 8 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d2 + layer_drop_rate: 0.0 + linear_units: 1024 + positionwise_layer_type: linear + use_ffn: true + macaron_ffn: true + merge_conv_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + layer_drop_rate: 0.0 +preprocessor: default +preprocessor_conf: {} +required: +- output_dir +- token_list +version: '202409' +distributed: false +``` + +
+ + + +### Citing ESPnet + +```BibTex +@inproceedings{watanabe2018espnet, + author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, + title={{ESPnet}: End-to-End Speech Processing Toolkit}, + year={2018}, + booktitle={Proceedings of Interspeech}, + pages={2207--2211}, + doi={10.21437/Interspeech.2018-1456}, + url={http://dx.doi.org/10.21437/Interspeech.2018-1456} +} + + + + + + +``` + +or arXiv: + +```bibtex +@misc{watanabe2018espnet, + title={ESPnet: End-to-End Speech Processing Toolkit}, + author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, + year={2018}, + eprint={1804.00015}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` diff --git a/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md new file mode 100644 index 0000000000000000000000000000000000000000..5dc08967c4c4b45e5a445f2595c5e4586d9d92d9 --- /dev/null +++ b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/RESULTS.md @@ -0,0 +1,27 @@ + +# RESULTS +## Environments +- date: `Sun May 25 02:33:47 IST 2025` +- python version: `3.8.10 (default, Mar 18 2025, 20:04:55) [GCC 9.4.0]` +- espnet version: `espnet 202412` +- pytorch version: `pytorch 2.3.0+cu121` +- Git hash: `0fe7b8581fbc68841eb48776f052aa9a5989108c` + - Commit date: `Tue Jan 14 20:06:15 2025 -0500` + +## exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_te|2226|17825|80.6|17.2|2.2|2.2|21.6|72.5| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_lid_asr_model_valid.acc.ave/test_te|2226|125985|97.1|1.8|1.2|1.2|4.1|72.5| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| diff --git a/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..43b85ee202b6e8510de04dccb89dcb4a6e2fece6 --- /dev/null +++ b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml @@ -0,0 +1,298 @@ +config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e8_mactrue_bs6M_gacc1.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1 +ngpu: 1 +seed: 2022 +num_workers: 8 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: 0 +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +use_deepspeed: false +deepspeed_config: null +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +use_tf32: false +collect_stats: false +write_collected_feats: false +max_epoch: 70 +patience: 5 +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: true +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +use_adapter: false +adapter: lora +save_strategy: all +adapter_conf: {} +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +category_sample_size: 10 +train_shape_file: +- exp_small/exp_te/asr_stats_raw_te_char_sp/train/speech_shape +- exp_small/exp_te/asr_stats_raw_te_char_sp/train/text_shape.char +valid_shape_file: +- exp_small/exp_te/asr_stats_raw_te_char_sp/valid/speech_shape +- exp_small/exp_te/asr_stats_raw_te_char_sp/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +chunk_default_fs: null +chunk_max_abs_length: null +chunk_discard_short_samples: true +train_data_path_and_name_and_type: +- - dump/te/raw/train_te_sp/wav.scp + - speech + - sound +- - dump/te/raw/train_te_sp/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/te/raw/dev_te/wav.scp + - speech + - sound +- - dump/te/raw/dev_te/text + - text + - text +multi_task_dataset: false +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +allow_multi_rates: false +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 +token_list: +- +- +- +- ్ +- ు +- ా +- ి +- న +- ల +- ం +- క +- ర +- ప +- త +- వ +- ట +- స +- ే +- య +- డ +- ద +- మ +- చ +- ో +- గ +- ె +- బ +- ీ +- అ +- ొ +- ఎ +- ూ +- జ +- ై +- ఉ +- ధ +- ఇ +- ఆ +- ష +- భ +- శ +- ఏ +- ళ +- ఫ +- ణ +- . +- హ +- థ +- ఒ +- ఖ +- ఈ +- ౌ +- ఐ +- ృ +- ఓ +- ఊ +- ఋ +- ఛ +- ఘ +- ఠ +- ఔ +- ఱ +- ఢ +- ఞ +- ః +- ౖ +- ౦ +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true + brctc_risk_strategy: exp + brctc_group_strategy: end + brctc_risk_factor: 0.0 +joint_net_conf: null +use_preprocessor: true +use_lang_prompt: false +use_nlp_prompt: false +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + n_fft: 512 + win_length: 400 + hop_length: 160 + fs: 16k +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0.0 + - 0.05 + num_time_mask: 5 +normalize: utterance_mvn +normalize_conf: {} +model: espnet +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: e_branchformer +encoder_conf: + output_size: 256 + attention_heads: 4 + attention_layer_type: rel_selfattn + pos_enc_layer_type: rel_pos + rel_pos_type: latest + cgmlp_linear_units: 1024 + cgmlp_conv_kernel: 31 + use_linear_after_conv: false + gate_activation: identity + num_blocks: 8 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d2 + layer_drop_rate: 0.0 + linear_units: 1024 + positionwise_layer_type: linear + use_ffn: true + macaron_ffn: true + merge_conv_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + layer_drop_rate: 0.0 +preprocessor: default +preprocessor_conf: {} +required: +- output_dir +- token_list +version: '202409' +distributed: false diff --git a/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png new file mode 100644 index 0000000000000000000000000000000000000000..668c2f7bf46bcfae5c59bb0f80f51be82a5adb56 Binary files /dev/null and b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/acc.png differ diff --git a/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..a7cf18f0c6cb7c4725ee7ae31d0114ca3919a058 Binary files /dev/null and b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/backward_time.png differ diff --git a/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png new file mode 100644 index 0000000000000000000000000000000000000000..a731d3f1c176f8fdd42c4a0a67d063e0bb82fcdc Binary files /dev/null and b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer.png differ diff --git a/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..cc7e5ea1e8e26b463e23c53111116f10cb3f38aa Binary files /dev/null and b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/cer_ctc.png differ diff --git a/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png new file mode 100644 index 0000000000000000000000000000000000000000..8828a83c8359b9f04dff2b42974592b988de5f7c Binary files /dev/null and b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/clip.png differ diff --git a/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..6c6dd4fe482e155656c5a81d1776bfc3e97948a2 Binary files /dev/null and b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/forward_time.png differ diff --git a/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png new file mode 100644 index 0000000000000000000000000000000000000000..e0f201b13b70d6f39b710d14d448af7267eb30e8 Binary files /dev/null and b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/gpu_max_cached_mem_GB.png differ diff --git a/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..d3ab65a0aa6d19b0052c94788a88afe8ad3c7bdf Binary files /dev/null and b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/grad_norm.png differ diff --git a/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png new file mode 100644 index 0000000000000000000000000000000000000000..b149260ad21f773b0bd3f5bef9e43dfd6c8d4622 Binary files /dev/null and b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/iter_time.png differ diff --git a/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..4168ac17894040ab3110dd16dece0d853745b5e1 Binary files /dev/null and b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss.png differ diff --git a/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png new file mode 100644 index 0000000000000000000000000000000000000000..07cc79c141db9e70427604feed85fb21d2b0ab39 Binary files /dev/null and b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_att.png differ diff --git a/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..8ab909cfa53dc6b7a3bcab7eabb46dece0e647cf Binary files /dev/null and b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_ctc.png differ diff --git a/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png new file mode 100644 index 0000000000000000000000000000000000000000..4fae61d05b4f94a722f33f61fe8bb8033ff69758 Binary files /dev/null and b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/loss_scale.png differ diff --git a/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png new file mode 100644 index 0000000000000000000000000000000000000000..074469f1174e617af3996152763be6b1a485a6c1 Binary files /dev/null and b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim0_lr0.png differ diff --git a/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png new file mode 100644 index 0000000000000000000000000000000000000000..898b02185c1603f0aeaffd0b0ebe35d54d504489 Binary files /dev/null and b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/optim_step_time.png differ diff --git a/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png new file mode 100644 index 0000000000000000000000000000000000000000..b40e50d4c4081e5f18666546c5ceab770830b81b Binary files /dev/null and b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/train_time.png differ diff --git a/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png new file mode 100644 index 0000000000000000000000000000000000000000..79e723aa7fde327f9db87d9138677d4bae755aaa Binary files /dev/null and b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/images/wer.png differ diff --git a/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b86175b63a0271c9f9cb0257ccefa2d323d224c --- /dev/null +++ b/exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06215983b817c2cf5e803b343f335d8368701638c469cd86a8488af053c44b15 +size 112609578 diff --git a/exp_small/exp_te/meta.yaml b/exp_small/exp_te/meta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4092428d6ef5c97b8906df5243a6574ed5bbcd1d --- /dev/null +++ b/exp_small/exp_te/meta.yaml @@ -0,0 +1,8 @@ +espnet: '202412' +files: + asr_model_file: exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/valid.acc.ave_5best.pth +python: "3.8.10 (default, Mar 18 2025, 20:04:55) \n[GCC 9.4.0]" +timestamp: 1748120629.364632 +torch: 2.3.0+cu121 +yaml_files: + asr_train_config: exp_small/exp_te/asr_te_ebf_size256_mlp1024_lin1024_e8_mactrue_bs6M_gacc1/config.yaml