| | --- |
| | tags: |
| | - espnet |
| | - audio |
| | - automatic-speech-recognition |
| | language: te |
| | datasets: |
| | - microsoft_indian_languages_interspeech2018 |
| |
|
| | license: cc-by-4.0 |
| | --- |
| | |
| | ## ESPnet2 model |
| |
|
| | ### `` |
| |
|
| | This model was trained by Chaitanya Narisetty using recipe in [espnet](https://github.com/espnet/espnet/). |
| |
|
| | ### Demo: How to use in ESPnet2 |
| |
|
| | ```bash |
| | cd espnet |
| | |
| | pip install -e . |
| | cd egs2/ms_indic_is18/asr1 |
| | ./run.sh --skip_data_prep false --skip_train true --download_model espnet/chai_microsoft_indian_langs_te |
| | ``` |
| |
|
| | <!-- Generated by scripts/utils/show_asr_result.sh --> |
| | # RESULTS |
| | ## Environments |
| | - date: `Tue Mar 22 13:38:24 EDT 2022` |
| | - python version: `3.9.5 (default, Jun 4 2021, 12:28:51) [GCC 7.5.0]` |
| | - espnet version: `espnet 0.10.7a1` |
| | - pytorch version: `pytorch 1.8.1+cu111` |
| | - Git hash: `f91410f712d1287cd6809c5bf26b54c5a40fe314` |
| | - Commit date: `Mon Mar 14 22:32:17 2022 -0400` |
| |
|
| | ## asr_train_asr_xlsr53_conformer_raw_te_bpe150_sp |
| | ### WER |
| |
|
| | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |
| | |---|---|---|---|---|---|---|---|---| |
| | |decode_transformer5_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|28413|78.0|19.5|2.5|2.4|24.4|80.1| |
| | |decode_transformer5_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|28413|78.0|19.4|2.6|2.4|24.4|79.7| |
| | |decode_transformer5_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|28413|78.0|19.5|2.6|2.5|24.5|79.9| |
| | |
| | ### CER |
| | |
| | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |
| | |---|---|---|---|---|---|---|---|---| |
| | |decode_transformer5_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|229419|95.6|2.2|2.2|1.6|6.1|80.1| |
| | |decode_transformer5_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|229419|95.6|2.2|2.2|1.6|6.0|79.7| |
| | |decode_transformer5_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|229419|95.6|2.1|2.2|1.6|6.0|79.9| |
| |
|
| | ### TER |
| |
|
| | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |
| | |---|---|---|---|---|---|---|---|---| |
| | |decode_transformer5_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|146657|92.7|4.7|2.6|1.6|8.9|80.1| |
| | |decode_transformer5_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|146657|92.8|4.7|2.6|1.6|8.9|79.7| |
| | |decode_transformer5_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|146657|92.8|4.6|2.6|1.6|8.9|79.9| |
| | |
| | |
| | ## config |
| | |
| | <details><summary>expand</summary> |
| | |
| | ``` |
| | config: conf/tuning/train_asr_xlsr53_conformer.yaml |
| | print_config: false |
| | log_level: INFO |
| | dry_run: false |
| | iterator_type: sequence |
| | output_dir: exp/asr_train_asr_xlsr53_conformer_raw_te_bpe150_sp |
| | ngpu: 1 |
| | seed: 0 |
| | num_workers: 1 |
| | num_att_plot: 3 |
| | dist_backend: nccl |
| | dist_init_method: env:// |
| | dist_world_size: null |
| | dist_rank: null |
| | local_rank: 0 |
| | dist_master_addr: null |
| | dist_master_port: null |
| | dist_launcher: null |
| | multiprocessing_distributed: false |
| | unused_parameters: false |
| | sharded_ddp: false |
| | cudnn_enabled: true |
| | cudnn_benchmark: false |
| | cudnn_deterministic: true |
| | collect_stats: false |
| | write_collected_feats: false |
| | max_epoch: 50 |
| | patience: 15 |
| | val_scheduler_criterion: |
| | - valid |
| | - loss |
| | early_stopping_criterion: |
| | - valid |
| | - loss |
| | - min |
| | best_model_criterion: |
| | - - valid |
| | - acc |
| | - max |
| | keep_nbest_models: 5 |
| | nbest_averaging_interval: 0 |
| | grad_clip: 5 |
| | grad_clip_type: 2.0 |
| | grad_noise: false |
| | accum_grad: 1 |
| | no_forward_run: false |
| | resume: true |
| | train_dtype: float32 |
| | use_amp: false |
| | log_interval: null |
| | use_matplotlib: true |
| | use_tensorboard: true |
| | use_wandb: false |
| | wandb_project: null |
| | wandb_id: null |
| | wandb_entity: null |
| | wandb_name: null |
| | wandb_model_log_interval: -1 |
| | detect_anomaly: false |
| | pretrain_path: null |
| | init_param: [] |
| | ignore_init_mismatch: false |
| | freeze_param: |
| | - frontend.upstream |
| | num_iters_per_epoch: null |
| | batch_size: 64 |
| | valid_batch_size: null |
| | batch_bins: 1000000 |
| | valid_batch_bins: null |
| | train_shape_file: |
| | - exp/asr_stats_raw_te_bpe150_sp_ssl/train/speech_shape |
| | - exp/asr_stats_raw_te_bpe150_sp_ssl/train/text_shape.bpe |
| | valid_shape_file: |
| | - exp/asr_stats_raw_te_bpe150_sp_ssl/valid/speech_shape |
| | - exp/asr_stats_raw_te_bpe150_sp_ssl/valid/text_shape.bpe |
| | batch_type: folded |
| | valid_batch_type: null |
| | fold_length: |
| | - 80000 |
| | - 150 |
| | sort_in_batch: descending |
| | sort_batch: descending |
| | multiple_iterator: false |
| | chunk_length: 500 |
| | chunk_shift_ratio: 0.5 |
| | num_cache_chunks: 1024 |
| | train_data_path_and_name_and_type: |
| | - - dump/raw/train_te_sp/wav.scp |
| | - speech |
| | - sound |
| | - - dump/raw/train_te_sp/text |
| | - text |
| | - text |
| | valid_data_path_and_name_and_type: |
| | - - dump/raw/dev_te/wav.scp |
| | - speech |
| | - sound |
| | - - dump/raw/dev_te/text |
| | - text |
| | - text |
| | allow_variable_data_keys: false |
| | max_cache_size: 0.0 |
| | max_cache_fd: 32 |
| | valid_max_cache_size: null |
| | optim: adam |
| | optim_conf: |
| | lr: 0.0005 |
| | scheduler: warmuplr |
| | scheduler_conf: |
| | warmup_steps: 30000 |
| | token_list: |
| | - <blank> |
| | - <unk> |
| | - ా |
| | - ు |
| | - ి |
| | - ం |
| | - ే |
| | - వ |
| | - న |
| | - ల |
| | - ▁అ |
| | - క |
| | - ్ |
| | - ో |
| | - మ |
| | - ▁ |
| | - త |
| | - ర |
| | - ప |
| | - ీ |
| | - ▁మ |
| | - య |
| | - డ |
| | - ▁ప |
| | - ద |
| | - ని |
| | - గ |
| | - ▁వ |
| | - స |
| | - కు |
| | - ె |
| | - ర్ |
| | - ▁స |
| | - ▁క |
| | - ్య |
| | - న్న |
| | - ట |
| | - ▁చ |
| | - ▁త |
| | - ాల |
| | - ంట |
| | - ూ |
| | - శ |
| | - ంద |
| | - ార |
| | - ▁న |
| | - ారు |
| | - ▁ఉ |
| | - లు |
| | - ▁ఆ |
| | - ను |
| | - జ |
| | - రి |
| | - ▁ప్ర |
| | - ించ |
| | - ధ |
| | - ై |
| | - హ |
| | - ంది |
| | - ్ర |
| | - ▁ఇ |
| | - చ |
| | - రు |
| | - స్త |
| | - లో |
| | - ▁ద |
| | - డు |
| | - ▁ఎ |
| | - ▁వి |
| | - ల్ల |
| | - ణ |
| | - గా |
| | - ది |
| | - డి |
| | - న్నారు |
| | - దు |
| | - ిన |
| | - ▁ర |
| | - త్ |
| | - ొ |
| | - ▁గ |
| | - ంత |
| | - ంగా |
| | - ▁కా |
| | - బ |
| | - ▁జ |
| | - ష |
| | - ▁తెల |
| | - ులు |
| | - ▁ఏ |
| | - ట్ట |
| | - చ్చ |
| | - తి |
| | - నే |
| | - కి |
| | - ంలో |
| | - ▁అవును |
| | - ▁చెప్ప |
| | - భ |
| | - ▁ఈ |
| | - ప్ప |
| | - ▁ని |
| | - ▁రా |
| | - క్క |
| | - ▁బ |
| | - ట్ల |
| | - ▁భ |
| | - తో |
| | - ▁కూడా |
| | - ▁బా |
| | - ద్ద |
| | - ▁చేస |
| | - ▁లే |
| | - ాయి |
| | - ానికి |
| | - త్ర |
| | - ▁కొ |
| | - ఖ |
| | - ▁ఒక |
| | - ▁చాలా |
| | - క్ష |
| | - ళ |
| | - ▁చేస్త |
| | - ృ |
| | - థ |
| | - ఘ |
| | - ఫ |
| | - ఓ |
| | - ౌ |
| | - ఒ |
| | - ఐ |
| | - ఠ |
| | - ఢ |
| | - అ |
| | - ఉ |
| | - ఏ |
| | - ఈ |
| | - ౦ |
| | - ఇ |
| | - ః |
| | - ఋ |
| | - ఝ |
| | - ఔ |
| | - ఛ |
| | - ఞ |
| | - ఊ |
| | - ఎ |
| | - ఆ |
| | - ఙ |
| | - <sos/eos> |
| | init: xavier_uniform |
| | input_size: null |
| | ctc_conf: |
| | dropout_rate: 0.0 |
| | ctc_type: builtin |
| | reduce: true |
| | ignore_nan_grad: true |
| | joint_net_conf: null |
| | model_conf: |
| | ctc_weight: 0.3 |
| | lsm_weight: 0.1 |
| | length_normalized_loss: false |
| | extract_feats_in_collect_stats: false |
| | use_preprocessor: true |
| | token_type: bpe |
| | bpemodel: data/te_token_list/bpe_unigram150/bpe.model |
| | non_linguistic_symbols: null |
| | cleaner: null |
| | g2p: null |
| | speech_volume_normalize: null |
| | rir_scp: null |
| | rir_apply_prob: 1.0 |
| | noise_scp: null |
| | noise_apply_prob: 1.0 |
| | noise_db_range: '13_15' |
| | frontend: fused |
| | frontend_conf: |
| | frontends: |
| | - frontend_type: default |
| | n_fft: 512 |
| | win_length: 400 |
| | hop_length: 160 |
| | - frontend_type: s3prl |
| | frontend_conf: |
| | upstream: wav2vec2_xlsr |
| | download_dir: ./hub |
| | multilayer_feature: true |
| | align_method: linear_projection |
| | proj_dim: 200 |
| | fs: 16k |
| | specaug: specaug |
| | specaug_conf: |
| | apply_time_warp: true |
| | time_warp_window: 5 |
| | time_warp_mode: bicubic |
| | apply_freq_mask: true |
| | freq_mask_width_range: |
| | - 0 |
| | - 30 |
| | num_freq_mask: 2 |
| | apply_time_mask: true |
| | time_mask_width_range: |
| | - 0 |
| | - 40 |
| | num_time_mask: 2 |
| | normalize: utterance_mvn |
| | normalize_conf: {} |
| | preencoder: linear |
| | preencoder_conf: |
| | input_size: 400 |
| | output_size: 100 |
| | encoder: conformer |
| | encoder_conf: |
| | output_size: 256 |
| | attention_heads: 4 |
| | linear_units: 2048 |
| | num_blocks: 12 |
| | dropout_rate: 0.1 |
| | positional_dropout_rate: 0.1 |
| | attention_dropout_rate: 0.1 |
| | input_layer: conv2d |
| | normalize_before: true |
| | macaron_style: true |
| | pos_enc_layer_type: rel_pos |
| | selfattention_layer_type: rel_selfattn |
| | activation_type: swish |
| | use_cnn_module: true |
| | cnn_module_kernel: 15 |
| | postencoder: null |
| | postencoder_conf: {} |
| | decoder: transformer |
| | decoder_conf: |
| | input_layer: embed |
| | num_blocks: 6 |
| | linear_units: 2048 |
| | dropout_rate: 0.1 |
| | positional_dropout_rate: 0.1 |
| | self_attention_dropout_rate: 0.1 |
| | src_attention_dropout_rate: 0.1 |
| | required: |
| | - output_dir |
| | - token_list |
| | version: 0.10.7a1 |
| | distributed: false |
| | ``` |
| | |
| | </details> |
| |
|
| |
|
| |
|
| | ### Citing ESPnet |
| |
|
| | ```BibTex |
| | @inproceedings{watanabe2018espnet, |
| | author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
| | title={{ESPnet}: End-to-End Speech Processing Toolkit}, |
| | year={2018}, |
| | booktitle={Proceedings of Interspeech}, |
| | pages={2207--2211}, |
| | doi={10.21437/Interspeech.2018-1456}, |
| | url={http://dx.doi.org/10.21437/Interspeech.2018-1456} |
| | } |
| | |
| | |
| | |
| | |
| | ``` |
| |
|
| | or arXiv: |
| |
|
| | ```bibtex |
| | @misc{watanabe2018espnet, |
| | title={ESPnet: End-to-End Speech Processing Toolkit}, |
| | author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
| | year={2018}, |
| | eprint={1804.00015}, |
| | archivePrefix={arXiv}, |
| | primaryClass={cs.CL} |
| | } |
| | ``` |