| | --- |
| | tags: |
| | - espnet |
| | - audio |
| | - automatic-speech-recognition |
| | language: en |
| | datasets: |
| | - tedlium2 |
| | license: cc-by-4.0 |
| | --- |
| | |
| | ## ESPnet2 ASR model |
| |
|
| | ### `pyf98/tedlium2_conformer_e15` |
| |
|
| | This model was trained by Yifan Peng using tedlium2 recipe in [espnet](https://github.com/espnet/espnet/). |
| |
|
| | ### Demo: How to use in ESPnet2 |
| |
|
| | Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html) |
| | if you haven't done that already. |
| |
|
| | ```bash |
| | cd espnet |
| | git checkout 8ee35df7260008e9a8a20d9a9b64773a02f706ef |
| | pip install -e . |
| | cd egs2/tedlium2/asr1 |
| | ./run.sh --skip_data_prep false --skip_train true --download_model pyf98/tedlium2_conformer_e15 |
| | ``` |
| |
|
| | <!-- Generated by scripts/utils/show_asr_result.sh --> |
| | # RESULTS |
| | ## Environments |
| | - date: `Sat Dec 17 04:27:41 CST 2022` |
| | - python version: `3.9.15 (main, Nov 24 2022, 14:31:59) [GCC 11.2.0]` |
| | - espnet version: `espnet 202209` |
| | - pytorch version: `pytorch 1.12.1` |
| | - Git hash: `26f432bc859e5e40cac1a86042d498ba7baffbb0` |
| | - Commit date: `Fri Dec 9 02:16:01 2022 +0000` |
| |
|
| | ## asr_train_asr_conformer_e15_raw_en_bpe500_sp |
| | ### WER |
| |
|
| | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |
| | |---|---|---|---|---|---|---|---|---| |
| | |decode_asr_asr_model_valid.acc.ave/dev|466|14671|93.5|4.1|2.5|1.0|7.5|70.0| |
| | |decode_asr_asr_model_valid.acc.ave/test|1155|27500|93.4|4.0|2.6|1.0|7.6|64.2| |
| |
|
| | ### CER |
| |
|
| | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |
| | |---|---|---|---|---|---|---|---|---| |
| | |decode_asr_asr_model_valid.acc.ave/dev|466|78259|97.0|0.8|2.1|0.8|3.8|70.0| |
| | |decode_asr_asr_model_valid.acc.ave/test|1155|145066|97.0|0.9|2.2|0.9|4.0|64.2| |
| |
|
| | ### TER |
| |
|
| | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |
| | |---|---|---|---|---|---|---|---|---| |
| | |decode_asr_asr_model_valid.acc.ave/dev|466|28296|95.0|2.8|2.2|0.8|5.9|70.0| |
| | |decode_asr_asr_model_valid.acc.ave/test|1155|52113|95.1|2.5|2.4|0.9|5.8|64.2| |
| |
|
| | ## ASR config |
| |
|
| | <details><summary>expand</summary> |
| |
|
| | ``` |
| | config: conf/tuning/train_asr_conformer_e15.yaml |
| | print_config: false |
| | log_level: INFO |
| | dry_run: false |
| | iterator_type: sequence |
| | output_dir: exp/asr_train_asr_conformer_e15_raw_en_bpe500_sp |
| | ngpu: 1 |
| | seed: 2022 |
| | num_workers: 6 |
| | num_att_plot: 3 |
| | dist_backend: nccl |
| | dist_init_method: env:// |
| | dist_world_size: 2 |
| | dist_rank: 0 |
| | local_rank: 0 |
| | dist_master_addr: localhost |
| | dist_master_port: 59747 |
| | dist_launcher: null |
| | multiprocessing_distributed: true |
| | unused_parameters: false |
| | sharded_ddp: false |
| | cudnn_enabled: true |
| | cudnn_benchmark: false |
| | cudnn_deterministic: true |
| | collect_stats: false |
| | write_collected_feats: false |
| | max_epoch: 50 |
| | patience: null |
| | val_scheduler_criterion: |
| | - valid |
| | - loss |
| | early_stopping_criterion: |
| | - valid |
| | - loss |
| | - min |
| | best_model_criterion: |
| | - - valid |
| | - acc |
| | - max |
| | keep_nbest_models: 10 |
| | nbest_averaging_interval: 0 |
| | grad_clip: 5.0 |
| | grad_clip_type: 2.0 |
| | grad_noise: false |
| | accum_grad: 1 |
| | no_forward_run: false |
| | resume: true |
| | train_dtype: float32 |
| | use_amp: true |
| | log_interval: null |
| | use_matplotlib: true |
| | use_tensorboard: true |
| | create_graph_in_tensorboard: false |
| | use_wandb: false |
| | wandb_project: null |
| | wandb_id: null |
| | wandb_entity: null |
| | wandb_name: null |
| | wandb_model_log_interval: -1 |
| | detect_anomaly: false |
| | pretrain_path: null |
| | init_param: [] |
| | ignore_init_mismatch: false |
| | freeze_param: [] |
| | num_iters_per_epoch: null |
| | batch_size: 20 |
| | valid_batch_size: null |
| | batch_bins: 50000000 |
| | valid_batch_bins: null |
| | train_shape_file: |
| | - exp/asr_stats_raw_en_bpe500_sp/train/speech_shape |
| | - exp/asr_stats_raw_en_bpe500_sp/train/text_shape.bpe |
| | valid_shape_file: |
| | - exp/asr_stats_raw_en_bpe500_sp/valid/speech_shape |
| | - exp/asr_stats_raw_en_bpe500_sp/valid/text_shape.bpe |
| | batch_type: numel |
| | valid_batch_type: null |
| | fold_length: |
| | - 80000 |
| | - 150 |
| | sort_in_batch: descending |
| | sort_batch: descending |
| | multiple_iterator: false |
| | chunk_length: 500 |
| | chunk_shift_ratio: 0.5 |
| | num_cache_chunks: 1024 |
| | train_data_path_and_name_and_type: |
| | - - dump/raw/train_sp/wav.scp |
| | - speech |
| | - kaldi_ark |
| | - - dump/raw/train_sp/text |
| | - text |
| | - text |
| | valid_data_path_and_name_and_type: |
| | - - dump/raw/dev/wav.scp |
| | - speech |
| | - kaldi_ark |
| | - - dump/raw/dev/text |
| | - text |
| | - text |
| | allow_variable_data_keys: false |
| | max_cache_size: 0.0 |
| | max_cache_fd: 32 |
| | valid_max_cache_size: null |
| | optim: adam |
| | optim_conf: |
| | lr: 0.002 |
| | weight_decay: 1.0e-06 |
| | scheduler: warmuplr |
| | scheduler_conf: |
| | warmup_steps: 15000 |
| | token_list: |
| | - <blank> |
| | - <unk> |
| | - s |
| | - ▁the |
| | - t |
| | - ▁a |
| | - ▁and |
| | - ▁to |
| | - d |
| | - e |
| | - ▁of |
| | - '''' |
| | - n |
| | - ing |
| | - ▁in |
| | - ▁i |
| | - ▁that |
| | - i |
| | - a |
| | - l |
| | - p |
| | - m |
| | - y |
| | - o |
| | - ▁it |
| | - ▁we |
| | - c |
| | - u |
| | - ▁you |
| | - ed |
| | - ▁ |
| | - r |
| | - ▁is |
| | - re |
| | - ▁this |
| | - ar |
| | - g |
| | - ▁so |
| | - al |
| | - b |
| | - ▁s |
| | - or |
| | - ▁f |
| | - ▁c |
| | - in |
| | - k |
| | - f |
| | - ▁for |
| | - ic |
| | - er |
| | - le |
| | - ▁be |
| | - ▁do |
| | - ▁re |
| | - ve |
| | - ▁e |
| | - ▁w |
| | - ▁was |
| | - es |
| | - ▁they |
| | - ly |
| | - h |
| | - ▁on |
| | - v |
| | - ▁are |
| | - ri |
| | - ▁have |
| | - an |
| | - ▁what |
| | - ▁with |
| | - ▁t |
| | - w |
| | - ur |
| | - it |
| | - ent |
| | - ▁can |
| | - ▁he |
| | - ▁but |
| | - ra |
| | - ce |
| | - ▁me |
| | - ▁b |
| | - ▁ma |
| | - ▁p |
| | - ll |
| | - ▁st |
| | - ▁one |
| | - 'on' |
| | - ▁about |
| | - th |
| | - ▁de |
| | - en |
| | - ▁all |
| | - ▁not |
| | - il |
| | - ▁g |
| | - ch |
| | - at |
| | - ▁there |
| | - ▁mo |
| | - ter |
| | - ation |
| | - tion |
| | - ▁at |
| | - ▁my |
| | - ro |
| | - ▁as |
| | - te |
| | - ▁le |
| | - ▁con |
| | - ▁like |
| | - ▁people |
| | - ▁or |
| | - ▁an |
| | - el |
| | - ▁if |
| | - ▁from |
| | - ver |
| | - ▁su |
| | - ▁co |
| | - ate |
| | - ▁these |
| | - ol |
| | - ci |
| | - ▁now |
| | - ▁see |
| | - ▁out |
| | - ▁our |
| | - ion |
| | - ▁know |
| | - ect |
| | - ▁just |
| | - as |
| | - ▁ex |
| | - ▁ch |
| | - ▁d |
| | - ▁when |
| | - ▁very |
| | - ▁think |
| | - ▁who |
| | - ▁because |
| | - ▁go |
| | - ▁up |
| | - ▁us |
| | - ▁pa |
| | - ▁no |
| | - ies |
| | - ▁di |
| | - ▁ho |
| | - om |
| | - ive |
| | - ▁get |
| | - id |
| | - ▁o |
| | - ▁hi |
| | - un |
| | - ▁how |
| | - ▁by |
| | - ir |
| | - et |
| | - ck |
| | - ity |
| | - ▁po |
| | - ul |
| | - ▁which |
| | - ▁mi |
| | - ▁some |
| | - z |
| | - ▁sp |
| | - ▁un |
| | - ▁going |
| | - ▁pro |
| | - ist |
| | - ▁se |
| | - ▁look |
| | - ▁time |
| | - ment |
| | - de |
| | - ▁more |
| | - ▁had |
| | - ng |
| | - ▁would |
| | - ge |
| | - la |
| | - ▁here |
| | - ▁really |
| | - x |
| | - ▁your |
| | - ▁them |
| | - us |
| | - me |
| | - ▁en |
| | - ▁two |
| | - ▁k |
| | - ▁li |
| | - ▁world |
| | - ne |
| | - ow |
| | - ▁way |
| | - ▁want |
| | - ▁work |
| | - ▁don |
| | - ▁lo |
| | - ▁fa |
| | - ▁were |
| | - ▁their |
| | - age |
| | - vi |
| | - ▁ha |
| | - ac |
| | - der |
| | - est |
| | - ▁bo |
| | - am |
| | - ▁other |
| | - able |
| | - ▁actually |
| | - ▁sh |
| | - ▁make |
| | - ▁ba |
| | - ▁la |
| | - ine |
| | - ▁into |
| | - ▁where |
| | - ▁could |
| | - ▁comp |
| | - ting |
| | - ▁has |
| | - ▁will |
| | - ▁ne |
| | - j |
| | - ical |
| | - ally |
| | - ▁vi |
| | - ▁things |
| | - ▁te |
| | - igh |
| | - ▁say |
| | - ▁years |
| | - ers |
| | - ▁ra |
| | - ther |
| | - ▁than |
| | - ru |
| | - ▁ro |
| | - op |
| | - ▁did |
| | - ▁any |
| | - ▁new |
| | - ound |
| | - ig |
| | - ▁well |
| | - mo |
| | - ▁she |
| | - ▁na |
| | - ▁been |
| | - he |
| | - ▁thousand |
| | - ▁car |
| | - ▁take |
| | - ▁right |
| | - ▁then |
| | - ▁need |
| | - ▁start |
| | - ▁hundred |
| | - ▁something |
| | - ▁over |
| | - ▁com |
| | - ia |
| | - ▁kind |
| | - um |
| | - if |
| | - ▁those |
| | - ▁first |
| | - ▁pre |
| | - ta |
| | - ▁said |
| | - ize |
| | - end |
| | - ▁even |
| | - ▁thing |
| | - one |
| | - ▁back |
| | - ite |
| | - ▁every |
| | - ▁little |
| | - ry |
| | - ▁life |
| | - ▁much |
| | - ke |
| | - ▁also |
| | - ▁most |
| | - ant |
| | - per |
| | - ▁three |
| | - ▁come |
| | - ▁lot |
| | - ance |
| | - ▁got |
| | - ▁talk |
| | - ▁per |
| | - ▁inter |
| | - ▁sa |
| | - ▁use |
| | - ▁mu |
| | - ▁part |
| | - ish |
| | - ence |
| | - ▁happen |
| | - ▁bi |
| | - ▁mean |
| | - ough |
| | - ▁qu |
| | - ▁bu |
| | - ▁day |
| | - ▁ga |
| | - ▁only |
| | - ▁many |
| | - ▁different |
| | - ▁dr |
| | - ▁th |
| | - ▁show |
| | - ful |
| | - ▁down |
| | - ated |
| | - ▁good |
| | - ▁tra |
| | - ▁around |
| | - ▁idea |
| | - ▁human |
| | - ous |
| | - ▁put |
| | - ▁through |
| | - ▁five |
| | - ▁why |
| | - ▁change |
| | - ▁real |
| | - ff |
| | - ible |
| | - ▁fact |
| | - ▁same |
| | - ▁jo |
| | - ▁live |
| | - ▁year |
| | - ▁problem |
| | - ▁ph |
| | - ▁four |
| | - ▁give |
| | - ▁big |
| | - ▁tell |
| | - ▁great |
| | - ▁try |
| | - ▁va |
| | - ▁ru |
| | - ▁system |
| | - ▁six |
| | - ▁plan |
| | - ▁place |
| | - ▁build |
| | - ▁called |
| | - ▁again |
| | - ▁point |
| | - ▁twenty |
| | - ▁percent |
| | - ▁nine |
| | - ▁find |
| | - ▁app |
| | - ▁after |
| | - ▁long |
| | - ▁eight |
| | - ▁imp |
| | - ▁gene |
| | - ▁design |
| | - ▁today |
| | - ▁should |
| | - ▁made |
| | - ious |
| | - ▁came |
| | - ▁learn |
| | - ▁last |
| | - ▁own |
| | - way |
| | - ▁turn |
| | - ▁seven |
| | - ▁high |
| | - ▁question |
| | - ▁person |
| | - ▁brain |
| | - ▁important |
| | - ▁another |
| | - ▁thought |
| | - ▁trans |
| | - ▁create |
| | - ness |
| | - ▁hu |
| | - ▁power |
| | - ▁act |
| | - land |
| | - ▁play |
| | - ▁sort |
| | - ▁old |
| | - ▁before |
| | - ▁course |
| | - ▁understand |
| | - ▁feel |
| | - ▁might |
| | - ▁each |
| | - ▁million |
| | - ▁better |
| | - ▁together |
| | - ▁ago |
| | - ▁example |
| | - ▁help |
| | - ▁story |
| | - ▁next |
| | - ▁hand |
| | - ▁school |
| | - ▁water |
| | - ▁develop |
| | - ▁technology |
| | - que |
| | - ▁second |
| | - ▁grow |
| | - ▁still |
| | - ▁cell |
| | - ▁believe |
| | - ▁number |
| | - ▁small |
| | - ▁between |
| | - qui |
| | - ▁data |
| | - ▁become |
| | - ▁america |
| | - ▁maybe |
| | - ▁space |
| | - ▁project |
| | - ▁organ |
| | - ▁vo |
| | - ▁children |
| | - ▁book |
| | - graph |
| | - ▁open |
| | - ▁fifty |
| | - ▁picture |
| | - ▁health |
| | - ▁thirty |
| | - ▁africa |
| | - ▁reason |
| | - ▁large |
| | - ▁hard |
| | - ▁computer |
| | - ▁always |
| | - ▁sense |
| | - ▁money |
| | - ▁women |
| | - ▁everything |
| | - ▁information |
| | - ▁country |
| | - ▁teach |
| | - ▁energy |
| | - ▁experience |
| | - ▁food |
| | - ▁process |
| | - qua |
| | - ▁interesting |
| | - ▁future |
| | - ▁science |
| | - q |
| | - '0' |
| | - '5' |
| | - '6' |
| | - '9' |
| | - '3' |
| | - '8' |
| | - '4' |
| | - N |
| | - A |
| | - '7' |
| | - S |
| | - G |
| | - F |
| | - R |
| | - L |
| | - U |
| | - E |
| | - T |
| | - H |
| | - _ |
| | - B |
| | - D |
| | - J |
| | - M |
| | - ă |
| | - ō |
| | - ť |
| | - '2' |
| | - '-' |
| | - '1' |
| | - C |
| | - <sos/eos> |
| | init: null |
| | input_size: null |
| | ctc_conf: |
| | dropout_rate: 0.0 |
| | ctc_type: builtin |
| | reduce: true |
| | ignore_nan_grad: null |
| | zero_infinity: true |
| | joint_net_conf: null |
| | use_preprocessor: true |
| | token_type: bpe |
| | bpemodel: data/en_token_list/bpe_unigram500/bpe.model |
| | non_linguistic_symbols: null |
| | cleaner: null |
| | g2p: null |
| | speech_volume_normalize: null |
| | rir_scp: null |
| | rir_apply_prob: 1.0 |
| | noise_scp: null |
| | noise_apply_prob: 1.0 |
| | noise_db_range: '13_15' |
| | short_noise_thres: 0.5 |
| | frontend: default |
| | frontend_conf: |
| | n_fft: 512 |
| | win_length: 400 |
| | hop_length: 160 |
| | fs: 16k |
| | specaug: specaug |
| | specaug_conf: |
| | apply_time_warp: true |
| | time_warp_window: 5 |
| | time_warp_mode: bicubic |
| | apply_freq_mask: true |
| | freq_mask_width_range: |
| | - 0 |
| | - 27 |
| | num_freq_mask: 2 |
| | apply_time_mask: true |
| | time_mask_width_ratio_range: |
| | - 0.0 |
| | - 0.05 |
| | num_time_mask: 5 |
| | normalize: global_mvn |
| | normalize_conf: |
| | stats_file: exp/asr_stats_raw_en_bpe500_sp/train/feats_stats.npz |
| | model: espnet |
| | model_conf: |
| | ctc_weight: 0.3 |
| | lsm_weight: 0.1 |
| | length_normalized_loss: false |
| | preencoder: null |
| | preencoder_conf: {} |
| | encoder: conformer |
| | encoder_conf: |
| | output_size: 256 |
| | attention_heads: 4 |
| | linear_units: 1024 |
| | num_blocks: 15 |
| | dropout_rate: 0.1 |
| | positional_dropout_rate: 0.1 |
| | attention_dropout_rate: 0.1 |
| | input_layer: conv2d |
| | normalize_before: true |
| | macaron_style: true |
| | rel_pos_type: latest |
| | pos_enc_layer_type: rel_pos |
| | selfattention_layer_type: rel_selfattn |
| | activation_type: swish |
| | use_cnn_module: true |
| | cnn_module_kernel: 31 |
| | postencoder: null |
| | postencoder_conf: {} |
| | decoder: transformer |
| | decoder_conf: |
| | attention_heads: 4 |
| | linear_units: 2048 |
| | num_blocks: 6 |
| | dropout_rate: 0.1 |
| | positional_dropout_rate: 0.1 |
| | self_attention_dropout_rate: 0.1 |
| | src_attention_dropout_rate: 0.1 |
| | preprocessor: default |
| | preprocessor_conf: {} |
| | required: |
| | - output_dir |
| | - token_list |
| | version: '202209' |
| | distributed: true |
| | ``` |
| |
|
| | </details> |
| |
|
| |
|
| |
|
| | ### Citing ESPnet |
| |
|
| | ```BibTex |
| | @inproceedings{watanabe2018espnet, |
| | author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
| | title={{ESPnet}: End-to-End Speech Processing Toolkit}, |
| | year={2018}, |
| | booktitle={Proceedings of Interspeech}, |
| | pages={2207--2211}, |
| | doi={10.21437/Interspeech.2018-1456}, |
| | url={http://dx.doi.org/10.21437/Interspeech.2018-1456} |
| | } |
| | |
| | |
| | |
| | |
| | ``` |
| |
|
| | or arXiv: |
| |
|
| | ```bibtex |
| | @misc{watanabe2018espnet, |
| | title={ESPnet: End-to-End Speech Processing Toolkit}, |
| | author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
| | year={2018}, |
| | eprint={1804.00015}, |
| | archivePrefix={arXiv}, |
| | primaryClass={cs.CL} |
| | } |
| | ``` |
| |
|