Upload checkpoint, sanitized config, and transcripts for ctc-baseline_mms_set_4

Files changed (5) hide show

README.md +41 -0
config.yaml +343 -0
hyp.trn +0 -0
ref.trn +0 -0
valid.loss.best.pth +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,41 @@

+---
+title: "CTC-DRO MMS-based ASR model - set 4"
+language: multilingual
+tags:
+  - asr
+  - ctc-dro
+  - MMS
+license: cc-by-nc-4.0
+---
+# CTC-Baseline MMS-based ASR model - set 4
+This repository contains a CTC-Baseline MMS-based automatic speech recognition (ASR) model trained with ESPnet.
+The model was trained on balanced training data from set 4.
+## Intended Use
+This model is intended for ASR. Users can run inference using the provided checkpoint (`valid.loss.best.pth`) and configuration file (`config.yaml`):
+```bash
+import soundfile as sf
+from espnet2.bin.asr_inference import Speech2Text
+asr_train_config = "ctc-baseline_mms_set_4/config.yaml"
+asr_model_file = "ctc-baseline_mms_set_4/valid.loss.best.pth"
+model = Speech2Text.from_pretrained(
+    asr_train_config=asr_train_config,
+    asr_model_file=asr_model_file
+)
+speech, _ = sf.read("input.wav")
+text, *_ = model(speech)[0]
+print("Recognized text:", text)
+```
+## How to Use
+1. Clone this repository.
+2. Use ESPnet’s inference scripts with the provided `config.yaml` and checkpoint file.
+3. Ensure any external resources referenced in `config.yaml` are available at the indicated relative paths.

config.yaml ADDED Viewed

	@@ -0,0 +1,343 @@

+accum_grad: 16
+adapter: lora
+adapter_conf: {}
+allow_multi_rates: false
+allow_variable_data_keys: false
+aux_ctc_tasks: []
+batch_bins: 1000000
+batch_size: 4
+batch_type: duration_language
+best_model_criterion:
+- - valid
+  - loss
+  - min
+bpemodel: null
+chunk_default_fs: null
+chunk_excluded_key_prefixes: []
+chunk_length: 500
+chunk_shift_ratio: 0.5
+cleaner: null
+collect_stats: false
+create_graph_in_tensorboard: false
+ctc_conf:
+  ctc_type: builtin
+cudnn_benchmark: false
+cudnn_deterministic: true
+cudnn_enabled: true
+decoder: null
+decoder_conf: {}
+detect_anomaly: false
+distributed: false
+drop_last_iter: false
+dry_run: false
+duration_batch_length: -1
+early_stopping_criterion:
+- valid
+- loss
+- min
+encoder: transformer
+encoder_conf:
+  attention_dropout_rate: 0.1
+  attention_heads: 8
+  dropout_rate: 0.1
+  input_layer: conv2d2
+  linear_units: 1024
+  normalize_before: true
+  num_blocks: 2
+  output_size: 256
+  positional_dropout_rate: 0.1
+exclude_weight_decay: false
+exclude_weight_decay_conf: {}
+fold_length:
+- 80000
+- 150
+freeze_param: []
+frontend: s3prl
+frontend_conf:
+  download_dir: ./hub
+  frontend_conf:
+    path_or_url: facebook/mms-300m
+    upstream: hf_wav2vec2_custom
+  fs: 16k
+  multilayer_feature: true
+g2p: null
+grad_clip: 5.0
+grad_clip_type: 2.0
+grad_noise: false
+ignore_init_mismatch: false
+init: xavier_uniform
+init_param: []
+input_size: null
+iterator_type: sequence
+joint_net_conf: null
+keep_nbest_models: 2
+log_interval: null
+log_level: INFO
+max_cache_fd: 32
+max_cache_size: 0.0
+max_epoch: 40
+model: espnet
+model_conf:
+  ctc_weight: 1.0
+multiple_iterator: false
+multiprocessing_distributed: false
+nbest_averaging_interval: 0
+ngpu: 1
+no_forward_run: false
+noise_apply_prob: 1.0
+noise_db_range: '13_15'
+noise_scp: null
+non_linguistic_symbols: ./nlsyms.txt
+normalize: utterance_mvn
+normalize_conf: {}
+num_att_plot: 3
+num_cache_chunks: 1024
+num_iters_per_epoch: 140
+num_workers: 4
+optim: adam
+optim_conf:
+  lr: 0.0001
+  weight_decay: 1.0e-06
+output_dir: ./inference_results
+patience: null
+postencoder: null
+postencoder_conf: {}
+preencoder: linear
+preencoder_conf:
+  input_size: 1024
+  output_size: 80
+preprocessor: default
+preprocessor_conf: {}
+pretrain_path: null
+print_config: false
+required:
+- output_dir
+- token_list
+resume: true
+rir_apply_prob: 1.0
+rir_scp: null
+save_strategy: all
+scheduler: null
+scheduler_conf: {}
+seed: 0
+sharded_ddp: false
+short_noise_thres: 0.5
+shuffle_within_batch: false
+sort_batch: descending
+sort_in_batch: descending
+specaug: specaug
+specaug_conf:
+  apply_freq_mask: true
+  apply_time_mask: true
+  apply_time_warp: true
+  freq_mask_width_range:
+  - 0
+  - 27
+  num_freq_mask: 2
+  num_time_mask: 10
+  time_mask_width_ratio_range:
+  - 0.0
+  - 0.05
+  time_warp_mode: bicubic
+  time_warp_window: 5
+speech_volume_normalize: null
+token_list:
+- <blank>
+- <unk>
+- <space>
+- E
+- A
+- O
+- N
+- S
+- I
+- ا
+- L
+- T
+- R
+- و
+- D
+- ن
+- ر
+- ی
+- ي
+- M
+- U
+- H
+- P
+- ک
+- م
+- C
+- А
+- Ӹ
+- Н
+- B
+- ت
+- س
+- ل
+- J
+- K
+- ہ
+- Т
+- ے
+- G
+- Ш
+- К
+- Е
+- Л
+- Ы
+- V
+- М
+- ج
+- Ӓ
+- ه
+- ب
+- د
+- О
+- Y
+- '[slv]'
+- Р
+- ڪ
+- پ
+- Z
+- '[mrj]'
+- F
+- گ
+- И
+- В
+- ئ
+- Д
+- '[sot]'
+- ں
+- '[spa]'
+- W
+- Q
+- П
+- Г
+- ف
+- ق
+- С
+- ع
+- ش
+- Ж
+- ز
+- ھ
+- آ
+- Č
+- Í
+- У
+- ح
+- '[urd]'
+- Š
+- ٹ
+- چ
+- Ь
+- ٽ
+- '[snd]'
+- ڻ
+- Й
+- ط
+- ص
+- ٿ
+- Ц
+- خ
+- Ó
+- Я
+- Á
+- É
+- Ч
+- ۾
+- '0'
+- Ž
+- З
+- '1'
+- ۽
+- –
+- ڏ
+- Э
+- ڊ
+- —
+- ڈ
+- ء
+- Ñ
+- ڙ
+- ِ
+- '2'
+- ٻ
+- Х
+- Ӱ
+- ظ
+- ض
+- ث
+- ڳ
+- ،
+- X
+- ¡
+- غ
+- ڑ
+- Ӧ
+- ذ
+- ¿
+- '5'
+- ڌ
+- '3'
+- ڀ
+- ُ
+- '9'
+- Ú
+- '4'
+- '8'
+- ۔
+- '6'
+- ٺ
+- Ю
+- »
+- Б
+- «
+- ڇ
+- ً
+- ڃ
+- '7'
+- ڄ
+- ؤ
+- ڍ
+- Ф
+- َ
+- ٰ
+- ّ
+- ڱ
+- ”
+- ژ
+- ڦ
+- Ё
+- ؛
+- ٍ
+- Щ
+- ؟
+- ’
+- ‘
+- °
+- ۃ
+- إ
+- Ć
+- <sos/eos>
+token_type: char
+train_dtype: float32
+unused_parameters: true
+use_adapter: false
+use_amp: false
+use_lang_prompt: false
+use_matplotlib: true
+use_nlp_prompt: false
+use_preprocessor: true
+use_tensorboard: true
+val_scheduler_criterion:
+- valid
+- loss
+valid_batch_bins: null
+valid_batch_size: null
+valid_batch_type: null
+valid_iterator_type: null
+valid_max_cache_size: null
+version: '202402'
+write_collected_feats: false

hyp.trn ADDED Viewed

The diff for this file is too large to render. See raw diff

ref.trn ADDED Viewed

The diff for this file is too large to render. See raw diff

valid.loss.best.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d643d36c1cacfdbe4162ffe5755c790e7a195544ddd4d7b23b320475dd852c83
+size 1280866892