niobures commited on
Commit
766cd56
·
verified ·
1 Parent(s): 7107d58

wav2vec2 (ar, de, es, fr, fa, he, ro, ru, tr, uk, multi)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +7 -0
  2. ar/wav2vec2-large-xlsr-53-arabic (elgeish)/.gitattributes +17 -0
  3. ar/wav2vec2-large-xlsr-53-arabic (elgeish)/README.md +206 -0
  4. ar/wav2vec2-large-xlsr-53-arabic (elgeish)/config.json +69 -0
  5. ar/wav2vec2-large-xlsr-53-arabic (elgeish)/flax_model.msgpack +3 -0
  6. ar/wav2vec2-large-xlsr-53-arabic (elgeish)/preprocessor_config.json +8 -0
  7. ar/wav2vec2-large-xlsr-53-arabic (elgeish)/pytorch_model.bin +3 -0
  8. ar/wav2vec2-large-xlsr-53-arabic (elgeish)/source.txt +1 -0
  9. ar/wav2vec2-large-xlsr-53-arabic (elgeish)/special_tokens_map.json +1 -0
  10. ar/wav2vec2-large-xlsr-53-arabic (elgeish)/tokenizer_config.json +1 -0
  11. ar/wav2vec2-large-xlsr-53-arabic (elgeish)/validation_wer.png +3 -0
  12. ar/wav2vec2-large-xlsr-53-arabic (elgeish)/vocab.json +1 -0
  13. ar/wav2vec2-large-xlsr-53-arabic/.gitattributes +17 -0
  14. ar/wav2vec2-large-xlsr-53-arabic/README.md +200 -0
  15. ar/wav2vec2-large-xlsr-53-arabic/config.json +76 -0
  16. ar/wav2vec2-large-xlsr-53-arabic/flax_model.msgpack +3 -0
  17. ar/wav2vec2-large-xlsr-53-arabic/preprocessor_config.json +8 -0
  18. ar/wav2vec2-large-xlsr-53-arabic/pytorch_model.bin +3 -0
  19. ar/wav2vec2-large-xlsr-53-arabic/source.txt +1 -0
  20. ar/wav2vec2-large-xlsr-53-arabic/special_tokens_map.json +1 -0
  21. ar/wav2vec2-large-xlsr-53-arabic/vocab.json +1 -0
  22. de/wav2vec2-base-10k-voxpopuli-ft-de/.gitattributes +17 -0
  23. de/wav2vec2-base-10k-voxpopuli-ft-de/README.md +69 -0
  24. de/wav2vec2-base-10k-voxpopuli-ft-de/config.json +68 -0
  25. de/wav2vec2-base-10k-voxpopuli-ft-de/preprocessor_config.json +9 -0
  26. de/wav2vec2-base-10k-voxpopuli-ft-de/pytorch_model.bin +3 -0
  27. de/wav2vec2-base-10k-voxpopuli-ft-de/source.txt +1 -0
  28. de/wav2vec2-base-10k-voxpopuli-ft-de/special_tokens_map.json +1 -0
  29. de/wav2vec2-base-10k-voxpopuli-ft-de/tokenizer_config.json +1 -0
  30. de/wav2vec2-base-10k-voxpopuli-ft-de/vocab.json +1 -0
  31. es/wav2vec2-large-es-voxpopuli/.gitattributes +17 -0
  32. es/wav2vec2-large-es-voxpopuli/README.md +24 -0
  33. es/wav2vec2-large-es-voxpopuli/config.json +83 -0
  34. es/wav2vec2-large-es-voxpopuli/flax_model.msgpack +3 -0
  35. es/wav2vec2-large-es-voxpopuli/preprocessor_config.json +9 -0
  36. es/wav2vec2-large-es-voxpopuli/pytorch_model.bin +3 -0
  37. es/wav2vec2-large-es-voxpopuli/source.txt +1 -0
  38. fa/wav2vec2-large-xlsr-53-persian/.gitattributes +17 -0
  39. fa/wav2vec2-large-xlsr-53-persian/README.md +195 -0
  40. fa/wav2vec2-large-xlsr-53-persian/config.json +76 -0
  41. fa/wav2vec2-large-xlsr-53-persian/flax_model.msgpack +3 -0
  42. fa/wav2vec2-large-xlsr-53-persian/issues.txt +9 -0
  43. fa/wav2vec2-large-xlsr-53-persian/preprocessor_config.json +8 -0
  44. fa/wav2vec2-large-xlsr-53-persian/pytorch_model.bin +3 -0
  45. fa/wav2vec2-large-xlsr-53-persian/source.txt +1 -0
  46. fa/wav2vec2-large-xlsr-53-persian/special_tokens_map.json +1 -0
  47. fa/wav2vec2-large-xlsr-53-persian/vocab.json +1 -0
  48. fa/wav2vec2-large-xlsr-persian-v3/.gitattributes +17 -0
  49. fa/wav2vec2-large-xlsr-persian-v3/README.md +236 -0
  50. fa/wav2vec2-large-xlsr-persian-v3/config.json +76 -0
.gitattributes CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ ar/wav2vec2-large-xlsr-53-arabic[[:space:]](elgeish)/validation_wer.png filter=lfs diff=lfs merge=lfs -text
37
+ fa/wav2vec2-large-xlsr-persian-v3/sample2978.flac filter=lfs diff=lfs merge=lfs -text
38
+ he/wav2vec2-xls-r-300m-lm-hebrew/language_model/unigrams.txt filter=lfs diff=lfs merge=lfs -text
39
+ multi/wav2vec2-xlsr-53-espeak-cv-ft/img[[:space:]](issue[[:space:]]10).jpeg filter=lfs diff=lfs merge=lfs -text
40
+ ru/wav2vec2-large-xlsr-53-russian/language_model/lm.binary filter=lfs diff=lfs merge=lfs -text
41
+ ru/wav2vec2-large-xlsr-53-russian/language_model/unigrams.txt filter=lfs diff=lfs merge=lfs -text
42
+ uk/w2v-xls-r-uk/language_model/lm.binary filter=lfs diff=lfs merge=lfs -text
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/.gitattributes ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.arrow filter=lfs diff=lfs merge=lfs -text
10
+ *.ftz filter=lfs diff=lfs merge=lfs -text
11
+ *.joblib filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.pb filter=lfs diff=lfs merge=lfs -text
15
+ *.pt filter=lfs diff=lfs merge=lfs -text
16
+ *.pth filter=lfs diff=lfs merge=lfs -text
17
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/README.md ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: ar
3
+ datasets:
4
+ - arabic_speech_corpus
5
+ - mozilla-foundation/common_voice_6_1
6
+ metrics:
7
+ - wer
8
+ tags:
9
+ - audio
10
+ - automatic-speech-recognition
11
+ - speech
12
+ - xlsr-fine-tuning-week
13
+ - hf-asr-leaderboard
14
+ license: apache-2.0
15
+ model-index:
16
+ - name: elgeish-wav2vec2-large-xlsr-53-arabic
17
+ results:
18
+ - task:
19
+ name: Automatic Speech Recognition
20
+ type: automatic-speech-recognition
21
+ dataset:
22
+ name: Common Voice 6.1 (Arabic)
23
+ type: mozilla-foundation/common_voice_6_1
24
+ config: ar
25
+ split: test
26
+ args:
27
+ language: ar
28
+ metrics:
29
+ - name: Test WER
30
+ type: wer
31
+ value: 26.55
32
+ - name: Validation WER
33
+ type: wer
34
+ value: 23.39
35
+ ---
36
+
37
+ # Wav2Vec2-Large-XLSR-53-Arabic
38
+
39
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53)
40
+ on Arabic using the `train` splits of [Common Voice](https://huggingface.co/datasets/common_voice)
41
+ and [Arabic Speech Corpus](https://huggingface.co/datasets/arabic_speech_corpus).
42
+ When using this model, make sure that your speech input is sampled at 16kHz.
43
+
44
+ ## Usage
45
+
46
+ The model can be used directly (without a language model) as follows:
47
+
48
+ ```python
49
+ import torch
50
+ import torchaudio
51
+ from datasets import load_dataset
52
+ from lang_trans.arabic import buckwalter
53
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
54
+
55
+ dataset = load_dataset("common_voice", "ar", split="test[:10]")
56
+ resamplers = { # all three sampling rates exist in test split
57
+ 48000: torchaudio.transforms.Resample(48000, 16000),
58
+ 44100: torchaudio.transforms.Resample(44100, 16000),
59
+ 32000: torchaudio.transforms.Resample(32000, 16000),
60
+ }
61
+
62
+ def prepare_example(example):
63
+ speech, sampling_rate = torchaudio.load(example["path"])
64
+ example["speech"] = resamplers[sampling_rate](speech).squeeze().numpy()
65
+ return example
66
+
67
+ dataset = dataset.map(prepare_example)
68
+ processor = Wav2Vec2Processor.from_pretrained("elgeish/wav2vec2-large-xlsr-53-arabic")
69
+ model = Wav2Vec2ForCTC.from_pretrained("elgeish/wav2vec2-large-xlsr-53-arabic").eval()
70
+
71
+ def predict(batch):
72
+ inputs = processor(batch["speech"], sampling_rate=16000, return_tensors="pt", padding=True)
73
+ with torch.no_grad():
74
+ predicted = torch.argmax(model(inputs.input_values).logits, dim=-1)
75
+ predicted[predicted == -100] = processor.tokenizer.pad_token_id # see fine-tuning script
76
+ batch["predicted"] = processor.tokenizer.batch_decode(predicted)
77
+ return batch
78
+
79
+ dataset = dataset.map(predict, batched=True, batch_size=1, remove_columns=["speech"])
80
+
81
+ for reference, predicted in zip(dataset["sentence"], dataset["predicted"]):
82
+ print("reference:", reference)
83
+ print("predicted:", buckwalter.untrans(predicted))
84
+ print("--")
85
+ ```
86
+
87
+ Here's the output:
88
+
89
+ ```
90
+ reference: ألديك قلم ؟
91
+ predicted: هلديك قالر
92
+ --
93
+ reference: ليست هناك مسافة على هذه الأرض أبعد من يوم أمس.
94
+ predicted: ليست نالك مسافة على هذه الأرض أبعد من يوم أمس
95
+ --
96
+ reference: إنك تكبر المشكلة.
97
+ predicted: إنك تكبر المشكلة
98
+ --
99
+ reference: يرغب أن يلتقي بك.
100
+ predicted: يرغب أن يلتقي بك
101
+ --
102
+ reference: إنهم لا يعرفون لماذا حتى.
103
+ predicted: إنهم لا يعرفون لماذا حتى
104
+ --
105
+ reference: سيسعدني مساعدتك أي وقت تحب.
106
+ predicted: سيسئدني مساعد سكرأي وقت تحب
107
+ --
108
+ reference: أَحَبُّ نظريّة علمية إليّ هي أن حلقات زحل مكونة بالكامل من الأمتعة المفقودة.
109
+ predicted: أحب ناضريةً علمية إلي هي أنحل قتزح المكونا بالكامل من الأمت عن المفقودة
110
+ --
111
+ reference: سأشتري له قلماً.
112
+ predicted: سأشتري له قلما
113
+ --
114
+ reference: أين المشكلة ؟
115
+ predicted: أين المشكل
116
+ --
117
+ reference: وَلِلَّهِ يَسْجُدُ مَا فِي السَّمَاوَاتِ وَمَا فِي الْأَرْضِ مِنْ دَابَّةٍ وَالْمَلَائِكَةُ وَهُمْ لَا يَسْتَكْبِرُونَ
118
+ predicted: ولله يسجد ما في السماوات وما في الأرض من دابة والملائكة وهم لا يستكبرون
119
+ --
120
+ ```
121
+
122
+ ## Evaluation
123
+
124
+ The model can be evaluated as follows on the Arabic test data of Common Voice:
125
+
126
+ ```python
127
+ import jiwer
128
+ import torch
129
+ import torchaudio
130
+ from datasets import load_dataset
131
+ from lang_trans.arabic import buckwalter
132
+ from transformers import set_seed, Wav2Vec2ForCTC, Wav2Vec2Processor
133
+
134
+ set_seed(42)
135
+ test_split = load_dataset("common_voice", "ar", split="test")
136
+ resamplers = { # all three sampling rates exist in test split
137
+ 48000: torchaudio.transforms.Resample(48000, 16000),
138
+ 44100: torchaudio.transforms.Resample(44100, 16000),
139
+ 32000: torchaudio.transforms.Resample(32000, 16000),
140
+ }
141
+
142
+ def prepare_example(example):
143
+ speech, sampling_rate = torchaudio.load(example["path"])
144
+ example["speech"] = resamplers[sampling_rate](speech).squeeze().numpy()
145
+ return example
146
+
147
+ test_split = test_split.map(prepare_example)
148
+ processor = Wav2Vec2Processor.from_pretrained("elgeish/wav2vec2-large-xlsr-53-arabic")
149
+ model = Wav2Vec2ForCTC.from_pretrained("elgeish/wav2vec2-large-xlsr-53-arabic").to("cuda").eval()
150
+
151
+ def predict(batch):
152
+ inputs = processor(batch["speech"], sampling_rate=16000, return_tensors="pt", padding=True)
153
+ with torch.no_grad():
154
+ predicted = torch.argmax(model(inputs.input_values.to("cuda")).logits, dim=-1)
155
+ predicted[predicted == -100] = processor.tokenizer.pad_token_id # see fine-tuning script
156
+ batch["predicted"] = processor.batch_decode(predicted)
157
+ return batch
158
+
159
+ test_split = test_split.map(predict, batched=True, batch_size=16, remove_columns=["speech"])
160
+ transformation = jiwer.Compose([
161
+ # normalize some diacritics, remove punctuation, and replace Persian letters with Arabic ones
162
+ jiwer.SubstituteRegexes({
163
+ r'[auiFNKo\~_،؟»\?;:\-,\.؛«!"]': "", "\u06D6": "",
164
+ r"[\|\{]": "A", "p": "h", "ک": "k", "ی": "y"}),
165
+ # default transformation below
166
+ jiwer.RemoveMultipleSpaces(),
167
+ jiwer.Strip(),
168
+ jiwer.SentencesToListOfWords(),
169
+ jiwer.RemoveEmptyStrings(),
170
+ ])
171
+ metrics = jiwer.compute_measures(
172
+ truth=[buckwalter.trans(s) for s in test_split["sentence"]], # Buckwalter transliteration
173
+ hypothesis=test_split["predicted"],
174
+ truth_transform=transformation,
175
+ hypothesis_transform=transformation,
176
+ )
177
+ print(f"WER: {metrics['wer']:.2%}")
178
+ ```
179
+
180
+ **Test Result**: 26.55%
181
+
182
+ ## Training
183
+
184
+ For more details, see [Fine-Tuning with Arabic Speech Corpus](https://github.com/huggingface/transformers/tree/1c06240e1b3477728129bb58e7b6c7734bb5074e/examples/research_projects/wav2vec2#fine-tuning-with-arabic-speech-corpus).
185
+
186
+ This model represents Arabic in a format called [Buckwalter transliteration](https://en.wikipedia.org/wiki/Buckwalter_transliteration).
187
+ The Buckwalter format only includes ASCII characters, some of which are non-alpha (e.g., `">"` maps to `"أ"`).
188
+ The [lang-trans](https://github.com/kariminf/lang-trans) package is used to convert (transliterate) Arabic abjad.
189
+
190
+ [This script](https://github.com/huggingface/transformers/blob/1c06240e1b3477728129bb58e7b6c7734bb5074e/examples/research_projects/wav2vec2/finetune_large_xlsr_53_arabic_speech_corpus.sh)
191
+ was used to first fine-tune [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53)
192
+ on the `train` split of the [Arabic Speech Corpus](https://huggingface.co/datasets/arabic_speech_corpus) dataset;
193
+ the `test` split was used for model selection; the resulting model at this point is saved as [elgeish/wav2vec2-large-xlsr-53-levantine-arabic](https://huggingface.co/elgeish/wav2vec2-large-xlsr-53-levantine-arabic).
194
+
195
+ Training was then resumed using the `train` split of the [Common Voice](https://huggingface.co/datasets/common_voice) dataset;
196
+ the `validation` split was used for model selection;
197
+ training was stopped to meet the deadline of [Fine-Tune-XLSR Week](https://github.com/huggingface/transformers/blob/700229f8a4003c4f71f29275e0874b5ba58cd39d/examples/research_projects/wav2vec2/FINE_TUNE_XLSR_WAV2VEC2.md):
198
+ this model is the checkpoint at 100k steps and a validation WER of **23.39%**.
199
+
200
+ <img src="https://huggingface.co/elgeish/wav2vec2-large-xlsr-53-arabic/raw/main/validation_wer.png" alt="Validation WER" width="100%" />
201
+
202
+ It's worth noting that validation WER is trending down, indicating the potential of further training (resuming the decaying learning rate at 7e-6).
203
+
204
+ ## Future Work
205
+ One area to explore is using `attention_mask` in model input, which is recommended [here](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2).
206
+ Also, exploring data augmentation using datasets used to train models listed [here](https://paperswithcode.com/sota/speech-recognition-on-common-voice-arabic).
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/config.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "elgeish/wav2vec2-large-xlsr-53-arabic",
3
+ "activation_dropout": 0.1,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "Wav2Vec2ForCTC"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "conv_bias": true,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_stride": [
30
+ 5,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2
37
+ ],
38
+ "ctc_loss_reduction": "sum",
39
+ "ctc_zero_infinity": false,
40
+ "do_stable_layer_norm": true,
41
+ "eos_token_id": 2,
42
+ "feat_extract_activation": "gelu",
43
+ "feat_extract_dropout": 0.0,
44
+ "feat_extract_norm": "layer",
45
+ "feat_proj_dropout": 0.1,
46
+ "final_dropout": 0.1,
47
+ "gradient_checkpointing": false,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.1,
50
+ "hidden_dropout_prob": 0.1,
51
+ "hidden_size": 1024,
52
+ "initializer_range": 0.02,
53
+ "intermediate_size": 4096,
54
+ "layer_norm_eps": 1e-05,
55
+ "layerdrop": 0.1,
56
+ "mask_feature_length": 10,
57
+ "mask_feature_prob": 0.0,
58
+ "mask_time_length": 10,
59
+ "mask_time_prob": 0.05,
60
+ "model_type": "wav2vec2",
61
+ "num_attention_heads": 16,
62
+ "num_conv_pos_embedding_groups": 16,
63
+ "num_conv_pos_embeddings": 128,
64
+ "num_feat_extract_layers": 7,
65
+ "num_hidden_layers": 24,
66
+ "pad_token_id": 0,
67
+ "transformers_version": "4.4.0.dev0",
68
+ "vocab_size": 56
69
+ }
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50f263c94e388555ccc40d13e8086de5222f7ca3a4aa20c4cad8232076e92fd4
3
+ size 1261999872
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/preprocessor_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_size": 1,
4
+ "padding_side": "right",
5
+ "padding_value": 0.0,
6
+ "return_attention_mask": false,
7
+ "sampling_rate": 16000
8
+ }
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe4a13489e50820edbd709e85699dd87beeff5387e8c40d16860b10efb964547
3
+ size 1262163415
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/elgeish/wav2vec2-large-xlsr-53-arabic
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "/", "return_attention_mask": false, "do_normalize": true, "special_tokens_map_file": "special_tokens_map.json", "name_or_path": "elgeish/wav2vec2-large-xlsr-53-arabic"}
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/validation_wer.png ADDED

Git LFS Details

  • SHA256: 0fbdcedb6b204cc77c6728b76d7fa1e7f5c8b75748fe06df6bb03933f46fb5c4
  • Pointer size: 131 Bytes
  • Size of remote file: 109 kB
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "/": 4, "'": 5, "|": 6, ">": 7, "&": 8, "<": 9, "}": 10, "A": 11, "b": 12, "p": 13, "t": 14, "v": 15, "j": 16, "H": 17, "x": 18, "d": 19, "*": 20, "r": 21, "z": 22, "s": 23, "$": 24, "S": 25, "D": 26, "T": 27, "Z": 28, "E": 29, "g": 30, "_": 31, "f": 32, "q": 33, "k": 34, "l": 35, "m": 36, "n": 37, "h": 38, "w": 39, "Y": 40, "y": 41, "F": 42, "N": 43, "K": 44, "a": 45, "u": 46, "i": 47, "~": 48, "o": 49, "`": 50, "{": 51, "P": 52, "J": 53, "V": 54, "G": 55}
ar/wav2vec2-large-xlsr-53-arabic/.gitattributes ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.arrow filter=lfs diff=lfs merge=lfs -text
10
+ *.ftz filter=lfs diff=lfs merge=lfs -text
11
+ *.joblib filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.pb filter=lfs diff=lfs merge=lfs -text
15
+ *.pt filter=lfs diff=lfs merge=lfs -text
16
+ *.pth filter=lfs diff=lfs merge=lfs -text
17
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
ar/wav2vec2-large-xlsr-53-arabic/README.md ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: ar
3
+ datasets:
4
+ - common_voice
5
+ - arabic_speech_corpus
6
+ metrics:
7
+ - wer
8
+ - cer
9
+ tags:
10
+ - audio
11
+ - automatic-speech-recognition
12
+ - speech
13
+ - xlsr-fine-tuning-week
14
+ license: apache-2.0
15
+ model-index:
16
+ - name: XLSR Wav2Vec2 Arabic by Jonatas Grosman
17
+ results:
18
+ - task:
19
+ name: Speech Recognition
20
+ type: automatic-speech-recognition
21
+ dataset:
22
+ name: Common Voice ar
23
+ type: common_voice
24
+ args: ar
25
+ metrics:
26
+ - name: Test WER
27
+ type: wer
28
+ value: 39.59
29
+ - name: Test CER
30
+ type: cer
31
+ value: 18.18
32
+ ---
33
+
34
+ # Fine-tuned XLSR-53 large model for speech recognition in Arabic
35
+
36
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Arabic using the train and validation splits of [Common Voice 6.1](https://huggingface.co/datasets/common_voice) and [Arabic Speech Corpus](https://huggingface.co/datasets/arabic_speech_corpus).
37
+ When using this model, make sure that your speech input is sampled at 16kHz.
38
+
39
+ This model has been fine-tuned thanks to the GPU credits generously given by the [OVHcloud](https://www.ovhcloud.com/en/public-cloud/ai-training/) :)
40
+
41
+ The script used for training can be found here: https://github.com/jonatasgrosman/wav2vec2-sprint
42
+
43
+ ## Usage
44
+
45
+ The model can be used directly (without a language model) as follows...
46
+
47
+ Using the [HuggingSound](https://github.com/jonatasgrosman/huggingsound) library:
48
+
49
+ ```python
50
+ from huggingsound import SpeechRecognitionModel
51
+
52
+ model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-arabic")
53
+ audio_paths = ["/path/to/file.mp3", "/path/to/another_file.wav"]
54
+
55
+ transcriptions = model.transcribe(audio_paths)
56
+ ```
57
+
58
+ Writing your own inference script:
59
+
60
+ ```python
61
+ import torch
62
+ import librosa
63
+ from datasets import load_dataset
64
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
65
+
66
+ LANG_ID = "ar"
67
+ MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
68
+ SAMPLES = 10
69
+
70
+ test_dataset = load_dataset("common_voice", LANG_ID, split=f"test[:{SAMPLES}]")
71
+
72
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
73
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
74
+
75
+ # Preprocessing the datasets.
76
+ # We need to read the audio files as arrays
77
+ def speech_file_to_array_fn(batch):
78
+ speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
79
+ batch["speech"] = speech_array
80
+ batch["sentence"] = batch["sentence"].upper()
81
+ return batch
82
+
83
+ test_dataset = test_dataset.map(speech_file_to_array_fn)
84
+ inputs = processor(test_dataset["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
85
+
86
+ with torch.no_grad():
87
+ logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
88
+
89
+ predicted_ids = torch.argmax(logits, dim=-1)
90
+ predicted_sentences = processor.batch_decode(predicted_ids)
91
+
92
+ for i, predicted_sentence in enumerate(predicted_sentences):
93
+ print("-" * 100)
94
+ print("Reference:", test_dataset[i]["sentence"])
95
+ print("Prediction:", predicted_sentence)
96
+ ```
97
+
98
+ | Reference | Prediction |
99
+ | ------------- | ------------- |
100
+ | ألديك قلم ؟ | ألديك قلم |
101
+ | ليست هناك مسافة على هذه الأرض أبعد من يوم أمس. | ليست نالك مسافة على هذه الأرض أبعد من يوم الأمس م |
102
+ | إنك تكبر المشكلة. | إنك تكبر المشكلة |
103
+ | يرغب أن يلتقي بك. | يرغب أن يلتقي بك |
104
+ | إنهم لا يعرفون لماذا حتى. | إنهم لا يعرفون لماذا حتى |
105
+ | سيسعدني مساعدتك أي وقت تحب. | سيسئدنيمساعدتك أي وقد تحب |
106
+ | أَحَبُّ نظريّة علمية إليّ هي أن حلقات زحل مكونة بالكامل من الأمتعة المفقودة. | أحب نظرية علمية إلي هي أن حل قتزح المكوينا بالكامل من الأمت عن المفقودة |
107
+ | سأشتري له قلماً. | سأشتري له قلما |
108
+ | أين المشكلة ؟ | أين المشكل |
109
+ | وَلِلَّهِ يَسْجُدُ مَا فِي السَّمَاوَاتِ وَمَا فِي الْأَرْضِ مِنْ دَابَّةٍ وَالْمَلَائِكَةُ وَهُمْ لَا يَسْتَكْبِرُونَ | ولله يسجد ما في السماوات وما في الأرض من دابة والملائكة وهم لا يستكبرون |
110
+
111
+ ## Evaluation
112
+
113
+ The model can be evaluated as follows on the Arabic test data of Common Voice.
114
+
115
+ ```python
116
+ import torch
117
+ import re
118
+ import librosa
119
+ from datasets import load_dataset, load_metric
120
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
121
+
122
+ LANG_ID = "ar"
123
+ MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
124
+ DEVICE = "cuda"
125
+
126
+ CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
127
+ "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
128
+ "{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
129
+ "、", "﹂", "﹁", "‧", "~", "﹏", ",", "{", "}", "(", ")", "[", "]", "【", "】", "‥", "〽",
130
+ "『", "』", "〝", "〟", "⟨", "⟩", "〜", ":", "!", "?", "♪", "؛", "/", "\\", "º", "−", "^", "'", "ʻ", "ˆ"]
131
+
132
+ test_dataset = load_dataset("common_voice", LANG_ID, split="test")
133
+
134
+ wer = load_metric("wer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/wer.py
135
+ cer = load_metric("cer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/cer.py
136
+
137
+ chars_to_ignore_regex = f"[{re.escape(''.join(CHARS_TO_IGNORE))}]"
138
+
139
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
140
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
141
+ model.to(DEVICE)
142
+
143
+ # Preprocessing the datasets.
144
+ # We need to read the audio files as arrays
145
+ def speech_file_to_array_fn(batch):
146
+ with warnings.catch_warnings():
147
+ warnings.simplefilter("ignore")
148
+ speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
149
+ batch["speech"] = speech_array
150
+ batch["sentence"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).upper()
151
+ return batch
152
+
153
+ test_dataset = test_dataset.map(speech_file_to_array_fn)
154
+
155
+ # Preprocessing the datasets.
156
+ # We need to read the audio files as arrays
157
+ def evaluate(batch):
158
+ inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
159
+
160
+ with torch.no_grad():
161
+ logits = model(inputs.input_values.to(DEVICE), attention_mask=inputs.attention_mask.to(DEVICE)).logits
162
+
163
+ pred_ids = torch.argmax(logits, dim=-1)
164
+ batch["pred_strings"] = processor.batch_decode(pred_ids)
165
+ return batch
166
+
167
+ result = test_dataset.map(evaluate, batched=True, batch_size=8)
168
+
169
+ predictions = [x.upper() for x in result["pred_strings"]]
170
+ references = [x.upper() for x in result["sentence"]]
171
+
172
+ print(f"WER: {wer.compute(predictions=predictions, references=references, chunk_size=1000) * 100}")
173
+ print(f"CER: {cer.compute(predictions=predictions, references=references, chunk_size=1000) * 100}")
174
+ ```
175
+
176
+ **Test Result**:
177
+
178
+ In the table below I report the Word Error Rate (WER) and the Character Error Rate (CER) of the model. I ran the evaluation script described above on other models as well (on 2021-05-14). Note that the table below may show different results from those already reported, this may have been caused due to some specificity of the other evaluation scripts used.
179
+
180
+ | Model | WER | CER |
181
+ | ------------- | ------------- | ------------- |
182
+ | jonatasgrosman/wav2vec2-large-xlsr-53-arabic | **39.59%** | **18.18%** |
183
+ | bakrianoo/sinai-voice-ar-stt | 45.30% | 21.84% |
184
+ | othrif/wav2vec2-large-xlsr-arabic | 45.93% | 20.51% |
185
+ | kmfoda/wav2vec2-large-xlsr-arabic | 54.14% | 26.07% |
186
+ | mohammed/wav2vec2-large-xlsr-arabic | 56.11% | 26.79% |
187
+ | anas/wav2vec2-large-xlsr-arabic | 62.02% | 27.09% |
188
+ | elgeish/wav2vec2-large-xlsr-53-arabic | 100.00% | 100.56% |
189
+
190
+ ## Citation
191
+ If you want to cite this model you can use this:
192
+
193
+ ```bibtex
194
+ @misc{grosman2021xlsr53-large-arabic,
195
+ title={Fine-tuned {XLSR}-53 large model for speech recognition in {A}rabic},
196
+ author={Grosman, Jonatas},
197
+ howpublished={\url{https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-arabic}},
198
+ year={2021}
199
+ }
200
+ ```
ar/wav2vec2-large-xlsr-53-arabic/config.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
3
+ "activation_dropout": 0.05,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "Wav2Vec2ForCTC"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "conv_bias": true,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_stride": [
30
+ 5,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2
37
+ ],
38
+ "ctc_loss_reduction": "mean",
39
+ "ctc_zero_infinity": true,
40
+ "do_stable_layer_norm": true,
41
+ "eos_token_id": 2,
42
+ "feat_extract_activation": "gelu",
43
+ "feat_extract_dropout": 0.0,
44
+ "feat_extract_norm": "layer",
45
+ "feat_proj_dropout": 0.05,
46
+ "final_dropout": 0.0,
47
+ "gradient_checkpointing": true,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.05,
50
+ "hidden_size": 1024,
51
+ "initializer_range": 0.02,
52
+ "intermediate_size": 4096,
53
+ "layer_norm_eps": 1e-05,
54
+ "layerdrop": 0.05,
55
+ "mask_channel_length": 10,
56
+ "mask_channel_min_space": 1,
57
+ "mask_channel_other": 0.0,
58
+ "mask_channel_prob": 0.0,
59
+ "mask_channel_selection": "static",
60
+ "mask_feature_length": 10,
61
+ "mask_feature_prob": 0.0,
62
+ "mask_time_length": 10,
63
+ "mask_time_min_space": 1,
64
+ "mask_time_other": 0.0,
65
+ "mask_time_prob": 0.05,
66
+ "mask_time_selection": "static",
67
+ "model_type": "wav2vec2",
68
+ "num_attention_heads": 16,
69
+ "num_conv_pos_embedding_groups": 16,
70
+ "num_conv_pos_embeddings": 128,
71
+ "num_feat_extract_layers": 7,
72
+ "num_hidden_layers": 24,
73
+ "pad_token_id": 0,
74
+ "transformers_version": "4.5.0.dev0",
75
+ "vocab_size": 51
76
+ }
ar/wav2vec2-large-xlsr-53-arabic/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b44a67c277854fbcd96179ee8bfedb9f03f3826efc2af35f8eb9b964fd0df2b1
3
+ size 1261979372
ar/wav2vec2-large-xlsr-53-arabic/preprocessor_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_size": 1,
4
+ "padding_side": "right",
5
+ "padding_value": 0.0,
6
+ "return_attention_mask": true,
7
+ "sampling_rate": 16000
8
+ }
ar/wav2vec2-large-xlsr-53-arabic/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0b26f6d9d3edfde1784aef863c192a8cc1e438a23b45910ab648531ebe1857b
3
+ size 1262142936
ar/wav2vec2-large-xlsr-53-arabic/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-arabic
ar/wav2vec2-large-xlsr-53-arabic/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
ar/wav2vec2-large-xlsr-53-arabic/vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "|": 4, "-": 5, "ء": 6, "آ": 7, "أ": 8, "ؤ": 9, "إ": 10, "ئ": 11, "ا": 12, "ب": 13, "ة": 14, "ت": 15, "ث": 16, "ج": 17, "ح": 18, "خ": 19, "د": 20, "ذ": 21, "ر": 22, "ز": 23, "س": 24, "ش": 25, "ص": 26, "ض": 27, "ط": 28, "ظ": 29, "ع": 30, "غ": 31, "ـ": 32, "ف": 33, "ق": 34, "ك": 35, "ل": 36, "م": 37, "ن": 38, "ه": 39, "و": 40, "ى": 41, "ي": 42, "ً": 43, "ٌ": 44, "ٍ": 45, "َ": 46, "ُ": 47, "ِ": 48, "ّ": 49, "ْ": 50}
de/wav2vec2-base-10k-voxpopuli-ft-de/.gitattributes ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.arrow filter=lfs diff=lfs merge=lfs -text
10
+ *.ftz filter=lfs diff=lfs merge=lfs -text
11
+ *.joblib filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.pb filter=lfs diff=lfs merge=lfs -text
15
+ *.pt filter=lfs diff=lfs merge=lfs -text
16
+ *.pth filter=lfs diff=lfs merge=lfs -text
17
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
de/wav2vec2-base-10k-voxpopuli-ft-de/README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: de
3
+ tags:
4
+ - audio
5
+ - automatic-speech-recognition
6
+ - voxpopuli
7
+ license: cc-by-nc-4.0
8
+ ---
9
+
10
+ # Wav2Vec2-Base-VoxPopuli-Finetuned
11
+
12
+ [Facebook's Wav2Vec2](https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/) base model pretrained on the 10K unlabeled subset of [VoxPopuli corpus](https://arxiv.org/abs/2101.00390) and fine-tuned on the transcribed data in de (refer to Table 1 of paper for more information).
13
+
14
+ **Paper**: *[VoxPopuli: A Large-Scale Multilingual Speech Corpus for Representation
15
+ Learning, Semi-Supervised Learning and Interpretation](https://arxiv.org/abs/2101.00390)*
16
+
17
+ **Authors**: *Changhan Wang, Morgane Riviere, Ann Lee, Anne Wu, Chaitanya Talnikar, Daniel Haziza, Mary Williamson, Juan Pino, Emmanuel Dupoux* from *Facebook AI*
18
+
19
+ See the official website for more information, [here](https://github.com/facebookresearch/voxpopuli/)
20
+
21
+
22
+ # Usage for inference
23
+
24
+ In the following it is shown how the model can be used in inference on a sample of the [Common Voice dataset](https://commonvoice.mozilla.org/en/datasets)
25
+
26
+ ```python
27
+ #!/usr/bin/env python3
28
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
29
+ from datasets import load_dataset
30
+ import torchaudio
31
+ import torch
32
+
33
+ # resample audio
34
+
35
+ # load model & processor
36
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-10k-voxpopuli-ft-de")
37
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-10k-voxpopuli-ft-de")
38
+
39
+ # load dataset
40
+ ds = load_dataset("common_voice", "de", split="validation[:1%]")
41
+
42
+ # common voice does not match target sampling rate
43
+ common_voice_sample_rate = 48000
44
+ target_sample_rate = 16000
45
+
46
+ resampler = torchaudio.transforms.Resample(common_voice_sample_rate, target_sample_rate)
47
+
48
+
49
+ # define mapping fn to read in sound file and resample
50
+ def map_to_array(batch):
51
+ speech, _ = torchaudio.load(batch["path"])
52
+ speech = resampler(speech)
53
+ batch["speech"] = speech[0]
54
+ return batch
55
+
56
+
57
+ # load all audio files
58
+ ds = ds.map(map_to_array)
59
+
60
+ # run inference on the first 5 data samples
61
+ inputs = processor(ds[:5]["speech"], sampling_rate=target_sample_rate, return_tensors="pt", padding=True)
62
+
63
+ # inference
64
+ logits = model(**inputs).logits
65
+ predicted_ids = torch.argmax(logits, axis=-1)
66
+
67
+ print(processor.batch_decode(predicted_ids))
68
+ ```
69
+
de/wav2vec2-base-10k-voxpopuli-ft-de/config.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "Wav2Vec2ForCTC"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 0,
9
+ "conv_bias": false,
10
+ "conv_dim": [
11
+ 512,
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512
18
+ ],
19
+ "conv_kernel": [
20
+ 10,
21
+ 3,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 2,
26
+ 2
27
+ ],
28
+ "conv_stride": [
29
+ 5,
30
+ 2,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2
36
+ ],
37
+ "ctc_loss_reduction": "sum",
38
+ "ctc_zero_infinity": false,
39
+ "do_stable_layer_norm": false,
40
+ "eos_token_id": 2,
41
+ "feat_extract_activation": "gelu",
42
+ "feat_extract_dropout": 0.0,
43
+ "feat_extract_norm": "group",
44
+ "feat_proj_dropout": 0.1,
45
+ "final_dropout": 0.1,
46
+ "gradient_checkpointing": false,
47
+ "hidden_act": "gelu",
48
+ "hidden_dropout": 0.1,
49
+ "hidden_dropout_prob": 0.1,
50
+ "hidden_size": 768,
51
+ "initializer_range": 0.02,
52
+ "intermediate_size": 3072,
53
+ "layer_norm_eps": 1e-05,
54
+ "layerdrop": 0.1,
55
+ "mask_feature_length": 10,
56
+ "mask_feature_prob": 0.0,
57
+ "mask_time_length": 10,
58
+ "mask_time_prob": 0.05,
59
+ "model_type": "wav2vec2",
60
+ "num_attention_heads": 12,
61
+ "num_conv_pos_embedding_groups": 16,
62
+ "num_conv_pos_embeddings": 128,
63
+ "num_feat_extract_layers": 7,
64
+ "num_hidden_layers": 12,
65
+ "pad_token_id": 1,
66
+ "transformers_version": "4.6.0.dev0",
67
+ "vocab_size": 36
68
+ }
de/wav2vec2-base-10k-voxpopuli-ft-de/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": false,
8
+ "sampling_rate": 16000
9
+ }
de/wav2vec2-base-10k-voxpopuli-ft-de/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6744100ba037593b22fac95738074e7cf6c1d9c94f0bfa0e76c3cf863b25741f
3
+ size 377684844
de/wav2vec2-base-10k-voxpopuli-ft-de/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/facebook/wav2vec2-base-10k-voxpopuli-ft-de
de/wav2vec2-base-10k-voxpopuli-ft-de/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
de/wav2vec2-base-10k-voxpopuli-ft-de/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|"}
de/wav2vec2-base-10k-voxpopuli-ft-de/vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "|": 4, "e": 5, "n": 6, "i": 7, "r": 8, "s": 9, "t": 10, "a": 11, "d": 12, "h": 13, "u": 14, "l": 15, "g": 16, "c": 17, "m": 18, "o": 19, "b": 20, "w": 21, "f": 22, "k": 23, "z": 24, "p": 25, "v": 26, "ü": 27, "ä": 28, "ö": 29, "j": 30, "ß": 31, "y": 32, "x": 33, "q": 34, "1": 35}
es/wav2vec2-large-es-voxpopuli/.gitattributes ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.arrow filter=lfs diff=lfs merge=lfs -text
10
+ *.ftz filter=lfs diff=lfs merge=lfs -text
11
+ *.joblib filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.pb filter=lfs diff=lfs merge=lfs -text
15
+ *.pt filter=lfs diff=lfs merge=lfs -text
16
+ *.pth filter=lfs diff=lfs merge=lfs -text
17
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
es/wav2vec2-large-es-voxpopuli/README.md ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: es
3
+ tags:
4
+ - audio
5
+ - automatic-speech-recognition
6
+ - voxpopuli
7
+ license: cc-by-nc-4.0
8
+ ---
9
+
10
+ # Wav2Vec2-Large-VoxPopuli
11
+
12
+ [Facebook's Wav2Vec2](https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/) large model pretrained on the es unlabeled subset of [VoxPopuli corpus](https://arxiv.org/abs/2101.00390).
13
+
14
+ **Paper**: *[VoxPopuli: A Large-Scale Multilingual Speech Corpus for Representation
15
+ Learning, Semi-Supervised Learning and Interpretation](https://arxiv.org/abs/2101.00390)*
16
+
17
+ **Authors**: *Changhan Wang, Morgane Riviere, Ann Lee, Anne Wu, Chaitanya Talnikar, Daniel Haziza, Mary Williamson, Juan Pino, Emmanuel Dupoux* from *Facebook AI*
18
+
19
+ See the official website for more information, [here](https://github.com/facebookresearch/voxpopuli/)
20
+
21
+ # Fine-Tuning
22
+
23
+ Please refer to [this blog](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) on how to fine-tune this model on a specific language. Note that you should replace `"facebook/wav2vec2-large-xlsr-53"` with this checkpoint for fine-tuning.
24
+
es/wav2vec2-large-es-voxpopuli/config.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "Wav2Vec2ForPreTraining"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 1,
9
+ "codevector_dim": 768,
10
+ "contrastive_logits_temperature": 0.1,
11
+ "conv_bias": true,
12
+ "conv_dim": [
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512
20
+ ],
21
+ "conv_kernel": [
22
+ 10,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 3,
27
+ 2,
28
+ 2
29
+ ],
30
+ "conv_stride": [
31
+ 5,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2
38
+ ],
39
+ "ctc_loss_reduction": "sum",
40
+ "ctc_zero_infinity": false,
41
+ "diversity_loss_weight": 0.1,
42
+ "do_stable_layer_norm": true,
43
+ "eos_token_id": 2,
44
+ "feat_extract_activation": "gelu",
45
+ "feat_extract_dropout": 0.0,
46
+ "feat_extract_norm": "layer",
47
+ "feat_proj_dropout": 0.1,
48
+ "feat_quantizer_dropout": 0.0,
49
+ "final_dropout": 0.0,
50
+ "gradient_checkpointing": false,
51
+ "hidden_act": "gelu",
52
+ "hidden_dropout": 0.1,
53
+ "hidden_size": 1024,
54
+ "initializer_range": 0.02,
55
+ "intermediate_size": 4096,
56
+ "layer_norm_eps": 1e-05,
57
+ "layerdrop": 0.1,
58
+ "mask_channel_length": 10,
59
+ "mask_channel_min_space": 1,
60
+ "mask_channel_other": 0.0,
61
+ "mask_channel_prob": 0.0,
62
+ "mask_channel_selection": "static",
63
+ "mask_feature_length": 10,
64
+ "mask_feature_prob": 0.0,
65
+ "mask_time_length": 10,
66
+ "mask_time_min_space": 1,
67
+ "mask_time_other": 0.0,
68
+ "mask_time_prob": 0.075,
69
+ "mask_time_selection": "static",
70
+ "model_type": "wav2vec2",
71
+ "num_attention_heads": 16,
72
+ "num_codevector_groups": 2,
73
+ "num_codevectors_per_group": 320,
74
+ "num_conv_pos_embedding_groups": 16,
75
+ "num_conv_pos_embeddings": 128,
76
+ "num_feat_extract_layers": 7,
77
+ "num_hidden_layers": 24,
78
+ "num_negatives": 100,
79
+ "pad_token_id": 0,
80
+ "proj_codevector_dim": 768,
81
+ "transformers_version": "4.7.0.dev0",
82
+ "vocab_size": 32
83
+ }
es/wav2vec2-large-es-voxpopuli/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad63febc6c296b8b3de75e39e739606d6bd239ab12c923b1dcdc176095dae2fd
3
+ size 1269577963
es/wav2vec2-large-es-voxpopuli/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
es/wav2vec2-large-es-voxpopuli/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92aa5fa25e1dd02a474cea12b2d958eacd128126f9ee8e8ce89249986e5da762
3
+ size 1269737156
es/wav2vec2-large-es-voxpopuli/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/facebook/wav2vec2-large-es-voxpopuli
fa/wav2vec2-large-xlsr-53-persian/.gitattributes ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.arrow filter=lfs diff=lfs merge=lfs -text
10
+ *.ftz filter=lfs diff=lfs merge=lfs -text
11
+ *.joblib filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.pb filter=lfs diff=lfs merge=lfs -text
15
+ *.pt filter=lfs diff=lfs merge=lfs -text
16
+ *.pth filter=lfs diff=lfs merge=lfs -text
17
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
fa/wav2vec2-large-xlsr-53-persian/README.md ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: fa
3
+ datasets:
4
+ - common_voice
5
+ metrics:
6
+ - wer
7
+ - cer
8
+ tags:
9
+ - audio
10
+ - automatic-speech-recognition
11
+ - speech
12
+ - xlsr-fine-tuning-week
13
+ license: apache-2.0
14
+ model-index:
15
+ - name: XLSR Wav2Vec2 Persian by Jonatas Grosman
16
+ results:
17
+ - task:
18
+ name: Speech Recognition
19
+ type: automatic-speech-recognition
20
+ dataset:
21
+ name: Common Voice fa
22
+ type: common_voice
23
+ args: fa
24
+ metrics:
25
+ - name: Test WER
26
+ type: wer
27
+ value: 30.12
28
+ - name: Test CER
29
+ type: cer
30
+ value: 7.37
31
+ ---
32
+
33
+ # Fine-tuned XLSR-53 large model for speech recognition in Persian
34
+
35
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Persian using the train and validation splits of [Common Voice 6.1](https://huggingface.co/datasets/common_voice).
36
+ When using this model, make sure that your speech input is sampled at 16kHz.
37
+
38
+ This model has been fine-tuned thanks to the GPU credits generously given by the [OVHcloud](https://www.ovhcloud.com/en/public-cloud/ai-training/) :)
39
+
40
+ The script used for training can be found here: https://github.com/jonatasgrosman/wav2vec2-sprint
41
+
42
+ ## Usage
43
+
44
+ The model can be used directly (without a language model) as follows...
45
+
46
+ Using the [HuggingSound](https://github.com/jonatasgrosman/huggingsound) library:
47
+
48
+ ```python
49
+ from huggingsound import SpeechRecognitionModel
50
+
51
+ model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-persian")
52
+ audio_paths = ["/path/to/file.mp3", "/path/to/another_file.wav"]
53
+
54
+ transcriptions = model.transcribe(audio_paths)
55
+ ```
56
+
57
+ Writing your own inference script:
58
+
59
+ ```python
60
+ import torch
61
+ import librosa
62
+ from datasets import load_dataset
63
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
64
+
65
+ LANG_ID = "fa"
66
+ MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-persian"
67
+ SAMPLES = 5
68
+
69
+ test_dataset = load_dataset("common_voice", LANG_ID, split=f"test[:{SAMPLES}]")
70
+
71
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
72
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
73
+
74
+ # Preprocessing the datasets.
75
+ # We need to read the audio files as arrays
76
+ def speech_file_to_array_fn(batch):
77
+ speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
78
+ batch["speech"] = speech_array
79
+ batch["sentence"] = batch["sentence"].upper()
80
+ return batch
81
+
82
+ test_dataset = test_dataset.map(speech_file_to_array_fn)
83
+ inputs = processor(test_dataset["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
84
+
85
+ with torch.no_grad():
86
+ logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
87
+
88
+ predicted_ids = torch.argmax(logits, dim=-1)
89
+ predicted_sentences = processor.batch_decode(predicted_ids)
90
+
91
+ for i, predicted_sentence in enumerate(predicted_sentences):
92
+ print("-" * 100)
93
+ print("Reference:", test_dataset[i]["sentence"])
94
+ print("Prediction:", predicted_sentence)
95
+ ```
96
+
97
+ | Reference | Prediction |
98
+ | ------------- | ------------- |
99
+ | از مهمونداری کنار بکشم | از مهمانداری کنار بکشم |
100
+ | برو از مهرداد بپرس. | برو از ماقدعاد به پرس |
101
+ | خب ، تو چیكار می كنی؟ | خوب تو چیکار می کنی |
102
+ | مسقط پایتخت عمان در عربی به معنای محل سقوط است | مسقط پایتخت عمان در عربی به بعنای محل سقوط است |
103
+ | آه، نه اصلاُ! | اهنه اصلا |
104
+ | توانست | توانست |
105
+ | قصیده فن شعر میگوید ای دوستان | قصیده فن شعر میگوید ایدوستون |
106
+ | دو استایل متفاوت دارین | دوبوست داریل و متفاوت بری |
107
+ | دو روز قبل از کریسمس ؟ | اون مفتود پش پشش |
108
+ | ساعت های کاری چیست؟ | این توری که موشیکل خب |
109
+
110
+ ## Evaluation
111
+
112
+ The model can be evaluated as follows on the Persian test data of Common Voice.
113
+
114
+ ```python
115
+ import torch
116
+ import re
117
+ import librosa
118
+ from datasets import load_dataset, load_metric
119
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
120
+
121
+ LANG_ID = "fa"
122
+ MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-persian"
123
+ DEVICE = "cuda"
124
+
125
+ CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
126
+ "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
127
+ "{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
128
+ "、", "﹂", "﹁", "‧", "~", "﹏", ",", "{", "}", "(", ")", "[", "]", "【", "】", "‥", "〽",
129
+ "『", "』", "〝", "〟", "⟨", "⟩", "〜", ":", "!", "?", "♪", "؛", "/", "\\", "º", "−", "^", "ʻ", "ˆ"]
130
+
131
+ test_dataset = load_dataset("common_voice", LANG_ID, split="test")
132
+
133
+ wer = load_metric("wer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/wer.py
134
+ cer = load_metric("cer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/cer.py
135
+
136
+ chars_to_ignore_regex = f"[{re.escape(''.join(CHARS_TO_IGNORE))}]"
137
+
138
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
139
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
140
+ model.to(DEVICE)
141
+
142
+ # Preprocessing the datasets.
143
+ # We need to read the audio files as arrays
144
+ def speech_file_to_array_fn(batch):
145
+ with warnings.catch_warnings():
146
+ warnings.simplefilter("ignore")
147
+ speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
148
+ batch["speech"] = speech_array
149
+ batch["sentence"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).upper()
150
+ return batch
151
+
152
+ test_dataset = test_dataset.map(speech_file_to_array_fn)
153
+
154
+ # Preprocessing the datasets.
155
+ # We need to read the audio files as arrays
156
+ def evaluate(batch):
157
+ inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
158
+
159
+ with torch.no_grad():
160
+ logits = model(inputs.input_values.to(DEVICE), attention_mask=inputs.attention_mask.to(DEVICE)).logits
161
+
162
+ pred_ids = torch.argmax(logits, dim=-1)
163
+ batch["pred_strings"] = processor.batch_decode(pred_ids)
164
+ return batch
165
+
166
+ result = test_dataset.map(evaluate, batched=True, batch_size=8)
167
+
168
+ predictions = [x.upper() for x in result["pred_strings"]]
169
+ references = [x.upper() for x in result["sentence"]]
170
+
171
+ print(f"WER: {wer.compute(predictions=predictions, references=references, chunk_size=1000) * 100}")
172
+ print(f"CER: {cer.compute(predictions=predictions, references=references, chunk_size=1000) * 100}")
173
+ ```
174
+
175
+ **Test Result**:
176
+
177
+ In the table below I report the Word Error Rate (WER) and the Character Error Rate (CER) of the model. I ran the evaluation script described above on other models as well (on 2021-04-22). Note that the table below may show different results from those already reported, this may have been caused due to some specificity of the other evaluation scripts used.
178
+
179
+ | Model | WER | CER |
180
+ | ------------- | ------------- | ------------- |
181
+ | jonatasgrosman/wav2vec2-large-xlsr-53-persian | **30.12%** | **7.37%** |
182
+ | m3hrdadfi/wav2vec2-large-xlsr-persian-v2 | 33.85% | 8.79% |
183
+ | m3hrdadfi/wav2vec2-large-xlsr-persian | 34.37% | 8.98% |
184
+
185
+ ## Citation
186
+ If you want to cite this model you can use this:
187
+
188
+ ```bibtex
189
+ @misc{grosman2021xlsr53-large-persian,
190
+ title={Fine-tuned {XLSR}-53 large model for speech recognition in {P}ersian},
191
+ author={Grosman, Jonatas},
192
+ howpublished={\url{https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-persian}},
193
+ year={2021}
194
+ }
195
+ ```
fa/wav2vec2-large-xlsr-53-persian/config.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
3
+ "activation_dropout": 0.05,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "Wav2Vec2ForCTC"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "conv_bias": true,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_stride": [
30
+ 5,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2
37
+ ],
38
+ "ctc_loss_reduction": "mean",
39
+ "ctc_zero_infinity": true,
40
+ "do_stable_layer_norm": true,
41
+ "eos_token_id": 2,
42
+ "feat_extract_activation": "gelu",
43
+ "feat_extract_dropout": 0.0,
44
+ "feat_extract_norm": "layer",
45
+ "feat_proj_dropout": 0.05,
46
+ "final_dropout": 0.0,
47
+ "gradient_checkpointing": true,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.05,
50
+ "hidden_size": 1024,
51
+ "initializer_range": 0.02,
52
+ "intermediate_size": 4096,
53
+ "layer_norm_eps": 1e-05,
54
+ "layerdrop": 0.05,
55
+ "mask_channel_length": 10,
56
+ "mask_channel_min_space": 1,
57
+ "mask_channel_other": 0.0,
58
+ "mask_channel_prob": 0.0,
59
+ "mask_channel_selection": "static",
60
+ "mask_feature_length": 10,
61
+ "mask_feature_prob": 0.0,
62
+ "mask_time_length": 10,
63
+ "mask_time_min_space": 1,
64
+ "mask_time_other": 0.0,
65
+ "mask_time_prob": 0.05,
66
+ "mask_time_selection": "static",
67
+ "model_type": "wav2vec2",
68
+ "num_attention_heads": 16,
69
+ "num_conv_pos_embedding_groups": 16,
70
+ "num_conv_pos_embeddings": 128,
71
+ "num_feat_extract_layers": 7,
72
+ "num_hidden_layers": 24,
73
+ "pad_token_id": 0,
74
+ "transformers_version": "4.5.0.dev0",
75
+ "vocab_size": 67
76
+ }
fa/wav2vec2-large-xlsr-53-persian/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1c25fc4a3db03fb9610f8d954703e5b1497168dac2d4a5e67eaf1e400badb1f
3
+ size 1262044974
fa/wav2vec2-large-xlsr-53-persian/issues.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ =============================================================================
2
+ #3 WER 30% is wrong in real world/test
3
+ =============================================================================
4
+
5
+ [mosipvp] Feb 9, 2025
6
+
7
+ Dear Creator, I think you are sharing an unreasonable result. I was test this in "Farsi" and the result is absolutely f***ing.
8
+ I think you don't speak Farsi/Persian, so please use a native Farsi as your assistant/advisor.
9
+ But thank you for sharing this model.
fa/wav2vec2-large-xlsr-53-persian/preprocessor_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_size": 1,
4
+ "padding_side": "right",
5
+ "padding_value": 0.0,
6
+ "return_attention_mask": true,
7
+ "sampling_rate": 16000
8
+ }
fa/wav2vec2-large-xlsr-53-persian/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b859c7f562a2cc3c6002c2eb5178b66777406c4fccf53f196ead46a4f6c4796
3
+ size 1262208535
fa/wav2vec2-large-xlsr-53-persian/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-persian
fa/wav2vec2-large-xlsr-53-persian/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
fa/wav2vec2-large-xlsr-53-persian/vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "|": 4, "٬": 5, "و": 6, "ـ": 7, "ئ": 8, "ل": 9, "ج": 10, "ک": 11, "R": 12, "ِ": 13, "ع": 14, "َ": 15, "م": 16, "ض": 17, "-": 18, "I": 19, "F": 20, "ذ": 21, "ن": 22, "ژ": 23, "A": 24, "ش": 25, "ث": 26, "Y": 27, "د": 28, "ر": 29, "ّ": 30, "أ": 31, "ق": 32, "ب": 33, "ح": 34, "ظ": 35, "پ": 36, "ت": 37, "خ": 38, "غ": 39, "ط": 40, "ك": 41, "ي": 42, "E": 43, "Ā": 44, "؛": 45, "ی": 46, "چ": 47, "ه": 48, "M": 49, "ف": 50, "آ": 51, "ز": 52, "ص": 53, "س": 54, "گ": 55, "N": 56, "ُ": 57, "T": 58, "S": 59, "Š": 60, "ٔ": 61, "B": 62, "ء": 63, "ً": 64, "ا": 65, "ى": 66}
fa/wav2vec2-large-xlsr-persian-v3/.gitattributes ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.arrow filter=lfs diff=lfs merge=lfs -text
10
+ *.ftz filter=lfs diff=lfs merge=lfs -text
11
+ *.joblib filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.pb filter=lfs diff=lfs merge=lfs -text
15
+ *.pt filter=lfs diff=lfs merge=lfs -text
16
+ *.pth filter=lfs diff=lfs merge=lfs -text
17
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
fa/wav2vec2-large-xlsr-persian-v3/README.md ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: fa
3
+ datasets:
4
+ - common_voice
5
+ tags:
6
+ - audio
7
+ - automatic-speech-recognition
8
+ - speech
9
+ - xlsr-fine-tuning-week
10
+ widget:
11
+ - example_title: Common Voice sample 1
12
+ src: https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-persian-v3/resolve/main/sample1.flac
13
+ - example_title: Common Voice sample 2978
14
+ src: https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-persian-v3/resolve/main/sample2978.flac
15
+ - example_title: Common Voice sample 5168
16
+ src: https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-persian-v3/resolve/main/sample5168.flac
17
+ model-index:
18
+ - name: XLSR Wav2Vec2 Persian (Farsi) V3 by Mehrdad Farahani
19
+ results:
20
+ - task:
21
+ name: Speech Recognition
22
+ type: automatic-speech-recognition
23
+ dataset:
24
+ name: Common Voice fa
25
+ type: common_voice
26
+ args: fa
27
+ metrics:
28
+ - name: Test WER
29
+ type: wer
30
+ value: 10.36
31
+ ---
32
+
33
+ # Wav2Vec2-Large-XLSR-53-Persian V3
34
+
35
+
36
+ ## Usage
37
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Persian (Farsi) using [Common Voice](https://huggingface.co/datasets/common_voice). When using this model, make sure that your speech input is sampled at 16kHz.
38
+
39
+
40
+ **Requirements**
41
+ ```bash
42
+ # requirement packages
43
+ !pip install git+https://github.com/huggingface/datasets.git
44
+ !pip install git+https://github.com/huggingface/transformers.git
45
+ !pip install torchaudio
46
+ !pip install librosa
47
+ !pip install jiwer
48
+ !pip install parsivar
49
+ !pip install num2fawords
50
+ ```
51
+
52
+ **Normalizer**
53
+ ```bash
54
+ # Normalizer
55
+ !wget -O normalizer.py https://huggingface.co/m3hrdadfi/"wav2vec2-large-xlsr-persian-v3/raw/main/dictionary.py
56
+ !wget -O normalizer.py https://huggingface.co/m3hrdadfi/"wav2vec2-large-xlsr-persian-v3/raw/main/normalizer.py
57
+ ```
58
+
59
+ **Downloading data**
60
+ ```bash
61
+ wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fa.tar.gz
62
+
63
+ tar -xzf fa.tar.gz
64
+ rm -rf fa.tar.gz
65
+ ```
66
+
67
+ **Cleaning**
68
+ ```python
69
+ from normalizer import normalizer
70
+
71
+ def cleaning(text):
72
+ if not isinstance(text, str):
73
+ return None
74
+
75
+ return normalizer({"sentence": text}, return_dict=False)
76
+
77
+ data_dir = "/content/cv-corpus-6.1-2020-12-11/fa"
78
+
79
+ test = pd.read_csv(f"{data_dir}/test.tsv", sep=" ")
80
+ test["path"] = data_dir + "/clips/" + test["path"]
81
+ print(f"Step 0: {len(test)}")
82
+
83
+ test["status"] = test["path"].apply(lambda path: True if os.path.exists(path) else None)
84
+ test = test.dropna(subset=["path"])
85
+ test = test.drop("status", 1)
86
+ print(f"Step 1: {len(test)}")
87
+
88
+ test["sentence"] = test["sentence"].apply(lambda t: cleaning(t))
89
+ test = test.dropna(subset=["sentence"])
90
+ print(f"Step 2: {len(test)}")
91
+
92
+ test = test.reset_index(drop=True)
93
+ print(test.head())
94
+
95
+ test = test[["path", "sentence"]]
96
+ test.to_csv("/content/test.csv", sep=" ", encoding="utf-8", index=False)
97
+ ```
98
+
99
+ **Prediction**
100
+ ```python
101
+ import numpy as np
102
+ import pandas as pd
103
+
104
+ import librosa
105
+ import torch
106
+ import torchaudio
107
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
108
+ from datasets import load_dataset, load_metric
109
+
110
+ import IPython.display as ipd
111
+
112
+ model_name_or_path = "m3hrdadfi/wav2vec2-large-xlsr-persian-v3"
113
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
114
+ print(model_name_or_path, device)
115
+
116
+ processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
117
+ model = Wav2Vec2ForCTC.from_pretrained(model_name_or_path).to(device)
118
+
119
+
120
+ def speech_file_to_array_fn(batch):
121
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
122
+ speech_array = speech_array.squeeze().numpy()
123
+ speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, processor.feature_extractor.sampling_rate)
124
+
125
+ batch["speech"] = speech_array
126
+ return batch
127
+
128
+
129
+ def predict(batch):
130
+ features = processor(
131
+ batch["speech"],
132
+ sampling_rate=processor.feature_extractor.sampling_rate,
133
+ return_tensors="pt",
134
+ padding=True
135
+ )
136
+
137
+ input_values = features.input_values.to(device)
138
+ attention_mask = features.attention_mask.to(device)
139
+
140
+ with torch.no_grad():
141
+ logits = model(input_values, attention_mask=attention_mask).logits
142
+
143
+ pred_ids = torch.argmax(logits, dim=-1)
144
+
145
+ batch["predicted"] = processor.batch_decode(pred_ids)
146
+ return batch
147
+
148
+
149
+ dataset = load_dataset("csv", data_files={"test": "/content/test.csv"}, delimiter=" ")["test"]
150
+ dataset = dataset.map(speech_file_to_array_fn)
151
+ result = dataset.map(predict, batched=True, batch_size=4)
152
+ ```
153
+
154
+ **WER Score**
155
+ ```python
156
+ wer = load_metric("wer")
157
+ print("WER: {:.2f}".format(100 * wer.compute(predictions=result["predicted"], references=result["sentence"])))
158
+ ```
159
+
160
+ **Output**
161
+ ```python
162
+ max_items = np.random.randint(0, len(result), 20).tolist()
163
+ for i in max_items:
164
+ reference, predicted = result["sentence"][i], result["predicted"][i]
165
+ print("reference:", reference)
166
+ print("predicted:", predicted)
167
+ print('---')
168
+ ```
169
+
170
+ ```text
171
+ reference: ماجرا رو براش تعریف کردم او�� گفت مریم اگه میدونی پسر خوبیه خب چه اشکالی داره با‌هاش بیش‌تر اشنا بشو
172
+ predicted: ماجرا رو براش تعریف کردم اون گفت مریم اگه میدونی پسر خوبیه خب چه اشکالی داره با‌هاش بیش‌تر اشنا بشو
173
+ ---
174
+ reference: بیا پایین تو اجازه نداری بری اون بالا
175
+ predicted: بیا پایین تو اجازه نداری بری اون بالا
176
+ ---
177
+ reference: هر روز یک دو مداد کش می رفتتم تااین که تا پایان ترم از تمامی دوستانم مداد برداشته بودم
178
+ predicted: هر روز یک دو مداد کش می رفتم تااین که تا پایین ترم از تمامی دوستان و مداد برداشته بودم
179
+ ---
180
+ reference: فکر میکنی آروم میشینه
181
+ predicted: فکر میکنی آروم میشینه
182
+ ---
183
+ reference: هرکسی با گوشی هوشمند خود میتواند با کایلا متصل گردد در یک محدوده مکانی
184
+ predicted: هرکسی با گوشی هوشمند خود میتواند با کایلا متصل گردد در یک محدوده مکانی
185
+ ---
186
+ reference: برو از مهرداد بپرس
187
+ predicted: برو از مهرداد بپرس
188
+ ---
189
+ reference: می خواهم شما را با این قدم‌ها آشنا کنم
190
+ predicted: می خواهم شما را با این قدم‌ها آشنا کنم
191
+ ---
192
+ reference: میدونم یه روز دوباره می تونم تو رو ببینم
193
+ predicted: میدونم یه روز دوباره می تونم تو رو ببینم
194
+ ---
195
+ reference: بسیار خوب خواهد بود دعوت او را بپذیری
196
+ predicted: بسیار خوب خواهد بود دعوت او را بپذیری
197
+ ---
198
+ reference: بهت بگن آشغالی خوبه
199
+ predicted: بهت بگن آشغالی خوبه
200
+ ---
201
+ reference: چرا معاشرت با هم ایمانان ما را محفوظ نگه میدارد
202
+ predicted: چرا معاشرت با هم ایمانان آ را م حفوظ نگه میدارد
203
+ ---
204
+ reference: بولیوی پس از گویان فقیر‌ترین کشور آمریکای جنوبی است
205
+ predicted: بولیوی پس از گویان فقیر‌ترین کشور آمریکای جنوبی است
206
+ ---
207
+ reference: بعد از مدتی اینکار برایم عادی شد
208
+ predicted: بعد از مدتی اینکار برایم عادو شد
209
+ ---
210
+ reference: به نظر اون هم همینطوره
211
+ predicted: به نظر اون هم همینطوره
212
+ ---
213
+ reference: هیچ مایونز ی دارید
214
+ predicted: هیچ مایونز ی دارید
215
+ ---
216
+ reference: هیچ یک از انان کاری به سنگ نداشتند
217
+ predicted: هیچ شک از انان کاری به سنگ نداشتند
218
+ ---
219
+ reference: می خواهم کمی کتاب شعر ببینم
220
+ predicted: می خواهم کتاب شعر ببینم
221
+ ---
222
+ reference: همین شوهر فهیمه مگه نمی گفتی فرمانده بوده کو
223
+ predicted: همین شوهر فهیمه بینامی گفتی فهمانده بود کو
224
+ ---
225
+ reference: اون جا‌ها کسی رو نمیبینی که تو دستش کتاب نباشه
226
+ predicted: اون جا‌ها کسی رو نمیبینی که تو دستش کتاب نباشه
227
+ ---
228
+ reference: زندان رفتن من در این سال‌های اخیر برام شانس بزرگی بود که معما و مشکل چندین سال‌هام را حل کرد
229
+ predicted: زندان رفتن من در این سال‌ها اخی براب شانس بزرگی بود که معما و مشکل چندین سال‌هام را حل کرد
230
+ ---
231
+ ```
232
+
233
+ ## Evaluation
234
+
235
+ **Test Result:**
236
+ - WER: 10.36%
fa/wav2vec2-large-xlsr-persian-v3/config.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
3
+ "activation_dropout": 0.09216,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "Wav2Vec2ForCTC"
7
+ ],
8
+ "attention_dropout": 0.05316,
9
+ "bos_token_id": 1,
10
+ "conv_bias": true,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_stride": [
30
+ 5,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2
37
+ ],
38
+ "ctc_loss_reduction": "mean",
39
+ "ctc_zero_infinity": true,
40
+ "do_stable_layer_norm": true,
41
+ "eos_token_id": 2,
42
+ "feat_extract_activation": "gelu",
43
+ "feat_extract_dropout": 0.0,
44
+ "feat_extract_norm": "layer",
45
+ "feat_proj_dropout": 0.01249,
46
+ "final_dropout": 0.0,
47
+ "gradient_checkpointing": true,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.01941,
50
+ "hidden_size": 1024,
51
+ "initializer_range": 0.02,
52
+ "intermediate_size": 4096,
53
+ "layer_norm_eps": 1e-05,
54
+ "layerdrop": 0.01377,
55
+ "mask_channel_length": 10,
56
+ "mask_channel_min_space": 1,
57
+ "mask_channel_other": 0.0,
58
+ "mask_channel_prob": 0.0,
59
+ "mask_channel_selection": "static",
60
+ "mask_feature_length": 10,
61
+ "mask_feature_prob": 0.0,
62
+ "mask_time_length": 10,
63
+ "mask_time_min_space": 1,
64
+ "mask_time_other": 0.0,
65
+ "mask_time_prob": 0.04529,
66
+ "mask_time_selection": "static",
67
+ "model_type": "wav2vec2",
68
+ "num_attention_heads": 16,
69
+ "num_conv_pos_embedding_groups": 16,
70
+ "num_conv_pos_embeddings": 128,
71
+ "num_feat_extract_layers": 7,
72
+ "num_hidden_layers": 24,
73
+ "pad_token_id": 0,
74
+ "transformers_version": "4.6.0.dev0",
75
+ "vocab_size": 40
76
+ }