wav2vec2 (ar, de, es, fr, fa, he, ro, ru, tr, uk, multi)
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +7 -0
- ar/wav2vec2-large-xlsr-53-arabic (elgeish)/.gitattributes +17 -0
- ar/wav2vec2-large-xlsr-53-arabic (elgeish)/README.md +206 -0
- ar/wav2vec2-large-xlsr-53-arabic (elgeish)/config.json +69 -0
- ar/wav2vec2-large-xlsr-53-arabic (elgeish)/flax_model.msgpack +3 -0
- ar/wav2vec2-large-xlsr-53-arabic (elgeish)/preprocessor_config.json +8 -0
- ar/wav2vec2-large-xlsr-53-arabic (elgeish)/pytorch_model.bin +3 -0
- ar/wav2vec2-large-xlsr-53-arabic (elgeish)/source.txt +1 -0
- ar/wav2vec2-large-xlsr-53-arabic (elgeish)/special_tokens_map.json +1 -0
- ar/wav2vec2-large-xlsr-53-arabic (elgeish)/tokenizer_config.json +1 -0
- ar/wav2vec2-large-xlsr-53-arabic (elgeish)/validation_wer.png +3 -0
- ar/wav2vec2-large-xlsr-53-arabic (elgeish)/vocab.json +1 -0
- ar/wav2vec2-large-xlsr-53-arabic/.gitattributes +17 -0
- ar/wav2vec2-large-xlsr-53-arabic/README.md +200 -0
- ar/wav2vec2-large-xlsr-53-arabic/config.json +76 -0
- ar/wav2vec2-large-xlsr-53-arabic/flax_model.msgpack +3 -0
- ar/wav2vec2-large-xlsr-53-arabic/preprocessor_config.json +8 -0
- ar/wav2vec2-large-xlsr-53-arabic/pytorch_model.bin +3 -0
- ar/wav2vec2-large-xlsr-53-arabic/source.txt +1 -0
- ar/wav2vec2-large-xlsr-53-arabic/special_tokens_map.json +1 -0
- ar/wav2vec2-large-xlsr-53-arabic/vocab.json +1 -0
- de/wav2vec2-base-10k-voxpopuli-ft-de/.gitattributes +17 -0
- de/wav2vec2-base-10k-voxpopuli-ft-de/README.md +69 -0
- de/wav2vec2-base-10k-voxpopuli-ft-de/config.json +68 -0
- de/wav2vec2-base-10k-voxpopuli-ft-de/preprocessor_config.json +9 -0
- de/wav2vec2-base-10k-voxpopuli-ft-de/pytorch_model.bin +3 -0
- de/wav2vec2-base-10k-voxpopuli-ft-de/source.txt +1 -0
- de/wav2vec2-base-10k-voxpopuli-ft-de/special_tokens_map.json +1 -0
- de/wav2vec2-base-10k-voxpopuli-ft-de/tokenizer_config.json +1 -0
- de/wav2vec2-base-10k-voxpopuli-ft-de/vocab.json +1 -0
- es/wav2vec2-large-es-voxpopuli/.gitattributes +17 -0
- es/wav2vec2-large-es-voxpopuli/README.md +24 -0
- es/wav2vec2-large-es-voxpopuli/config.json +83 -0
- es/wav2vec2-large-es-voxpopuli/flax_model.msgpack +3 -0
- es/wav2vec2-large-es-voxpopuli/preprocessor_config.json +9 -0
- es/wav2vec2-large-es-voxpopuli/pytorch_model.bin +3 -0
- es/wav2vec2-large-es-voxpopuli/source.txt +1 -0
- fa/wav2vec2-large-xlsr-53-persian/.gitattributes +17 -0
- fa/wav2vec2-large-xlsr-53-persian/README.md +195 -0
- fa/wav2vec2-large-xlsr-53-persian/config.json +76 -0
- fa/wav2vec2-large-xlsr-53-persian/flax_model.msgpack +3 -0
- fa/wav2vec2-large-xlsr-53-persian/issues.txt +9 -0
- fa/wav2vec2-large-xlsr-53-persian/preprocessor_config.json +8 -0
- fa/wav2vec2-large-xlsr-53-persian/pytorch_model.bin +3 -0
- fa/wav2vec2-large-xlsr-53-persian/source.txt +1 -0
- fa/wav2vec2-large-xlsr-53-persian/special_tokens_map.json +1 -0
- fa/wav2vec2-large-xlsr-53-persian/vocab.json +1 -0
- fa/wav2vec2-large-xlsr-persian-v3/.gitattributes +17 -0
- fa/wav2vec2-large-xlsr-persian-v3/README.md +236 -0
- fa/wav2vec2-large-xlsr-persian-v3/config.json +76 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
ar/wav2vec2-large-xlsr-53-arabic[[:space:]](elgeish)/validation_wer.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
fa/wav2vec2-large-xlsr-persian-v3/sample2978.flac filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
he/wav2vec2-xls-r-300m-lm-hebrew/language_model/unigrams.txt filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
multi/wav2vec2-xlsr-53-espeak-cv-ft/img[[:space:]](issue[[:space:]]10).jpeg filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
ru/wav2vec2-large-xlsr-53-russian/language_model/lm.binary filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
ru/wav2vec2-large-xlsr-53-russian/language_model/unigrams.txt filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
uk/w2v-xls-r-uk/language_model/lm.binary filter=lfs diff=lfs merge=lfs -text
|
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/.gitattributes
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/README.md
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language: ar
|
| 3 |
+
datasets:
|
| 4 |
+
- arabic_speech_corpus
|
| 5 |
+
- mozilla-foundation/common_voice_6_1
|
| 6 |
+
metrics:
|
| 7 |
+
- wer
|
| 8 |
+
tags:
|
| 9 |
+
- audio
|
| 10 |
+
- automatic-speech-recognition
|
| 11 |
+
- speech
|
| 12 |
+
- xlsr-fine-tuning-week
|
| 13 |
+
- hf-asr-leaderboard
|
| 14 |
+
license: apache-2.0
|
| 15 |
+
model-index:
|
| 16 |
+
- name: elgeish-wav2vec2-large-xlsr-53-arabic
|
| 17 |
+
results:
|
| 18 |
+
- task:
|
| 19 |
+
name: Automatic Speech Recognition
|
| 20 |
+
type: automatic-speech-recognition
|
| 21 |
+
dataset:
|
| 22 |
+
name: Common Voice 6.1 (Arabic)
|
| 23 |
+
type: mozilla-foundation/common_voice_6_1
|
| 24 |
+
config: ar
|
| 25 |
+
split: test
|
| 26 |
+
args:
|
| 27 |
+
language: ar
|
| 28 |
+
metrics:
|
| 29 |
+
- name: Test WER
|
| 30 |
+
type: wer
|
| 31 |
+
value: 26.55
|
| 32 |
+
- name: Validation WER
|
| 33 |
+
type: wer
|
| 34 |
+
value: 23.39
|
| 35 |
+
---
|
| 36 |
+
|
| 37 |
+
# Wav2Vec2-Large-XLSR-53-Arabic
|
| 38 |
+
|
| 39 |
+
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53)
|
| 40 |
+
on Arabic using the `train` splits of [Common Voice](https://huggingface.co/datasets/common_voice)
|
| 41 |
+
and [Arabic Speech Corpus](https://huggingface.co/datasets/arabic_speech_corpus).
|
| 42 |
+
When using this model, make sure that your speech input is sampled at 16kHz.
|
| 43 |
+
|
| 44 |
+
## Usage
|
| 45 |
+
|
| 46 |
+
The model can be used directly (without a language model) as follows:
|
| 47 |
+
|
| 48 |
+
```python
|
| 49 |
+
import torch
|
| 50 |
+
import torchaudio
|
| 51 |
+
from datasets import load_dataset
|
| 52 |
+
from lang_trans.arabic import buckwalter
|
| 53 |
+
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
| 54 |
+
|
| 55 |
+
dataset = load_dataset("common_voice", "ar", split="test[:10]")
|
| 56 |
+
resamplers = { # all three sampling rates exist in test split
|
| 57 |
+
48000: torchaudio.transforms.Resample(48000, 16000),
|
| 58 |
+
44100: torchaudio.transforms.Resample(44100, 16000),
|
| 59 |
+
32000: torchaudio.transforms.Resample(32000, 16000),
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
def prepare_example(example):
|
| 63 |
+
speech, sampling_rate = torchaudio.load(example["path"])
|
| 64 |
+
example["speech"] = resamplers[sampling_rate](speech).squeeze().numpy()
|
| 65 |
+
return example
|
| 66 |
+
|
| 67 |
+
dataset = dataset.map(prepare_example)
|
| 68 |
+
processor = Wav2Vec2Processor.from_pretrained("elgeish/wav2vec2-large-xlsr-53-arabic")
|
| 69 |
+
model = Wav2Vec2ForCTC.from_pretrained("elgeish/wav2vec2-large-xlsr-53-arabic").eval()
|
| 70 |
+
|
| 71 |
+
def predict(batch):
|
| 72 |
+
inputs = processor(batch["speech"], sampling_rate=16000, return_tensors="pt", padding=True)
|
| 73 |
+
with torch.no_grad():
|
| 74 |
+
predicted = torch.argmax(model(inputs.input_values).logits, dim=-1)
|
| 75 |
+
predicted[predicted == -100] = processor.tokenizer.pad_token_id # see fine-tuning script
|
| 76 |
+
batch["predicted"] = processor.tokenizer.batch_decode(predicted)
|
| 77 |
+
return batch
|
| 78 |
+
|
| 79 |
+
dataset = dataset.map(predict, batched=True, batch_size=1, remove_columns=["speech"])
|
| 80 |
+
|
| 81 |
+
for reference, predicted in zip(dataset["sentence"], dataset["predicted"]):
|
| 82 |
+
print("reference:", reference)
|
| 83 |
+
print("predicted:", buckwalter.untrans(predicted))
|
| 84 |
+
print("--")
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
Here's the output:
|
| 88 |
+
|
| 89 |
+
```
|
| 90 |
+
reference: ألديك قلم ؟
|
| 91 |
+
predicted: هلديك قالر
|
| 92 |
+
--
|
| 93 |
+
reference: ليست هناك مسافة على هذه الأرض أبعد من يوم أمس.
|
| 94 |
+
predicted: ليست نالك مسافة على هذه الأرض أبعد من يوم أمس
|
| 95 |
+
--
|
| 96 |
+
reference: إنك تكبر المشكلة.
|
| 97 |
+
predicted: إنك تكبر المشكلة
|
| 98 |
+
--
|
| 99 |
+
reference: يرغب أن يلتقي بك.
|
| 100 |
+
predicted: يرغب أن يلتقي بك
|
| 101 |
+
--
|
| 102 |
+
reference: إنهم لا يعرفون لماذا حتى.
|
| 103 |
+
predicted: إنهم لا يعرفون لماذا حتى
|
| 104 |
+
--
|
| 105 |
+
reference: سيسعدني مساعدتك أي وقت تحب.
|
| 106 |
+
predicted: سيسئدني مساعد سكرأي وقت تحب
|
| 107 |
+
--
|
| 108 |
+
reference: أَحَبُّ نظريّة علمية إليّ هي أن حلقات زحل مكونة بالكامل من الأمتعة المفقودة.
|
| 109 |
+
predicted: أحب ناضريةً علمية إلي هي أنحل قتزح المكونا بالكامل من الأمت عن المفقودة
|
| 110 |
+
--
|
| 111 |
+
reference: سأشتري له قلماً.
|
| 112 |
+
predicted: سأشتري له قلما
|
| 113 |
+
--
|
| 114 |
+
reference: أين المشكلة ؟
|
| 115 |
+
predicted: أين المشكل
|
| 116 |
+
--
|
| 117 |
+
reference: وَلِلَّهِ يَسْجُدُ مَا فِي السَّمَاوَاتِ وَمَا فِي الْأَرْضِ مِنْ دَابَّةٍ وَالْمَلَائِكَةُ وَهُمْ لَا يَسْتَكْبِرُونَ
|
| 118 |
+
predicted: ولله يسجد ما في السماوات وما في الأرض من دابة والملائكة وهم لا يستكبرون
|
| 119 |
+
--
|
| 120 |
+
```
|
| 121 |
+
|
| 122 |
+
## Evaluation
|
| 123 |
+
|
| 124 |
+
The model can be evaluated as follows on the Arabic test data of Common Voice:
|
| 125 |
+
|
| 126 |
+
```python
|
| 127 |
+
import jiwer
|
| 128 |
+
import torch
|
| 129 |
+
import torchaudio
|
| 130 |
+
from datasets import load_dataset
|
| 131 |
+
from lang_trans.arabic import buckwalter
|
| 132 |
+
from transformers import set_seed, Wav2Vec2ForCTC, Wav2Vec2Processor
|
| 133 |
+
|
| 134 |
+
set_seed(42)
|
| 135 |
+
test_split = load_dataset("common_voice", "ar", split="test")
|
| 136 |
+
resamplers = { # all three sampling rates exist in test split
|
| 137 |
+
48000: torchaudio.transforms.Resample(48000, 16000),
|
| 138 |
+
44100: torchaudio.transforms.Resample(44100, 16000),
|
| 139 |
+
32000: torchaudio.transforms.Resample(32000, 16000),
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
def prepare_example(example):
|
| 143 |
+
speech, sampling_rate = torchaudio.load(example["path"])
|
| 144 |
+
example["speech"] = resamplers[sampling_rate](speech).squeeze().numpy()
|
| 145 |
+
return example
|
| 146 |
+
|
| 147 |
+
test_split = test_split.map(prepare_example)
|
| 148 |
+
processor = Wav2Vec2Processor.from_pretrained("elgeish/wav2vec2-large-xlsr-53-arabic")
|
| 149 |
+
model = Wav2Vec2ForCTC.from_pretrained("elgeish/wav2vec2-large-xlsr-53-arabic").to("cuda").eval()
|
| 150 |
+
|
| 151 |
+
def predict(batch):
|
| 152 |
+
inputs = processor(batch["speech"], sampling_rate=16000, return_tensors="pt", padding=True)
|
| 153 |
+
with torch.no_grad():
|
| 154 |
+
predicted = torch.argmax(model(inputs.input_values.to("cuda")).logits, dim=-1)
|
| 155 |
+
predicted[predicted == -100] = processor.tokenizer.pad_token_id # see fine-tuning script
|
| 156 |
+
batch["predicted"] = processor.batch_decode(predicted)
|
| 157 |
+
return batch
|
| 158 |
+
|
| 159 |
+
test_split = test_split.map(predict, batched=True, batch_size=16, remove_columns=["speech"])
|
| 160 |
+
transformation = jiwer.Compose([
|
| 161 |
+
# normalize some diacritics, remove punctuation, and replace Persian letters with Arabic ones
|
| 162 |
+
jiwer.SubstituteRegexes({
|
| 163 |
+
r'[auiFNKo\~_،؟»\?;:\-,\.؛«!"]': "", "\u06D6": "",
|
| 164 |
+
r"[\|\{]": "A", "p": "h", "ک": "k", "ی": "y"}),
|
| 165 |
+
# default transformation below
|
| 166 |
+
jiwer.RemoveMultipleSpaces(),
|
| 167 |
+
jiwer.Strip(),
|
| 168 |
+
jiwer.SentencesToListOfWords(),
|
| 169 |
+
jiwer.RemoveEmptyStrings(),
|
| 170 |
+
])
|
| 171 |
+
metrics = jiwer.compute_measures(
|
| 172 |
+
truth=[buckwalter.trans(s) for s in test_split["sentence"]], # Buckwalter transliteration
|
| 173 |
+
hypothesis=test_split["predicted"],
|
| 174 |
+
truth_transform=transformation,
|
| 175 |
+
hypothesis_transform=transformation,
|
| 176 |
+
)
|
| 177 |
+
print(f"WER: {metrics['wer']:.2%}")
|
| 178 |
+
```
|
| 179 |
+
|
| 180 |
+
**Test Result**: 26.55%
|
| 181 |
+
|
| 182 |
+
## Training
|
| 183 |
+
|
| 184 |
+
For more details, see [Fine-Tuning with Arabic Speech Corpus](https://github.com/huggingface/transformers/tree/1c06240e1b3477728129bb58e7b6c7734bb5074e/examples/research_projects/wav2vec2#fine-tuning-with-arabic-speech-corpus).
|
| 185 |
+
|
| 186 |
+
This model represents Arabic in a format called [Buckwalter transliteration](https://en.wikipedia.org/wiki/Buckwalter_transliteration).
|
| 187 |
+
The Buckwalter format only includes ASCII characters, some of which are non-alpha (e.g., `">"` maps to `"أ"`).
|
| 188 |
+
The [lang-trans](https://github.com/kariminf/lang-trans) package is used to convert (transliterate) Arabic abjad.
|
| 189 |
+
|
| 190 |
+
[This script](https://github.com/huggingface/transformers/blob/1c06240e1b3477728129bb58e7b6c7734bb5074e/examples/research_projects/wav2vec2/finetune_large_xlsr_53_arabic_speech_corpus.sh)
|
| 191 |
+
was used to first fine-tune [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53)
|
| 192 |
+
on the `train` split of the [Arabic Speech Corpus](https://huggingface.co/datasets/arabic_speech_corpus) dataset;
|
| 193 |
+
the `test` split was used for model selection; the resulting model at this point is saved as [elgeish/wav2vec2-large-xlsr-53-levantine-arabic](https://huggingface.co/elgeish/wav2vec2-large-xlsr-53-levantine-arabic).
|
| 194 |
+
|
| 195 |
+
Training was then resumed using the `train` split of the [Common Voice](https://huggingface.co/datasets/common_voice) dataset;
|
| 196 |
+
the `validation` split was used for model selection;
|
| 197 |
+
training was stopped to meet the deadline of [Fine-Tune-XLSR Week](https://github.com/huggingface/transformers/blob/700229f8a4003c4f71f29275e0874b5ba58cd39d/examples/research_projects/wav2vec2/FINE_TUNE_XLSR_WAV2VEC2.md):
|
| 198 |
+
this model is the checkpoint at 100k steps and a validation WER of **23.39%**.
|
| 199 |
+
|
| 200 |
+
<img src="https://huggingface.co/elgeish/wav2vec2-large-xlsr-53-arabic/raw/main/validation_wer.png" alt="Validation WER" width="100%" />
|
| 201 |
+
|
| 202 |
+
It's worth noting that validation WER is trending down, indicating the potential of further training (resuming the decaying learning rate at 7e-6).
|
| 203 |
+
|
| 204 |
+
## Future Work
|
| 205 |
+
One area to explore is using `attention_mask` in model input, which is recommended [here](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2).
|
| 206 |
+
Also, exploring data augmentation using datasets used to train models listed [here](https://paperswithcode.com/sota/speech-recognition-on-common-voice-arabic).
|
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/config.json
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "elgeish/wav2vec2-large-xlsr-53-arabic",
|
| 3 |
+
"activation_dropout": 0.1,
|
| 4 |
+
"apply_spec_augment": true,
|
| 5 |
+
"architectures": [
|
| 6 |
+
"Wav2Vec2ForCTC"
|
| 7 |
+
],
|
| 8 |
+
"attention_dropout": 0.1,
|
| 9 |
+
"bos_token_id": 1,
|
| 10 |
+
"conv_bias": true,
|
| 11 |
+
"conv_dim": [
|
| 12 |
+
512,
|
| 13 |
+
512,
|
| 14 |
+
512,
|
| 15 |
+
512,
|
| 16 |
+
512,
|
| 17 |
+
512,
|
| 18 |
+
512
|
| 19 |
+
],
|
| 20 |
+
"conv_kernel": [
|
| 21 |
+
10,
|
| 22 |
+
3,
|
| 23 |
+
3,
|
| 24 |
+
3,
|
| 25 |
+
3,
|
| 26 |
+
2,
|
| 27 |
+
2
|
| 28 |
+
],
|
| 29 |
+
"conv_stride": [
|
| 30 |
+
5,
|
| 31 |
+
2,
|
| 32 |
+
2,
|
| 33 |
+
2,
|
| 34 |
+
2,
|
| 35 |
+
2,
|
| 36 |
+
2
|
| 37 |
+
],
|
| 38 |
+
"ctc_loss_reduction": "sum",
|
| 39 |
+
"ctc_zero_infinity": false,
|
| 40 |
+
"do_stable_layer_norm": true,
|
| 41 |
+
"eos_token_id": 2,
|
| 42 |
+
"feat_extract_activation": "gelu",
|
| 43 |
+
"feat_extract_dropout": 0.0,
|
| 44 |
+
"feat_extract_norm": "layer",
|
| 45 |
+
"feat_proj_dropout": 0.1,
|
| 46 |
+
"final_dropout": 0.1,
|
| 47 |
+
"gradient_checkpointing": false,
|
| 48 |
+
"hidden_act": "gelu",
|
| 49 |
+
"hidden_dropout": 0.1,
|
| 50 |
+
"hidden_dropout_prob": 0.1,
|
| 51 |
+
"hidden_size": 1024,
|
| 52 |
+
"initializer_range": 0.02,
|
| 53 |
+
"intermediate_size": 4096,
|
| 54 |
+
"layer_norm_eps": 1e-05,
|
| 55 |
+
"layerdrop": 0.1,
|
| 56 |
+
"mask_feature_length": 10,
|
| 57 |
+
"mask_feature_prob": 0.0,
|
| 58 |
+
"mask_time_length": 10,
|
| 59 |
+
"mask_time_prob": 0.05,
|
| 60 |
+
"model_type": "wav2vec2",
|
| 61 |
+
"num_attention_heads": 16,
|
| 62 |
+
"num_conv_pos_embedding_groups": 16,
|
| 63 |
+
"num_conv_pos_embeddings": 128,
|
| 64 |
+
"num_feat_extract_layers": 7,
|
| 65 |
+
"num_hidden_layers": 24,
|
| 66 |
+
"pad_token_id": 0,
|
| 67 |
+
"transformers_version": "4.4.0.dev0",
|
| 68 |
+
"vocab_size": 56
|
| 69 |
+
}
|
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/flax_model.msgpack
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:50f263c94e388555ccc40d13e8086de5222f7ca3a4aa20c4cad8232076e92fd4
|
| 3 |
+
size 1261999872
|
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/preprocessor_config.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_normalize": true,
|
| 3 |
+
"feature_size": 1,
|
| 4 |
+
"padding_side": "right",
|
| 5 |
+
"padding_value": 0.0,
|
| 6 |
+
"return_attention_mask": false,
|
| 7 |
+
"sampling_rate": 16000
|
| 8 |
+
}
|
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fe4a13489e50820edbd709e85699dd87beeff5387e8c40d16860b10efb964547
|
| 3 |
+
size 1262163415
|
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/elgeish/wav2vec2-large-xlsr-53-arabic
|
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/special_tokens_map.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
|
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/tokenizer_config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "/", "return_attention_mask": false, "do_normalize": true, "special_tokens_map_file": "special_tokens_map.json", "name_or_path": "elgeish/wav2vec2-large-xlsr-53-arabic"}
|
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/validation_wer.png
ADDED
|
Git LFS Details
|
ar/wav2vec2-large-xlsr-53-arabic (elgeish)/vocab.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "/": 4, "'": 5, "|": 6, ">": 7, "&": 8, "<": 9, "}": 10, "A": 11, "b": 12, "p": 13, "t": 14, "v": 15, "j": 16, "H": 17, "x": 18, "d": 19, "*": 20, "r": 21, "z": 22, "s": 23, "$": 24, "S": 25, "D": 26, "T": 27, "Z": 28, "E": 29, "g": 30, "_": 31, "f": 32, "q": 33, "k": 34, "l": 35, "m": 36, "n": 37, "h": 38, "w": 39, "Y": 40, "y": 41, "F": 42, "N": 43, "K": 44, "a": 45, "u": 46, "i": 47, "~": 48, "o": 49, "`": 50, "{": 51, "P": 52, "J": 53, "V": 54, "G": 55}
|
ar/wav2vec2-large-xlsr-53-arabic/.gitattributes
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
ar/wav2vec2-large-xlsr-53-arabic/README.md
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language: ar
|
| 3 |
+
datasets:
|
| 4 |
+
- common_voice
|
| 5 |
+
- arabic_speech_corpus
|
| 6 |
+
metrics:
|
| 7 |
+
- wer
|
| 8 |
+
- cer
|
| 9 |
+
tags:
|
| 10 |
+
- audio
|
| 11 |
+
- automatic-speech-recognition
|
| 12 |
+
- speech
|
| 13 |
+
- xlsr-fine-tuning-week
|
| 14 |
+
license: apache-2.0
|
| 15 |
+
model-index:
|
| 16 |
+
- name: XLSR Wav2Vec2 Arabic by Jonatas Grosman
|
| 17 |
+
results:
|
| 18 |
+
- task:
|
| 19 |
+
name: Speech Recognition
|
| 20 |
+
type: automatic-speech-recognition
|
| 21 |
+
dataset:
|
| 22 |
+
name: Common Voice ar
|
| 23 |
+
type: common_voice
|
| 24 |
+
args: ar
|
| 25 |
+
metrics:
|
| 26 |
+
- name: Test WER
|
| 27 |
+
type: wer
|
| 28 |
+
value: 39.59
|
| 29 |
+
- name: Test CER
|
| 30 |
+
type: cer
|
| 31 |
+
value: 18.18
|
| 32 |
+
---
|
| 33 |
+
|
| 34 |
+
# Fine-tuned XLSR-53 large model for speech recognition in Arabic
|
| 35 |
+
|
| 36 |
+
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Arabic using the train and validation splits of [Common Voice 6.1](https://huggingface.co/datasets/common_voice) and [Arabic Speech Corpus](https://huggingface.co/datasets/arabic_speech_corpus).
|
| 37 |
+
When using this model, make sure that your speech input is sampled at 16kHz.
|
| 38 |
+
|
| 39 |
+
This model has been fine-tuned thanks to the GPU credits generously given by the [OVHcloud](https://www.ovhcloud.com/en/public-cloud/ai-training/) :)
|
| 40 |
+
|
| 41 |
+
The script used for training can be found here: https://github.com/jonatasgrosman/wav2vec2-sprint
|
| 42 |
+
|
| 43 |
+
## Usage
|
| 44 |
+
|
| 45 |
+
The model can be used directly (without a language model) as follows...
|
| 46 |
+
|
| 47 |
+
Using the [HuggingSound](https://github.com/jonatasgrosman/huggingsound) library:
|
| 48 |
+
|
| 49 |
+
```python
|
| 50 |
+
from huggingsound import SpeechRecognitionModel
|
| 51 |
+
|
| 52 |
+
model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-arabic")
|
| 53 |
+
audio_paths = ["/path/to/file.mp3", "/path/to/another_file.wav"]
|
| 54 |
+
|
| 55 |
+
transcriptions = model.transcribe(audio_paths)
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
Writing your own inference script:
|
| 59 |
+
|
| 60 |
+
```python
|
| 61 |
+
import torch
|
| 62 |
+
import librosa
|
| 63 |
+
from datasets import load_dataset
|
| 64 |
+
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
| 65 |
+
|
| 66 |
+
LANG_ID = "ar"
|
| 67 |
+
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
|
| 68 |
+
SAMPLES = 10
|
| 69 |
+
|
| 70 |
+
test_dataset = load_dataset("common_voice", LANG_ID, split=f"test[:{SAMPLES}]")
|
| 71 |
+
|
| 72 |
+
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
|
| 73 |
+
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
|
| 74 |
+
|
| 75 |
+
# Preprocessing the datasets.
|
| 76 |
+
# We need to read the audio files as arrays
|
| 77 |
+
def speech_file_to_array_fn(batch):
|
| 78 |
+
speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
|
| 79 |
+
batch["speech"] = speech_array
|
| 80 |
+
batch["sentence"] = batch["sentence"].upper()
|
| 81 |
+
return batch
|
| 82 |
+
|
| 83 |
+
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
| 84 |
+
inputs = processor(test_dataset["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
|
| 85 |
+
|
| 86 |
+
with torch.no_grad():
|
| 87 |
+
logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
|
| 88 |
+
|
| 89 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
| 90 |
+
predicted_sentences = processor.batch_decode(predicted_ids)
|
| 91 |
+
|
| 92 |
+
for i, predicted_sentence in enumerate(predicted_sentences):
|
| 93 |
+
print("-" * 100)
|
| 94 |
+
print("Reference:", test_dataset[i]["sentence"])
|
| 95 |
+
print("Prediction:", predicted_sentence)
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
| Reference | Prediction |
|
| 99 |
+
| ------------- | ------------- |
|
| 100 |
+
| ألديك قلم ؟ | ألديك قلم |
|
| 101 |
+
| ليست هناك مسافة على هذه الأرض أبعد من يوم أمس. | ليست نالك مسافة على هذه الأرض أبعد من يوم الأمس م |
|
| 102 |
+
| إنك تكبر المشكلة. | إنك تكبر المشكلة |
|
| 103 |
+
| يرغب أن يلتقي بك. | يرغب أن يلتقي بك |
|
| 104 |
+
| إنهم لا يعرفون لماذا حتى. | إنهم لا يعرفون لماذا حتى |
|
| 105 |
+
| سيسعدني مساعدتك أي وقت تحب. | سيسئدنيمساعدتك أي وقد تحب |
|
| 106 |
+
| أَحَبُّ نظريّة علمية إليّ هي أن حلقات زحل مكونة بالكامل من الأمتعة المفقودة. | أحب نظرية علمية إلي هي أن حل قتزح المكوينا بالكامل من الأمت عن المفقودة |
|
| 107 |
+
| سأشتري له قلماً. | سأشتري له قلما |
|
| 108 |
+
| أين المشكلة ؟ | أين المشكل |
|
| 109 |
+
| وَلِلَّهِ يَسْجُدُ مَا فِي السَّمَاوَاتِ وَمَا فِي الْأَرْضِ مِنْ دَابَّةٍ وَالْمَلَائِكَةُ وَهُمْ لَا يَسْتَكْبِرُونَ | ولله يسجد ما في السماوات وما في الأرض من دابة والملائكة وهم لا يستكبرون |
|
| 110 |
+
|
| 111 |
+
## Evaluation
|
| 112 |
+
|
| 113 |
+
The model can be evaluated as follows on the Arabic test data of Common Voice.
|
| 114 |
+
|
| 115 |
+
```python
|
| 116 |
+
import torch
|
| 117 |
+
import re
|
| 118 |
+
import librosa
|
| 119 |
+
from datasets import load_dataset, load_metric
|
| 120 |
+
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
| 121 |
+
|
| 122 |
+
LANG_ID = "ar"
|
| 123 |
+
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
|
| 124 |
+
DEVICE = "cuda"
|
| 125 |
+
|
| 126 |
+
CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
|
| 127 |
+
"؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
|
| 128 |
+
"{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
|
| 129 |
+
"、", "﹂", "﹁", "‧", "~", "﹏", ",", "{", "}", "(", ")", "[", "]", "【", "】", "‥", "〽",
|
| 130 |
+
"『", "』", "〝", "〟", "⟨", "⟩", "〜", ":", "!", "?", "♪", "؛", "/", "\\", "º", "−", "^", "'", "ʻ", "ˆ"]
|
| 131 |
+
|
| 132 |
+
test_dataset = load_dataset("common_voice", LANG_ID, split="test")
|
| 133 |
+
|
| 134 |
+
wer = load_metric("wer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/wer.py
|
| 135 |
+
cer = load_metric("cer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/cer.py
|
| 136 |
+
|
| 137 |
+
chars_to_ignore_regex = f"[{re.escape(''.join(CHARS_TO_IGNORE))}]"
|
| 138 |
+
|
| 139 |
+
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
|
| 140 |
+
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
|
| 141 |
+
model.to(DEVICE)
|
| 142 |
+
|
| 143 |
+
# Preprocessing the datasets.
|
| 144 |
+
# We need to read the audio files as arrays
|
| 145 |
+
def speech_file_to_array_fn(batch):
|
| 146 |
+
with warnings.catch_warnings():
|
| 147 |
+
warnings.simplefilter("ignore")
|
| 148 |
+
speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
|
| 149 |
+
batch["speech"] = speech_array
|
| 150 |
+
batch["sentence"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).upper()
|
| 151 |
+
return batch
|
| 152 |
+
|
| 153 |
+
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
| 154 |
+
|
| 155 |
+
# Preprocessing the datasets.
|
| 156 |
+
# We need to read the audio files as arrays
|
| 157 |
+
def evaluate(batch):
|
| 158 |
+
inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
|
| 159 |
+
|
| 160 |
+
with torch.no_grad():
|
| 161 |
+
logits = model(inputs.input_values.to(DEVICE), attention_mask=inputs.attention_mask.to(DEVICE)).logits
|
| 162 |
+
|
| 163 |
+
pred_ids = torch.argmax(logits, dim=-1)
|
| 164 |
+
batch["pred_strings"] = processor.batch_decode(pred_ids)
|
| 165 |
+
return batch
|
| 166 |
+
|
| 167 |
+
result = test_dataset.map(evaluate, batched=True, batch_size=8)
|
| 168 |
+
|
| 169 |
+
predictions = [x.upper() for x in result["pred_strings"]]
|
| 170 |
+
references = [x.upper() for x in result["sentence"]]
|
| 171 |
+
|
| 172 |
+
print(f"WER: {wer.compute(predictions=predictions, references=references, chunk_size=1000) * 100}")
|
| 173 |
+
print(f"CER: {cer.compute(predictions=predictions, references=references, chunk_size=1000) * 100}")
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
**Test Result**:
|
| 177 |
+
|
| 178 |
+
In the table below I report the Word Error Rate (WER) and the Character Error Rate (CER) of the model. I ran the evaluation script described above on other models as well (on 2021-05-14). Note that the table below may show different results from those already reported, this may have been caused due to some specificity of the other evaluation scripts used.
|
| 179 |
+
|
| 180 |
+
| Model | WER | CER |
|
| 181 |
+
| ------------- | ------------- | ------------- |
|
| 182 |
+
| jonatasgrosman/wav2vec2-large-xlsr-53-arabic | **39.59%** | **18.18%** |
|
| 183 |
+
| bakrianoo/sinai-voice-ar-stt | 45.30% | 21.84% |
|
| 184 |
+
| othrif/wav2vec2-large-xlsr-arabic | 45.93% | 20.51% |
|
| 185 |
+
| kmfoda/wav2vec2-large-xlsr-arabic | 54.14% | 26.07% |
|
| 186 |
+
| mohammed/wav2vec2-large-xlsr-arabic | 56.11% | 26.79% |
|
| 187 |
+
| anas/wav2vec2-large-xlsr-arabic | 62.02% | 27.09% |
|
| 188 |
+
| elgeish/wav2vec2-large-xlsr-53-arabic | 100.00% | 100.56% |
|
| 189 |
+
|
| 190 |
+
## Citation
|
| 191 |
+
If you want to cite this model you can use this:
|
| 192 |
+
|
| 193 |
+
```bibtex
|
| 194 |
+
@misc{grosman2021xlsr53-large-arabic,
|
| 195 |
+
title={Fine-tuned {XLSR}-53 large model for speech recognition in {A}rabic},
|
| 196 |
+
author={Grosman, Jonatas},
|
| 197 |
+
howpublished={\url{https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-arabic}},
|
| 198 |
+
year={2021}
|
| 199 |
+
}
|
| 200 |
+
```
|
ar/wav2vec2-large-xlsr-53-arabic/config.json
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "facebook/wav2vec2-large-xlsr-53",
|
| 3 |
+
"activation_dropout": 0.05,
|
| 4 |
+
"apply_spec_augment": true,
|
| 5 |
+
"architectures": [
|
| 6 |
+
"Wav2Vec2ForCTC"
|
| 7 |
+
],
|
| 8 |
+
"attention_dropout": 0.1,
|
| 9 |
+
"bos_token_id": 1,
|
| 10 |
+
"conv_bias": true,
|
| 11 |
+
"conv_dim": [
|
| 12 |
+
512,
|
| 13 |
+
512,
|
| 14 |
+
512,
|
| 15 |
+
512,
|
| 16 |
+
512,
|
| 17 |
+
512,
|
| 18 |
+
512
|
| 19 |
+
],
|
| 20 |
+
"conv_kernel": [
|
| 21 |
+
10,
|
| 22 |
+
3,
|
| 23 |
+
3,
|
| 24 |
+
3,
|
| 25 |
+
3,
|
| 26 |
+
2,
|
| 27 |
+
2
|
| 28 |
+
],
|
| 29 |
+
"conv_stride": [
|
| 30 |
+
5,
|
| 31 |
+
2,
|
| 32 |
+
2,
|
| 33 |
+
2,
|
| 34 |
+
2,
|
| 35 |
+
2,
|
| 36 |
+
2
|
| 37 |
+
],
|
| 38 |
+
"ctc_loss_reduction": "mean",
|
| 39 |
+
"ctc_zero_infinity": true,
|
| 40 |
+
"do_stable_layer_norm": true,
|
| 41 |
+
"eos_token_id": 2,
|
| 42 |
+
"feat_extract_activation": "gelu",
|
| 43 |
+
"feat_extract_dropout": 0.0,
|
| 44 |
+
"feat_extract_norm": "layer",
|
| 45 |
+
"feat_proj_dropout": 0.05,
|
| 46 |
+
"final_dropout": 0.0,
|
| 47 |
+
"gradient_checkpointing": true,
|
| 48 |
+
"hidden_act": "gelu",
|
| 49 |
+
"hidden_dropout": 0.05,
|
| 50 |
+
"hidden_size": 1024,
|
| 51 |
+
"initializer_range": 0.02,
|
| 52 |
+
"intermediate_size": 4096,
|
| 53 |
+
"layer_norm_eps": 1e-05,
|
| 54 |
+
"layerdrop": 0.05,
|
| 55 |
+
"mask_channel_length": 10,
|
| 56 |
+
"mask_channel_min_space": 1,
|
| 57 |
+
"mask_channel_other": 0.0,
|
| 58 |
+
"mask_channel_prob": 0.0,
|
| 59 |
+
"mask_channel_selection": "static",
|
| 60 |
+
"mask_feature_length": 10,
|
| 61 |
+
"mask_feature_prob": 0.0,
|
| 62 |
+
"mask_time_length": 10,
|
| 63 |
+
"mask_time_min_space": 1,
|
| 64 |
+
"mask_time_other": 0.0,
|
| 65 |
+
"mask_time_prob": 0.05,
|
| 66 |
+
"mask_time_selection": "static",
|
| 67 |
+
"model_type": "wav2vec2",
|
| 68 |
+
"num_attention_heads": 16,
|
| 69 |
+
"num_conv_pos_embedding_groups": 16,
|
| 70 |
+
"num_conv_pos_embeddings": 128,
|
| 71 |
+
"num_feat_extract_layers": 7,
|
| 72 |
+
"num_hidden_layers": 24,
|
| 73 |
+
"pad_token_id": 0,
|
| 74 |
+
"transformers_version": "4.5.0.dev0",
|
| 75 |
+
"vocab_size": 51
|
| 76 |
+
}
|
ar/wav2vec2-large-xlsr-53-arabic/flax_model.msgpack
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b44a67c277854fbcd96179ee8bfedb9f03f3826efc2af35f8eb9b964fd0df2b1
|
| 3 |
+
size 1261979372
|
ar/wav2vec2-large-xlsr-53-arabic/preprocessor_config.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_normalize": true,
|
| 3 |
+
"feature_size": 1,
|
| 4 |
+
"padding_side": "right",
|
| 5 |
+
"padding_value": 0.0,
|
| 6 |
+
"return_attention_mask": true,
|
| 7 |
+
"sampling_rate": 16000
|
| 8 |
+
}
|
ar/wav2vec2-large-xlsr-53-arabic/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a0b26f6d9d3edfde1784aef863c192a8cc1e438a23b45910ab648531ebe1857b
|
| 3 |
+
size 1262142936
|
ar/wav2vec2-large-xlsr-53-arabic/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-arabic
|
ar/wav2vec2-large-xlsr-53-arabic/special_tokens_map.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
|
ar/wav2vec2-large-xlsr-53-arabic/vocab.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "|": 4, "-": 5, "ء": 6, "آ": 7, "أ": 8, "ؤ": 9, "إ": 10, "ئ": 11, "ا": 12, "ب": 13, "ة": 14, "ت": 15, "ث": 16, "ج": 17, "ح": 18, "خ": 19, "د": 20, "ذ": 21, "ر": 22, "ز": 23, "س": 24, "ش": 25, "ص": 26, "ض": 27, "ط": 28, "ظ": 29, "ع": 30, "غ": 31, "ـ": 32, "ف": 33, "ق": 34, "ك": 35, "ل": 36, "م": 37, "ن": 38, "ه": 39, "و": 40, "ى": 41, "ي": 42, "ً": 43, "ٌ": 44, "ٍ": 45, "َ": 46, "ُ": 47, "ِ": 48, "ّ": 49, "ْ": 50}
|
de/wav2vec2-base-10k-voxpopuli-ft-de/.gitattributes
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
de/wav2vec2-base-10k-voxpopuli-ft-de/README.md
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language: de
|
| 3 |
+
tags:
|
| 4 |
+
- audio
|
| 5 |
+
- automatic-speech-recognition
|
| 6 |
+
- voxpopuli
|
| 7 |
+
license: cc-by-nc-4.0
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# Wav2Vec2-Base-VoxPopuli-Finetuned
|
| 11 |
+
|
| 12 |
+
[Facebook's Wav2Vec2](https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/) base model pretrained on the 10K unlabeled subset of [VoxPopuli corpus](https://arxiv.org/abs/2101.00390) and fine-tuned on the transcribed data in de (refer to Table 1 of paper for more information).
|
| 13 |
+
|
| 14 |
+
**Paper**: *[VoxPopuli: A Large-Scale Multilingual Speech Corpus for Representation
|
| 15 |
+
Learning, Semi-Supervised Learning and Interpretation](https://arxiv.org/abs/2101.00390)*
|
| 16 |
+
|
| 17 |
+
**Authors**: *Changhan Wang, Morgane Riviere, Ann Lee, Anne Wu, Chaitanya Talnikar, Daniel Haziza, Mary Williamson, Juan Pino, Emmanuel Dupoux* from *Facebook AI*
|
| 18 |
+
|
| 19 |
+
See the official website for more information, [here](https://github.com/facebookresearch/voxpopuli/)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# Usage for inference
|
| 23 |
+
|
| 24 |
+
In the following it is shown how the model can be used in inference on a sample of the [Common Voice dataset](https://commonvoice.mozilla.org/en/datasets)
|
| 25 |
+
|
| 26 |
+
```python
|
| 27 |
+
#!/usr/bin/env python3
|
| 28 |
+
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
| 29 |
+
from datasets import load_dataset
|
| 30 |
+
import torchaudio
|
| 31 |
+
import torch
|
| 32 |
+
|
| 33 |
+
# resample audio
|
| 34 |
+
|
| 35 |
+
# load model & processor
|
| 36 |
+
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-10k-voxpopuli-ft-de")
|
| 37 |
+
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-10k-voxpopuli-ft-de")
|
| 38 |
+
|
| 39 |
+
# load dataset
|
| 40 |
+
ds = load_dataset("common_voice", "de", split="validation[:1%]")
|
| 41 |
+
|
| 42 |
+
# common voice does not match target sampling rate
|
| 43 |
+
common_voice_sample_rate = 48000
|
| 44 |
+
target_sample_rate = 16000
|
| 45 |
+
|
| 46 |
+
resampler = torchaudio.transforms.Resample(common_voice_sample_rate, target_sample_rate)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# define mapping fn to read in sound file and resample
|
| 50 |
+
def map_to_array(batch):
|
| 51 |
+
speech, _ = torchaudio.load(batch["path"])
|
| 52 |
+
speech = resampler(speech)
|
| 53 |
+
batch["speech"] = speech[0]
|
| 54 |
+
return batch
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
# load all audio files
|
| 58 |
+
ds = ds.map(map_to_array)
|
| 59 |
+
|
| 60 |
+
# run inference on the first 5 data samples
|
| 61 |
+
inputs = processor(ds[:5]["speech"], sampling_rate=target_sample_rate, return_tensors="pt", padding=True)
|
| 62 |
+
|
| 63 |
+
# inference
|
| 64 |
+
logits = model(**inputs).logits
|
| 65 |
+
predicted_ids = torch.argmax(logits, axis=-1)
|
| 66 |
+
|
| 67 |
+
print(processor.batch_decode(predicted_ids))
|
| 68 |
+
```
|
| 69 |
+
|
de/wav2vec2-base-10k-voxpopuli-ft-de/config.json
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_dropout": 0.1,
|
| 3 |
+
"apply_spec_augment": true,
|
| 4 |
+
"architectures": [
|
| 5 |
+
"Wav2Vec2ForCTC"
|
| 6 |
+
],
|
| 7 |
+
"attention_dropout": 0.1,
|
| 8 |
+
"bos_token_id": 0,
|
| 9 |
+
"conv_bias": false,
|
| 10 |
+
"conv_dim": [
|
| 11 |
+
512,
|
| 12 |
+
512,
|
| 13 |
+
512,
|
| 14 |
+
512,
|
| 15 |
+
512,
|
| 16 |
+
512,
|
| 17 |
+
512
|
| 18 |
+
],
|
| 19 |
+
"conv_kernel": [
|
| 20 |
+
10,
|
| 21 |
+
3,
|
| 22 |
+
3,
|
| 23 |
+
3,
|
| 24 |
+
3,
|
| 25 |
+
2,
|
| 26 |
+
2
|
| 27 |
+
],
|
| 28 |
+
"conv_stride": [
|
| 29 |
+
5,
|
| 30 |
+
2,
|
| 31 |
+
2,
|
| 32 |
+
2,
|
| 33 |
+
2,
|
| 34 |
+
2,
|
| 35 |
+
2
|
| 36 |
+
],
|
| 37 |
+
"ctc_loss_reduction": "sum",
|
| 38 |
+
"ctc_zero_infinity": false,
|
| 39 |
+
"do_stable_layer_norm": false,
|
| 40 |
+
"eos_token_id": 2,
|
| 41 |
+
"feat_extract_activation": "gelu",
|
| 42 |
+
"feat_extract_dropout": 0.0,
|
| 43 |
+
"feat_extract_norm": "group",
|
| 44 |
+
"feat_proj_dropout": 0.1,
|
| 45 |
+
"final_dropout": 0.1,
|
| 46 |
+
"gradient_checkpointing": false,
|
| 47 |
+
"hidden_act": "gelu",
|
| 48 |
+
"hidden_dropout": 0.1,
|
| 49 |
+
"hidden_dropout_prob": 0.1,
|
| 50 |
+
"hidden_size": 768,
|
| 51 |
+
"initializer_range": 0.02,
|
| 52 |
+
"intermediate_size": 3072,
|
| 53 |
+
"layer_norm_eps": 1e-05,
|
| 54 |
+
"layerdrop": 0.1,
|
| 55 |
+
"mask_feature_length": 10,
|
| 56 |
+
"mask_feature_prob": 0.0,
|
| 57 |
+
"mask_time_length": 10,
|
| 58 |
+
"mask_time_prob": 0.05,
|
| 59 |
+
"model_type": "wav2vec2",
|
| 60 |
+
"num_attention_heads": 12,
|
| 61 |
+
"num_conv_pos_embedding_groups": 16,
|
| 62 |
+
"num_conv_pos_embeddings": 128,
|
| 63 |
+
"num_feat_extract_layers": 7,
|
| 64 |
+
"num_hidden_layers": 12,
|
| 65 |
+
"pad_token_id": 1,
|
| 66 |
+
"transformers_version": "4.6.0.dev0",
|
| 67 |
+
"vocab_size": 36
|
| 68 |
+
}
|
de/wav2vec2-base-10k-voxpopuli-ft-de/preprocessor_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_normalize": true,
|
| 3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
| 4 |
+
"feature_size": 1,
|
| 5 |
+
"padding_side": "right",
|
| 6 |
+
"padding_value": 0,
|
| 7 |
+
"return_attention_mask": false,
|
| 8 |
+
"sampling_rate": 16000
|
| 9 |
+
}
|
de/wav2vec2-base-10k-voxpopuli-ft-de/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6744100ba037593b22fac95738074e7cf6c1d9c94f0bfa0e76c3cf863b25741f
|
| 3 |
+
size 377684844
|
de/wav2vec2-base-10k-voxpopuli-ft-de/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/facebook/wav2vec2-base-10k-voxpopuli-ft-de
|
de/wav2vec2-base-10k-voxpopuli-ft-de/special_tokens_map.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
|
de/wav2vec2-base-10k-voxpopuli-ft-de/tokenizer_config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|"}
|
de/wav2vec2-base-10k-voxpopuli-ft-de/vocab.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "|": 4, "e": 5, "n": 6, "i": 7, "r": 8, "s": 9, "t": 10, "a": 11, "d": 12, "h": 13, "u": 14, "l": 15, "g": 16, "c": 17, "m": 18, "o": 19, "b": 20, "w": 21, "f": 22, "k": 23, "z": 24, "p": 25, "v": 26, "ü": 27, "ä": 28, "ö": 29, "j": 30, "ß": 31, "y": 32, "x": 33, "q": 34, "1": 35}
|
es/wav2vec2-large-es-voxpopuli/.gitattributes
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
es/wav2vec2-large-es-voxpopuli/README.md
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language: es
|
| 3 |
+
tags:
|
| 4 |
+
- audio
|
| 5 |
+
- automatic-speech-recognition
|
| 6 |
+
- voxpopuli
|
| 7 |
+
license: cc-by-nc-4.0
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# Wav2Vec2-Large-VoxPopuli
|
| 11 |
+
|
| 12 |
+
[Facebook's Wav2Vec2](https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/) large model pretrained on the es unlabeled subset of [VoxPopuli corpus](https://arxiv.org/abs/2101.00390).
|
| 13 |
+
|
| 14 |
+
**Paper**: *[VoxPopuli: A Large-Scale Multilingual Speech Corpus for Representation
|
| 15 |
+
Learning, Semi-Supervised Learning and Interpretation](https://arxiv.org/abs/2101.00390)*
|
| 16 |
+
|
| 17 |
+
**Authors**: *Changhan Wang, Morgane Riviere, Ann Lee, Anne Wu, Chaitanya Talnikar, Daniel Haziza, Mary Williamson, Juan Pino, Emmanuel Dupoux* from *Facebook AI*
|
| 18 |
+
|
| 19 |
+
See the official website for more information, [here](https://github.com/facebookresearch/voxpopuli/)
|
| 20 |
+
|
| 21 |
+
# Fine-Tuning
|
| 22 |
+
|
| 23 |
+
Please refer to [this blog](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) on how to fine-tune this model on a specific language. Note that you should replace `"facebook/wav2vec2-large-xlsr-53"` with this checkpoint for fine-tuning.
|
| 24 |
+
|
es/wav2vec2-large-es-voxpopuli/config.json
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_dropout": 0.0,
|
| 3 |
+
"apply_spec_augment": true,
|
| 4 |
+
"architectures": [
|
| 5 |
+
"Wav2Vec2ForPreTraining"
|
| 6 |
+
],
|
| 7 |
+
"attention_dropout": 0.1,
|
| 8 |
+
"bos_token_id": 1,
|
| 9 |
+
"codevector_dim": 768,
|
| 10 |
+
"contrastive_logits_temperature": 0.1,
|
| 11 |
+
"conv_bias": true,
|
| 12 |
+
"conv_dim": [
|
| 13 |
+
512,
|
| 14 |
+
512,
|
| 15 |
+
512,
|
| 16 |
+
512,
|
| 17 |
+
512,
|
| 18 |
+
512,
|
| 19 |
+
512
|
| 20 |
+
],
|
| 21 |
+
"conv_kernel": [
|
| 22 |
+
10,
|
| 23 |
+
3,
|
| 24 |
+
3,
|
| 25 |
+
3,
|
| 26 |
+
3,
|
| 27 |
+
2,
|
| 28 |
+
2
|
| 29 |
+
],
|
| 30 |
+
"conv_stride": [
|
| 31 |
+
5,
|
| 32 |
+
2,
|
| 33 |
+
2,
|
| 34 |
+
2,
|
| 35 |
+
2,
|
| 36 |
+
2,
|
| 37 |
+
2
|
| 38 |
+
],
|
| 39 |
+
"ctc_loss_reduction": "sum",
|
| 40 |
+
"ctc_zero_infinity": false,
|
| 41 |
+
"diversity_loss_weight": 0.1,
|
| 42 |
+
"do_stable_layer_norm": true,
|
| 43 |
+
"eos_token_id": 2,
|
| 44 |
+
"feat_extract_activation": "gelu",
|
| 45 |
+
"feat_extract_dropout": 0.0,
|
| 46 |
+
"feat_extract_norm": "layer",
|
| 47 |
+
"feat_proj_dropout": 0.1,
|
| 48 |
+
"feat_quantizer_dropout": 0.0,
|
| 49 |
+
"final_dropout": 0.0,
|
| 50 |
+
"gradient_checkpointing": false,
|
| 51 |
+
"hidden_act": "gelu",
|
| 52 |
+
"hidden_dropout": 0.1,
|
| 53 |
+
"hidden_size": 1024,
|
| 54 |
+
"initializer_range": 0.02,
|
| 55 |
+
"intermediate_size": 4096,
|
| 56 |
+
"layer_norm_eps": 1e-05,
|
| 57 |
+
"layerdrop": 0.1,
|
| 58 |
+
"mask_channel_length": 10,
|
| 59 |
+
"mask_channel_min_space": 1,
|
| 60 |
+
"mask_channel_other": 0.0,
|
| 61 |
+
"mask_channel_prob": 0.0,
|
| 62 |
+
"mask_channel_selection": "static",
|
| 63 |
+
"mask_feature_length": 10,
|
| 64 |
+
"mask_feature_prob": 0.0,
|
| 65 |
+
"mask_time_length": 10,
|
| 66 |
+
"mask_time_min_space": 1,
|
| 67 |
+
"mask_time_other": 0.0,
|
| 68 |
+
"mask_time_prob": 0.075,
|
| 69 |
+
"mask_time_selection": "static",
|
| 70 |
+
"model_type": "wav2vec2",
|
| 71 |
+
"num_attention_heads": 16,
|
| 72 |
+
"num_codevector_groups": 2,
|
| 73 |
+
"num_codevectors_per_group": 320,
|
| 74 |
+
"num_conv_pos_embedding_groups": 16,
|
| 75 |
+
"num_conv_pos_embeddings": 128,
|
| 76 |
+
"num_feat_extract_layers": 7,
|
| 77 |
+
"num_hidden_layers": 24,
|
| 78 |
+
"num_negatives": 100,
|
| 79 |
+
"pad_token_id": 0,
|
| 80 |
+
"proj_codevector_dim": 768,
|
| 81 |
+
"transformers_version": "4.7.0.dev0",
|
| 82 |
+
"vocab_size": 32
|
| 83 |
+
}
|
es/wav2vec2-large-es-voxpopuli/flax_model.msgpack
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ad63febc6c296b8b3de75e39e739606d6bd239ab12c923b1dcdc176095dae2fd
|
| 3 |
+
size 1269577963
|
es/wav2vec2-large-es-voxpopuli/preprocessor_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_normalize": true,
|
| 3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
| 4 |
+
"feature_size": 1,
|
| 5 |
+
"padding_side": "right",
|
| 6 |
+
"padding_value": 0,
|
| 7 |
+
"return_attention_mask": true,
|
| 8 |
+
"sampling_rate": 16000
|
| 9 |
+
}
|
es/wav2vec2-large-es-voxpopuli/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:92aa5fa25e1dd02a474cea12b2d958eacd128126f9ee8e8ce89249986e5da762
|
| 3 |
+
size 1269737156
|
es/wav2vec2-large-es-voxpopuli/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/facebook/wav2vec2-large-es-voxpopuli
|
fa/wav2vec2-large-xlsr-53-persian/.gitattributes
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
fa/wav2vec2-large-xlsr-53-persian/README.md
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language: fa
|
| 3 |
+
datasets:
|
| 4 |
+
- common_voice
|
| 5 |
+
metrics:
|
| 6 |
+
- wer
|
| 7 |
+
- cer
|
| 8 |
+
tags:
|
| 9 |
+
- audio
|
| 10 |
+
- automatic-speech-recognition
|
| 11 |
+
- speech
|
| 12 |
+
- xlsr-fine-tuning-week
|
| 13 |
+
license: apache-2.0
|
| 14 |
+
model-index:
|
| 15 |
+
- name: XLSR Wav2Vec2 Persian by Jonatas Grosman
|
| 16 |
+
results:
|
| 17 |
+
- task:
|
| 18 |
+
name: Speech Recognition
|
| 19 |
+
type: automatic-speech-recognition
|
| 20 |
+
dataset:
|
| 21 |
+
name: Common Voice fa
|
| 22 |
+
type: common_voice
|
| 23 |
+
args: fa
|
| 24 |
+
metrics:
|
| 25 |
+
- name: Test WER
|
| 26 |
+
type: wer
|
| 27 |
+
value: 30.12
|
| 28 |
+
- name: Test CER
|
| 29 |
+
type: cer
|
| 30 |
+
value: 7.37
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
# Fine-tuned XLSR-53 large model for speech recognition in Persian
|
| 34 |
+
|
| 35 |
+
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Persian using the train and validation splits of [Common Voice 6.1](https://huggingface.co/datasets/common_voice).
|
| 36 |
+
When using this model, make sure that your speech input is sampled at 16kHz.
|
| 37 |
+
|
| 38 |
+
This model has been fine-tuned thanks to the GPU credits generously given by the [OVHcloud](https://www.ovhcloud.com/en/public-cloud/ai-training/) :)
|
| 39 |
+
|
| 40 |
+
The script used for training can be found here: https://github.com/jonatasgrosman/wav2vec2-sprint
|
| 41 |
+
|
| 42 |
+
## Usage
|
| 43 |
+
|
| 44 |
+
The model can be used directly (without a language model) as follows...
|
| 45 |
+
|
| 46 |
+
Using the [HuggingSound](https://github.com/jonatasgrosman/huggingsound) library:
|
| 47 |
+
|
| 48 |
+
```python
|
| 49 |
+
from huggingsound import SpeechRecognitionModel
|
| 50 |
+
|
| 51 |
+
model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-persian")
|
| 52 |
+
audio_paths = ["/path/to/file.mp3", "/path/to/another_file.wav"]
|
| 53 |
+
|
| 54 |
+
transcriptions = model.transcribe(audio_paths)
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
Writing your own inference script:
|
| 58 |
+
|
| 59 |
+
```python
|
| 60 |
+
import torch
|
| 61 |
+
import librosa
|
| 62 |
+
from datasets import load_dataset
|
| 63 |
+
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
| 64 |
+
|
| 65 |
+
LANG_ID = "fa"
|
| 66 |
+
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-persian"
|
| 67 |
+
SAMPLES = 5
|
| 68 |
+
|
| 69 |
+
test_dataset = load_dataset("common_voice", LANG_ID, split=f"test[:{SAMPLES}]")
|
| 70 |
+
|
| 71 |
+
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
|
| 72 |
+
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
|
| 73 |
+
|
| 74 |
+
# Preprocessing the datasets.
|
| 75 |
+
# We need to read the audio files as arrays
|
| 76 |
+
def speech_file_to_array_fn(batch):
|
| 77 |
+
speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
|
| 78 |
+
batch["speech"] = speech_array
|
| 79 |
+
batch["sentence"] = batch["sentence"].upper()
|
| 80 |
+
return batch
|
| 81 |
+
|
| 82 |
+
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
| 83 |
+
inputs = processor(test_dataset["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
|
| 84 |
+
|
| 85 |
+
with torch.no_grad():
|
| 86 |
+
logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
|
| 87 |
+
|
| 88 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
| 89 |
+
predicted_sentences = processor.batch_decode(predicted_ids)
|
| 90 |
+
|
| 91 |
+
for i, predicted_sentence in enumerate(predicted_sentences):
|
| 92 |
+
print("-" * 100)
|
| 93 |
+
print("Reference:", test_dataset[i]["sentence"])
|
| 94 |
+
print("Prediction:", predicted_sentence)
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
| Reference | Prediction |
|
| 98 |
+
| ------------- | ------------- |
|
| 99 |
+
| از مهمونداری کنار بکشم | از مهمانداری کنار بکشم |
|
| 100 |
+
| برو از مهرداد بپرس. | برو از ماقدعاد به پرس |
|
| 101 |
+
| خب ، تو چیكار می كنی؟ | خوب تو چیکار می کنی |
|
| 102 |
+
| مسقط پایتخت عمان در عربی به معنای محل سقوط است | مسقط پایتخت عمان در عربی به بعنای محل سقوط است |
|
| 103 |
+
| آه، نه اصلاُ! | اهنه اصلا |
|
| 104 |
+
| توانست | توانست |
|
| 105 |
+
| قصیده فن شعر میگوید ای دوستان | قصیده فن شعر میگوید ایدوستون |
|
| 106 |
+
| دو استایل متفاوت دارین | دوبوست داریل و متفاوت بری |
|
| 107 |
+
| دو روز قبل از کریسمس ؟ | اون مفتود پش پشش |
|
| 108 |
+
| ساعت های کاری چیست؟ | این توری که موشیکل خب |
|
| 109 |
+
|
| 110 |
+
## Evaluation
|
| 111 |
+
|
| 112 |
+
The model can be evaluated as follows on the Persian test data of Common Voice.
|
| 113 |
+
|
| 114 |
+
```python
|
| 115 |
+
import torch
|
| 116 |
+
import re
|
| 117 |
+
import librosa
|
| 118 |
+
from datasets import load_dataset, load_metric
|
| 119 |
+
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
| 120 |
+
|
| 121 |
+
LANG_ID = "fa"
|
| 122 |
+
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-persian"
|
| 123 |
+
DEVICE = "cuda"
|
| 124 |
+
|
| 125 |
+
CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
|
| 126 |
+
"؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
|
| 127 |
+
"{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
|
| 128 |
+
"、", "﹂", "﹁", "‧", "~", "﹏", ",", "{", "}", "(", ")", "[", "]", "【", "】", "‥", "〽",
|
| 129 |
+
"『", "』", "〝", "〟", "⟨", "⟩", "〜", ":", "!", "?", "♪", "؛", "/", "\\", "º", "−", "^", "ʻ", "ˆ"]
|
| 130 |
+
|
| 131 |
+
test_dataset = load_dataset("common_voice", LANG_ID, split="test")
|
| 132 |
+
|
| 133 |
+
wer = load_metric("wer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/wer.py
|
| 134 |
+
cer = load_metric("cer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/cer.py
|
| 135 |
+
|
| 136 |
+
chars_to_ignore_regex = f"[{re.escape(''.join(CHARS_TO_IGNORE))}]"
|
| 137 |
+
|
| 138 |
+
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
|
| 139 |
+
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
|
| 140 |
+
model.to(DEVICE)
|
| 141 |
+
|
| 142 |
+
# Preprocessing the datasets.
|
| 143 |
+
# We need to read the audio files as arrays
|
| 144 |
+
def speech_file_to_array_fn(batch):
|
| 145 |
+
with warnings.catch_warnings():
|
| 146 |
+
warnings.simplefilter("ignore")
|
| 147 |
+
speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
|
| 148 |
+
batch["speech"] = speech_array
|
| 149 |
+
batch["sentence"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).upper()
|
| 150 |
+
return batch
|
| 151 |
+
|
| 152 |
+
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
| 153 |
+
|
| 154 |
+
# Preprocessing the datasets.
|
| 155 |
+
# We need to read the audio files as arrays
|
| 156 |
+
def evaluate(batch):
|
| 157 |
+
inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
|
| 158 |
+
|
| 159 |
+
with torch.no_grad():
|
| 160 |
+
logits = model(inputs.input_values.to(DEVICE), attention_mask=inputs.attention_mask.to(DEVICE)).logits
|
| 161 |
+
|
| 162 |
+
pred_ids = torch.argmax(logits, dim=-1)
|
| 163 |
+
batch["pred_strings"] = processor.batch_decode(pred_ids)
|
| 164 |
+
return batch
|
| 165 |
+
|
| 166 |
+
result = test_dataset.map(evaluate, batched=True, batch_size=8)
|
| 167 |
+
|
| 168 |
+
predictions = [x.upper() for x in result["pred_strings"]]
|
| 169 |
+
references = [x.upper() for x in result["sentence"]]
|
| 170 |
+
|
| 171 |
+
print(f"WER: {wer.compute(predictions=predictions, references=references, chunk_size=1000) * 100}")
|
| 172 |
+
print(f"CER: {cer.compute(predictions=predictions, references=references, chunk_size=1000) * 100}")
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
**Test Result**:
|
| 176 |
+
|
| 177 |
+
In the table below I report the Word Error Rate (WER) and the Character Error Rate (CER) of the model. I ran the evaluation script described above on other models as well (on 2021-04-22). Note that the table below may show different results from those already reported, this may have been caused due to some specificity of the other evaluation scripts used.
|
| 178 |
+
|
| 179 |
+
| Model | WER | CER |
|
| 180 |
+
| ------------- | ------------- | ------------- |
|
| 181 |
+
| jonatasgrosman/wav2vec2-large-xlsr-53-persian | **30.12%** | **7.37%** |
|
| 182 |
+
| m3hrdadfi/wav2vec2-large-xlsr-persian-v2 | 33.85% | 8.79% |
|
| 183 |
+
| m3hrdadfi/wav2vec2-large-xlsr-persian | 34.37% | 8.98% |
|
| 184 |
+
|
| 185 |
+
## Citation
|
| 186 |
+
If you want to cite this model you can use this:
|
| 187 |
+
|
| 188 |
+
```bibtex
|
| 189 |
+
@misc{grosman2021xlsr53-large-persian,
|
| 190 |
+
title={Fine-tuned {XLSR}-53 large model for speech recognition in {P}ersian},
|
| 191 |
+
author={Grosman, Jonatas},
|
| 192 |
+
howpublished={\url{https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-persian}},
|
| 193 |
+
year={2021}
|
| 194 |
+
}
|
| 195 |
+
```
|
fa/wav2vec2-large-xlsr-53-persian/config.json
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "facebook/wav2vec2-large-xlsr-53",
|
| 3 |
+
"activation_dropout": 0.05,
|
| 4 |
+
"apply_spec_augment": true,
|
| 5 |
+
"architectures": [
|
| 6 |
+
"Wav2Vec2ForCTC"
|
| 7 |
+
],
|
| 8 |
+
"attention_dropout": 0.1,
|
| 9 |
+
"bos_token_id": 1,
|
| 10 |
+
"conv_bias": true,
|
| 11 |
+
"conv_dim": [
|
| 12 |
+
512,
|
| 13 |
+
512,
|
| 14 |
+
512,
|
| 15 |
+
512,
|
| 16 |
+
512,
|
| 17 |
+
512,
|
| 18 |
+
512
|
| 19 |
+
],
|
| 20 |
+
"conv_kernel": [
|
| 21 |
+
10,
|
| 22 |
+
3,
|
| 23 |
+
3,
|
| 24 |
+
3,
|
| 25 |
+
3,
|
| 26 |
+
2,
|
| 27 |
+
2
|
| 28 |
+
],
|
| 29 |
+
"conv_stride": [
|
| 30 |
+
5,
|
| 31 |
+
2,
|
| 32 |
+
2,
|
| 33 |
+
2,
|
| 34 |
+
2,
|
| 35 |
+
2,
|
| 36 |
+
2
|
| 37 |
+
],
|
| 38 |
+
"ctc_loss_reduction": "mean",
|
| 39 |
+
"ctc_zero_infinity": true,
|
| 40 |
+
"do_stable_layer_norm": true,
|
| 41 |
+
"eos_token_id": 2,
|
| 42 |
+
"feat_extract_activation": "gelu",
|
| 43 |
+
"feat_extract_dropout": 0.0,
|
| 44 |
+
"feat_extract_norm": "layer",
|
| 45 |
+
"feat_proj_dropout": 0.05,
|
| 46 |
+
"final_dropout": 0.0,
|
| 47 |
+
"gradient_checkpointing": true,
|
| 48 |
+
"hidden_act": "gelu",
|
| 49 |
+
"hidden_dropout": 0.05,
|
| 50 |
+
"hidden_size": 1024,
|
| 51 |
+
"initializer_range": 0.02,
|
| 52 |
+
"intermediate_size": 4096,
|
| 53 |
+
"layer_norm_eps": 1e-05,
|
| 54 |
+
"layerdrop": 0.05,
|
| 55 |
+
"mask_channel_length": 10,
|
| 56 |
+
"mask_channel_min_space": 1,
|
| 57 |
+
"mask_channel_other": 0.0,
|
| 58 |
+
"mask_channel_prob": 0.0,
|
| 59 |
+
"mask_channel_selection": "static",
|
| 60 |
+
"mask_feature_length": 10,
|
| 61 |
+
"mask_feature_prob": 0.0,
|
| 62 |
+
"mask_time_length": 10,
|
| 63 |
+
"mask_time_min_space": 1,
|
| 64 |
+
"mask_time_other": 0.0,
|
| 65 |
+
"mask_time_prob": 0.05,
|
| 66 |
+
"mask_time_selection": "static",
|
| 67 |
+
"model_type": "wav2vec2",
|
| 68 |
+
"num_attention_heads": 16,
|
| 69 |
+
"num_conv_pos_embedding_groups": 16,
|
| 70 |
+
"num_conv_pos_embeddings": 128,
|
| 71 |
+
"num_feat_extract_layers": 7,
|
| 72 |
+
"num_hidden_layers": 24,
|
| 73 |
+
"pad_token_id": 0,
|
| 74 |
+
"transformers_version": "4.5.0.dev0",
|
| 75 |
+
"vocab_size": 67
|
| 76 |
+
}
|
fa/wav2vec2-large-xlsr-53-persian/flax_model.msgpack
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f1c25fc4a3db03fb9610f8d954703e5b1497168dac2d4a5e67eaf1e400badb1f
|
| 3 |
+
size 1262044974
|
fa/wav2vec2-large-xlsr-53-persian/issues.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
=============================================================================
|
| 2 |
+
#3 WER 30% is wrong in real world/test
|
| 3 |
+
=============================================================================
|
| 4 |
+
|
| 5 |
+
[mosipvp] Feb 9, 2025
|
| 6 |
+
|
| 7 |
+
Dear Creator, I think you are sharing an unreasonable result. I was test this in "Farsi" and the result is absolutely f***ing.
|
| 8 |
+
I think you don't speak Farsi/Persian, so please use a native Farsi as your assistant/advisor.
|
| 9 |
+
But thank you for sharing this model.
|
fa/wav2vec2-large-xlsr-53-persian/preprocessor_config.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_normalize": true,
|
| 3 |
+
"feature_size": 1,
|
| 4 |
+
"padding_side": "right",
|
| 5 |
+
"padding_value": 0.0,
|
| 6 |
+
"return_attention_mask": true,
|
| 7 |
+
"sampling_rate": 16000
|
| 8 |
+
}
|
fa/wav2vec2-large-xlsr-53-persian/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3b859c7f562a2cc3c6002c2eb5178b66777406c4fccf53f196ead46a4f6c4796
|
| 3 |
+
size 1262208535
|
fa/wav2vec2-large-xlsr-53-persian/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-persian
|
fa/wav2vec2-large-xlsr-53-persian/special_tokens_map.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
|
fa/wav2vec2-large-xlsr-53-persian/vocab.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "|": 4, "٬": 5, "و": 6, "ـ": 7, "ئ": 8, "ل": 9, "ج": 10, "ک": 11, "R": 12, "ِ": 13, "ع": 14, "َ": 15, "م": 16, "ض": 17, "-": 18, "I": 19, "F": 20, "ذ": 21, "ن": 22, "ژ": 23, "A": 24, "ش": 25, "ث": 26, "Y": 27, "د": 28, "ر": 29, "ّ": 30, "أ": 31, "ق": 32, "ب": 33, "ح": 34, "ظ": 35, "پ": 36, "ت": 37, "خ": 38, "غ": 39, "ط": 40, "ك": 41, "ي": 42, "E": 43, "Ā": 44, "؛": 45, "ی": 46, "چ": 47, "ه": 48, "M": 49, "ف": 50, "آ": 51, "ز": 52, "ص": 53, "س": 54, "گ": 55, "N": 56, "ُ": 57, "T": 58, "S": 59, "Š": 60, "ٔ": 61, "B": 62, "ء": 63, "ً": 64, "ا": 65, "ى": 66}
|
fa/wav2vec2-large-xlsr-persian-v3/.gitattributes
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
fa/wav2vec2-large-xlsr-persian-v3/README.md
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language: fa
|
| 3 |
+
datasets:
|
| 4 |
+
- common_voice
|
| 5 |
+
tags:
|
| 6 |
+
- audio
|
| 7 |
+
- automatic-speech-recognition
|
| 8 |
+
- speech
|
| 9 |
+
- xlsr-fine-tuning-week
|
| 10 |
+
widget:
|
| 11 |
+
- example_title: Common Voice sample 1
|
| 12 |
+
src: https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-persian-v3/resolve/main/sample1.flac
|
| 13 |
+
- example_title: Common Voice sample 2978
|
| 14 |
+
src: https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-persian-v3/resolve/main/sample2978.flac
|
| 15 |
+
- example_title: Common Voice sample 5168
|
| 16 |
+
src: https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-persian-v3/resolve/main/sample5168.flac
|
| 17 |
+
model-index:
|
| 18 |
+
- name: XLSR Wav2Vec2 Persian (Farsi) V3 by Mehrdad Farahani
|
| 19 |
+
results:
|
| 20 |
+
- task:
|
| 21 |
+
name: Speech Recognition
|
| 22 |
+
type: automatic-speech-recognition
|
| 23 |
+
dataset:
|
| 24 |
+
name: Common Voice fa
|
| 25 |
+
type: common_voice
|
| 26 |
+
args: fa
|
| 27 |
+
metrics:
|
| 28 |
+
- name: Test WER
|
| 29 |
+
type: wer
|
| 30 |
+
value: 10.36
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
# Wav2Vec2-Large-XLSR-53-Persian V3
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
## Usage
|
| 37 |
+
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Persian (Farsi) using [Common Voice](https://huggingface.co/datasets/common_voice). When using this model, make sure that your speech input is sampled at 16kHz.
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
**Requirements**
|
| 41 |
+
```bash
|
| 42 |
+
# requirement packages
|
| 43 |
+
!pip install git+https://github.com/huggingface/datasets.git
|
| 44 |
+
!pip install git+https://github.com/huggingface/transformers.git
|
| 45 |
+
!pip install torchaudio
|
| 46 |
+
!pip install librosa
|
| 47 |
+
!pip install jiwer
|
| 48 |
+
!pip install parsivar
|
| 49 |
+
!pip install num2fawords
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
**Normalizer**
|
| 53 |
+
```bash
|
| 54 |
+
# Normalizer
|
| 55 |
+
!wget -O normalizer.py https://huggingface.co/m3hrdadfi/"wav2vec2-large-xlsr-persian-v3/raw/main/dictionary.py
|
| 56 |
+
!wget -O normalizer.py https://huggingface.co/m3hrdadfi/"wav2vec2-large-xlsr-persian-v3/raw/main/normalizer.py
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
**Downloading data**
|
| 60 |
+
```bash
|
| 61 |
+
wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fa.tar.gz
|
| 62 |
+
|
| 63 |
+
tar -xzf fa.tar.gz
|
| 64 |
+
rm -rf fa.tar.gz
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
**Cleaning**
|
| 68 |
+
```python
|
| 69 |
+
from normalizer import normalizer
|
| 70 |
+
|
| 71 |
+
def cleaning(text):
|
| 72 |
+
if not isinstance(text, str):
|
| 73 |
+
return None
|
| 74 |
+
|
| 75 |
+
return normalizer({"sentence": text}, return_dict=False)
|
| 76 |
+
|
| 77 |
+
data_dir = "/content/cv-corpus-6.1-2020-12-11/fa"
|
| 78 |
+
|
| 79 |
+
test = pd.read_csv(f"{data_dir}/test.tsv", sep=" ")
|
| 80 |
+
test["path"] = data_dir + "/clips/" + test["path"]
|
| 81 |
+
print(f"Step 0: {len(test)}")
|
| 82 |
+
|
| 83 |
+
test["status"] = test["path"].apply(lambda path: True if os.path.exists(path) else None)
|
| 84 |
+
test = test.dropna(subset=["path"])
|
| 85 |
+
test = test.drop("status", 1)
|
| 86 |
+
print(f"Step 1: {len(test)}")
|
| 87 |
+
|
| 88 |
+
test["sentence"] = test["sentence"].apply(lambda t: cleaning(t))
|
| 89 |
+
test = test.dropna(subset=["sentence"])
|
| 90 |
+
print(f"Step 2: {len(test)}")
|
| 91 |
+
|
| 92 |
+
test = test.reset_index(drop=True)
|
| 93 |
+
print(test.head())
|
| 94 |
+
|
| 95 |
+
test = test[["path", "sentence"]]
|
| 96 |
+
test.to_csv("/content/test.csv", sep=" ", encoding="utf-8", index=False)
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
**Prediction**
|
| 100 |
+
```python
|
| 101 |
+
import numpy as np
|
| 102 |
+
import pandas as pd
|
| 103 |
+
|
| 104 |
+
import librosa
|
| 105 |
+
import torch
|
| 106 |
+
import torchaudio
|
| 107 |
+
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
| 108 |
+
from datasets import load_dataset, load_metric
|
| 109 |
+
|
| 110 |
+
import IPython.display as ipd
|
| 111 |
+
|
| 112 |
+
model_name_or_path = "m3hrdadfi/wav2vec2-large-xlsr-persian-v3"
|
| 113 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 114 |
+
print(model_name_or_path, device)
|
| 115 |
+
|
| 116 |
+
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
|
| 117 |
+
model = Wav2Vec2ForCTC.from_pretrained(model_name_or_path).to(device)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def speech_file_to_array_fn(batch):
|
| 121 |
+
speech_array, sampling_rate = torchaudio.load(batch["path"])
|
| 122 |
+
speech_array = speech_array.squeeze().numpy()
|
| 123 |
+
speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, processor.feature_extractor.sampling_rate)
|
| 124 |
+
|
| 125 |
+
batch["speech"] = speech_array
|
| 126 |
+
return batch
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def predict(batch):
|
| 130 |
+
features = processor(
|
| 131 |
+
batch["speech"],
|
| 132 |
+
sampling_rate=processor.feature_extractor.sampling_rate,
|
| 133 |
+
return_tensors="pt",
|
| 134 |
+
padding=True
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
input_values = features.input_values.to(device)
|
| 138 |
+
attention_mask = features.attention_mask.to(device)
|
| 139 |
+
|
| 140 |
+
with torch.no_grad():
|
| 141 |
+
logits = model(input_values, attention_mask=attention_mask).logits
|
| 142 |
+
|
| 143 |
+
pred_ids = torch.argmax(logits, dim=-1)
|
| 144 |
+
|
| 145 |
+
batch["predicted"] = processor.batch_decode(pred_ids)
|
| 146 |
+
return batch
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
dataset = load_dataset("csv", data_files={"test": "/content/test.csv"}, delimiter=" ")["test"]
|
| 150 |
+
dataset = dataset.map(speech_file_to_array_fn)
|
| 151 |
+
result = dataset.map(predict, batched=True, batch_size=4)
|
| 152 |
+
```
|
| 153 |
+
|
| 154 |
+
**WER Score**
|
| 155 |
+
```python
|
| 156 |
+
wer = load_metric("wer")
|
| 157 |
+
print("WER: {:.2f}".format(100 * wer.compute(predictions=result["predicted"], references=result["sentence"])))
|
| 158 |
+
```
|
| 159 |
+
|
| 160 |
+
**Output**
|
| 161 |
+
```python
|
| 162 |
+
max_items = np.random.randint(0, len(result), 20).tolist()
|
| 163 |
+
for i in max_items:
|
| 164 |
+
reference, predicted = result["sentence"][i], result["predicted"][i]
|
| 165 |
+
print("reference:", reference)
|
| 166 |
+
print("predicted:", predicted)
|
| 167 |
+
print('---')
|
| 168 |
+
```
|
| 169 |
+
|
| 170 |
+
```text
|
| 171 |
+
reference: ماجرا رو براش تعریف کردم او�� گفت مریم اگه میدونی پسر خوبیه خب چه اشکالی داره باهاش بیشتر اشنا بشو
|
| 172 |
+
predicted: ماجرا رو براش تعریف کردم اون گفت مریم اگه میدونی پسر خوبیه خب چه اشکالی داره باهاش بیشتر اشنا بشو
|
| 173 |
+
---
|
| 174 |
+
reference: بیا پایین تو اجازه نداری بری اون بالا
|
| 175 |
+
predicted: بیا پایین تو اجازه نداری بری اون بالا
|
| 176 |
+
---
|
| 177 |
+
reference: هر روز یک دو مداد کش می رفتتم تااین که تا پایان ترم از تمامی دوستانم مداد برداشته بودم
|
| 178 |
+
predicted: هر روز یک دو مداد کش می رفتم تااین که تا پایین ترم از تمامی دوستان و مداد برداشته بودم
|
| 179 |
+
---
|
| 180 |
+
reference: فکر میکنی آروم میشینه
|
| 181 |
+
predicted: فکر میکنی آروم میشینه
|
| 182 |
+
---
|
| 183 |
+
reference: هرکسی با گوشی هوشمند خود میتواند با کایلا متصل گردد در یک محدوده مکانی
|
| 184 |
+
predicted: هرکسی با گوشی هوشمند خود میتواند با کایلا متصل گردد در یک محدوده مکانی
|
| 185 |
+
---
|
| 186 |
+
reference: برو از مهرداد بپرس
|
| 187 |
+
predicted: برو از مهرداد بپرس
|
| 188 |
+
---
|
| 189 |
+
reference: می خواهم شما را با این قدمها آشنا کنم
|
| 190 |
+
predicted: می خواهم شما را با این قدمها آشنا کنم
|
| 191 |
+
---
|
| 192 |
+
reference: میدونم یه روز دوباره می تونم تو رو ببینم
|
| 193 |
+
predicted: میدونم یه روز دوباره می تونم تو رو ببینم
|
| 194 |
+
---
|
| 195 |
+
reference: بسیار خوب خواهد بود دعوت او را بپذیری
|
| 196 |
+
predicted: بسیار خوب خواهد بود دعوت او را بپذیری
|
| 197 |
+
---
|
| 198 |
+
reference: بهت بگن آشغالی خوبه
|
| 199 |
+
predicted: بهت بگن آشغالی خوبه
|
| 200 |
+
---
|
| 201 |
+
reference: چرا معاشرت با هم ایمانان ما را محفوظ نگه میدارد
|
| 202 |
+
predicted: چرا معاشرت با هم ایمانان آ را م حفوظ نگه میدارد
|
| 203 |
+
---
|
| 204 |
+
reference: بولیوی پس از گویان فقیرترین کشور آمریکای جنوبی است
|
| 205 |
+
predicted: بولیوی پس از گویان فقیرترین کشور آمریکای جنوبی است
|
| 206 |
+
---
|
| 207 |
+
reference: بعد از مدتی اینکار برایم عادی شد
|
| 208 |
+
predicted: بعد از مدتی اینکار برایم عادو شد
|
| 209 |
+
---
|
| 210 |
+
reference: به نظر اون هم همینطوره
|
| 211 |
+
predicted: به نظر اون هم همینطوره
|
| 212 |
+
---
|
| 213 |
+
reference: هیچ مایونز ی دارید
|
| 214 |
+
predicted: هیچ مایونز ی دارید
|
| 215 |
+
---
|
| 216 |
+
reference: هیچ یک از انان کاری به سنگ نداشتند
|
| 217 |
+
predicted: هیچ شک از انان کاری به سنگ نداشتند
|
| 218 |
+
---
|
| 219 |
+
reference: می خواهم کمی کتاب شعر ببینم
|
| 220 |
+
predicted: می خواهم کتاب شعر ببینم
|
| 221 |
+
---
|
| 222 |
+
reference: همین شوهر فهیمه مگه نمی گفتی فرمانده بوده کو
|
| 223 |
+
predicted: همین شوهر فهیمه بینامی گفتی فهمانده بود کو
|
| 224 |
+
---
|
| 225 |
+
reference: اون جاها کسی رو نمیبینی که تو دستش کتاب نباشه
|
| 226 |
+
predicted: اون جاها کسی رو نمیبینی که تو دستش کتاب نباشه
|
| 227 |
+
---
|
| 228 |
+
reference: زندان رفتن من در این سالهای اخیر برام شانس بزرگی بود که معما و مشکل چندین سالهام را حل کرد
|
| 229 |
+
predicted: زندان رفتن من در این سالها اخی براب شانس بزرگی بود که معما و مشکل چندین سالهام را حل کرد
|
| 230 |
+
---
|
| 231 |
+
```
|
| 232 |
+
|
| 233 |
+
## Evaluation
|
| 234 |
+
|
| 235 |
+
**Test Result:**
|
| 236 |
+
- WER: 10.36%
|
fa/wav2vec2-large-xlsr-persian-v3/config.json
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "facebook/wav2vec2-large-xlsr-53",
|
| 3 |
+
"activation_dropout": 0.09216,
|
| 4 |
+
"apply_spec_augment": true,
|
| 5 |
+
"architectures": [
|
| 6 |
+
"Wav2Vec2ForCTC"
|
| 7 |
+
],
|
| 8 |
+
"attention_dropout": 0.05316,
|
| 9 |
+
"bos_token_id": 1,
|
| 10 |
+
"conv_bias": true,
|
| 11 |
+
"conv_dim": [
|
| 12 |
+
512,
|
| 13 |
+
512,
|
| 14 |
+
512,
|
| 15 |
+
512,
|
| 16 |
+
512,
|
| 17 |
+
512,
|
| 18 |
+
512
|
| 19 |
+
],
|
| 20 |
+
"conv_kernel": [
|
| 21 |
+
10,
|
| 22 |
+
3,
|
| 23 |
+
3,
|
| 24 |
+
3,
|
| 25 |
+
3,
|
| 26 |
+
2,
|
| 27 |
+
2
|
| 28 |
+
],
|
| 29 |
+
"conv_stride": [
|
| 30 |
+
5,
|
| 31 |
+
2,
|
| 32 |
+
2,
|
| 33 |
+
2,
|
| 34 |
+
2,
|
| 35 |
+
2,
|
| 36 |
+
2
|
| 37 |
+
],
|
| 38 |
+
"ctc_loss_reduction": "mean",
|
| 39 |
+
"ctc_zero_infinity": true,
|
| 40 |
+
"do_stable_layer_norm": true,
|
| 41 |
+
"eos_token_id": 2,
|
| 42 |
+
"feat_extract_activation": "gelu",
|
| 43 |
+
"feat_extract_dropout": 0.0,
|
| 44 |
+
"feat_extract_norm": "layer",
|
| 45 |
+
"feat_proj_dropout": 0.01249,
|
| 46 |
+
"final_dropout": 0.0,
|
| 47 |
+
"gradient_checkpointing": true,
|
| 48 |
+
"hidden_act": "gelu",
|
| 49 |
+
"hidden_dropout": 0.01941,
|
| 50 |
+
"hidden_size": 1024,
|
| 51 |
+
"initializer_range": 0.02,
|
| 52 |
+
"intermediate_size": 4096,
|
| 53 |
+
"layer_norm_eps": 1e-05,
|
| 54 |
+
"layerdrop": 0.01377,
|
| 55 |
+
"mask_channel_length": 10,
|
| 56 |
+
"mask_channel_min_space": 1,
|
| 57 |
+
"mask_channel_other": 0.0,
|
| 58 |
+
"mask_channel_prob": 0.0,
|
| 59 |
+
"mask_channel_selection": "static",
|
| 60 |
+
"mask_feature_length": 10,
|
| 61 |
+
"mask_feature_prob": 0.0,
|
| 62 |
+
"mask_time_length": 10,
|
| 63 |
+
"mask_time_min_space": 1,
|
| 64 |
+
"mask_time_other": 0.0,
|
| 65 |
+
"mask_time_prob": 0.04529,
|
| 66 |
+
"mask_time_selection": "static",
|
| 67 |
+
"model_type": "wav2vec2",
|
| 68 |
+
"num_attention_heads": 16,
|
| 69 |
+
"num_conv_pos_embedding_groups": 16,
|
| 70 |
+
"num_conv_pos_embeddings": 128,
|
| 71 |
+
"num_feat_extract_layers": 7,
|
| 72 |
+
"num_hidden_layers": 24,
|
| 73 |
+
"pad_token_id": 0,
|
| 74 |
+
"transformers_version": "4.6.0.dev0",
|
| 75 |
+
"vocab_size": 40
|
| 76 |
+
}
|