wav2vec2 (az, fa, te, uk)
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +12 -0
- az/wav2vec2-large-mms-1b-azerbaijani/.gitattributes +35 -0
- az/wav2vec2-large-mms-1b-azerbaijani/README.md +79 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter.az.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260105_102937.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260105_112030.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260105_143407.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260105_181040.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260106_074657.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260121_215230.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260122_003358.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260122_031538.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260122_055920.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260122_084159.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260210_233725.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260211_025645.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260211_061212.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260211_092612.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260211_124514.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260305_200504.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260305_232514.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260306_024541.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260306_060440.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260306_090420.pt +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/added_tokens.json +4 -0
- az/wav2vec2-large-mms-1b-azerbaijani/config.json +108 -0
- az/wav2vec2-large-mms-1b-azerbaijani/model.safetensors +3 -0
- az/wav2vec2-large-mms-1b-azerbaijani/preprocessor_config.json +9 -0
- az/wav2vec2-large-mms-1b-azerbaijani/source.txt +1 -0
- az/wav2vec2-large-mms-1b-azerbaijani/special_tokens_map.json +6 -0
- az/wav2vec2-large-mms-1b-azerbaijani/tokenizer_config.json +47 -0
- az/wav2vec2-large-mms-1b-azerbaijani/vocab.json +40 -0
- fa/Persian-Speech-Transcription-Wav2Vec2-V1/.gitattributes +35 -0
- fa/Persian-Speech-Transcription-Wav2Vec2-V1/README.md +7 -0
- fa/Persian-Speech-Transcription-Wav2Vec2-V1/all_results.json +15 -0
- fa/Persian-Speech-Transcription-Wav2Vec2-V1/config.json +116 -0
- fa/Persian-Speech-Transcription-Wav2Vec2-V1/eval_results.json +9 -0
- fa/Persian-Speech-Transcription-Wav2Vec2-V1/model.safetensors +3 -0
- fa/Persian-Speech-Transcription-Wav2Vec2-V1/preprocessor_config.json +10 -0
- fa/Persian-Speech-Transcription-Wav2Vec2-V1/pytorch_model.bin +3 -0
- fa/Persian-Speech-Transcription-Wav2Vec2-V1/source.txt +1 -0
- fa/Persian-Speech-Transcription-Wav2Vec2-V1/special_tokens_map.json +6 -0
- fa/Persian-Speech-Transcription-Wav2Vec2-V1/tokenizer_config.json +15 -0
- fa/Persian-Speech-Transcription-Wav2Vec2-V1/train_results.json +9 -0
- fa/Persian-Speech-Transcription-Wav2Vec2-V1/trainer_state.json +58 -0
- fa/Persian-Speech-Transcription-Wav2Vec2-V1/training_args.bin +3 -0
- fa/Persian-Speech-Transcription-Wav2Vec2-V1/vocab.json +42 -0
- fa/Sharif-wav2vec2/.gitattributes +31 -0
- fa/Sharif-wav2vec2/README.md +156 -0
- fa/Sharif-wav2vec2/alphabet.json +45 -0
.gitattributes
CHANGED
|
@@ -43,3 +43,15 @@ uk/w2v-xls-r-uk/language_model/lm.binary filter=lfs diff=lfs merge=lfs -text
|
|
| 43 |
ar/Arabic_speech_Syllables_recognition_Using_Wav2vec2/Final[[:space:]]Paper[[:space:]]Syllable-Based[[:space:]]Arabic[[:space:]]Speech[[:space:]]Recognition[[:space:]]Using[[:space:]]Wav2Vec.pdf filter=lfs diff=lfs merge=lfs -text
|
| 44 |
ar/asr-wav2vec2-commonvoice-14-ar/example-ar.wav filter=lfs diff=lfs merge=lfs -text
|
| 45 |
ar/wav2vec2_ar_anz2/language_model/voctext.arpa filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
ar/Arabic_speech_Syllables_recognition_Using_Wav2vec2/Final[[:space:]]Paper[[:space:]]Syllable-Based[[:space:]]Arabic[[:space:]]Speech[[:space:]]Recognition[[:space:]]Using[[:space:]]Wav2Vec.pdf filter=lfs diff=lfs merge=lfs -text
|
| 44 |
ar/asr-wav2vec2-commonvoice-14-ar/example-ar.wav filter=lfs diff=lfs merge=lfs -text
|
| 45 |
ar/wav2vec2_ar_anz2/language_model/voctext.arpa filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
fa/Sharif-wav2vec2/language_model/5gram.arpa filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
fa/Wav2Vec2-Large-XLSR-Persian-ShEMO/M16A01.wav filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
fa/Wav2Vec2-Large-XLSR-Persian-ShEMO/wandb-loss.png filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
fa/Wav2Vec2-Large-XLSR-Persian-ShEMO/wandb-wer.png filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
uk/w2v-bert-uk-v2.1-iree-cpu/w2v2-cpu.vmfb filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
uk/w2v-bert-uk-v2.1-iree-cpu/w2v2-fp16-cpu.vmfb filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
uk/w2v-bert-uk-v2.1-iree-cuda/w2v2-cuda-optimized.vmfb filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
uk/w2v-bert-uk-v2.1-iree-cuda/w2v2-cuda.vmfb filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
uk/w2v-bert-uk-v2.1-iree-cuda/w2v2-fp16-cuda-optimized.vmfb filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
uk/w2v-bert-uk-v2.1-iree-cuda/w2v2-fp16-cuda.vmfb filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
uk/wav2vec2-xls-r-1b-uk-cv/language_model/unigrams.txt filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
uk/wav2vec2-xls-r-1b-uk/language_model/unigrams.txt filter=lfs diff=lfs merge=lfs -text
|
az/wav2vec2-large-mms-1b-azerbaijani/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
az/wav2vec2-large-mms-1b-azerbaijani/README.md
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- az
|
| 4 |
+
license: apache-2.0
|
| 5 |
+
tags:
|
| 6 |
+
- asr
|
| 7 |
+
- speech-recognition
|
| 8 |
+
- wav2vec2
|
| 9 |
+
- mms
|
| 10 |
+
- azerbaijani
|
| 11 |
+
library_name: transformers
|
| 12 |
+
pipeline_tag: automatic-speech-recognition
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
# Wav2Vec2 Large MMS 1B – Azerbaijani ASR
|
| 16 |
+
|
| 17 |
+
This model is a **Wav2Vec2 Large MMS (1B parameters)** fine-tuned for **Azerbaijani (az)** speech recognition using an external adapter.
|
| 18 |
+
|
| 19 |
+
The base model comes from Meta’s **Massively Multilingual Speech (MMS)** project, with a custom Azerbaijani adapter loaded at inference time.
|
| 20 |
+
|
| 21 |
+
---
|
| 22 |
+
|
| 23 |
+
## Model Details
|
| 24 |
+
|
| 25 |
+
- **Base model:** facebook/wav2vec2-large-mms-1b
|
| 26 |
+
- **Language:** Azerbaijani (`az`)
|
| 27 |
+
- **Sampling rate:** 16 kHz
|
| 28 |
+
- **Framework:** PyTorch
|
| 29 |
+
- **Adapter file:** `adapter.az.pt`
|
| 30 |
+
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
## Usage
|
| 34 |
+
|
| 35 |
+
### Installation
|
| 36 |
+
pip install torch transformers numpy
|
| 37 |
+
|
| 38 |
+
### Inference Example
|
| 39 |
+
```python
|
| 40 |
+
import torch
|
| 41 |
+
import numpy as np
|
| 42 |
+
import wave
|
| 43 |
+
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
| 44 |
+
|
| 45 |
+
MODEL_ID = "tahmaz/wav2vec2-large-mms-1b-azerbaijani"
|
| 46 |
+
SAMPLE_RATE = 16000
|
| 47 |
+
|
| 48 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 49 |
+
|
| 50 |
+
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
|
| 51 |
+
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID).to(device)
|
| 52 |
+
|
| 53 |
+
# Load adapter
|
| 54 |
+
adapter_weights = torch.load(
|
| 55 |
+
"adapter.az.pt", # or downloaded from HF
|
| 56 |
+
map_location=device
|
| 57 |
+
)
|
| 58 |
+
model.load_state_dict(adapter_weights, strict=False)
|
| 59 |
+
model.eval()
|
| 60 |
+
|
| 61 |
+
def transcribe_wav(path):
|
| 62 |
+
with wave.open(path, "rb") as wf:
|
| 63 |
+
audio = wf.readframes(wf.getnframes())
|
| 64 |
+
|
| 65 |
+
audio = np.frombuffer(audio, dtype=np.int16).astype(np.float32) / 32768.0
|
| 66 |
+
|
| 67 |
+
inputs = processor(
|
| 68 |
+
audio,
|
| 69 |
+
sampling_rate=SAMPLE_RATE,
|
| 70 |
+
return_tensors="pt"
|
| 71 |
+
).to(device)
|
| 72 |
+
|
| 73 |
+
with torch.no_grad():
|
| 74 |
+
logits = model(**inputs).logits
|
| 75 |
+
|
| 76 |
+
pred_ids = torch.argmax(logits, dim=-1)
|
| 77 |
+
return processor.batch_decode(pred_ids)[0]
|
| 78 |
+
|
| 79 |
+
print(transcribe_wav("sample.wav"))
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter.az.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6a88600c971d7bb0cad12495a9ea10446124464d73ab6c13e336735dc109d652
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260105_102937.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1f8d1671d72d5a84562950a842a1528c6f9c3e9ccff482294a0bbc1afcf9b57f
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260105_112030.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1e903582557dbdc06efffc76f0ca1f7fd6d171ea098f2bf5b14690a1bc685c94
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260105_143407.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9f0ff90a1ba983158fff23f5bc8a29c2ec5fa5c60afcd4f061538fd76dc6b79b
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260105_181040.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ffc524ddcfe1e15bd6d1b8e3c52e5ab58dd4abf00600d368a02b823800b011bc
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260106_074657.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cae29d2bf46a3967990c24ae06bd98ea4dfee0b184312875d3de32ddad59b13a
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260121_215230.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:31ea191357f5e981ba06f3b1f656393ecf001d5b72281e22a47f8be8b8ce529a
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260122_003358.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ca891ffb464fc9b1f55a90ea236eb46252560ad38c6d0d6024a9759e34fc11ef
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260122_031538.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a4cd4cdf7f09254197561b80ea3c4d21906f6ea7b0c51c2a571ca642ad48e5f5
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260122_055920.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:68b1de8d7c4209ec0bd827e9a2e88106e7fb1328e17d4cc0a6c1139df6a1965e
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260122_084159.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ee82e177ce1a4298a82145c1838efeec5a8b670b52553c45544d46a298e87459
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260210_233725.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8785b7cef3a45e4d38475392bbe0d41c459550e49ae83db65274ca4209139c30
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260211_025645.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ad57c6c2403d9c69f77b1abc6408353000368ef802aab148aa3fb639c388ba5f
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260211_061212.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:15feaadb77e5ffa52f19ed622bfb49ebe7277f5c5907b2d3d3bfd256825b59e1
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260211_092612.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9653722619443663c64940017fe95a0ab71dc1756b8b3b2ed210c8a0f7040b37
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260211_124514.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:02162fc4a09cf266ac6c9904238f8bb9eb06bd03a5053099a922ad45febbde14
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260305_200504.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0adb9df6bcb66ade02f135a77abbd6ef2bf85af9cba73a85e9129b26bb088eba
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260305_232514.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:86525ef8f0f62298918445313464bd831f7c99ce5c995005bfa3dd993bf6bda8
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260306_024541.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:da5284687ad84bd7d0b2d9f3a98725afd922f570272f05060486174ed6b299b8
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260306_060440.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5c437f0b94c8106ab4b2ce97b2595e270562a208e9daf29a1f9b300abf9c12be
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260306_090420.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5f466b20431d56455673c89305219cfab5ba98f2adb2c37bf2ca26c1f873fc7e
|
| 3 |
+
size 8902835
|
az/wav2vec2-large-mms-1b-azerbaijani/added_tokens.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</s>": 37,
|
| 3 |
+
"<s>": 36
|
| 4 |
+
}
|
az/wav2vec2-large-mms-1b-azerbaijani/config.json
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "facebook/mms-1b-all",
|
| 3 |
+
"activation_dropout": 0.05,
|
| 4 |
+
"adapter_attn_dim": 16,
|
| 5 |
+
"adapter_kernel_size": 3,
|
| 6 |
+
"adapter_stride": 2,
|
| 7 |
+
"add_adapter": false,
|
| 8 |
+
"apply_spec_augment": true,
|
| 9 |
+
"architectures": [
|
| 10 |
+
"Wav2Vec2ForCTC"
|
| 11 |
+
],
|
| 12 |
+
"attention_dropout": 0.0,
|
| 13 |
+
"bos_token_id": 1,
|
| 14 |
+
"classifier_proj_size": 256,
|
| 15 |
+
"codevector_dim": 1024,
|
| 16 |
+
"contrastive_logits_temperature": 0.1,
|
| 17 |
+
"conv_bias": true,
|
| 18 |
+
"conv_dim": [
|
| 19 |
+
512,
|
| 20 |
+
512,
|
| 21 |
+
512,
|
| 22 |
+
512,
|
| 23 |
+
512,
|
| 24 |
+
512,
|
| 25 |
+
512
|
| 26 |
+
],
|
| 27 |
+
"conv_kernel": [
|
| 28 |
+
10,
|
| 29 |
+
3,
|
| 30 |
+
3,
|
| 31 |
+
3,
|
| 32 |
+
3,
|
| 33 |
+
2,
|
| 34 |
+
2
|
| 35 |
+
],
|
| 36 |
+
"conv_stride": [
|
| 37 |
+
5,
|
| 38 |
+
2,
|
| 39 |
+
2,
|
| 40 |
+
2,
|
| 41 |
+
2,
|
| 42 |
+
2,
|
| 43 |
+
2
|
| 44 |
+
],
|
| 45 |
+
"ctc_loss_reduction": "mean",
|
| 46 |
+
"ctc_zero_infinity": false,
|
| 47 |
+
"diversity_loss_weight": 0.1,
|
| 48 |
+
"do_stable_layer_norm": true,
|
| 49 |
+
"eos_token_id": 2,
|
| 50 |
+
"feat_extract_activation": "gelu",
|
| 51 |
+
"feat_extract_dropout": 0.0,
|
| 52 |
+
"feat_extract_norm": "layer",
|
| 53 |
+
"feat_proj_dropout": 0.0,
|
| 54 |
+
"feat_quantizer_dropout": 0.0,
|
| 55 |
+
"final_dropout": 0.05,
|
| 56 |
+
"hidden_act": "gelu",
|
| 57 |
+
"hidden_dropout": 0.0,
|
| 58 |
+
"hidden_size": 1280,
|
| 59 |
+
"initializer_range": 0.02,
|
| 60 |
+
"intermediate_size": 5120,
|
| 61 |
+
"layer_norm_eps": 1e-05,
|
| 62 |
+
"layerdrop": 0.0,
|
| 63 |
+
"mask_feature_length": 10,
|
| 64 |
+
"mask_feature_min_masks": 0,
|
| 65 |
+
"mask_feature_prob": 0.0,
|
| 66 |
+
"mask_time_length": 10,
|
| 67 |
+
"mask_time_min_masks": 2,
|
| 68 |
+
"mask_time_prob": 0.05,
|
| 69 |
+
"model_type": "wav2vec2",
|
| 70 |
+
"num_adapter_layers": 3,
|
| 71 |
+
"num_attention_heads": 16,
|
| 72 |
+
"num_codevector_groups": 2,
|
| 73 |
+
"num_codevectors_per_group": 320,
|
| 74 |
+
"num_conv_pos_embedding_groups": 16,
|
| 75 |
+
"num_conv_pos_embeddings": 128,
|
| 76 |
+
"num_feat_extract_layers": 7,
|
| 77 |
+
"num_hidden_layers": 48,
|
| 78 |
+
"num_negatives": 100,
|
| 79 |
+
"output_hidden_size": 1280,
|
| 80 |
+
"pad_token_id": 35,
|
| 81 |
+
"proj_codevector_dim": 1024,
|
| 82 |
+
"tdnn_dilation": [
|
| 83 |
+
1,
|
| 84 |
+
2,
|
| 85 |
+
3,
|
| 86 |
+
1,
|
| 87 |
+
1
|
| 88 |
+
],
|
| 89 |
+
"tdnn_dim": [
|
| 90 |
+
512,
|
| 91 |
+
512,
|
| 92 |
+
512,
|
| 93 |
+
512,
|
| 94 |
+
1500
|
| 95 |
+
],
|
| 96 |
+
"tdnn_kernel": [
|
| 97 |
+
5,
|
| 98 |
+
3,
|
| 99 |
+
3,
|
| 100 |
+
1,
|
| 101 |
+
1
|
| 102 |
+
],
|
| 103 |
+
"torch_dtype": "float32",
|
| 104 |
+
"transformers_version": "4.35.2",
|
| 105 |
+
"use_weighted_layer_sum": false,
|
| 106 |
+
"vocab_size": 38,
|
| 107 |
+
"xvector_output_dim": 512
|
| 108 |
+
}
|
az/wav2vec2-large-mms-1b-azerbaijani/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:662aac793ba8367e8a3dc5f821c842dd5de299d2dd9f6bf2999105a30267de8b
|
| 3 |
+
size 3858926792
|
az/wav2vec2-large-mms-1b-azerbaijani/preprocessor_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_normalize": true,
|
| 3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
| 4 |
+
"feature_size": 1,
|
| 5 |
+
"padding_side": "right",
|
| 6 |
+
"padding_value": 0.0,
|
| 7 |
+
"return_attention_mask": true,
|
| 8 |
+
"sampling_rate": 16000
|
| 9 |
+
}
|
az/wav2vec2-large-mms-1b-azerbaijani/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/tahmaz/wav2vec2-large-mms-1b-azerbaijani
|
az/wav2vec2-large-mms-1b-azerbaijani/special_tokens_map.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<s>",
|
| 3 |
+
"eos_token": "</s>",
|
| 4 |
+
"pad_token": "[PAD]",
|
| 5 |
+
"unk_token": "[UNK]"
|
| 6 |
+
}
|
az/wav2vec2-large-mms-1b-azerbaijani/tokenizer_config.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"34": {
|
| 4 |
+
"content": "[UNK]",
|
| 5 |
+
"lstrip": true,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": true,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": false
|
| 10 |
+
},
|
| 11 |
+
"35": {
|
| 12 |
+
"content": "[PAD]",
|
| 13 |
+
"lstrip": true,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": true,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": false
|
| 18 |
+
},
|
| 19 |
+
"36": {
|
| 20 |
+
"content": "<s>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": true,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"37": {
|
| 28 |
+
"content": "</s>",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": true,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
}
|
| 35 |
+
},
|
| 36 |
+
"bos_token": "<s>",
|
| 37 |
+
"clean_up_tokenization_spaces": true,
|
| 38 |
+
"do_lower_case": false,
|
| 39 |
+
"eos_token": "</s>",
|
| 40 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 41 |
+
"pad_token": "[PAD]",
|
| 42 |
+
"replace_word_delimiter_char": " ",
|
| 43 |
+
"target_lang": "azj-script_latin",
|
| 44 |
+
"tokenizer_class": "Wav2Vec2CTCTokenizer",
|
| 45 |
+
"unk_token": "[UNK]",
|
| 46 |
+
"word_delimiter_token": "|"
|
| 47 |
+
}
|
az/wav2vec2-large-mms-1b-azerbaijani/vocab.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"azj-script_latin": {
|
| 3 |
+
"[PAD]": 35,
|
| 4 |
+
"[UNK]": 34,
|
| 5 |
+
"a": 1,
|
| 6 |
+
"b": 2,
|
| 7 |
+
"c": 3,
|
| 8 |
+
"d": 4,
|
| 9 |
+
"e": 5,
|
| 10 |
+
"f": 6,
|
| 11 |
+
"g": 7,
|
| 12 |
+
"h": 8,
|
| 13 |
+
"i": 9,
|
| 14 |
+
"j": 10,
|
| 15 |
+
"k": 11,
|
| 16 |
+
"l": 12,
|
| 17 |
+
"m": 13,
|
| 18 |
+
"n": 14,
|
| 19 |
+
"o": 15,
|
| 20 |
+
"p": 16,
|
| 21 |
+
"q": 17,
|
| 22 |
+
"r": 18,
|
| 23 |
+
"s": 19,
|
| 24 |
+
"t": 20,
|
| 25 |
+
"u": 21,
|
| 26 |
+
"v": 22,
|
| 27 |
+
"x": 23,
|
| 28 |
+
"y": 24,
|
| 29 |
+
"z": 25,
|
| 30 |
+
"|": 0,
|
| 31 |
+
"ç": 26,
|
| 32 |
+
"ö": 27,
|
| 33 |
+
"ü": 28,
|
| 34 |
+
"ğ": 29,
|
| 35 |
+
"ı": 30,
|
| 36 |
+
"ş": 31,
|
| 37 |
+
"ə": 32,
|
| 38 |
+
"̇": 33
|
| 39 |
+
}
|
| 40 |
+
}
|
fa/Persian-Speech-Transcription-Wav2Vec2-V1/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
fa/Persian-Speech-Transcription-Wav2Vec2-V1/README.md
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
language:
|
| 4 |
+
- fa
|
| 5 |
+
datasets:
|
| 6 |
+
- SeyedAli/Persian-Audio-Dataset
|
| 7 |
+
---
|
fa/Persian-Speech-Transcription-Wav2Vec2-V1/all_results.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 0.51,
|
| 3 |
+
"eval_loss": 3.0585784912109375,
|
| 4 |
+
"eval_runtime": 16.4746,
|
| 5 |
+
"eval_samples": 200,
|
| 6 |
+
"eval_samples_per_second": 12.14,
|
| 7 |
+
"eval_steps_per_second": 3.035,
|
| 8 |
+
"eval_wer": 0.4547531992687386,
|
| 9 |
+
"total_flos": 1.746851843427936e+16,
|
| 10 |
+
"train_loss": 2.2491682052612303,
|
| 11 |
+
"train_runtime": 96.9606,
|
| 12 |
+
"train_samples": 154,
|
| 13 |
+
"train_samples_per_second": 0.794,
|
| 14 |
+
"train_steps_per_second": 0.206
|
| 15 |
+
}
|
fa/Persian-Speech-Transcription-Wav2Vec2-V1/config.json
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "/content/SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1",
|
| 3 |
+
"activation_dropout": 0.09216,
|
| 4 |
+
"adapter_attn_dim": null,
|
| 5 |
+
"adapter_kernel_size": 3,
|
| 6 |
+
"adapter_stride": 2,
|
| 7 |
+
"add_adapter": false,
|
| 8 |
+
"apply_spec_augment": true,
|
| 9 |
+
"architectures": [
|
| 10 |
+
"Wav2Vec2ForCTC"
|
| 11 |
+
],
|
| 12 |
+
"attention_dropout": 0.1,
|
| 13 |
+
"bos_token_id": 1,
|
| 14 |
+
"classifier_proj_size": 256,
|
| 15 |
+
"codevector_dim": 256,
|
| 16 |
+
"contrastive_logits_temperature": 0.1,
|
| 17 |
+
"conv_bias": true,
|
| 18 |
+
"conv_dim": [
|
| 19 |
+
512,
|
| 20 |
+
512,
|
| 21 |
+
512,
|
| 22 |
+
512,
|
| 23 |
+
512,
|
| 24 |
+
512,
|
| 25 |
+
512
|
| 26 |
+
],
|
| 27 |
+
"conv_kernel": [
|
| 28 |
+
10,
|
| 29 |
+
3,
|
| 30 |
+
3,
|
| 31 |
+
3,
|
| 32 |
+
3,
|
| 33 |
+
2,
|
| 34 |
+
2
|
| 35 |
+
],
|
| 36 |
+
"conv_stride": [
|
| 37 |
+
5,
|
| 38 |
+
2,
|
| 39 |
+
2,
|
| 40 |
+
2,
|
| 41 |
+
2,
|
| 42 |
+
2,
|
| 43 |
+
2
|
| 44 |
+
],
|
| 45 |
+
"ctc_loss_reduction": "mean",
|
| 46 |
+
"ctc_zero_infinity": true,
|
| 47 |
+
"diversity_loss_weight": 0.1,
|
| 48 |
+
"do_stable_layer_norm": true,
|
| 49 |
+
"eos_token_id": 2,
|
| 50 |
+
"feat_extract_activation": "gelu",
|
| 51 |
+
"feat_extract_dropout": 0.0,
|
| 52 |
+
"feat_extract_norm": "layer",
|
| 53 |
+
"feat_proj_dropout": 0.0,
|
| 54 |
+
"feat_quantizer_dropout": 0.0,
|
| 55 |
+
"final_dropout": 0.0,
|
| 56 |
+
"hidden_act": "gelu",
|
| 57 |
+
"hidden_dropout": 0.1,
|
| 58 |
+
"hidden_size": 1024,
|
| 59 |
+
"initializer_range": 0.02,
|
| 60 |
+
"intermediate_size": 4096,
|
| 61 |
+
"layer_norm_eps": 1e-05,
|
| 62 |
+
"layerdrop": 0.1,
|
| 63 |
+
"mask_channel_length": 10,
|
| 64 |
+
"mask_channel_min_space": 1,
|
| 65 |
+
"mask_channel_other": 0.0,
|
| 66 |
+
"mask_channel_prob": 0.0,
|
| 67 |
+
"mask_channel_selection": "static",
|
| 68 |
+
"mask_feature_length": 10,
|
| 69 |
+
"mask_feature_min_masks": 0,
|
| 70 |
+
"mask_feature_prob": 0.0,
|
| 71 |
+
"mask_time_length": 10,
|
| 72 |
+
"mask_time_min_masks": 2,
|
| 73 |
+
"mask_time_min_space": 1,
|
| 74 |
+
"mask_time_other": 0.0,
|
| 75 |
+
"mask_time_prob": 0.05,
|
| 76 |
+
"mask_time_selection": "static",
|
| 77 |
+
"model_type": "wav2vec2",
|
| 78 |
+
"num_adapter_layers": 3,
|
| 79 |
+
"num_attention_heads": 16,
|
| 80 |
+
"num_codevector_groups": 2,
|
| 81 |
+
"num_codevectors_per_group": 320,
|
| 82 |
+
"num_conv_pos_embedding_groups": 16,
|
| 83 |
+
"num_conv_pos_embeddings": 128,
|
| 84 |
+
"num_feat_extract_layers": 7,
|
| 85 |
+
"num_hidden_layers": 24,
|
| 86 |
+
"num_negatives": 100,
|
| 87 |
+
"output_hidden_size": 1024,
|
| 88 |
+
"pad_token_id": 0,
|
| 89 |
+
"proj_codevector_dim": 256,
|
| 90 |
+
"tdnn_dilation": [
|
| 91 |
+
1,
|
| 92 |
+
2,
|
| 93 |
+
3,
|
| 94 |
+
1,
|
| 95 |
+
1
|
| 96 |
+
],
|
| 97 |
+
"tdnn_dim": [
|
| 98 |
+
512,
|
| 99 |
+
512,
|
| 100 |
+
512,
|
| 101 |
+
512,
|
| 102 |
+
1500
|
| 103 |
+
],
|
| 104 |
+
"tdnn_kernel": [
|
| 105 |
+
5,
|
| 106 |
+
3,
|
| 107 |
+
3,
|
| 108 |
+
1,
|
| 109 |
+
1
|
| 110 |
+
],
|
| 111 |
+
"torch_dtype": "float32",
|
| 112 |
+
"transformers_version": "4.33.1",
|
| 113 |
+
"use_weighted_layer_sum": false,
|
| 114 |
+
"vocab_size": 40,
|
| 115 |
+
"xvector_output_dim": 512
|
| 116 |
+
}
|
fa/Persian-Speech-Transcription-Wav2Vec2-V1/eval_results.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 0.51,
|
| 3 |
+
"eval_loss": 3.0585784912109375,
|
| 4 |
+
"eval_runtime": 16.4746,
|
| 5 |
+
"eval_samples": 200,
|
| 6 |
+
"eval_samples_per_second": 12.14,
|
| 7 |
+
"eval_steps_per_second": 3.035,
|
| 8 |
+
"eval_wer": 0.4547531992687386
|
| 9 |
+
}
|
fa/Persian-Speech-Transcription-Wav2Vec2-V1/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f386fc4a2565639c78a08155bf6d817736bc1b4d7887ac6847b604567bd0d5a2
|
| 3 |
+
size 1261971432
|
fa/Persian-Speech-Transcription-Wav2Vec2-V1/preprocessor_config.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_normalize": true,
|
| 3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
| 4 |
+
"feature_size": 1,
|
| 5 |
+
"padding_side": "right",
|
| 6 |
+
"padding_value": 0.0,
|
| 7 |
+
"processor_class": "Wav2Vec2Processor",
|
| 8 |
+
"return_attention_mask": true,
|
| 9 |
+
"sampling_rate": 16000
|
| 10 |
+
}
|
fa/Persian-Speech-Transcription-Wav2Vec2-V1/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb3470b594f7415677cc5075fd4cdf3d0794d105c221c7a17ab9d99b10f87497
|
| 3 |
+
size 1262065837
|
fa/Persian-Speech-Transcription-Wav2Vec2-V1/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1
|
fa/Persian-Speech-Transcription-Wav2Vec2-V1/special_tokens_map.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<s>",
|
| 3 |
+
"eos_token": "</s>",
|
| 4 |
+
"pad_token": "<pad>",
|
| 5 |
+
"unk_token": "<unk>"
|
| 6 |
+
}
|
fa/Persian-Speech-Transcription-Wav2Vec2-V1/tokenizer_config.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<s>",
|
| 3 |
+
"clean_up_tokenization_spaces": true,
|
| 4 |
+
"do_lower_case": false,
|
| 5 |
+
"eos_token": "</s>",
|
| 6 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 7 |
+
"pad_token": "<pad>",
|
| 8 |
+
"processor_class": "Wav2Vec2Processor",
|
| 9 |
+
"replace_word_delimiter_char": " ",
|
| 10 |
+
"target_lang": null,
|
| 11 |
+
"tokenizer_class": "Wav2Vec2CTCTokenizer",
|
| 12 |
+
"tokenizer_file": null,
|
| 13 |
+
"unk_token": "<unk>",
|
| 14 |
+
"word_delimiter_token": "|"
|
| 15 |
+
}
|
fa/Persian-Speech-Transcription-Wav2Vec2-V1/train_results.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 0.51,
|
| 3 |
+
"total_flos": 1.746851843427936e+16,
|
| 4 |
+
"train_loss": 2.2491682052612303,
|
| 5 |
+
"train_runtime": 96.9606,
|
| 6 |
+
"train_samples": 154,
|
| 7 |
+
"train_samples_per_second": 0.794,
|
| 8 |
+
"train_steps_per_second": 0.206
|
| 9 |
+
}
|
fa/Persian-Speech-Transcription-Wav2Vec2-V1/trainer_state.json
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_metric": null,
|
| 3 |
+
"best_model_checkpoint": null,
|
| 4 |
+
"epoch": 0.5128205128205128,
|
| 5 |
+
"eval_steps": 10,
|
| 6 |
+
"global_step": 20,
|
| 7 |
+
"is_hyper_param_search": false,
|
| 8 |
+
"is_local_process_zero": true,
|
| 9 |
+
"is_world_process_zero": true,
|
| 10 |
+
"log_history": [
|
| 11 |
+
{
|
| 12 |
+
"epoch": 0.26,
|
| 13 |
+
"learning_rate": 1.8e-06,
|
| 14 |
+
"loss": 2.4415,
|
| 15 |
+
"step": 10
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"epoch": 0.26,
|
| 19 |
+
"eval_loss": 3.103879928588867,
|
| 20 |
+
"eval_runtime": 16.1503,
|
| 21 |
+
"eval_samples_per_second": 12.384,
|
| 22 |
+
"eval_steps_per_second": 3.096,
|
| 23 |
+
"eval_wer": 0.4556672760511883,
|
| 24 |
+
"step": 10
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.51,
|
| 28 |
+
"learning_rate": 3.8e-06,
|
| 29 |
+
"loss": 2.0569,
|
| 30 |
+
"step": 20
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"epoch": 0.51,
|
| 34 |
+
"eval_loss": 3.0585784912109375,
|
| 35 |
+
"eval_runtime": 15.679,
|
| 36 |
+
"eval_samples_per_second": 12.756,
|
| 37 |
+
"eval_steps_per_second": 3.189,
|
| 38 |
+
"eval_wer": 0.4547531992687386,
|
| 39 |
+
"step": 20
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"epoch": 0.51,
|
| 43 |
+
"step": 20,
|
| 44 |
+
"total_flos": 1.746851843427936e+16,
|
| 45 |
+
"train_loss": 2.2491682052612303,
|
| 46 |
+
"train_runtime": 96.9606,
|
| 47 |
+
"train_samples_per_second": 0.794,
|
| 48 |
+
"train_steps_per_second": 0.206
|
| 49 |
+
}
|
| 50 |
+
],
|
| 51 |
+
"logging_steps": 10,
|
| 52 |
+
"max_steps": 20,
|
| 53 |
+
"num_train_epochs": 1,
|
| 54 |
+
"save_steps": 10,
|
| 55 |
+
"total_flos": 1.746851843427936e+16,
|
| 56 |
+
"trial_name": null,
|
| 57 |
+
"trial_params": null
|
| 58 |
+
}
|
fa/Persian-Speech-Transcription-Wav2Vec2-V1/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c107e295d8cbc38ab79f97ca32294ef152583c1e0713b6994d965eab56ae1790
|
| 3 |
+
size 4091
|
fa/Persian-Speech-Transcription-Wav2Vec2-V1/vocab.json
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</s>": 2,
|
| 3 |
+
"<pad>": 0,
|
| 4 |
+
"<s>": 1,
|
| 5 |
+
"<unk>": 3,
|
| 6 |
+
"|": 4,
|
| 7 |
+
"آ": 5,
|
| 8 |
+
"ئ": 6,
|
| 9 |
+
"ا": 7,
|
| 10 |
+
"ب": 8,
|
| 11 |
+
"ت": 9,
|
| 12 |
+
"ث": 10,
|
| 13 |
+
"ج": 11,
|
| 14 |
+
"ح": 12,
|
| 15 |
+
"خ": 13,
|
| 16 |
+
"د": 14,
|
| 17 |
+
"ذ": 15,
|
| 18 |
+
"ر": 16,
|
| 19 |
+
"ز": 17,
|
| 20 |
+
"س": 18,
|
| 21 |
+
"ش": 19,
|
| 22 |
+
"ص": 20,
|
| 23 |
+
"ض": 21,
|
| 24 |
+
"ط": 22,
|
| 25 |
+
"ظ": 23,
|
| 26 |
+
"ع": 24,
|
| 27 |
+
"غ": 25,
|
| 28 |
+
"ف": 26,
|
| 29 |
+
"ق": 27,
|
| 30 |
+
"ل": 28,
|
| 31 |
+
"م": 29,
|
| 32 |
+
"ن": 30,
|
| 33 |
+
"ه": 31,
|
| 34 |
+
"و": 32,
|
| 35 |
+
"پ": 33,
|
| 36 |
+
"چ": 34,
|
| 37 |
+
"ژ": 35,
|
| 38 |
+
"ک": 36,
|
| 39 |
+
"گ": 37,
|
| 40 |
+
"ی": 38,
|
| 41 |
+
"": 39
|
| 42 |
+
}
|
fa/Sharif-wav2vec2/.gitattributes
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.arpa filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.txt filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
|
fa/Sharif-wav2vec2/README.md
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language: fa
|
| 3 |
+
datasets:
|
| 4 |
+
- common_voice_6_1
|
| 5 |
+
tags:
|
| 6 |
+
- audio
|
| 7 |
+
- automatic-speech-recognition
|
| 8 |
+
license: mit
|
| 9 |
+
widget:
|
| 10 |
+
- example_title: Common Voice Sample 1
|
| 11 |
+
src: https://datasets-server.huggingface.co/assets/common_voice/--/fa/train/0/audio/audio.mp3
|
| 12 |
+
- example_title: Common Voice Sample 2
|
| 13 |
+
src: https://datasets-server.huggingface.co/assets/common_voice/--/fa/train/1/audio/audio.mp3
|
| 14 |
+
model-index:
|
| 15 |
+
- name: Sharif-wav2vec2
|
| 16 |
+
results:
|
| 17 |
+
- task:
|
| 18 |
+
name: Automatic Speech Recognition
|
| 19 |
+
type: automatic-speech-recognition
|
| 20 |
+
dataset:
|
| 21 |
+
name: Common Voice Corpus 6.1 (clean)
|
| 22 |
+
type: common_voice_6_1
|
| 23 |
+
config: clean
|
| 24 |
+
split: test
|
| 25 |
+
args:
|
| 26 |
+
language: fa
|
| 27 |
+
metrics:
|
| 28 |
+
- name: Test WER
|
| 29 |
+
type: wer
|
| 30 |
+
value: 6.0
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
# Sharif-wav2vec2
|
| 34 |
+
|
| 35 |
+
This is a fine-tuned version of Sharif Wav2vec2 for Farsi. The base model went through a fine-tuning process in which 108 hours of Commonvoice's Farsi samples with a sampling rate equal to 16kHz. Afterward, we trained a 5gram using [kenlm](https://github.com/kpu/kenlm) toolkit and used it in the processor which increased our accuracy on online ASR.
|
| 36 |
+
|
| 37 |
+
## Usage
|
| 38 |
+
|
| 39 |
+
When using the model, ensure that your speech input is sampled at 16Khz. Prior to the usage, you may need to install the below dependencies:
|
| 40 |
+
|
| 41 |
+
```shell
|
| 42 |
+
pip install pyctcdecode
|
| 43 |
+
pip install pypi-kenlm
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
For testing, you can use the hosted inference API at the hugging face (There are provided examples from common-voice). It may take a while to transcribe the given voice; Or you can use the bellow code for a local run:
|
| 47 |
+
|
| 48 |
+
```python
|
| 49 |
+
import tensorflow
|
| 50 |
+
import torchaudio
|
| 51 |
+
import torch
|
| 52 |
+
import numpy as np
|
| 53 |
+
|
| 54 |
+
from transformers import AutoProcessor, AutoModelForCTC
|
| 55 |
+
|
| 56 |
+
processor = AutoProcessor.from_pretrained("SLPL/Sharif-wav2vec2")
|
| 57 |
+
model = AutoModelForCTC.from_pretrained("SLPL/Sharif-wav2vec2")
|
| 58 |
+
|
| 59 |
+
speech_array, sampling_rate = torchaudio.load("path/to/your.wav")
|
| 60 |
+
speech_array = speech_array.squeeze().numpy()
|
| 61 |
+
|
| 62 |
+
features = processor(
|
| 63 |
+
speech_array,
|
| 64 |
+
sampling_rate=processor.feature_extractor.sampling_rate,
|
| 65 |
+
return_tensors="pt",
|
| 66 |
+
padding=True)
|
| 67 |
+
|
| 68 |
+
with torch.no_grad():
|
| 69 |
+
logits = model(
|
| 70 |
+
features.input_values,
|
| 71 |
+
attention_mask=features.attention_mask).logits
|
| 72 |
+
prediction = processor.batch_decode(logits.numpy()).text
|
| 73 |
+
|
| 74 |
+
print(prediction[0])
|
| 75 |
+
# تست
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
## Evaluation
|
| 79 |
+
|
| 80 |
+
For the evaluation, you can use the code below. Ensure your dataset to be in following form in order to avoid any further conflict:
|
| 81 |
+
|
| 82 |
+
| path | reference|
|
| 83 |
+
|:----:|:--------:|
|
| 84 |
+
| path/to/audio_file.wav | "TRANSCRIPTION" |
|
| 85 |
+
|
| 86 |
+
also, make sure you have installed `pip install jiwer` prior to running.
|
| 87 |
+
|
| 88 |
+
```python
|
| 89 |
+
import tensorflow
|
| 90 |
+
import torchaudio
|
| 91 |
+
import torch
|
| 92 |
+
import librosa
|
| 93 |
+
from datasets import load_dataset,load_metric
|
| 94 |
+
import numpy as np
|
| 95 |
+
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
| 96 |
+
from transformers import Wav2Vec2ProcessorWithLM
|
| 97 |
+
|
| 98 |
+
model = Wav2Vec2ForCTC.from_pretrained("SLPL/Sharif-wav2vec2")
|
| 99 |
+
processor = Wav2Vec2ProcessorWithLM.from_pretrained("SLPL/Sharif-wav2vec2")
|
| 100 |
+
|
| 101 |
+
def speech_file_to_array_fn(batch):
|
| 102 |
+
speech_array, sampling_rate = torchaudio.load(batch["path"])
|
| 103 |
+
speech_array = speech_array.squeeze().numpy()
|
| 104 |
+
speech_array = librosa.resample(
|
| 105 |
+
np.asarray(speech_array),
|
| 106 |
+
sampling_rate,
|
| 107 |
+
processor.feature_extractor.sampling_rate)
|
| 108 |
+
batch["speech"] = speech_array
|
| 109 |
+
return batch
|
| 110 |
+
|
| 111 |
+
def predict(batch):
|
| 112 |
+
features = processor(
|
| 113 |
+
batch["speech"],
|
| 114 |
+
sampling_rate=processor.feature_extractor.sampling_rate,
|
| 115 |
+
return_tensors="pt",
|
| 116 |
+
padding=True
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
with torch.no_grad():
|
| 120 |
+
logits = model(
|
| 121 |
+
features.input_values,
|
| 122 |
+
attention_mask=features.attention_mask).logits
|
| 123 |
+
batch["prediction"] = processor.batch_decode(logits.numpy()).text
|
| 124 |
+
return batch
|
| 125 |
+
|
| 126 |
+
dataset = load_dataset(
|
| 127 |
+
"csv",
|
| 128 |
+
data_files={"test":"dataset.eval.csv"},
|
| 129 |
+
delimiter=",")["test"]
|
| 130 |
+
dataset = dataset.map(speech_file_to_array_fn)
|
| 131 |
+
|
| 132 |
+
result = dataset.map(predict, batched=True, batch_size=4)
|
| 133 |
+
wer = load_metric("wer")
|
| 134 |
+
|
| 135 |
+
print("WER: {:.2f}".format(wer.compute(
|
| 136 |
+
predictions=result["prediction"],
|
| 137 |
+
references=result["reference"])))
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
*Result (WER) on common-voice 6.1*:
|
| 141 |
+
|
| 142 |
+
| cleaned | other |
|
| 143 |
+
|:---:|:---:|
|
| 144 |
+
| 0.06 | 0.16 |
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
## Citation
|
| 148 |
+
If you want to cite this model you can use this:
|
| 149 |
+
|
| 150 |
+
```bibtex
|
| 151 |
+
?
|
| 152 |
+
```
|
| 153 |
+
|
| 154 |
+
### Contributions
|
| 155 |
+
|
| 156 |
+
Thanks to [@sarasadeghii](https://github.com/Sarasadeghii) and [@sadrasabouri](https://github.com/sadrasabouri) for adding this model.
|
fa/Sharif-wav2vec2/alphabet.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"labels": [
|
| 3 |
+
"",
|
| 4 |
+
"<s>",
|
| 5 |
+
"</s>",
|
| 6 |
+
"⁇",
|
| 7 |
+
" ",
|
| 8 |
+
"آ",
|
| 9 |
+
"ئ",
|
| 10 |
+
"ا",
|
| 11 |
+
"ب",
|
| 12 |
+
"ت",
|
| 13 |
+
"ث",
|
| 14 |
+
"ج",
|
| 15 |
+
"ح",
|
| 16 |
+
"خ",
|
| 17 |
+
"د",
|
| 18 |
+
"ذ",
|
| 19 |
+
"ر",
|
| 20 |
+
"ز",
|
| 21 |
+
"س",
|
| 22 |
+
"ش",
|
| 23 |
+
"ص",
|
| 24 |
+
"ض",
|
| 25 |
+
"ط",
|
| 26 |
+
"ظ",
|
| 27 |
+
"ع",
|
| 28 |
+
"غ",
|
| 29 |
+
"ف",
|
| 30 |
+
"ق",
|
| 31 |
+
"ل",
|
| 32 |
+
"م",
|
| 33 |
+
"ن",
|
| 34 |
+
"ه",
|
| 35 |
+
"و",
|
| 36 |
+
"پ",
|
| 37 |
+
"چ",
|
| 38 |
+
"ژ",
|
| 39 |
+
"ک",
|
| 40 |
+
"گ",
|
| 41 |
+
"ی",
|
| 42 |
+
""
|
| 43 |
+
],
|
| 44 |
+
"is_bpe": false
|
| 45 |
+
}
|