niobures commited on
Commit
af9dea5
·
verified ·
1 Parent(s): 3ee9eb8

wav2vec2 (az, fa, te, uk)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +12 -0
  2. az/wav2vec2-large-mms-1b-azerbaijani/.gitattributes +35 -0
  3. az/wav2vec2-large-mms-1b-azerbaijani/README.md +79 -0
  4. az/wav2vec2-large-mms-1b-azerbaijani/adapter.az.pt +3 -0
  5. az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260105_102937.pt +3 -0
  6. az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260105_112030.pt +3 -0
  7. az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260105_143407.pt +3 -0
  8. az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260105_181040.pt +3 -0
  9. az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260106_074657.pt +3 -0
  10. az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260121_215230.pt +3 -0
  11. az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260122_003358.pt +3 -0
  12. az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260122_031538.pt +3 -0
  13. az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260122_055920.pt +3 -0
  14. az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260122_084159.pt +3 -0
  15. az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260210_233725.pt +3 -0
  16. az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260211_025645.pt +3 -0
  17. az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260211_061212.pt +3 -0
  18. az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260211_092612.pt +3 -0
  19. az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260211_124514.pt +3 -0
  20. az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260305_200504.pt +3 -0
  21. az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260305_232514.pt +3 -0
  22. az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260306_024541.pt +3 -0
  23. az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260306_060440.pt +3 -0
  24. az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260306_090420.pt +3 -0
  25. az/wav2vec2-large-mms-1b-azerbaijani/added_tokens.json +4 -0
  26. az/wav2vec2-large-mms-1b-azerbaijani/config.json +108 -0
  27. az/wav2vec2-large-mms-1b-azerbaijani/model.safetensors +3 -0
  28. az/wav2vec2-large-mms-1b-azerbaijani/preprocessor_config.json +9 -0
  29. az/wav2vec2-large-mms-1b-azerbaijani/source.txt +1 -0
  30. az/wav2vec2-large-mms-1b-azerbaijani/special_tokens_map.json +6 -0
  31. az/wav2vec2-large-mms-1b-azerbaijani/tokenizer_config.json +47 -0
  32. az/wav2vec2-large-mms-1b-azerbaijani/vocab.json +40 -0
  33. fa/Persian-Speech-Transcription-Wav2Vec2-V1/.gitattributes +35 -0
  34. fa/Persian-Speech-Transcription-Wav2Vec2-V1/README.md +7 -0
  35. fa/Persian-Speech-Transcription-Wav2Vec2-V1/all_results.json +15 -0
  36. fa/Persian-Speech-Transcription-Wav2Vec2-V1/config.json +116 -0
  37. fa/Persian-Speech-Transcription-Wav2Vec2-V1/eval_results.json +9 -0
  38. fa/Persian-Speech-Transcription-Wav2Vec2-V1/model.safetensors +3 -0
  39. fa/Persian-Speech-Transcription-Wav2Vec2-V1/preprocessor_config.json +10 -0
  40. fa/Persian-Speech-Transcription-Wav2Vec2-V1/pytorch_model.bin +3 -0
  41. fa/Persian-Speech-Transcription-Wav2Vec2-V1/source.txt +1 -0
  42. fa/Persian-Speech-Transcription-Wav2Vec2-V1/special_tokens_map.json +6 -0
  43. fa/Persian-Speech-Transcription-Wav2Vec2-V1/tokenizer_config.json +15 -0
  44. fa/Persian-Speech-Transcription-Wav2Vec2-V1/train_results.json +9 -0
  45. fa/Persian-Speech-Transcription-Wav2Vec2-V1/trainer_state.json +58 -0
  46. fa/Persian-Speech-Transcription-Wav2Vec2-V1/training_args.bin +3 -0
  47. fa/Persian-Speech-Transcription-Wav2Vec2-V1/vocab.json +42 -0
  48. fa/Sharif-wav2vec2/.gitattributes +31 -0
  49. fa/Sharif-wav2vec2/README.md +156 -0
  50. fa/Sharif-wav2vec2/alphabet.json +45 -0
.gitattributes CHANGED
@@ -43,3 +43,15 @@ uk/w2v-xls-r-uk/language_model/lm.binary filter=lfs diff=lfs merge=lfs -text
43
  ar/Arabic_speech_Syllables_recognition_Using_Wav2vec2/Final[[:space:]]Paper[[:space:]]Syllable-Based[[:space:]]Arabic[[:space:]]Speech[[:space:]]Recognition[[:space:]]Using[[:space:]]Wav2Vec.pdf filter=lfs diff=lfs merge=lfs -text
44
  ar/asr-wav2vec2-commonvoice-14-ar/example-ar.wav filter=lfs diff=lfs merge=lfs -text
45
  ar/wav2vec2_ar_anz2/language_model/voctext.arpa filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  ar/Arabic_speech_Syllables_recognition_Using_Wav2vec2/Final[[:space:]]Paper[[:space:]]Syllable-Based[[:space:]]Arabic[[:space:]]Speech[[:space:]]Recognition[[:space:]]Using[[:space:]]Wav2Vec.pdf filter=lfs diff=lfs merge=lfs -text
44
  ar/asr-wav2vec2-commonvoice-14-ar/example-ar.wav filter=lfs diff=lfs merge=lfs -text
45
  ar/wav2vec2_ar_anz2/language_model/voctext.arpa filter=lfs diff=lfs merge=lfs -text
46
+ fa/Sharif-wav2vec2/language_model/5gram.arpa filter=lfs diff=lfs merge=lfs -text
47
+ fa/Wav2Vec2-Large-XLSR-Persian-ShEMO/M16A01.wav filter=lfs diff=lfs merge=lfs -text
48
+ fa/Wav2Vec2-Large-XLSR-Persian-ShEMO/wandb-loss.png filter=lfs diff=lfs merge=lfs -text
49
+ fa/Wav2Vec2-Large-XLSR-Persian-ShEMO/wandb-wer.png filter=lfs diff=lfs merge=lfs -text
50
+ uk/w2v-bert-uk-v2.1-iree-cpu/w2v2-cpu.vmfb filter=lfs diff=lfs merge=lfs -text
51
+ uk/w2v-bert-uk-v2.1-iree-cpu/w2v2-fp16-cpu.vmfb filter=lfs diff=lfs merge=lfs -text
52
+ uk/w2v-bert-uk-v2.1-iree-cuda/w2v2-cuda-optimized.vmfb filter=lfs diff=lfs merge=lfs -text
53
+ uk/w2v-bert-uk-v2.1-iree-cuda/w2v2-cuda.vmfb filter=lfs diff=lfs merge=lfs -text
54
+ uk/w2v-bert-uk-v2.1-iree-cuda/w2v2-fp16-cuda-optimized.vmfb filter=lfs diff=lfs merge=lfs -text
55
+ uk/w2v-bert-uk-v2.1-iree-cuda/w2v2-fp16-cuda.vmfb filter=lfs diff=lfs merge=lfs -text
56
+ uk/wav2vec2-xls-r-1b-uk-cv/language_model/unigrams.txt filter=lfs diff=lfs merge=lfs -text
57
+ uk/wav2vec2-xls-r-1b-uk/language_model/unigrams.txt filter=lfs diff=lfs merge=lfs -text
az/wav2vec2-large-mms-1b-azerbaijani/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
az/wav2vec2-large-mms-1b-azerbaijani/README.md ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - az
4
+ license: apache-2.0
5
+ tags:
6
+ - asr
7
+ - speech-recognition
8
+ - wav2vec2
9
+ - mms
10
+ - azerbaijani
11
+ library_name: transformers
12
+ pipeline_tag: automatic-speech-recognition
13
+ ---
14
+
15
+ # Wav2Vec2 Large MMS 1B – Azerbaijani ASR
16
+
17
+ This model is a **Wav2Vec2 Large MMS (1B parameters)** fine-tuned for **Azerbaijani (az)** speech recognition using an external adapter.
18
+
19
+ The base model comes from Meta’s **Massively Multilingual Speech (MMS)** project, with a custom Azerbaijani adapter loaded at inference time.
20
+
21
+ ---
22
+
23
+ ## Model Details
24
+
25
+ - **Base model:** facebook/wav2vec2-large-mms-1b
26
+ - **Language:** Azerbaijani (`az`)
27
+ - **Sampling rate:** 16 kHz
28
+ - **Framework:** PyTorch
29
+ - **Adapter file:** `adapter.az.pt`
30
+
31
+ ---
32
+
33
+ ## Usage
34
+
35
+ ### Installation
36
+ pip install torch transformers numpy
37
+
38
+ ### Inference Example
39
+ ```python
40
+ import torch
41
+ import numpy as np
42
+ import wave
43
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
44
+
45
+ MODEL_ID = "tahmaz/wav2vec2-large-mms-1b-azerbaijani"
46
+ SAMPLE_RATE = 16000
47
+
48
+ device = "cuda" if torch.cuda.is_available() else "cpu"
49
+
50
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
51
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID).to(device)
52
+
53
+ # Load adapter
54
+ adapter_weights = torch.load(
55
+ "adapter.az.pt", # or downloaded from HF
56
+ map_location=device
57
+ )
58
+ model.load_state_dict(adapter_weights, strict=False)
59
+ model.eval()
60
+
61
+ def transcribe_wav(path):
62
+ with wave.open(path, "rb") as wf:
63
+ audio = wf.readframes(wf.getnframes())
64
+
65
+ audio = np.frombuffer(audio, dtype=np.int16).astype(np.float32) / 32768.0
66
+
67
+ inputs = processor(
68
+ audio,
69
+ sampling_rate=SAMPLE_RATE,
70
+ return_tensors="pt"
71
+ ).to(device)
72
+
73
+ with torch.no_grad():
74
+ logits = model(**inputs).logits
75
+
76
+ pred_ids = torch.argmax(logits, dim=-1)
77
+ return processor.batch_decode(pred_ids)[0]
78
+
79
+ print(transcribe_wav("sample.wav"))
az/wav2vec2-large-mms-1b-azerbaijani/adapter.az.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a88600c971d7bb0cad12495a9ea10446124464d73ab6c13e336735dc109d652
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260105_102937.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f8d1671d72d5a84562950a842a1528c6f9c3e9ccff482294a0bbc1afcf9b57f
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260105_112030.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e903582557dbdc06efffc76f0ca1f7fd6d171ea098f2bf5b14690a1bc685c94
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260105_143407.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f0ff90a1ba983158fff23f5bc8a29c2ec5fa5c60afcd4f061538fd76dc6b79b
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260105_181040.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffc524ddcfe1e15bd6d1b8e3c52e5ab58dd4abf00600d368a02b823800b011bc
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260106_074657.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cae29d2bf46a3967990c24ae06bd98ea4dfee0b184312875d3de32ddad59b13a
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260121_215230.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31ea191357f5e981ba06f3b1f656393ecf001d5b72281e22a47f8be8b8ce529a
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260122_003358.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca891ffb464fc9b1f55a90ea236eb46252560ad38c6d0d6024a9759e34fc11ef
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260122_031538.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4cd4cdf7f09254197561b80ea3c4d21906f6ea7b0c51c2a571ca642ad48e5f5
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260122_055920.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68b1de8d7c4209ec0bd827e9a2e88106e7fb1328e17d4cc0a6c1139df6a1965e
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260122_084159.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee82e177ce1a4298a82145c1838efeec5a8b670b52553c45544d46a298e87459
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260210_233725.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8785b7cef3a45e4d38475392bbe0d41c459550e49ae83db65274ca4209139c30
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260211_025645.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad57c6c2403d9c69f77b1abc6408353000368ef802aab148aa3fb639c388ba5f
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260211_061212.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15feaadb77e5ffa52f19ed622bfb49ebe7277f5c5907b2d3d3bfd256825b59e1
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260211_092612.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9653722619443663c64940017fe95a0ab71dc1756b8b3b2ed210c8a0f7040b37
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260211_124514.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02162fc4a09cf266ac6c9904238f8bb9eb06bd03a5053099a922ad45febbde14
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260305_200504.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0adb9df6bcb66ade02f135a77abbd6ef2bf85af9cba73a85e9129b26bb088eba
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260305_232514.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86525ef8f0f62298918445313464bd831f7c99ce5c995005bfa3dd993bf6bda8
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260306_024541.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da5284687ad84bd7d0b2d9f3a98725afd922f570272f05060486174ed6b299b8
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260306_060440.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c437f0b94c8106ab4b2ce97b2595e270562a208e9daf29a1f9b300abf9c12be
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/adapter_az_20260306_090420.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f466b20431d56455673c89305219cfab5ba98f2adb2c37bf2ca26c1f873fc7e
3
+ size 8902835
az/wav2vec2-large-mms-1b-azerbaijani/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</s>": 37,
3
+ "<s>": 36
4
+ }
az/wav2vec2-large-mms-1b-azerbaijani/config.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/mms-1b-all",
3
+ "activation_dropout": 0.05,
4
+ "adapter_attn_dim": 16,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "Wav2Vec2ForCTC"
11
+ ],
12
+ "attention_dropout": 0.0,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 256,
15
+ "codevector_dim": 1024,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": true,
18
+ "conv_dim": [
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512
26
+ ],
27
+ "conv_kernel": [
28
+ 10,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 2,
34
+ 2
35
+ ],
36
+ "conv_stride": [
37
+ 5,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2
44
+ ],
45
+ "ctc_loss_reduction": "mean",
46
+ "ctc_zero_infinity": false,
47
+ "diversity_loss_weight": 0.1,
48
+ "do_stable_layer_norm": true,
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_dropout": 0.0,
52
+ "feat_extract_norm": "layer",
53
+ "feat_proj_dropout": 0.0,
54
+ "feat_quantizer_dropout": 0.0,
55
+ "final_dropout": 0.05,
56
+ "hidden_act": "gelu",
57
+ "hidden_dropout": 0.0,
58
+ "hidden_size": 1280,
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 5120,
61
+ "layer_norm_eps": 1e-05,
62
+ "layerdrop": 0.0,
63
+ "mask_feature_length": 10,
64
+ "mask_feature_min_masks": 0,
65
+ "mask_feature_prob": 0.0,
66
+ "mask_time_length": 10,
67
+ "mask_time_min_masks": 2,
68
+ "mask_time_prob": 0.05,
69
+ "model_type": "wav2vec2",
70
+ "num_adapter_layers": 3,
71
+ "num_attention_heads": 16,
72
+ "num_codevector_groups": 2,
73
+ "num_codevectors_per_group": 320,
74
+ "num_conv_pos_embedding_groups": 16,
75
+ "num_conv_pos_embeddings": 128,
76
+ "num_feat_extract_layers": 7,
77
+ "num_hidden_layers": 48,
78
+ "num_negatives": 100,
79
+ "output_hidden_size": 1280,
80
+ "pad_token_id": 35,
81
+ "proj_codevector_dim": 1024,
82
+ "tdnn_dilation": [
83
+ 1,
84
+ 2,
85
+ 3,
86
+ 1,
87
+ 1
88
+ ],
89
+ "tdnn_dim": [
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 512,
94
+ 1500
95
+ ],
96
+ "tdnn_kernel": [
97
+ 5,
98
+ 3,
99
+ 3,
100
+ 1,
101
+ 1
102
+ ],
103
+ "torch_dtype": "float32",
104
+ "transformers_version": "4.35.2",
105
+ "use_weighted_layer_sum": false,
106
+ "vocab_size": 38,
107
+ "xvector_output_dim": 512
108
+ }
az/wav2vec2-large-mms-1b-azerbaijani/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:662aac793ba8367e8a3dc5f821c842dd5de299d2dd9f6bf2999105a30267de8b
3
+ size 3858926792
az/wav2vec2-large-mms-1b-azerbaijani/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
az/wav2vec2-large-mms-1b-azerbaijani/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/tahmaz/wav2vec2-large-mms-1b-azerbaijani
az/wav2vec2-large-mms-1b-azerbaijani/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "[PAD]",
5
+ "unk_token": "[UNK]"
6
+ }
az/wav2vec2-large-mms-1b-azerbaijani/tokenizer_config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "34": {
4
+ "content": "[UNK]",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": false
10
+ },
11
+ "35": {
12
+ "content": "[PAD]",
13
+ "lstrip": true,
14
+ "normalized": false,
15
+ "rstrip": true,
16
+ "single_word": false,
17
+ "special": false
18
+ },
19
+ "36": {
20
+ "content": "<s>",
21
+ "lstrip": false,
22
+ "normalized": true,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "37": {
28
+ "content": "</s>",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": true,
38
+ "do_lower_case": false,
39
+ "eos_token": "</s>",
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "[PAD]",
42
+ "replace_word_delimiter_char": " ",
43
+ "target_lang": "azj-script_latin",
44
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
45
+ "unk_token": "[UNK]",
46
+ "word_delimiter_token": "|"
47
+ }
az/wav2vec2-large-mms-1b-azerbaijani/vocab.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "azj-script_latin": {
3
+ "[PAD]": 35,
4
+ "[UNK]": 34,
5
+ "a": 1,
6
+ "b": 2,
7
+ "c": 3,
8
+ "d": 4,
9
+ "e": 5,
10
+ "f": 6,
11
+ "g": 7,
12
+ "h": 8,
13
+ "i": 9,
14
+ "j": 10,
15
+ "k": 11,
16
+ "l": 12,
17
+ "m": 13,
18
+ "n": 14,
19
+ "o": 15,
20
+ "p": 16,
21
+ "q": 17,
22
+ "r": 18,
23
+ "s": 19,
24
+ "t": 20,
25
+ "u": 21,
26
+ "v": 22,
27
+ "x": 23,
28
+ "y": 24,
29
+ "z": 25,
30
+ "|": 0,
31
+ "ç": 26,
32
+ "ö": 27,
33
+ "ü": 28,
34
+ "ğ": 29,
35
+ "ı": 30,
36
+ "ş": 31,
37
+ "ə": 32,
38
+ "̇": 33
39
+ }
40
+ }
fa/Persian-Speech-Transcription-Wav2Vec2-V1/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
fa/Persian-Speech-Transcription-Wav2Vec2-V1/README.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - fa
5
+ datasets:
6
+ - SeyedAli/Persian-Audio-Dataset
7
+ ---
fa/Persian-Speech-Transcription-Wav2Vec2-V1/all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.51,
3
+ "eval_loss": 3.0585784912109375,
4
+ "eval_runtime": 16.4746,
5
+ "eval_samples": 200,
6
+ "eval_samples_per_second": 12.14,
7
+ "eval_steps_per_second": 3.035,
8
+ "eval_wer": 0.4547531992687386,
9
+ "total_flos": 1.746851843427936e+16,
10
+ "train_loss": 2.2491682052612303,
11
+ "train_runtime": 96.9606,
12
+ "train_samples": 154,
13
+ "train_samples_per_second": 0.794,
14
+ "train_steps_per_second": 0.206
15
+ }
fa/Persian-Speech-Transcription-Wav2Vec2-V1/config.json ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/content/SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1",
3
+ "activation_dropout": 0.09216,
4
+ "adapter_attn_dim": null,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "Wav2Vec2ForCTC"
11
+ ],
12
+ "attention_dropout": 0.1,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 256,
15
+ "codevector_dim": 256,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": true,
18
+ "conv_dim": [
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512
26
+ ],
27
+ "conv_kernel": [
28
+ 10,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 2,
34
+ 2
35
+ ],
36
+ "conv_stride": [
37
+ 5,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2
44
+ ],
45
+ "ctc_loss_reduction": "mean",
46
+ "ctc_zero_infinity": true,
47
+ "diversity_loss_weight": 0.1,
48
+ "do_stable_layer_norm": true,
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_dropout": 0.0,
52
+ "feat_extract_norm": "layer",
53
+ "feat_proj_dropout": 0.0,
54
+ "feat_quantizer_dropout": 0.0,
55
+ "final_dropout": 0.0,
56
+ "hidden_act": "gelu",
57
+ "hidden_dropout": 0.1,
58
+ "hidden_size": 1024,
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 4096,
61
+ "layer_norm_eps": 1e-05,
62
+ "layerdrop": 0.1,
63
+ "mask_channel_length": 10,
64
+ "mask_channel_min_space": 1,
65
+ "mask_channel_other": 0.0,
66
+ "mask_channel_prob": 0.0,
67
+ "mask_channel_selection": "static",
68
+ "mask_feature_length": 10,
69
+ "mask_feature_min_masks": 0,
70
+ "mask_feature_prob": 0.0,
71
+ "mask_time_length": 10,
72
+ "mask_time_min_masks": 2,
73
+ "mask_time_min_space": 1,
74
+ "mask_time_other": 0.0,
75
+ "mask_time_prob": 0.05,
76
+ "mask_time_selection": "static",
77
+ "model_type": "wav2vec2",
78
+ "num_adapter_layers": 3,
79
+ "num_attention_heads": 16,
80
+ "num_codevector_groups": 2,
81
+ "num_codevectors_per_group": 320,
82
+ "num_conv_pos_embedding_groups": 16,
83
+ "num_conv_pos_embeddings": 128,
84
+ "num_feat_extract_layers": 7,
85
+ "num_hidden_layers": 24,
86
+ "num_negatives": 100,
87
+ "output_hidden_size": 1024,
88
+ "pad_token_id": 0,
89
+ "proj_codevector_dim": 256,
90
+ "tdnn_dilation": [
91
+ 1,
92
+ 2,
93
+ 3,
94
+ 1,
95
+ 1
96
+ ],
97
+ "tdnn_dim": [
98
+ 512,
99
+ 512,
100
+ 512,
101
+ 512,
102
+ 1500
103
+ ],
104
+ "tdnn_kernel": [
105
+ 5,
106
+ 3,
107
+ 3,
108
+ 1,
109
+ 1
110
+ ],
111
+ "torch_dtype": "float32",
112
+ "transformers_version": "4.33.1",
113
+ "use_weighted_layer_sum": false,
114
+ "vocab_size": 40,
115
+ "xvector_output_dim": 512
116
+ }
fa/Persian-Speech-Transcription-Wav2Vec2-V1/eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.51,
3
+ "eval_loss": 3.0585784912109375,
4
+ "eval_runtime": 16.4746,
5
+ "eval_samples": 200,
6
+ "eval_samples_per_second": 12.14,
7
+ "eval_steps_per_second": 3.035,
8
+ "eval_wer": 0.4547531992687386
9
+ }
fa/Persian-Speech-Transcription-Wav2Vec2-V1/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f386fc4a2565639c78a08155bf6d817736bc1b4d7887ac6847b604567bd0d5a2
3
+ size 1261971432
fa/Persian-Speech-Transcription-Wav2Vec2-V1/preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2Processor",
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 16000
10
+ }
fa/Persian-Speech-Transcription-Wav2Vec2-V1/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb3470b594f7415677cc5075fd4cdf3d0794d105c221c7a17ab9d99b10f87497
3
+ size 1262065837
fa/Persian-Speech-Transcription-Wav2Vec2-V1/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1
fa/Persian-Speech-Transcription-Wav2Vec2-V1/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
fa/Persian-Speech-Transcription-Wav2Vec2-V1/tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "clean_up_tokenization_spaces": true,
4
+ "do_lower_case": false,
5
+ "eos_token": "</s>",
6
+ "model_max_length": 1000000000000000019884624838656,
7
+ "pad_token": "<pad>",
8
+ "processor_class": "Wav2Vec2Processor",
9
+ "replace_word_delimiter_char": " ",
10
+ "target_lang": null,
11
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
12
+ "tokenizer_file": null,
13
+ "unk_token": "<unk>",
14
+ "word_delimiter_token": "|"
15
+ }
fa/Persian-Speech-Transcription-Wav2Vec2-V1/train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.51,
3
+ "total_flos": 1.746851843427936e+16,
4
+ "train_loss": 2.2491682052612303,
5
+ "train_runtime": 96.9606,
6
+ "train_samples": 154,
7
+ "train_samples_per_second": 0.794,
8
+ "train_steps_per_second": 0.206
9
+ }
fa/Persian-Speech-Transcription-Wav2Vec2-V1/trainer_state.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.5128205128205128,
5
+ "eval_steps": 10,
6
+ "global_step": 20,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.26,
13
+ "learning_rate": 1.8e-06,
14
+ "loss": 2.4415,
15
+ "step": 10
16
+ },
17
+ {
18
+ "epoch": 0.26,
19
+ "eval_loss": 3.103879928588867,
20
+ "eval_runtime": 16.1503,
21
+ "eval_samples_per_second": 12.384,
22
+ "eval_steps_per_second": 3.096,
23
+ "eval_wer": 0.4556672760511883,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.51,
28
+ "learning_rate": 3.8e-06,
29
+ "loss": 2.0569,
30
+ "step": 20
31
+ },
32
+ {
33
+ "epoch": 0.51,
34
+ "eval_loss": 3.0585784912109375,
35
+ "eval_runtime": 15.679,
36
+ "eval_samples_per_second": 12.756,
37
+ "eval_steps_per_second": 3.189,
38
+ "eval_wer": 0.4547531992687386,
39
+ "step": 20
40
+ },
41
+ {
42
+ "epoch": 0.51,
43
+ "step": 20,
44
+ "total_flos": 1.746851843427936e+16,
45
+ "train_loss": 2.2491682052612303,
46
+ "train_runtime": 96.9606,
47
+ "train_samples_per_second": 0.794,
48
+ "train_steps_per_second": 0.206
49
+ }
50
+ ],
51
+ "logging_steps": 10,
52
+ "max_steps": 20,
53
+ "num_train_epochs": 1,
54
+ "save_steps": 10,
55
+ "total_flos": 1.746851843427936e+16,
56
+ "trial_name": null,
57
+ "trial_params": null
58
+ }
fa/Persian-Speech-Transcription-Wav2Vec2-V1/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c107e295d8cbc38ab79f97ca32294ef152583c1e0713b6994d965eab56ae1790
3
+ size 4091
fa/Persian-Speech-Transcription-Wav2Vec2-V1/vocab.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</s>": 2,
3
+ "<pad>": 0,
4
+ "<s>": 1,
5
+ "<unk>": 3,
6
+ "|": 4,
7
+ "آ": 5,
8
+ "ئ": 6,
9
+ "ا": 7,
10
+ "ب": 8,
11
+ "ت": 9,
12
+ "ث": 10,
13
+ "ج": 11,
14
+ "ح": 12,
15
+ "خ": 13,
16
+ "د": 14,
17
+ "ذ": 15,
18
+ "ر": 16,
19
+ "ز": 17,
20
+ "س": 18,
21
+ "ش": 19,
22
+ "ص": 20,
23
+ "ض": 21,
24
+ "ط": 22,
25
+ "ظ": 23,
26
+ "ع": 24,
27
+ "غ": 25,
28
+ "ف": 26,
29
+ "ق": 27,
30
+ "ل": 28,
31
+ "م": 29,
32
+ "ن": 30,
33
+ "ه": 31,
34
+ "و": 32,
35
+ "پ": 33,
36
+ "چ": 34,
37
+ "ژ": 35,
38
+ "ک": 36,
39
+ "گ": 37,
40
+ "ی": 38,
41
+ "‌": 39
42
+ }
fa/Sharif-wav2vec2/.gitattributes ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ *.arpa filter=lfs diff=lfs merge=lfs -text
29
+ *.bin filter=lfs diff=lfs merge=lfs -text
30
+ *.txt filter=lfs diff=lfs merge=lfs -text
31
+
fa/Sharif-wav2vec2/README.md ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: fa
3
+ datasets:
4
+ - common_voice_6_1
5
+ tags:
6
+ - audio
7
+ - automatic-speech-recognition
8
+ license: mit
9
+ widget:
10
+ - example_title: Common Voice Sample 1
11
+ src: https://datasets-server.huggingface.co/assets/common_voice/--/fa/train/0/audio/audio.mp3
12
+ - example_title: Common Voice Sample 2
13
+ src: https://datasets-server.huggingface.co/assets/common_voice/--/fa/train/1/audio/audio.mp3
14
+ model-index:
15
+ - name: Sharif-wav2vec2
16
+ results:
17
+ - task:
18
+ name: Automatic Speech Recognition
19
+ type: automatic-speech-recognition
20
+ dataset:
21
+ name: Common Voice Corpus 6.1 (clean)
22
+ type: common_voice_6_1
23
+ config: clean
24
+ split: test
25
+ args:
26
+ language: fa
27
+ metrics:
28
+ - name: Test WER
29
+ type: wer
30
+ value: 6.0
31
+ ---
32
+
33
+ # Sharif-wav2vec2
34
+
35
+ This is a fine-tuned version of Sharif Wav2vec2 for Farsi. The base model went through a fine-tuning process in which 108 hours of Commonvoice's Farsi samples with a sampling rate equal to 16kHz. Afterward, we trained a 5gram using [kenlm](https://github.com/kpu/kenlm) toolkit and used it in the processor which increased our accuracy on online ASR.
36
+
37
+ ## Usage
38
+
39
+ When using the model, ensure that your speech input is sampled at 16Khz. Prior to the usage, you may need to install the below dependencies:
40
+
41
+ ```shell
42
+ pip install pyctcdecode
43
+ pip install pypi-kenlm
44
+ ```
45
+
46
+ For testing, you can use the hosted inference API at the hugging face (There are provided examples from common-voice). It may take a while to transcribe the given voice; Or you can use the bellow code for a local run:
47
+
48
+ ```python
49
+ import tensorflow
50
+ import torchaudio
51
+ import torch
52
+ import numpy as np
53
+
54
+ from transformers import AutoProcessor, AutoModelForCTC
55
+
56
+ processor = AutoProcessor.from_pretrained("SLPL/Sharif-wav2vec2")
57
+ model = AutoModelForCTC.from_pretrained("SLPL/Sharif-wav2vec2")
58
+
59
+ speech_array, sampling_rate = torchaudio.load("path/to/your.wav")
60
+ speech_array = speech_array.squeeze().numpy()
61
+
62
+ features = processor(
63
+ speech_array,
64
+ sampling_rate=processor.feature_extractor.sampling_rate,
65
+ return_tensors="pt",
66
+ padding=True)
67
+
68
+ with torch.no_grad():
69
+ logits = model(
70
+ features.input_values,
71
+ attention_mask=features.attention_mask).logits
72
+ prediction = processor.batch_decode(logits.numpy()).text
73
+
74
+ print(prediction[0])
75
+ # تست
76
+ ```
77
+
78
+ ## Evaluation
79
+
80
+ For the evaluation, you can use the code below. Ensure your dataset to be in following form in order to avoid any further conflict:
81
+
82
+ | path | reference|
83
+ |:----:|:--------:|
84
+ | path/to/audio_file.wav | "TRANSCRIPTION" |
85
+
86
+ also, make sure you have installed `pip install jiwer` prior to running.
87
+
88
+ ```python
89
+ import tensorflow
90
+ import torchaudio
91
+ import torch
92
+ import librosa
93
+ from datasets import load_dataset,load_metric
94
+ import numpy as np
95
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
96
+ from transformers import Wav2Vec2ProcessorWithLM
97
+
98
+ model = Wav2Vec2ForCTC.from_pretrained("SLPL/Sharif-wav2vec2")
99
+ processor = Wav2Vec2ProcessorWithLM.from_pretrained("SLPL/Sharif-wav2vec2")
100
+
101
+ def speech_file_to_array_fn(batch):
102
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
103
+ speech_array = speech_array.squeeze().numpy()
104
+ speech_array = librosa.resample(
105
+ np.asarray(speech_array),
106
+ sampling_rate,
107
+ processor.feature_extractor.sampling_rate)
108
+ batch["speech"] = speech_array
109
+ return batch
110
+
111
+ def predict(batch):
112
+ features = processor(
113
+ batch["speech"],
114
+ sampling_rate=processor.feature_extractor.sampling_rate,
115
+ return_tensors="pt",
116
+ padding=True
117
+ )
118
+
119
+ with torch.no_grad():
120
+ logits = model(
121
+ features.input_values,
122
+ attention_mask=features.attention_mask).logits
123
+ batch["prediction"] = processor.batch_decode(logits.numpy()).text
124
+ return batch
125
+
126
+ dataset = load_dataset(
127
+ "csv",
128
+ data_files={"test":"dataset.eval.csv"},
129
+ delimiter=",")["test"]
130
+ dataset = dataset.map(speech_file_to_array_fn)
131
+
132
+ result = dataset.map(predict, batched=True, batch_size=4)
133
+ wer = load_metric("wer")
134
+
135
+ print("WER: {:.2f}".format(wer.compute(
136
+ predictions=result["prediction"],
137
+ references=result["reference"])))
138
+ ```
139
+
140
+ *Result (WER) on common-voice 6.1*:
141
+
142
+ | cleaned | other |
143
+ |:---:|:---:|
144
+ | 0.06 | 0.16 |
145
+
146
+
147
+ ## Citation
148
+ If you want to cite this model you can use this:
149
+
150
+ ```bibtex
151
+ ?
152
+ ```
153
+
154
+ ### Contributions
155
+
156
+ Thanks to [@sarasadeghii](https://github.com/Sarasadeghii) and [@sadrasabouri](https://github.com/sadrasabouri) for adding this model.
fa/Sharif-wav2vec2/alphabet.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "labels": [
3
+ "",
4
+ "<s>",
5
+ "</s>",
6
+ "⁇",
7
+ " ",
8
+ "آ",
9
+ "ئ",
10
+ "ا",
11
+ "ب",
12
+ "ت",
13
+ "ث",
14
+ "ج",
15
+ "ح",
16
+ "خ",
17
+ "د",
18
+ "ذ",
19
+ "ر",
20
+ "ز",
21
+ "س",
22
+ "ش",
23
+ "ص",
24
+ "ض",
25
+ "ط",
26
+ "ظ",
27
+ "ع",
28
+ "غ",
29
+ "ف",
30
+ "ق",
31
+ "ل",
32
+ "م",
33
+ "ن",
34
+ "ه",
35
+ "و",
36
+ "پ",
37
+ "چ",
38
+ "ژ",
39
+ "ک",
40
+ "گ",
41
+ "ی",
42
+ "‌"
43
+ ],
44
+ "is_bpe": false
45
+ }