Upload folder using huggingface_hub
Browse files- CKPT.yaml +11 -0
- README.md +66 -0
- attention_pooling.ckpt +3 -0
- brain.ckpt +3 -0
- dataloader-TRAIN.ckpt +3 -0
- dialect_encoder.txt +22 -0
- hyperparams.yaml +49 -0
- optimizer.ckpt +3 -0
- output_mlp.ckpt +3 -0
- whisper.ckpt +3 -0
- whisper_opt.ckpt +3 -0
CKPT.yaml
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# yamllint disable
|
| 2 |
+
end-of-epoch: true
|
| 3 |
+
error: 2.839878559112549
|
| 4 |
+
loss: 0.18992407526573798
|
| 5 |
+
macro_f1: 0.9538202964889487
|
| 6 |
+
macro_precision: 0.952679604174255
|
| 7 |
+
macro_recall: 0.9565894020982324
|
| 8 |
+
unixtime: 1737431086.8832679
|
| 9 |
+
weighted_f1: 0.9599932477445305
|
| 10 |
+
weighted_precision: 0.9608126922866167
|
| 11 |
+
weighted_recall: 0.9601927882898965
|
README.md
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- ar
|
| 4 |
+
pipeline_tag: audio-classification
|
| 5 |
+
library_name: speechbrain
|
| 6 |
+
tags:
|
| 7 |
+
- DIalectID
|
| 8 |
+
- ADI
|
| 9 |
+
- ADI-20
|
| 10 |
+
- speechbrain
|
| 11 |
+
- Identification
|
| 12 |
+
- pytorch
|
| 13 |
+
- embeddings
|
| 14 |
+
datasets:
|
| 15 |
+
- ADI-20
|
| 16 |
+
metrics:
|
| 17 |
+
- f1
|
| 18 |
+
- precision
|
| 19 |
+
- recall
|
| 20 |
+
- accuracy
|
| 21 |
+
---
|
| 22 |
+
|
| 23 |
+
## Install Requirements
|
| 24 |
+
|
| 25 |
+
### SpeechBrain
|
| 26 |
+
First of all, please install SpeechBrain with the following command:
|
| 27 |
+
|
| 28 |
+
```bash
|
| 29 |
+
pip install git+https://github.com/speechbrain/speechbrain.git@develop
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
### Clone ADI github repository
|
| 33 |
+
```bash
|
| 34 |
+
git clone https://github.com/elyadata/ADI-20
|
| 35 |
+
cd ADI-20
|
| 36 |
+
pip install -r requirements.txt
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
### Perform Arabic Dialect Identification
|
| 41 |
+
```python
|
| 42 |
+
from inference.classifier_attention_pooling import WhisperDialectClassifier
|
| 43 |
+
|
| 44 |
+
dialect_id = WhisperDialectClassifier.from_hparams(
|
| 45 |
+
source="",
|
| 46 |
+
hparams_file="hyperparms.yaml",
|
| 47 |
+
savedir="pretrained_DID/tmp").to("cuda")
|
| 48 |
+
|
| 49 |
+
dialect_id.device = "cuda"
|
| 50 |
+
|
| 51 |
+
dialect_id.classify_file("filenane.wav")
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
### Citation
|
| 55 |
+
If using this work, please cite:
|
| 56 |
+
```
|
| 57 |
+
@inproceedings{elleuch2025adi20,
|
| 58 |
+
author = {Haroun Elleuch and Salima Mdhaffar and Yannick Estève and Fethi Bougares},
|
| 59 |
+
title = {ADI‑20: Arabic Dialect Identification Dataset and Models},
|
| 60 |
+
booktitle = {Proceedings of the Annual Conference of the International Speech Communication Association (Interspeech)},
|
| 61 |
+
year = {2025},
|
| 62 |
+
address = {Rotterdam Ahoy Convention Centre, Rotterdam, The Netherlands},
|
| 63 |
+
month = {August},
|
| 64 |
+
days = {17‑21}
|
| 65 |
+
}
|
| 66 |
+
```
|
attention_pooling.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0e015a4ed868bc4dfcec47af51a95b622037fc13becb702cc8171a223dfddfe8
|
| 3 |
+
size 6740
|
brain.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3888629ac8efb67b3b056f3fe0d026702b046af2a15e965378332f7d63c5ca8f
|
| 3 |
+
size 50
|
dataloader-TRAIN.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6a21369bcca05a0d5c2a7eb0ba00bd5dd34c28915c8c3da30553ee4043b3d5a6
|
| 3 |
+
size 5
|
dialect_encoder.txt
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
'ALG' => 0
|
| 2 |
+
'EGY' => 1
|
| 3 |
+
'IRA' => 2
|
| 4 |
+
'JOR' => 3
|
| 5 |
+
'KSA' => 4
|
| 6 |
+
'KUW' => 5
|
| 7 |
+
'LEB' => 6
|
| 8 |
+
'LIB' => 7
|
| 9 |
+
'MAU' => 8
|
| 10 |
+
'MOR' => 9
|
| 11 |
+
'OMA' => 10
|
| 12 |
+
'PAL' => 11
|
| 13 |
+
'QAT' => 12
|
| 14 |
+
'SUD' => 13
|
| 15 |
+
'SYR' => 14
|
| 16 |
+
'UAE' => 15
|
| 17 |
+
'YEM' => 16
|
| 18 |
+
'BAH' => 17
|
| 19 |
+
'MSA' => 18
|
| 20 |
+
'TUN' => 19
|
| 21 |
+
================
|
| 22 |
+
'starting_index' => 0
|
hyperparams.yaml
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ##########################################################################################
|
| 2 |
+
# Model: Whisper-large-v3 Encoder + Attion pooling for Arabic Dialect Identification
|
| 3 |
+
#
|
| 4 |
+
# Author: Haroun Elleuch
|
| 5 |
+
############################################################################################
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
pretrained_path: Elyadata/ADI-whisper-ADI20
|
| 9 |
+
whisper_hub: openai/whisper-large-v3
|
| 10 |
+
|
| 11 |
+
n_languages: 20
|
| 12 |
+
features_dim: 1280
|
| 13 |
+
|
| 14 |
+
whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper
|
| 15 |
+
source: !ref <whisper_hub>
|
| 16 |
+
encoder_only: True
|
| 17 |
+
freeze_encoder: False
|
| 18 |
+
save_path: !ref <whisper_hub>
|
| 19 |
+
|
| 20 |
+
attention_pooling: !new:speechbrain.nnet.pooling.AttentionPooling
|
| 21 |
+
input_dim: !ref <features_dim>
|
| 22 |
+
|
| 23 |
+
output_mlp: !new:speechbrain.nnet.linear.Linear
|
| 24 |
+
input_size: !ref <features_dim>
|
| 25 |
+
n_neurons: !ref <n_languages>
|
| 26 |
+
bias: False
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
modules:
|
| 30 |
+
whisper: !ref <whisper>
|
| 31 |
+
attention_pooling: !ref <attention_pooling>
|
| 32 |
+
output_mlp: !ref <output_mlp>
|
| 33 |
+
|
| 34 |
+
log_softmax: !new:speechbrain.nnet.activations.Softmax
|
| 35 |
+
apply_log: True
|
| 36 |
+
|
| 37 |
+
label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
|
| 38 |
+
|
| 39 |
+
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
|
| 40 |
+
loadables:
|
| 41 |
+
whisper: !ref <whisper>
|
| 42 |
+
attention_pooling: !ref <attention_pooling>
|
| 43 |
+
output_mlp: !ref <output_mlp>
|
| 44 |
+
label_encoder: !ref <label_encoder>
|
| 45 |
+
paths:
|
| 46 |
+
whisper: !ref <pretrained_path>/whisper.ckpt
|
| 47 |
+
attention_pooling: !ref <pretrained_path>/attention_pooling.ckpt
|
| 48 |
+
output_mlp: !ref <pretrained_path>/output_mlp.ckpt
|
| 49 |
+
label_encoder: !ref <pretrained_path>/dialect_encoder.txt
|
optimizer.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cdd58ef92828f25761d1f03453a16225327b46a9e13fb978c72e966a17cbf617
|
| 3 |
+
size 218582
|
output_mlp.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a9d99ccddfc47f7160b7a630ef475327c769eaa4b0e1fa302c7e152e377dad5c
|
| 3 |
+
size 103723
|
whisper.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5721aa93158f312d0f694a573b72ed736dce9e33217c9f01d06e8d2cb149cc17
|
| 3 |
+
size 2548162402
|
whisper_opt.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:68e246d6bf1425e5e864514f09a6c2dcd5f342939f5178923578edd00493445b
|
| 3 |
+
size 5080804356
|