WavLM
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +7 -0
- SER-Odyssey/Baseline_Model.pdf +3 -0
- SER-Odyssey/MSP-Podcast_Challenge [JMasr] +48 -24.zip +3 -0
- SER-Odyssey/MSP-Podcast_Challenge.zip +3 -0
- SER-Odyssey/Odyssey 2024 - Speech Emotion Recognition Challenge. Dataset, Baseline, Framework, and Results.pdf +3 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/.gitattributes +35 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/README.md +83 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/config.json +26 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/model.safetensors +3 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/pipeline_utils.py +165 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/pytorch_model.bin +3 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/.gitattributes +35 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/README.md +87 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/config.json +32 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/model.safetensors +3 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/pipeline_utils.py +171 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/pytorch_model.bin +3 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/.gitattributes +35 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/README.md +83 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/config.json +26 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/model.safetensors +3 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/pipeline_utils.py +165 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/pytorch_model.bin +3 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/.gitattributes +35 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/README.md +84 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/config.json +28 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/model.safetensors +3 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/pipeline_utils.py +167 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/preprocessor_config.json +3 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/pytorch_model.bin +3 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/.gitattributes +35 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/README.md +83 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/config.json +26 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/model.safetensors +3 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/pipeline_utils.py +165 -0
- SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/pytorch_model.bin +3 -0
- SER-Odyssey/SER-WavLM-Multi-Attributes/.gitattributes +37 -0
- SER-Odyssey/SER-WavLM-Multi-Attributes/README.md +228 -0
- SER-Odyssey/SER-WavLM-Multi-Attributes/onnx/ReadMe +1 -0
- SER-Odyssey/SER-WavLM-Multi-Attributes/onnx/ser_dyn.onnx +3 -0
- SER-Odyssey/SER-WavLM-Multi-Attributes/pytorch/best_weights.pt +3 -0
- SER-Odyssey/SER-WavLM-Multi-Attributes/source.txt +1 -0
- SER-Odyssey/SER-WavLM-Multi-Attributes/tensorrt/ReadMe +2 -0
- SER-Odyssey/SER-WavLM-Multi-Attributes/tensorrt/trt10_ser_fp16.plan +3 -0
- SER-Odyssey/SER-WavLM-Multi-Attributes/tensorrt/trt8_ser_dyn_fp16.plan +3 -0
- SER-Odyssey/source.txt +1 -0
- WavLM. Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing.pdf +3 -0
- tiny-random-WavLMForAudioFrameClassification-ONNX/.gitattributes +35 -0
- tiny-random-WavLMForAudioFrameClassification-ONNX/config.json +88 -0
- tiny-random-WavLMForAudioFrameClassification-ONNX/onnx/model.onnx +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
SER-Odyssey/Baseline_Model.pdf filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
SER-Odyssey/Odyssey[[:space:]]2024[[:space:]]-[[:space:]]Speech[[:space:]]Emotion[[:space:]]Recognition[[:space:]]Challenge.[[:space:]]Dataset,[[:space:]]Baseline,[[:space:]]Framework,[[:space:]]and[[:space:]]Results.pdf filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
SER-Odyssey/SER-WavLM-Multi-Attributes/tensorrt/trt10_ser_fp16.plan filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
SER-Odyssey/SER-WavLM-Multi-Attributes/tensorrt/trt8_ser_dyn_fp16.plan filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
wavlm-large-mnn/wavlm_large_fp16.mnn filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
wavlm-large-mnn/wavlm_large_int8.mnn filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
WavLM.[[:space:]]Large-Scale[[:space:]]Self-Supervised[[:space:]]Pre-Training[[:space:]]for[[:space:]]Full[[:space:]]Stack[[:space:]]Speech[[:space:]]Processing.pdf filter=lfs diff=lfs merge=lfs -text
|
SER-Odyssey/Baseline_Model.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2006e79620902e9b411dd8e110f296c9e7d2458110faa8043d900187f203e103
|
| 3 |
+
size 460836
|
SER-Odyssey/MSP-Podcast_Challenge [JMasr] +48 -24.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b53455da505412b271968d94febb011505ef41201826ba048dc7308306838a04
|
| 3 |
+
size 895217
|
SER-Odyssey/MSP-Podcast_Challenge.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c86e9700cc05734431656503b7602aa0a5f9b60be4a5a02238e87121324055a7
|
| 3 |
+
size 897745
|
SER-Odyssey/Odyssey 2024 - Speech Emotion Recognition Challenge. Dataset, Baseline, Framework, and Results.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:11f79ccab188b27218b3c5038fbec0ef21e0dca0d08af3d998e77b993d0ed31c
|
| 3 |
+
size 1083858
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/README.md
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
language:
|
| 4 |
+
- en
|
| 5 |
+
pipeline_tag: audio-classification
|
| 6 |
+
tags:
|
| 7 |
+
- wavlm
|
| 8 |
+
- msp-podcast
|
| 9 |
+
- emotion-recognition
|
| 10 |
+
- audio
|
| 11 |
+
- speech
|
| 12 |
+
- arousal
|
| 13 |
+
- lucas
|
| 14 |
+
- speech-emotion-recognition
|
| 15 |
+
---
|
| 16 |
+
The model was trained on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) for the Odyssey 2024 Emotion Recognition competition baseline<br>
|
| 17 |
+
This particular model is the single-task specialized arousal model, which predict arousal in a range of approximately 0...1.
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# Benchmarks
|
| 22 |
+
CCC based on Test3 and Development sets of the Odyssey Competition
|
| 23 |
+
<table style="width:500px">
|
| 24 |
+
<tr><th colspan=2 align="center"> Sinle-Task Setup </th></tr>
|
| 25 |
+
<tr><th colspan=1 align="center">Test 3</th><th colspan=1 align="center">Development</th></tr>
|
| 26 |
+
<tr> <td align="center">Aro</td> <td align="center">Aro</td> </tr>
|
| 27 |
+
<tr> <td align="center"> 0.566</td> <td align="center" >0.651 </td> </tr>
|
| 28 |
+
</table>
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
For more details: [demo](https://huggingface.co/spaces/3loi/WavLM-SER-Multi-Baseline-Odyssey2024), [paper](https://ecs.utdallas.edu/research/researchlabs/msp-lab/publications/Goncalves_2024.pdf), and [GitHub](https://github.com/MSP-UTD/MSP-Podcast_Challenge/tree/main).
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
```
|
| 36 |
+
@InProceedings{Goncalves_2024,
|
| 37 |
+
author={L. Goncalves and A. N. Salman and A. {Reddy Naini} and L. Moro-Velazquez and T. Thebaud and L. {Paola Garcia} and N. Dehak and B. Sisman and C. Busso},
|
| 38 |
+
title={Odyssey2024 - Speech Emotion Recognition Challenge: Dataset, Baseline Framework, and Results},
|
| 39 |
+
booktitle={Odyssey 2024: The Speaker and Language Recognition Workshop)},
|
| 40 |
+
volume={To appear},
|
| 41 |
+
year={2024},
|
| 42 |
+
month={June},
|
| 43 |
+
address = {Quebec, Canada},
|
| 44 |
+
}
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# Usage
|
| 49 |
+
```python
|
| 50 |
+
from transformers import AutoModelForAudioClassification
|
| 51 |
+
import librosa, torch
|
| 52 |
+
|
| 53 |
+
#load model
|
| 54 |
+
model = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Arousal", trust_remote_code=True)
|
| 55 |
+
|
| 56 |
+
#get mean/std
|
| 57 |
+
mean = model.config.mean
|
| 58 |
+
std = model.config.std
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
#load an audio file
|
| 62 |
+
audio_path = "/path/to/audio.wav"
|
| 63 |
+
raw_wav, _ = librosa.load(audio_path, sr=model.config.sampling_rate)
|
| 64 |
+
|
| 65 |
+
#normalize the audio by mean/std
|
| 66 |
+
norm_wav = (raw_wav - mean) / (std+0.000001)
|
| 67 |
+
|
| 68 |
+
#generate the mask
|
| 69 |
+
mask = torch.ones(1, len(norm_wav))
|
| 70 |
+
|
| 71 |
+
#batch it (add dim)
|
| 72 |
+
wavs = torch.tensor(norm_wav).unsqueeze(0)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
#predict
|
| 76 |
+
with torch.no_grad():
|
| 77 |
+
pred = model(wavs, mask)
|
| 78 |
+
|
| 79 |
+
print(model.config.id2label)
|
| 80 |
+
print(pred)
|
| 81 |
+
#{0: 'arousal'}
|
| 82 |
+
#tensor([[0.3670]])
|
| 83 |
+
```
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/config.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"SERModel"
|
| 4 |
+
],
|
| 5 |
+
"auto_map": {
|
| 6 |
+
"AutoConfig": "pipeline_utils.SERConfig",
|
| 7 |
+
"AutoModelForAudioClassification": "pipeline_utils.SERModel"
|
| 8 |
+
},
|
| 9 |
+
"id2label": {
|
| 10 |
+
"0": "arousal"
|
| 11 |
+
},
|
| 12 |
+
"sampling_rate": 16000,
|
| 13 |
+
"maxlen": 192000,
|
| 14 |
+
"mean": -8.278621631819787e-05,
|
| 15 |
+
"std": 0.08485510250851999,
|
| 16 |
+
"classifier_dropout_prob": 0.5,
|
| 17 |
+
"classifier_hidden_layers": 1,
|
| 18 |
+
"hidden_size": 1024,
|
| 19 |
+
"model_type": "ser",
|
| 20 |
+
"num_attention_heads": 16,
|
| 21 |
+
"num_classes": 1,
|
| 22 |
+
"num_hidden_layers": 24,
|
| 23 |
+
"ssl_type": "microsoft/wavlm-large",
|
| 24 |
+
"torch_dtype": "float32",
|
| 25 |
+
"transformers_version": "4.34.0.dev0"
|
| 26 |
+
}
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6513eca66ff2f599b248059ad44c41fef39d61b5cfc4995f777022c42c07106c
|
| 3 |
+
size 1274482316
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/pipeline_utils.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
from transformers import AutoModel
|
| 5 |
+
from transformers.modeling_utils import PreTrainedModel ,PretrainedConfig
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Pooling(nn.Module):
|
| 9 |
+
def __init__(self):
|
| 10 |
+
super().__init__()
|
| 11 |
+
def compute_length_from_mask(self, mask):
|
| 12 |
+
"""
|
| 13 |
+
mask: (batch_size, T)
|
| 14 |
+
Assuming that the sampling rate is 16kHz, the frame shift is 20ms
|
| 15 |
+
"""
|
| 16 |
+
wav_lens = torch.sum(mask, dim=1) # (batch_size, )
|
| 17 |
+
feat_lens = torch.div(wav_lens-1, 16000*0.02, rounding_mode="floor") + 1
|
| 18 |
+
feat_lens = feat_lens.int().tolist()
|
| 19 |
+
return feat_lens
|
| 20 |
+
|
| 21 |
+
def forward(self, x, mask):
|
| 22 |
+
raise NotImplementedError
|
| 23 |
+
|
| 24 |
+
class MeanPooling(Pooling):
|
| 25 |
+
def __init__(self):
|
| 26 |
+
super().__init__()
|
| 27 |
+
def forward(self, xs, mask):
|
| 28 |
+
"""
|
| 29 |
+
xs: (batch_size, T, feat_dim)
|
| 30 |
+
mask: (batch_size, T)
|
| 31 |
+
|
| 32 |
+
=> output: (batch_size, feat_dim)
|
| 33 |
+
"""
|
| 34 |
+
feat_lens = self.compute_length_from_mask(mask)
|
| 35 |
+
pooled_list = []
|
| 36 |
+
for x, feat_len in zip(xs, feat_lens):
|
| 37 |
+
pooled = torch.mean(x[:feat_len], dim=0) # (feat_dim, )
|
| 38 |
+
pooled_list.append(pooled)
|
| 39 |
+
pooled = torch.stack(pooled_list, dim=0) # (batch_size, feat_dim)
|
| 40 |
+
return pooled
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class AttentiveStatisticsPooling(Pooling):
|
| 44 |
+
"""
|
| 45 |
+
AttentiveStatisticsPooling
|
| 46 |
+
Paper: Attentive Statistics Pooling for Deep Speaker Embedding
|
| 47 |
+
Link: https://arxiv.org/pdf/1803.10963.pdf
|
| 48 |
+
"""
|
| 49 |
+
def __init__(self, input_size):
|
| 50 |
+
super().__init__()
|
| 51 |
+
self._indim = input_size
|
| 52 |
+
self.sap_linear = nn.Linear(input_size, input_size)
|
| 53 |
+
self.attention = nn.Parameter(torch.FloatTensor(input_size, 1))
|
| 54 |
+
torch.nn.init.normal_(self.attention, mean=0, std=1)
|
| 55 |
+
|
| 56 |
+
def forward(self, xs, mask):
|
| 57 |
+
"""
|
| 58 |
+
xs: (batch_size, T, feat_dim)
|
| 59 |
+
mask: (batch_size, T)
|
| 60 |
+
|
| 61 |
+
=> output: (batch_size, feat_dim*2)
|
| 62 |
+
"""
|
| 63 |
+
feat_lens = self.compute_length_from_mask(mask)
|
| 64 |
+
pooled_list = []
|
| 65 |
+
for x, feat_len in zip(xs, feat_lens):
|
| 66 |
+
x = x[:feat_len].unsqueeze(0)
|
| 67 |
+
h = torch.tanh(self.sap_linear(x))
|
| 68 |
+
w = torch.matmul(h, self.attention).squeeze(dim=2)
|
| 69 |
+
w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1)
|
| 70 |
+
mu = torch.sum(x * w, dim=1)
|
| 71 |
+
rh = torch.sqrt((torch.sum((x**2) * w, dim=1) - mu**2).clamp(min=1e-5))
|
| 72 |
+
x = torch.cat((mu, rh), 1).squeeze(0)
|
| 73 |
+
pooled_list.append(x)
|
| 74 |
+
return torch.stack(pooled_list)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class EmotionRegression(nn.Module):
|
| 80 |
+
def __init__(self, *args, **kwargs):
|
| 81 |
+
super(EmotionRegression, self).__init__()
|
| 82 |
+
input_dim = args[0]
|
| 83 |
+
hidden_dim = args[1]
|
| 84 |
+
num_layers = args[2]
|
| 85 |
+
output_dim = args[3]
|
| 86 |
+
p = kwargs.get("dropout", 0.5)
|
| 87 |
+
|
| 88 |
+
self.fc=nn.ModuleList([
|
| 89 |
+
nn.Sequential(
|
| 90 |
+
nn.Linear(input_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
|
| 91 |
+
)
|
| 92 |
+
])
|
| 93 |
+
for lidx in range(num_layers-1):
|
| 94 |
+
self.fc.append(
|
| 95 |
+
nn.Sequential(
|
| 96 |
+
nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
|
| 97 |
+
)
|
| 98 |
+
)
|
| 99 |
+
self.out = nn.Sequential(
|
| 100 |
+
nn.Linear(hidden_dim, output_dim)
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
self.inp_drop = nn.Dropout(p)
|
| 104 |
+
def get_repr(self, x):
|
| 105 |
+
h = self.inp_drop(x)
|
| 106 |
+
for lidx, fc in enumerate(self.fc):
|
| 107 |
+
h=fc(h)
|
| 108 |
+
return h
|
| 109 |
+
|
| 110 |
+
def forward(self, x):
|
| 111 |
+
h=self.get_repr(x)
|
| 112 |
+
result = self.out(h)
|
| 113 |
+
return result
|
| 114 |
+
|
| 115 |
+
class SERConfig(PretrainedConfig):
|
| 116 |
+
model_type = "ser"
|
| 117 |
+
|
| 118 |
+
def __init__(
|
| 119 |
+
self,
|
| 120 |
+
num_classes: int = 1,
|
| 121 |
+
num_attention_heads = 16,
|
| 122 |
+
num_hidden_layers = 24,
|
| 123 |
+
hidden_size = 1024,
|
| 124 |
+
classifier_hidden_layers = 1,
|
| 125 |
+
classifier_dropout_prob = 0.5,
|
| 126 |
+
ssl_type= "microsoft/wavlm-large",
|
| 127 |
+
torch_dtype= "float32",
|
| 128 |
+
**kwargs,
|
| 129 |
+
):
|
| 130 |
+
self.num_classes = num_classes
|
| 131 |
+
self.num_attention_heads = num_attention_heads
|
| 132 |
+
self.num_hidden_layers = num_hidden_layers
|
| 133 |
+
self.hidden_size = hidden_size
|
| 134 |
+
self.classifier_hidden_layers = classifier_hidden_layers
|
| 135 |
+
self.classifier_dropout_prob = classifier_dropout_prob
|
| 136 |
+
self.ssl_type = ssl_type
|
| 137 |
+
self.torch_dtype = torch_dtype
|
| 138 |
+
super().__init__(**kwargs)
|
| 139 |
+
|
| 140 |
+
class SERModel(PreTrainedModel):
|
| 141 |
+
config_class = SERConfig
|
| 142 |
+
|
| 143 |
+
def __init__(self, config):
|
| 144 |
+
super().__init__(config)
|
| 145 |
+
self.ssl_model = AutoModel.from_pretrained(config.ssl_type)
|
| 146 |
+
self.ssl_model.freeze_feature_encoder()
|
| 147 |
+
|
| 148 |
+
self.pool_model = AttentiveStatisticsPooling(config.hidden_size)
|
| 149 |
+
|
| 150 |
+
self.ser_model = EmotionRegression(config.hidden_size*2,
|
| 151 |
+
config.hidden_size,
|
| 152 |
+
config.classifier_hidden_layers,
|
| 153 |
+
config.num_classes,
|
| 154 |
+
dropout=config.classifier_dropout_prob)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def forward(self, x, mask):
|
| 158 |
+
ssl = self.ssl_model(x, attention_mask=mask).last_hidden_state
|
| 159 |
+
|
| 160 |
+
ssl = self.pool_model(ssl, mask)
|
| 161 |
+
|
| 162 |
+
pred = self.ser_model(ssl)
|
| 163 |
+
|
| 164 |
+
return pred
|
| 165 |
+
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2f8a77ea0603b9ab91b3ce1d03c165db58d95ebae1c4210ab12dc94459c36b60
|
| 3 |
+
size 1274585617
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/README.md
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
language:
|
| 4 |
+
- en
|
| 5 |
+
pipeline_tag: audio-classification
|
| 6 |
+
tags:
|
| 7 |
+
- wavlm
|
| 8 |
+
- msp-podcast
|
| 9 |
+
- emotion-recognition
|
| 10 |
+
- audio
|
| 11 |
+
- speech
|
| 12 |
+
- categorical
|
| 13 |
+
- lucas
|
| 14 |
+
- speech-emotion-recognition
|
| 15 |
+
---
|
| 16 |
+
The model was trained on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) for the Odyssey 2024 Emotion Recognition competition baseline<br>
|
| 17 |
+
This particular model is the categorical based model which predicts: "Angry", "Sad", "Happy", "Surprise", "Fear", "Disgust", "Contempt" and "Neutral".
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# Benchmarks
|
| 21 |
+
F1-scores based on Test3 and Development sets of the Odyssey Competition
|
| 22 |
+
<table style="width:500px">
|
| 23 |
+
<tr><th colspan=8 align="center" >Categorical Setup</th></tr>
|
| 24 |
+
<tr><th colspan=4 align="center">Test 3</th><th colspan=4 align="center">Development</th></tr>
|
| 25 |
+
<tr> <td>F1-Mic.</td> <td>F1-Ma.</td> <td>Prec.</td> <td>Rec.</td> <td>F1-Mic.</td> <td>F1-Ma.</td> <td>Prec.</td> <td>Rec.</td> </tr>
|
| 26 |
+
<tr> <td> 0.327</td> <td>0.311</td> <td>0.332</td> <td>0.325</td> <td>0.409</td> <td>0.307</td> <td>0.316</td> <td>0.345</td> </tr>
|
| 27 |
+
</table>
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
For more details: [demo](https://huggingface.co/spaces/3loi/WavLM-SER-Multi-Baseline-Odyssey2024), [paper](https://ecs.utdallas.edu/research/researchlabs/msp-lab/publications/Goncalves_2024.pdf), and [GitHub](https://github.com/MSP-UTD/MSP-Podcast_Challenge/tree/main).
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
```
|
| 35 |
+
@InProceedings{Goncalves_2024,
|
| 36 |
+
author={L. Goncalves and A. N. Salman and A. {Reddy Naini} and L. Moro-Velazquez and T. Thebaud and L. {Paola Garcia} and N. Dehak and B. Sisman and C. Busso},
|
| 37 |
+
title={Odyssey2024 - Speech Emotion Recognition Challenge: Dataset, Baseline Framework, and Results},
|
| 38 |
+
booktitle={Odyssey 2024: The Speaker and Language Recognition Workshop)},
|
| 39 |
+
volume={To appear},
|
| 40 |
+
year={2024},
|
| 41 |
+
month={June},
|
| 42 |
+
address = {Quebec, Canada},
|
| 43 |
+
}
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# Usage
|
| 48 |
+
```python
|
| 49 |
+
from transformers import AutoModelForAudioClassification
|
| 50 |
+
import librosa, torch
|
| 51 |
+
|
| 52 |
+
#load model
|
| 53 |
+
model = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes", trust_remote_code=True)
|
| 54 |
+
|
| 55 |
+
#get mean/std
|
| 56 |
+
mean = model.config.mean
|
| 57 |
+
std = model.config.std
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
#load an audio file
|
| 61 |
+
audio_path = "/path/to/audio.wav"
|
| 62 |
+
raw_wav, _ = librosa.load(audio_path, sr=model.config.sampling_rate)
|
| 63 |
+
|
| 64 |
+
#normalize the audio by mean/std
|
| 65 |
+
norm_wav = (raw_wav - mean) / (std+0.000001)
|
| 66 |
+
|
| 67 |
+
#generate the mask
|
| 68 |
+
mask = torch.ones(1, len(norm_wav))
|
| 69 |
+
|
| 70 |
+
#batch it (add dim)
|
| 71 |
+
wavs = torch.tensor(norm_wav).unsqueeze(0)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
#predict
|
| 75 |
+
with torch.no_grad():
|
| 76 |
+
pred = model(wavs, mask)
|
| 77 |
+
|
| 78 |
+
print(model.config.id2label)
|
| 79 |
+
print(pred)
|
| 80 |
+
#{0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise', 4: 'Fear', 5: 'Disgust', 6: 'Contempt', 7: 'Neutral'}
|
| 81 |
+
#tensor([[0.0015, 0.3651, 0.0593, 0.0315, 0.0600, 0.0125, 0.0319, 0.4382]])
|
| 82 |
+
|
| 83 |
+
#convert logits to probability
|
| 84 |
+
probabilities = torch.nn.functional.softmax(pred, dim=1)
|
| 85 |
+
print(probabilities)
|
| 86 |
+
#[[0.0015, 0.3651, 0.0593, 0.0315, 0.0600, 0.0125, 0.0319, 0.4382]]
|
| 87 |
+
```
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/config.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"SERModel"
|
| 4 |
+
],
|
| 5 |
+
"auto_map": {
|
| 6 |
+
"AutoConfig": "pipeline_utils.SERConfig",
|
| 7 |
+
"AutoModelForAudioClassification": "pipeline_utils.SERModel"
|
| 8 |
+
},
|
| 9 |
+
"id2label": {
|
| 10 |
+
"0": "Angry",
|
| 11 |
+
"1": "Sad",
|
| 12 |
+
"2": "Happy",
|
| 13 |
+
"3": "Surprise",
|
| 14 |
+
"4": "Fear",
|
| 15 |
+
"5": "Disgust",
|
| 16 |
+
"6": "Contempt",
|
| 17 |
+
"7": "Neutral"
|
| 18 |
+
},
|
| 19 |
+
"sampling_rate": 16000,
|
| 20 |
+
"classifier_dropout_prob": 0.5,
|
| 21 |
+
"classifier_hidden_layers": 1,
|
| 22 |
+
"hidden_size": 1024,
|
| 23 |
+
"mean": -8.278621631819787e-05,
|
| 24 |
+
"model_type": "ser",
|
| 25 |
+
"num_attention_heads": 16,
|
| 26 |
+
"num_classes": 8,
|
| 27 |
+
"num_hidden_layers": 24,
|
| 28 |
+
"ssl_type": "microsoft/wavlm-large",
|
| 29 |
+
"std": 0.08485510250851999,
|
| 30 |
+
"torch_dtype": "float32",
|
| 31 |
+
"transformers_version": "4.34.0.dev0"
|
| 32 |
+
}
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fb52f3f472b6a5a824ac238537fa60bf39a73d74b3fa5f4a4473c012cb3d18f4
|
| 3 |
+
size 1274511016
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/pipeline_utils.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
from transformers import AutoModel
|
| 5 |
+
from transformers.modeling_utils import PreTrainedModel ,PretrainedConfig
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Pooling(nn.Module):
|
| 9 |
+
def __init__(self):
|
| 10 |
+
super().__init__()
|
| 11 |
+
def compute_length_from_mask(self, mask):
|
| 12 |
+
"""
|
| 13 |
+
mask: (batch_size, T)
|
| 14 |
+
Assuming that the sampling rate is 16kHz, the frame shift is 20ms
|
| 15 |
+
"""
|
| 16 |
+
wav_lens = torch.sum(mask, dim=1) # (batch_size, )
|
| 17 |
+
feat_lens = torch.div(wav_lens-1, 16000*0.02, rounding_mode="floor") + 1
|
| 18 |
+
feat_lens = feat_lens.int().tolist()
|
| 19 |
+
return feat_lens
|
| 20 |
+
|
| 21 |
+
def forward(self, x, mask):
|
| 22 |
+
raise NotImplementedError
|
| 23 |
+
|
| 24 |
+
class MeanPooling(Pooling):
|
| 25 |
+
def __init__(self):
|
| 26 |
+
super().__init__()
|
| 27 |
+
def forward(self, xs, mask):
|
| 28 |
+
"""
|
| 29 |
+
xs: (batch_size, T, feat_dim)
|
| 30 |
+
mask: (batch_size, T)
|
| 31 |
+
|
| 32 |
+
=> output: (batch_size, feat_dim)
|
| 33 |
+
"""
|
| 34 |
+
feat_lens = self.compute_length_from_mask(mask)
|
| 35 |
+
pooled_list = []
|
| 36 |
+
for x, feat_len in zip(xs, feat_lens):
|
| 37 |
+
pooled = torch.mean(x[:feat_len], dim=0) # (feat_dim, )
|
| 38 |
+
pooled_list.append(pooled)
|
| 39 |
+
pooled = torch.stack(pooled_list, dim=0) # (batch_size, feat_dim)
|
| 40 |
+
return pooled
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class AttentiveStatisticsPooling(Pooling):
|
| 44 |
+
"""
|
| 45 |
+
AttentiveStatisticsPooling
|
| 46 |
+
Paper: Attentive Statistics Pooling for Deep Speaker Embedding
|
| 47 |
+
Link: https://arxiv.org/pdf/1803.10963.pdf
|
| 48 |
+
"""
|
| 49 |
+
def __init__(self, input_size):
|
| 50 |
+
super().__init__()
|
| 51 |
+
self._indim = input_size
|
| 52 |
+
self.sap_linear = nn.Linear(input_size, input_size)
|
| 53 |
+
self.attention = nn.Parameter(torch.FloatTensor(input_size, 1))
|
| 54 |
+
torch.nn.init.normal_(self.attention, mean=0, std=1)
|
| 55 |
+
|
| 56 |
+
def forward(self, xs, mask):
|
| 57 |
+
"""
|
| 58 |
+
xs: (batch_size, T, feat_dim)
|
| 59 |
+
mask: (batch_size, T)
|
| 60 |
+
|
| 61 |
+
=> output: (batch_size, feat_dim*2)
|
| 62 |
+
"""
|
| 63 |
+
feat_lens = self.compute_length_from_mask(mask)
|
| 64 |
+
pooled_list = []
|
| 65 |
+
for x, feat_len in zip(xs, feat_lens):
|
| 66 |
+
x = x[:feat_len].unsqueeze(0)
|
| 67 |
+
h = torch.tanh(self.sap_linear(x))
|
| 68 |
+
w = torch.matmul(h, self.attention).squeeze(dim=2)
|
| 69 |
+
w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1)
|
| 70 |
+
mu = torch.sum(x * w, dim=1)
|
| 71 |
+
rh = torch.sqrt((torch.sum((x**2) * w, dim=1) - mu**2).clamp(min=1e-5))
|
| 72 |
+
x = torch.cat((mu, rh), 1).squeeze(0)
|
| 73 |
+
pooled_list.append(x)
|
| 74 |
+
return torch.stack(pooled_list)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class EmotionRegression(nn.Module):
|
| 80 |
+
def __init__(self, *args, **kwargs):
|
| 81 |
+
super(EmotionRegression, self).__init__()
|
| 82 |
+
input_dim = args[0]
|
| 83 |
+
hidden_dim = args[1]
|
| 84 |
+
num_layers = args[2]
|
| 85 |
+
output_dim = args[3]
|
| 86 |
+
p = kwargs.get("dropout", 0.5)
|
| 87 |
+
|
| 88 |
+
self.fc=nn.ModuleList([
|
| 89 |
+
nn.Sequential(
|
| 90 |
+
nn.Linear(input_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
|
| 91 |
+
)
|
| 92 |
+
])
|
| 93 |
+
for lidx in range(num_layers-1):
|
| 94 |
+
self.fc.append(
|
| 95 |
+
nn.Sequential(
|
| 96 |
+
nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
|
| 97 |
+
)
|
| 98 |
+
)
|
| 99 |
+
self.out = nn.Sequential(
|
| 100 |
+
nn.Linear(hidden_dim, output_dim)
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
self.inp_drop = nn.Dropout(p)
|
| 104 |
+
def get_repr(self, x):
|
| 105 |
+
h = self.inp_drop(x)
|
| 106 |
+
for lidx, fc in enumerate(self.fc):
|
| 107 |
+
h=fc(h)
|
| 108 |
+
return h
|
| 109 |
+
|
| 110 |
+
def forward(self, x):
|
| 111 |
+
h=self.get_repr(x)
|
| 112 |
+
result = self.out(h)
|
| 113 |
+
return result
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
class SERConfig(PretrainedConfig):
|
| 117 |
+
model_type = "ser"
|
| 118 |
+
|
| 119 |
+
def __init__(
|
| 120 |
+
self,
|
| 121 |
+
num_classes: int = 8,
|
| 122 |
+
num_attention_heads = 16,
|
| 123 |
+
num_hidden_layers = 24,
|
| 124 |
+
hidden_size = 1024,
|
| 125 |
+
classifier_hidden_layers = 1,
|
| 126 |
+
classifier_dropout_prob = 0.5,
|
| 127 |
+
ssl_type= "microsoft/wavlm-large",
|
| 128 |
+
torch_dtype= "float32",
|
| 129 |
+
mean= -8.278621631819787e-05,
|
| 130 |
+
std=0.08485510250851999,
|
| 131 |
+
**kwargs,
|
| 132 |
+
):
|
| 133 |
+
self.num_classes = num_classes
|
| 134 |
+
self.num_attention_heads = num_attention_heads
|
| 135 |
+
self.num_hidden_layers = num_hidden_layers
|
| 136 |
+
self.hidden_size = hidden_size
|
| 137 |
+
self.classifier_hidden_layers = classifier_hidden_layers
|
| 138 |
+
self.classifier_dropout_prob = classifier_dropout_prob
|
| 139 |
+
self.ssl_type = ssl_type
|
| 140 |
+
self.torch_dtype = torch_dtype
|
| 141 |
+
|
| 142 |
+
self.mean = mean
|
| 143 |
+
self.std = std
|
| 144 |
+
super().__init__(**kwargs)
|
| 145 |
+
|
| 146 |
+
class SERModel(PreTrainedModel):
|
| 147 |
+
config_class = SERConfig
|
| 148 |
+
|
| 149 |
+
def __init__(self, config):
|
| 150 |
+
super().__init__(config)
|
| 151 |
+
self.ssl_model = AutoModel.from_pretrained(config.ssl_type)
|
| 152 |
+
self.ssl_model.freeze_feature_encoder()
|
| 153 |
+
|
| 154 |
+
self.pool_model = AttentiveStatisticsPooling(config.hidden_size)
|
| 155 |
+
|
| 156 |
+
self.ser_model = EmotionRegression(config.hidden_size*2,
|
| 157 |
+
config.hidden_size,
|
| 158 |
+
config.classifier_hidden_layers,
|
| 159 |
+
config.num_classes,
|
| 160 |
+
dropout=config.classifier_dropout_prob)
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def forward(self, x, mask):
|
| 164 |
+
ssl = self.ssl_model(x, attention_mask=mask).last_hidden_state
|
| 165 |
+
|
| 166 |
+
ssl = self.pool_model(ssl, mask)
|
| 167 |
+
|
| 168 |
+
pred = self.ser_model(ssl)
|
| 169 |
+
|
| 170 |
+
return pred
|
| 171 |
+
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:446f71c92a67b69977c50b065a0e418c37fa20aba1d2e44ecb1190d97f9c0cbb
|
| 3 |
+
size 1274614289
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/README.md
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
language:
|
| 4 |
+
- en
|
| 5 |
+
pipeline_tag: audio-classification
|
| 6 |
+
tags:
|
| 7 |
+
- wavlm
|
| 8 |
+
- msp-podcast
|
| 9 |
+
- emotion-recognition
|
| 10 |
+
- audio
|
| 11 |
+
- speech
|
| 12 |
+
- dominance
|
| 13 |
+
- lucas
|
| 14 |
+
- speech-emotion-recognition
|
| 15 |
+
---
|
| 16 |
+
The model was trained on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) for the Odyssey 2024 Emotion Recognition competition baseline<br>
|
| 17 |
+
This particular model is the single-task specialized dominance model, which predict dominance in a range of approximately 0...1.
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# Benchmarks
|
| 22 |
+
CCC based on Test3 and Development sets of the Odyssey Competition
|
| 23 |
+
<table style="width:500px">
|
| 24 |
+
<tr><th colspan=2 align="center"> Sinle-Task Setup </th></tr>
|
| 25 |
+
<tr><th colspan=1 align="center">Test 3</th><th colspan=1 align="center">Development</th></tr>
|
| 26 |
+
<tr> <td align="center">Dom</td> <td align="center">Dom</td> </tr>
|
| 27 |
+
<tr> <td align="center"> 0.424</td> <td align="center" >0.584 </td> </tr>
|
| 28 |
+
</table>
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
For more details: [demo](https://huggingface.co/spaces/3loi/WavLM-SER-Multi-Baseline-Odyssey2024), [paper](https://ecs.utdallas.edu/research/researchlabs/msp-lab/publications/Goncalves_2024.pdf) and [GitHub](https://github.com/MSP-UTD/MSP-Podcast_Challenge/tree/main).
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
```
|
| 36 |
+
@InProceedings{Goncalves_2024,
|
| 37 |
+
author={L. Goncalves and A. N. Salman and A. {Reddy Naini} and L. Moro-Velazquez and T. Thebaud and L. {Paola Garcia} and N. Dehak and B. Sisman and C. Busso},
|
| 38 |
+
title={Odyssey2024 - Speech Emotion Recognition Challenge: Dataset, Baseline Framework, and Results},
|
| 39 |
+
booktitle={Odyssey 2024: The Speaker and Language Recognition Workshop)},
|
| 40 |
+
volume={To appear},
|
| 41 |
+
year={2024},
|
| 42 |
+
month={June},
|
| 43 |
+
address = {Quebec, Canada},
|
| 44 |
+
}
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# Usage
|
| 49 |
+
```python
|
| 50 |
+
from transformers import AutoModelForAudioClassification
|
| 51 |
+
import librosa, torch
|
| 52 |
+
|
| 53 |
+
#load model
|
| 54 |
+
model = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Dominance", trust_remote_code=True)
|
| 55 |
+
|
| 56 |
+
#get mean/std
|
| 57 |
+
mean = model.config.mean
|
| 58 |
+
std = model.config.std
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
#load an audio file
|
| 62 |
+
audio_path = "/path/to/audio.wav"
|
| 63 |
+
raw_wav, _ = librosa.load(audio_path, sr=model.config.sampling_rate)
|
| 64 |
+
|
| 65 |
+
#normalize the audio by mean/std
|
| 66 |
+
norm_wav = (raw_wav - mean) / (std+0.000001)
|
| 67 |
+
|
| 68 |
+
#generate the mask
|
| 69 |
+
mask = torch.ones(1, len(norm_wav))
|
| 70 |
+
|
| 71 |
+
#batch it (add dim)
|
| 72 |
+
wavs = torch.tensor(norm_wav).unsqueeze(0)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
#predict
|
| 76 |
+
with torch.no_grad():
|
| 77 |
+
pred = model(wavs, mask)
|
| 78 |
+
|
| 79 |
+
print(model.config.id2label)
|
| 80 |
+
print(pred)
|
| 81 |
+
#{0: 'dominance'}
|
| 82 |
+
#tensor([[0.3670]])
|
| 83 |
+
```
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/config.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"SERModel"
|
| 4 |
+
],
|
| 5 |
+
"auto_map": {
|
| 6 |
+
"AutoConfig": "pipeline_utils.SERConfig",
|
| 7 |
+
"AutoModelForAudioClassification": "pipeline_utils.SERModel"
|
| 8 |
+
},
|
| 9 |
+
"id2label": {
|
| 10 |
+
"0": "dominance"
|
| 11 |
+
},
|
| 12 |
+
"sampling_rate": 16000,
|
| 13 |
+
"maxlen": 192000,
|
| 14 |
+
"mean": -8.278621631819787e-05,
|
| 15 |
+
"std": 0.08485510250851999,
|
| 16 |
+
"classifier_dropout_prob": 0.5,
|
| 17 |
+
"classifier_hidden_layers": 1,
|
| 18 |
+
"hidden_size": 1024,
|
| 19 |
+
"model_type": "ser",
|
| 20 |
+
"num_attention_heads": 16,
|
| 21 |
+
"num_classes": 1,
|
| 22 |
+
"num_hidden_layers": 24,
|
| 23 |
+
"ssl_type": "microsoft/wavlm-large",
|
| 24 |
+
"torch_dtype": "float32",
|
| 25 |
+
"transformers_version": "4.34.0.dev0"
|
| 26 |
+
}
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e5279e2387d029fb3c7529830546a876518bc32e264d61a21a593d708c9491e0
|
| 3 |
+
size 1274482316
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/pipeline_utils.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
from transformers import AutoModel
|
| 5 |
+
from transformers.modeling_utils import PreTrainedModel ,PretrainedConfig
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Pooling(nn.Module):
|
| 9 |
+
def __init__(self):
|
| 10 |
+
super().__init__()
|
| 11 |
+
def compute_length_from_mask(self, mask):
|
| 12 |
+
"""
|
| 13 |
+
mask: (batch_size, T)
|
| 14 |
+
Assuming that the sampling rate is 16kHz, the frame shift is 20ms
|
| 15 |
+
"""
|
| 16 |
+
wav_lens = torch.sum(mask, dim=1) # (batch_size, )
|
| 17 |
+
feat_lens = torch.div(wav_lens-1, 16000*0.02, rounding_mode="floor") + 1
|
| 18 |
+
feat_lens = feat_lens.int().tolist()
|
| 19 |
+
return feat_lens
|
| 20 |
+
|
| 21 |
+
def forward(self, x, mask):
|
| 22 |
+
raise NotImplementedError
|
| 23 |
+
|
| 24 |
+
class MeanPooling(Pooling):
|
| 25 |
+
def __init__(self):
|
| 26 |
+
super().__init__()
|
| 27 |
+
def forward(self, xs, mask):
|
| 28 |
+
"""
|
| 29 |
+
xs: (batch_size, T, feat_dim)
|
| 30 |
+
mask: (batch_size, T)
|
| 31 |
+
|
| 32 |
+
=> output: (batch_size, feat_dim)
|
| 33 |
+
"""
|
| 34 |
+
feat_lens = self.compute_length_from_mask(mask)
|
| 35 |
+
pooled_list = []
|
| 36 |
+
for x, feat_len in zip(xs, feat_lens):
|
| 37 |
+
pooled = torch.mean(x[:feat_len], dim=0) # (feat_dim, )
|
| 38 |
+
pooled_list.append(pooled)
|
| 39 |
+
pooled = torch.stack(pooled_list, dim=0) # (batch_size, feat_dim)
|
| 40 |
+
return pooled
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class AttentiveStatisticsPooling(Pooling):
|
| 44 |
+
"""
|
| 45 |
+
AttentiveStatisticsPooling
|
| 46 |
+
Paper: Attentive Statistics Pooling for Deep Speaker Embedding
|
| 47 |
+
Link: https://arxiv.org/pdf/1803.10963.pdf
|
| 48 |
+
"""
|
| 49 |
+
def __init__(self, input_size):
|
| 50 |
+
super().__init__()
|
| 51 |
+
self._indim = input_size
|
| 52 |
+
self.sap_linear = nn.Linear(input_size, input_size)
|
| 53 |
+
self.attention = nn.Parameter(torch.FloatTensor(input_size, 1))
|
| 54 |
+
torch.nn.init.normal_(self.attention, mean=0, std=1)
|
| 55 |
+
|
| 56 |
+
def forward(self, xs, mask):
|
| 57 |
+
"""
|
| 58 |
+
xs: (batch_size, T, feat_dim)
|
| 59 |
+
mask: (batch_size, T)
|
| 60 |
+
|
| 61 |
+
=> output: (batch_size, feat_dim*2)
|
| 62 |
+
"""
|
| 63 |
+
feat_lens = self.compute_length_from_mask(mask)
|
| 64 |
+
pooled_list = []
|
| 65 |
+
for x, feat_len in zip(xs, feat_lens):
|
| 66 |
+
x = x[:feat_len].unsqueeze(0)
|
| 67 |
+
h = torch.tanh(self.sap_linear(x))
|
| 68 |
+
w = torch.matmul(h, self.attention).squeeze(dim=2)
|
| 69 |
+
w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1)
|
| 70 |
+
mu = torch.sum(x * w, dim=1)
|
| 71 |
+
rh = torch.sqrt((torch.sum((x**2) * w, dim=1) - mu**2).clamp(min=1e-5))
|
| 72 |
+
x = torch.cat((mu, rh), 1).squeeze(0)
|
| 73 |
+
pooled_list.append(x)
|
| 74 |
+
return torch.stack(pooled_list)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class EmotionRegression(nn.Module):
|
| 80 |
+
def __init__(self, *args, **kwargs):
|
| 81 |
+
super(EmotionRegression, self).__init__()
|
| 82 |
+
input_dim = args[0]
|
| 83 |
+
hidden_dim = args[1]
|
| 84 |
+
num_layers = args[2]
|
| 85 |
+
output_dim = args[3]
|
| 86 |
+
p = kwargs.get("dropout", 0.5)
|
| 87 |
+
|
| 88 |
+
self.fc=nn.ModuleList([
|
| 89 |
+
nn.Sequential(
|
| 90 |
+
nn.Linear(input_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
|
| 91 |
+
)
|
| 92 |
+
])
|
| 93 |
+
for lidx in range(num_layers-1):
|
| 94 |
+
self.fc.append(
|
| 95 |
+
nn.Sequential(
|
| 96 |
+
nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
|
| 97 |
+
)
|
| 98 |
+
)
|
| 99 |
+
self.out = nn.Sequential(
|
| 100 |
+
nn.Linear(hidden_dim, output_dim)
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
self.inp_drop = nn.Dropout(p)
|
| 104 |
+
def get_repr(self, x):
|
| 105 |
+
h = self.inp_drop(x)
|
| 106 |
+
for lidx, fc in enumerate(self.fc):
|
| 107 |
+
h=fc(h)
|
| 108 |
+
return h
|
| 109 |
+
|
| 110 |
+
def forward(self, x):
|
| 111 |
+
h=self.get_repr(x)
|
| 112 |
+
result = self.out(h)
|
| 113 |
+
return result
|
| 114 |
+
|
| 115 |
+
class SERConfig(PretrainedConfig):
|
| 116 |
+
model_type = "ser"
|
| 117 |
+
|
| 118 |
+
def __init__(
|
| 119 |
+
self,
|
| 120 |
+
num_classes: int = 1,
|
| 121 |
+
num_attention_heads = 16,
|
| 122 |
+
num_hidden_layers = 24,
|
| 123 |
+
hidden_size = 1024,
|
| 124 |
+
classifier_hidden_layers = 1,
|
| 125 |
+
classifier_dropout_prob = 0.5,
|
| 126 |
+
ssl_type= "microsoft/wavlm-large",
|
| 127 |
+
torch_dtype= "float32",
|
| 128 |
+
**kwargs,
|
| 129 |
+
):
|
| 130 |
+
self.num_classes = num_classes
|
| 131 |
+
self.num_attention_heads = num_attention_heads
|
| 132 |
+
self.num_hidden_layers = num_hidden_layers
|
| 133 |
+
self.hidden_size = hidden_size
|
| 134 |
+
self.classifier_hidden_layers = classifier_hidden_layers
|
| 135 |
+
self.classifier_dropout_prob = classifier_dropout_prob
|
| 136 |
+
self.ssl_type = ssl_type
|
| 137 |
+
self.torch_dtype = torch_dtype
|
| 138 |
+
super().__init__(**kwargs)
|
| 139 |
+
|
| 140 |
+
class SERModel(PreTrainedModel):
|
| 141 |
+
config_class = SERConfig
|
| 142 |
+
|
| 143 |
+
def __init__(self, config):
|
| 144 |
+
super().__init__(config)
|
| 145 |
+
self.ssl_model = AutoModel.from_pretrained(config.ssl_type)
|
| 146 |
+
self.ssl_model.freeze_feature_encoder()
|
| 147 |
+
|
| 148 |
+
self.pool_model = AttentiveStatisticsPooling(config.hidden_size)
|
| 149 |
+
|
| 150 |
+
self.ser_model = EmotionRegression(config.hidden_size*2,
|
| 151 |
+
config.hidden_size,
|
| 152 |
+
config.classifier_hidden_layers,
|
| 153 |
+
config.num_classes,
|
| 154 |
+
dropout=config.classifier_dropout_prob)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def forward(self, x, mask):
|
| 158 |
+
ssl = self.ssl_model(x, attention_mask=mask).last_hidden_state
|
| 159 |
+
|
| 160 |
+
ssl = self.pool_model(ssl, mask)
|
| 161 |
+
|
| 162 |
+
pred = self.ser_model(ssl)
|
| 163 |
+
|
| 164 |
+
return pred
|
| 165 |
+
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6fc0167d183d89114be10df1c4e4f74040b558408efee99a71fcf5205865ef2
|
| 3 |
+
size 1274585617
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/README.md
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
language:
|
| 4 |
+
- en
|
| 5 |
+
pipeline_tag: audio-classification
|
| 6 |
+
tags:
|
| 7 |
+
- wavlm
|
| 8 |
+
- msp-podcast
|
| 9 |
+
- emotion-recognition
|
| 10 |
+
- audio
|
| 11 |
+
- speech
|
| 12 |
+
- valence
|
| 13 |
+
- arousal
|
| 14 |
+
- dominance
|
| 15 |
+
- lucas
|
| 16 |
+
- speech-emotion-recognition
|
| 17 |
+
---
|
| 18 |
+
The model was trained on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) for the Odyssey 2024 Emotion Recognition competition baseline<br>
|
| 19 |
+
This particular model is the multi-attributed based model which predict arousal, dominance and valence in a range of approximately 0...1.
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# Benchmarks
|
| 23 |
+
CCC based on Test3 and Development sets of the Odyssey Competition
|
| 24 |
+
<table style="width:500px">
|
| 25 |
+
<tr><th colspan=6 align="center" >Multi-Task Setup</th></tr>
|
| 26 |
+
<tr><th colspan=3 align="center">Test 3</th><th colspan=3 align="center">Development</th></tr>
|
| 27 |
+
<tr> <td>Val</td> <td>Dom</td> <td>Aro</td> <td>Val</td> <td>Dom</td> <td>Aro</td> </tr>
|
| 28 |
+
<tr> <td> 0.577</td> <td>0.577</td> <td>0.405</td> <td>0.652</td> <td>0.688</td> <td>0.579</td> </tr>
|
| 29 |
+
</table>
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
For more details: [demo](https://huggingface.co/spaces/3loi/WavLM-SER-Multi-Baseline-Odyssey2024), [paper](https://ecs.utdallas.edu/research/researchlabs/msp-lab/publications/Goncalves_2024.pdf), and [GitHub](https://github.com/MSP-UTD/MSP-Podcast_Challenge/tree/main).
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
```
|
| 37 |
+
@InProceedings{Goncalves_2024,
|
| 38 |
+
author={L. Goncalves and A. N. Salman and A. {Reddy Naini} and L. Moro-Velazquez and T. Thebaud and L. {Paola Garcia} and N. Dehak and B. Sisman and C. Busso},
|
| 39 |
+
title={Odyssey2024 - Speech Emotion Recognition Challenge: Dataset, Baseline Framework, and Results},
|
| 40 |
+
booktitle={Odyssey 2024: The Speaker and Language Recognition Workshop)},
|
| 41 |
+
volume={To appear},
|
| 42 |
+
year={2024},
|
| 43 |
+
month={June},
|
| 44 |
+
address = {Quebec, Canada},
|
| 45 |
+
}
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# Usage
|
| 50 |
+
```python
|
| 51 |
+
from transformers import AutoModelForAudioClassification
|
| 52 |
+
import librosa, torch
|
| 53 |
+
|
| 54 |
+
#load model
|
| 55 |
+
model = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes", trust_remote_code=True)
|
| 56 |
+
|
| 57 |
+
#get mean/std
|
| 58 |
+
mean = model.config.mean
|
| 59 |
+
std = model.config.std
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
#load an audio file
|
| 63 |
+
audio_path = "/path/to/audio.wav"
|
| 64 |
+
raw_wav, _ = librosa.load(audio_path, sr=model.config.sampling_rate)
|
| 65 |
+
|
| 66 |
+
#normalize the audio by mean/std
|
| 67 |
+
norm_wav = (raw_wav - mean) / (std+0.000001)
|
| 68 |
+
|
| 69 |
+
#generate the mask
|
| 70 |
+
mask = torch.ones(1, len(norm_wav))
|
| 71 |
+
|
| 72 |
+
#batch it (add dim)
|
| 73 |
+
wavs = torch.tensor(norm_wav).unsqueeze(0)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
#predict
|
| 77 |
+
with torch.no_grad():
|
| 78 |
+
pred = model(wavs, mask)
|
| 79 |
+
|
| 80 |
+
print(model.config.id2label)
|
| 81 |
+
print(pred)
|
| 82 |
+
#{0: 'arousal', 1: 'dominance', 2: 'valence'}
|
| 83 |
+
#tensor([[0.3670, 0.4553, 0.4240]])
|
| 84 |
+
```
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"SERModel"
|
| 4 |
+
],
|
| 5 |
+
"auto_map": {
|
| 6 |
+
"AutoConfig": "pipeline_utils.SERConfig",
|
| 7 |
+
"AutoModelForAudioClassification": "pipeline_utils.SERModel"
|
| 8 |
+
},
|
| 9 |
+
"id2label": {
|
| 10 |
+
"0": "arousal",
|
| 11 |
+
"1": "dominance",
|
| 12 |
+
"2": "valence"
|
| 13 |
+
},
|
| 14 |
+
"sampling_rate": 16000,
|
| 15 |
+
"maxlen": 192000,
|
| 16 |
+
"mean": -8.278621631819787e-05,
|
| 17 |
+
"std": 0.08485510250851999,
|
| 18 |
+
"classifier_dropout_prob": 0.5,
|
| 19 |
+
"classifier_hidden_layers": 1,
|
| 20 |
+
"hidden_size": 1024,
|
| 21 |
+
"model_type": "ser",
|
| 22 |
+
"num_attention_heads": 16,
|
| 23 |
+
"num_classes": 3,
|
| 24 |
+
"num_hidden_layers": 24,
|
| 25 |
+
"ssl_type": "microsoft/wavlm-large",
|
| 26 |
+
"torch_dtype": "float32",
|
| 27 |
+
"transformers_version": "4.34.0.dev0"
|
| 28 |
+
}
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:557ba9b4aa8461a60bc7f5c5bd2e34b4de34d4c8ccfa684c438b6cbdc1893c9d
|
| 3 |
+
size 1274490516
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/pipeline_utils.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
from transformers import AutoModel
|
| 5 |
+
from transformers.modeling_utils import PreTrainedModel ,PretrainedConfig
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class Pooling(nn.Module):
|
| 11 |
+
def __init__(self):
|
| 12 |
+
super().__init__()
|
| 13 |
+
def compute_length_from_mask(self, mask):
|
| 14 |
+
"""
|
| 15 |
+
mask: (batch_size, T)
|
| 16 |
+
Assuming that the sampling rate is 16kHz, the frame shift is 20ms
|
| 17 |
+
"""
|
| 18 |
+
wav_lens = torch.sum(mask, dim=1) # (batch_size, )
|
| 19 |
+
feat_lens = torch.div(wav_lens-1, 16000*0.02, rounding_mode="floor") + 1
|
| 20 |
+
feat_lens = feat_lens.int().tolist()
|
| 21 |
+
return feat_lens
|
| 22 |
+
|
| 23 |
+
def forward(self, x, mask):
|
| 24 |
+
raise NotImplementedError
|
| 25 |
+
|
| 26 |
+
class MeanPooling(Pooling):
|
| 27 |
+
def __init__(self):
|
| 28 |
+
super().__init__()
|
| 29 |
+
def forward(self, xs, mask):
|
| 30 |
+
"""
|
| 31 |
+
xs: (batch_size, T, feat_dim)
|
| 32 |
+
mask: (batch_size, T)
|
| 33 |
+
|
| 34 |
+
=> output: (batch_size, feat_dim)
|
| 35 |
+
"""
|
| 36 |
+
feat_lens = self.compute_length_from_mask(mask)
|
| 37 |
+
pooled_list = []
|
| 38 |
+
for x, feat_len in zip(xs, feat_lens):
|
| 39 |
+
pooled = torch.mean(x[:feat_len], dim=0) # (feat_dim, )
|
| 40 |
+
pooled_list.append(pooled)
|
| 41 |
+
pooled = torch.stack(pooled_list, dim=0) # (batch_size, feat_dim)
|
| 42 |
+
return pooled
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class AttentiveStatisticsPooling(Pooling):
|
| 46 |
+
"""
|
| 47 |
+
AttentiveStatisticsPooling
|
| 48 |
+
Paper: Attentive Statistics Pooling for Deep Speaker Embedding
|
| 49 |
+
Link: https://arxiv.org/pdf/1803.10963.pdf
|
| 50 |
+
"""
|
| 51 |
+
def __init__(self, input_size):
|
| 52 |
+
super().__init__()
|
| 53 |
+
self._indim = input_size
|
| 54 |
+
self.sap_linear = nn.Linear(input_size, input_size)
|
| 55 |
+
self.attention = nn.Parameter(torch.FloatTensor(input_size, 1))
|
| 56 |
+
torch.nn.init.normal_(self.attention, mean=0, std=1)
|
| 57 |
+
|
| 58 |
+
def forward(self, xs, mask):
|
| 59 |
+
"""
|
| 60 |
+
xs: (batch_size, T, feat_dim)
|
| 61 |
+
mask: (batch_size, T)
|
| 62 |
+
|
| 63 |
+
=> output: (batch_size, feat_dim*2)
|
| 64 |
+
"""
|
| 65 |
+
feat_lens = self.compute_length_from_mask(mask)
|
| 66 |
+
pooled_list = []
|
| 67 |
+
for x, feat_len in zip(xs, feat_lens):
|
| 68 |
+
x = x[:feat_len].unsqueeze(0)
|
| 69 |
+
h = torch.tanh(self.sap_linear(x))
|
| 70 |
+
w = torch.matmul(h, self.attention).squeeze(dim=2)
|
| 71 |
+
w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1)
|
| 72 |
+
mu = torch.sum(x * w, dim=1)
|
| 73 |
+
rh = torch.sqrt((torch.sum((x**2) * w, dim=1) - mu**2).clamp(min=1e-5))
|
| 74 |
+
x = torch.cat((mu, rh), 1).squeeze(0)
|
| 75 |
+
pooled_list.append(x)
|
| 76 |
+
return torch.stack(pooled_list)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class EmotionRegression(nn.Module):
|
| 82 |
+
def __init__(self, *args, **kwargs):
|
| 83 |
+
super(EmotionRegression, self).__init__()
|
| 84 |
+
input_dim = args[0]
|
| 85 |
+
hidden_dim = args[1]
|
| 86 |
+
num_layers = args[2]
|
| 87 |
+
output_dim = args[3]
|
| 88 |
+
p = kwargs.get("dropout", 0.5)
|
| 89 |
+
|
| 90 |
+
self.fc=nn.ModuleList([
|
| 91 |
+
nn.Sequential(
|
| 92 |
+
nn.Linear(input_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
|
| 93 |
+
)
|
| 94 |
+
])
|
| 95 |
+
for lidx in range(num_layers-1):
|
| 96 |
+
self.fc.append(
|
| 97 |
+
nn.Sequential(
|
| 98 |
+
nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
|
| 99 |
+
)
|
| 100 |
+
)
|
| 101 |
+
self.out = nn.Sequential(
|
| 102 |
+
nn.Linear(hidden_dim, output_dim)
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
self.inp_drop = nn.Dropout(p)
|
| 106 |
+
def get_repr(self, x):
|
| 107 |
+
h = self.inp_drop(x)
|
| 108 |
+
for lidx, fc in enumerate(self.fc):
|
| 109 |
+
h=fc(h)
|
| 110 |
+
return h
|
| 111 |
+
|
| 112 |
+
def forward(self, x):
|
| 113 |
+
h=self.get_repr(x)
|
| 114 |
+
result = self.out(h)
|
| 115 |
+
return result
|
| 116 |
+
|
| 117 |
+
class SERConfig(PretrainedConfig):
|
| 118 |
+
model_type = "ser"
|
| 119 |
+
|
| 120 |
+
def __init__(
|
| 121 |
+
self,
|
| 122 |
+
num_classes: int = 3,
|
| 123 |
+
num_attention_heads = 16,
|
| 124 |
+
num_hidden_layers = 24,
|
| 125 |
+
hidden_size = 1024,
|
| 126 |
+
classifier_hidden_layers = 1,
|
| 127 |
+
classifier_dropout_prob = 0.5,
|
| 128 |
+
ssl_type= "microsoft/wavlm-large",
|
| 129 |
+
torch_dtype= "float32",
|
| 130 |
+
**kwargs,
|
| 131 |
+
):
|
| 132 |
+
self.num_classes = num_classes
|
| 133 |
+
self.num_attention_heads = num_attention_heads
|
| 134 |
+
self.num_hidden_layers = num_hidden_layers
|
| 135 |
+
self.hidden_size = hidden_size
|
| 136 |
+
self.classifier_hidden_layers = classifier_hidden_layers
|
| 137 |
+
self.classifier_dropout_prob = classifier_dropout_prob
|
| 138 |
+
self.ssl_type = ssl_type
|
| 139 |
+
self.torch_dtype = torch_dtype
|
| 140 |
+
super().__init__(**kwargs)
|
| 141 |
+
|
| 142 |
+
class SERModel(PreTrainedModel):
|
| 143 |
+
config_class = SERConfig
|
| 144 |
+
|
| 145 |
+
def __init__(self, config):
|
| 146 |
+
super().__init__(config)
|
| 147 |
+
self.ssl_model = AutoModel.from_pretrained(config.ssl_type)
|
| 148 |
+
self.ssl_model.freeze_feature_encoder()
|
| 149 |
+
|
| 150 |
+
self.pool_model = AttentiveStatisticsPooling(config.hidden_size)
|
| 151 |
+
|
| 152 |
+
self.ser_model = EmotionRegression(config.hidden_size*2,
|
| 153 |
+
config.hidden_size,
|
| 154 |
+
config.classifier_hidden_layers,
|
| 155 |
+
config.num_classes,
|
| 156 |
+
dropout=config.classifier_dropout_prob)
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def forward(self, x, mask):
|
| 160 |
+
ssl = self.ssl_model(x, attention_mask=mask).last_hidden_state
|
| 161 |
+
|
| 162 |
+
ssl = self.pool_model(ssl, mask)
|
| 163 |
+
|
| 164 |
+
pred = self.ser_model(ssl)
|
| 165 |
+
|
| 166 |
+
return pred
|
| 167 |
+
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/preprocessor_config.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"mean": 10
|
| 3 |
+
}
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5c34b4fd571efce7b4530a7539f1928213d535f6be19b2324bceca0c08c3e601
|
| 3 |
+
size 1274593809
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/README.md
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
language:
|
| 4 |
+
- en
|
| 5 |
+
pipeline_tag: audio-classification
|
| 6 |
+
tags:
|
| 7 |
+
- wavlm
|
| 8 |
+
- msp-podcast
|
| 9 |
+
- emotion-recognition
|
| 10 |
+
- audio
|
| 11 |
+
- speech
|
| 12 |
+
- valence
|
| 13 |
+
- lucas
|
| 14 |
+
- speech-emotion-recognition
|
| 15 |
+
---
|
| 16 |
+
The model was trained on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) for the Odyssey 2024 Emotion Recognition competition baseline<br>
|
| 17 |
+
This particular model is the single-task specialized valence model, which predict valence in a range of approximately 0...1.
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# Benchmarks
|
| 22 |
+
CCC based on Test3 and Development sets of the Odyssey Competition
|
| 23 |
+
<table style="width:500px">
|
| 24 |
+
<tr><th colspan=2 align="center"> Sinle-Task Setup </th></tr>
|
| 25 |
+
<tr><th colspan=1 align="center">Test 3</th><th colspan=1 align="center">Development</th></tr>
|
| 26 |
+
<tr> <td align="center">Val</td> <td align="center">Val</td> </tr>
|
| 27 |
+
<tr> <td align="center"> 0.607</td> <td align="center" >0.709 </td> </tr>
|
| 28 |
+
</table>
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
For more details: [demo](https://huggingface.co/spaces/3loi/WavLM-SER-Multi-Baseline-Odyssey2024), [paper](https://ecs.utdallas.edu/research/researchlabs/msp-lab/publications/Goncalves_2024.pdf), and [GitHub](https://github.com/MSP-UTD/MSP-Podcast_Challenge/tree/main).
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
```
|
| 36 |
+
@InProceedings{Goncalves_2024,
|
| 37 |
+
author={L. Goncalves and A. N. Salman and A. {Reddy Naini} and L. Moro-Velazquez and T. Thebaud and L. {Paola Garcia} and N. Dehak and B. Sisman and C. Busso},
|
| 38 |
+
title={Odyssey2024 - Speech Emotion Recognition Challenge: Dataset, Baseline Framework, and Results},
|
| 39 |
+
booktitle={Odyssey 2024: The Speaker and Language Recognition Workshop)},
|
| 40 |
+
volume={To appear},
|
| 41 |
+
year={2024},
|
| 42 |
+
month={June},
|
| 43 |
+
address = {Quebec, Canada},
|
| 44 |
+
}
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# Usage
|
| 49 |
+
```python
|
| 50 |
+
from transformers import AutoModelForAudioClassification
|
| 51 |
+
import librosa, torch
|
| 52 |
+
|
| 53 |
+
#load model
|
| 54 |
+
model = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Valence", trust_remote_code=True)
|
| 55 |
+
|
| 56 |
+
#get mean/std
|
| 57 |
+
mean = model.config.mean
|
| 58 |
+
std = model.config.std
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
#load an audio file
|
| 62 |
+
audio_path = "/path/to/audio.wav"
|
| 63 |
+
raw_wav, _ = librosa.load(audio_path, sr=model.config.sampling_rate)
|
| 64 |
+
|
| 65 |
+
#normalize the audio by mean/std
|
| 66 |
+
norm_wav = (raw_wav - mean) / (std+0.000001)
|
| 67 |
+
|
| 68 |
+
#generate the mask
|
| 69 |
+
mask = torch.ones(1, len(norm_wav))
|
| 70 |
+
|
| 71 |
+
#batch it (add dim)
|
| 72 |
+
wavs = torch.tensor(norm_wav).unsqueeze(0)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
#predict
|
| 76 |
+
with torch.no_grad():
|
| 77 |
+
pred = model(wavs, mask)
|
| 78 |
+
|
| 79 |
+
print(model.config.id2label)
|
| 80 |
+
print(pred)
|
| 81 |
+
#{0: 'valence'}
|
| 82 |
+
#tensor([[0.3670]])
|
| 83 |
+
```
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/config.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"SERModel"
|
| 4 |
+
],
|
| 5 |
+
"auto_map": {
|
| 6 |
+
"AutoConfig": "pipeline_utils.SERConfig",
|
| 7 |
+
"AutoModelForAudioClassification": "pipeline_utils.SERModel"
|
| 8 |
+
},
|
| 9 |
+
"id2label": {
|
| 10 |
+
"0": "valence"
|
| 11 |
+
},
|
| 12 |
+
"sampling_rate": 16000,
|
| 13 |
+
"maxlen": 192000,
|
| 14 |
+
"mean": -8.278621631819787e-05,
|
| 15 |
+
"std": 0.08485510250851999,
|
| 16 |
+
"classifier_dropout_prob": 0.5,
|
| 17 |
+
"classifier_hidden_layers": 1,
|
| 18 |
+
"hidden_size": 1024,
|
| 19 |
+
"model_type": "ser",
|
| 20 |
+
"num_attention_heads": 16,
|
| 21 |
+
"num_classes": 1,
|
| 22 |
+
"num_hidden_layers": 24,
|
| 23 |
+
"ssl_type": "microsoft/wavlm-large",
|
| 24 |
+
"torch_dtype": "float32",
|
| 25 |
+
"transformers_version": "4.34.0.dev0"
|
| 26 |
+
}
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:44449ad4b46e4af5168f29b25055ca67c28ffd44829d11020782c43712bbc8b3
|
| 3 |
+
size 1274482316
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/pipeline_utils.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
from transformers import AutoModel
|
| 5 |
+
from transformers.modeling_utils import PreTrainedModel ,PretrainedConfig
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Pooling(nn.Module):
|
| 9 |
+
def __init__(self):
|
| 10 |
+
super().__init__()
|
| 11 |
+
def compute_length_from_mask(self, mask):
|
| 12 |
+
"""
|
| 13 |
+
mask: (batch_size, T)
|
| 14 |
+
Assuming that the sampling rate is 16kHz, the frame shift is 20ms
|
| 15 |
+
"""
|
| 16 |
+
wav_lens = torch.sum(mask, dim=1) # (batch_size, )
|
| 17 |
+
feat_lens = torch.div(wav_lens-1, 16000*0.02, rounding_mode="floor") + 1
|
| 18 |
+
feat_lens = feat_lens.int().tolist()
|
| 19 |
+
return feat_lens
|
| 20 |
+
|
| 21 |
+
def forward(self, x, mask):
|
| 22 |
+
raise NotImplementedError
|
| 23 |
+
|
| 24 |
+
class MeanPooling(Pooling):
|
| 25 |
+
def __init__(self):
|
| 26 |
+
super().__init__()
|
| 27 |
+
def forward(self, xs, mask):
|
| 28 |
+
"""
|
| 29 |
+
xs: (batch_size, T, feat_dim)
|
| 30 |
+
mask: (batch_size, T)
|
| 31 |
+
|
| 32 |
+
=> output: (batch_size, feat_dim)
|
| 33 |
+
"""
|
| 34 |
+
feat_lens = self.compute_length_from_mask(mask)
|
| 35 |
+
pooled_list = []
|
| 36 |
+
for x, feat_len in zip(xs, feat_lens):
|
| 37 |
+
pooled = torch.mean(x[:feat_len], dim=0) # (feat_dim, )
|
| 38 |
+
pooled_list.append(pooled)
|
| 39 |
+
pooled = torch.stack(pooled_list, dim=0) # (batch_size, feat_dim)
|
| 40 |
+
return pooled
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class AttentiveStatisticsPooling(Pooling):
|
| 44 |
+
"""
|
| 45 |
+
AttentiveStatisticsPooling
|
| 46 |
+
Paper: Attentive Statistics Pooling for Deep Speaker Embedding
|
| 47 |
+
Link: https://arxiv.org/pdf/1803.10963.pdf
|
| 48 |
+
"""
|
| 49 |
+
def __init__(self, input_size):
|
| 50 |
+
super().__init__()
|
| 51 |
+
self._indim = input_size
|
| 52 |
+
self.sap_linear = nn.Linear(input_size, input_size)
|
| 53 |
+
self.attention = nn.Parameter(torch.FloatTensor(input_size, 1))
|
| 54 |
+
torch.nn.init.normal_(self.attention, mean=0, std=1)
|
| 55 |
+
|
| 56 |
+
def forward(self, xs, mask):
|
| 57 |
+
"""
|
| 58 |
+
xs: (batch_size, T, feat_dim)
|
| 59 |
+
mask: (batch_size, T)
|
| 60 |
+
|
| 61 |
+
=> output: (batch_size, feat_dim*2)
|
| 62 |
+
"""
|
| 63 |
+
feat_lens = self.compute_length_from_mask(mask)
|
| 64 |
+
pooled_list = []
|
| 65 |
+
for x, feat_len in zip(xs, feat_lens):
|
| 66 |
+
x = x[:feat_len].unsqueeze(0)
|
| 67 |
+
h = torch.tanh(self.sap_linear(x))
|
| 68 |
+
w = torch.matmul(h, self.attention).squeeze(dim=2)
|
| 69 |
+
w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1)
|
| 70 |
+
mu = torch.sum(x * w, dim=1)
|
| 71 |
+
rh = torch.sqrt((torch.sum((x**2) * w, dim=1) - mu**2).clamp(min=1e-5))
|
| 72 |
+
x = torch.cat((mu, rh), 1).squeeze(0)
|
| 73 |
+
pooled_list.append(x)
|
| 74 |
+
return torch.stack(pooled_list)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class EmotionRegression(nn.Module):
|
| 80 |
+
def __init__(self, *args, **kwargs):
|
| 81 |
+
super(EmotionRegression, self).__init__()
|
| 82 |
+
input_dim = args[0]
|
| 83 |
+
hidden_dim = args[1]
|
| 84 |
+
num_layers = args[2]
|
| 85 |
+
output_dim = args[3]
|
| 86 |
+
p = kwargs.get("dropout", 0.5)
|
| 87 |
+
|
| 88 |
+
self.fc=nn.ModuleList([
|
| 89 |
+
nn.Sequential(
|
| 90 |
+
nn.Linear(input_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
|
| 91 |
+
)
|
| 92 |
+
])
|
| 93 |
+
for lidx in range(num_layers-1):
|
| 94 |
+
self.fc.append(
|
| 95 |
+
nn.Sequential(
|
| 96 |
+
nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
|
| 97 |
+
)
|
| 98 |
+
)
|
| 99 |
+
self.out = nn.Sequential(
|
| 100 |
+
nn.Linear(hidden_dim, output_dim)
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
self.inp_drop = nn.Dropout(p)
|
| 104 |
+
def get_repr(self, x):
|
| 105 |
+
h = self.inp_drop(x)
|
| 106 |
+
for lidx, fc in enumerate(self.fc):
|
| 107 |
+
h=fc(h)
|
| 108 |
+
return h
|
| 109 |
+
|
| 110 |
+
def forward(self, x):
|
| 111 |
+
h=self.get_repr(x)
|
| 112 |
+
result = self.out(h)
|
| 113 |
+
return result
|
| 114 |
+
|
| 115 |
+
class SERConfig(PretrainedConfig):
|
| 116 |
+
model_type = "ser"
|
| 117 |
+
|
| 118 |
+
def __init__(
|
| 119 |
+
self,
|
| 120 |
+
num_classes: int = 1,
|
| 121 |
+
num_attention_heads = 16,
|
| 122 |
+
num_hidden_layers = 24,
|
| 123 |
+
hidden_size = 1024,
|
| 124 |
+
classifier_hidden_layers = 1,
|
| 125 |
+
classifier_dropout_prob = 0.5,
|
| 126 |
+
ssl_type= "microsoft/wavlm-large",
|
| 127 |
+
torch_dtype= "float32",
|
| 128 |
+
**kwargs,
|
| 129 |
+
):
|
| 130 |
+
self.num_classes = num_classes
|
| 131 |
+
self.num_attention_heads = num_attention_heads
|
| 132 |
+
self.num_hidden_layers = num_hidden_layers
|
| 133 |
+
self.hidden_size = hidden_size
|
| 134 |
+
self.classifier_hidden_layers = classifier_hidden_layers
|
| 135 |
+
self.classifier_dropout_prob = classifier_dropout_prob
|
| 136 |
+
self.ssl_type = ssl_type
|
| 137 |
+
self.torch_dtype = torch_dtype
|
| 138 |
+
super().__init__(**kwargs)
|
| 139 |
+
|
| 140 |
+
class SERModel(PreTrainedModel):
|
| 141 |
+
config_class = SERConfig
|
| 142 |
+
|
| 143 |
+
def __init__(self, config):
|
| 144 |
+
super().__init__(config)
|
| 145 |
+
self.ssl_model = AutoModel.from_pretrained(config.ssl_type)
|
| 146 |
+
self.ssl_model.freeze_feature_encoder()
|
| 147 |
+
|
| 148 |
+
self.pool_model = AttentiveStatisticsPooling(config.hidden_size)
|
| 149 |
+
|
| 150 |
+
self.ser_model = EmotionRegression(config.hidden_size*2,
|
| 151 |
+
config.hidden_size,
|
| 152 |
+
config.classifier_hidden_layers,
|
| 153 |
+
config.num_classes,
|
| 154 |
+
dropout=config.classifier_dropout_prob)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def forward(self, x, mask):
|
| 158 |
+
ssl = self.ssl_model(x, attention_mask=mask).last_hidden_state
|
| 159 |
+
|
| 160 |
+
ssl = self.pool_model(ssl, mask)
|
| 161 |
+
|
| 162 |
+
pred = self.ser_model(ssl)
|
| 163 |
+
|
| 164 |
+
return pred
|
| 165 |
+
|
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ca8929ea564b56819ed846e96b1a472df11fa39d63f540108bab62c84b269b8
|
| 3 |
+
size 1274585617
|
SER-Odyssey/SER-WavLM-Multi-Attributes/.gitattributes
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tensorrt/trt10_ser_fp16.plan filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
tensorrt/trt8_ser_dyn_fp16.plan filter=lfs diff=lfs merge=lfs -text
|
SER-Odyssey/SER-WavLM-Multi-Attributes/README.md
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
language:
|
| 4 |
+
- en
|
| 5 |
+
pipeline_tag: audio-classification
|
| 6 |
+
tags:
|
| 7 |
+
- pytorch
|
| 8 |
+
- wavlm
|
| 9 |
+
- msp-podcast
|
| 10 |
+
- emotion-recognition
|
| 11 |
+
- audio
|
| 12 |
+
- speech
|
| 13 |
+
- valence
|
| 14 |
+
- arousal
|
| 15 |
+
- dominance
|
| 16 |
+
- lucas
|
| 17 |
+
- speech-emotion-recognition
|
| 18 |
+
---
|
| 19 |
+
The model is a recreation of [3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes](https://huggingface.co/3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes) for direct implementation in torch, with class definition and feed forward method. This model was recreated with the hopes of greater flexibilty of control, training/fine-tuning of model. The model was trained on the same [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) dataset as the original, but a different smaller subset was used. The subset is evenly distributed across gender and emotion category with hopes that training would improve accuracy of valence and arousal predictions.
|
| 20 |
+
This model is therefore a multi-attributed based model which predict arousal, dominance and valence. However, unlike the original model, I just kept the original attribute score range of 0...7 (the range the dataset follows). I will provide the evaluations later on. For now I decided to make this repo so that other people could test out my model and see what they think of the inference accuracy themselves, or retrain from scratch, modify etc. My best trained weights s of now are provided in this repo. The class definition for the model is can be found in my [github](https://github.com/PhilipAmadasun/SER-Model-for-dimensional-attribute-prediction#).
|
| 21 |
+
|
| 22 |
+
# Get class definition
|
| 23 |
+
```
|
| 24 |
+
git clone https://github.com/PhilipAmadasun/SER-Model-for-dimensional-attribute-prediction.git
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
# Usage
|
| 28 |
+
## Inference Testing
|
| 29 |
+
```python
|
| 30 |
+
import torch
|
| 31 |
+
import torchaudio
|
| 32 |
+
from SER_Model_setup import SERModel
|
| 33 |
+
|
| 34 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 35 |
+
|
| 36 |
+
checkpoint_path = "<model.pt file>"
|
| 37 |
+
checkpoint = torch.load(checkpoint_path, map_location=device)
|
| 38 |
+
|
| 39 |
+
# Create the model architecture and load weights
|
| 40 |
+
model = SERModel()
|
| 41 |
+
model.load_state_dict(checkpoint['model_state_dict'])
|
| 42 |
+
model.to(device)
|
| 43 |
+
model.eval()
|
| 44 |
+
|
| 45 |
+
audio_path = "<wav file>"
|
| 46 |
+
audio, sr = torchaudio.load(audio_path)
|
| 47 |
+
|
| 48 |
+
if sr != model.sample_rate:
|
| 49 |
+
resampler = torchaudio.transforms.Resample(sr, model.sample_rate)
|
| 50 |
+
audio = resampler(audio)
|
| 51 |
+
#print(audio.shape[0])
|
| 52 |
+
|
| 53 |
+
if audio.shape[0] > 1:
|
| 54 |
+
audio = torch.mean(audio, dim=0, keepdim=True)
|
| 55 |
+
|
| 56 |
+
audio_len = audio.shape[-1]
|
| 57 |
+
|
| 58 |
+
# Create waveform tensor (shape: [1, audio_len])
|
| 59 |
+
waveform = torch.zeros(1, audio_len, dtype=torch.float32)
|
| 60 |
+
# print(waveform)
|
| 61 |
+
# print()
|
| 62 |
+
# print(f"waveform shape: {waveform.shape}")
|
| 63 |
+
# print()
|
| 64 |
+
waveform[0, :audio_len] = audio
|
| 65 |
+
# print(waveform)
|
| 66 |
+
# print()
|
| 67 |
+
# Create mask as 2D tensor: shape [1, audio_len] with ones in valid region
|
| 68 |
+
mask = torch.ones(1, audio_len, dtype=torch.float32)
|
| 69 |
+
# print(mask)
|
| 70 |
+
# print()
|
| 71 |
+
# print(f"mask shape: {mask.shape}")
|
| 72 |
+
|
| 73 |
+
# Move waveform and mask to device
|
| 74 |
+
waveform = waveform.to(device)
|
| 75 |
+
mask = mask.to(device)
|
| 76 |
+
|
| 77 |
+
# Normalize waveform using model's mean and std
|
| 78 |
+
mean = model.mean.to(device)
|
| 79 |
+
std = model.std.to(device)
|
| 80 |
+
waveform = (waveform - mean) / (std + 1e-6)
|
| 81 |
+
|
| 82 |
+
with torch.no_grad():
|
| 83 |
+
predictions = model(waveform, mask) # predictions shape: [1, 3]
|
| 84 |
+
|
| 85 |
+
# Extract predictions: [0,0] for arousal, [0,1] for valence, [0,2] for dominance
|
| 86 |
+
arousal = predictions[0, 0].item()
|
| 87 |
+
valence = predictions[0, 1].item()
|
| 88 |
+
dominance = predictions[0, 2].item()
|
| 89 |
+
|
| 90 |
+
print(f"Arousal: {arousal:.3f}")
|
| 91 |
+
print(f"Valence: {valence:.3f}")
|
| 92 |
+
print(f"Dominance: {dominance:.3f}")
|
| 93 |
+
```
|
| 94 |
+
## Batch inference
|
| 95 |
+
```python
|
| 96 |
+
import os
|
| 97 |
+
import glob
|
| 98 |
+
import torch
|
| 99 |
+
import torchaudio
|
| 100 |
+
from SER_Model_setup import SERModel # Adjust if your model code is elsewhere
|
| 101 |
+
|
| 102 |
+
def load_model_from_checkpoint(checkpoint_path, device='cpu'):
|
| 103 |
+
"""
|
| 104 |
+
Loads the SERModel and weights from a checkpoint, moves to device, sets eval mode.
|
| 105 |
+
"""
|
| 106 |
+
checkpoint = torch.load(checkpoint_path, map_location=device)
|
| 107 |
+
|
| 108 |
+
# Create the model architecture
|
| 109 |
+
model = SERModel()
|
| 110 |
+
model.load_state_dict(checkpoint['model_state_dict'])
|
| 111 |
+
|
| 112 |
+
model.to(device)
|
| 113 |
+
model.eval()
|
| 114 |
+
return model
|
| 115 |
+
|
| 116 |
+
def batch_inference(model, file_paths, device='cpu', normalize=True):
|
| 117 |
+
"""
|
| 118 |
+
Perform true batch inference on multiple .wav files in one forward pass.
|
| 119 |
+
|
| 120 |
+
Args:
|
| 121 |
+
model (SERModel): The loaded SER model in eval mode
|
| 122 |
+
file_paths (list[str]): List of paths to .wav files
|
| 123 |
+
device (str or torch.device): 'cpu' or 'cuda'
|
| 124 |
+
normalize (bool): Whether to normalize waveforms (subtract mean, divide std)
|
| 125 |
+
|
| 126 |
+
Returns:
|
| 127 |
+
dict: {filename: {"arousal": float, "valence": float, "dominance": float}}
|
| 128 |
+
"""
|
| 129 |
+
|
| 130 |
+
# ----------------------------------------
|
| 131 |
+
# 1) Load & store all waveforms in memory
|
| 132 |
+
# ----------------------------------------
|
| 133 |
+
waveforms_list = []
|
| 134 |
+
lengths = []
|
| 135 |
+
for fp in file_paths:
|
| 136 |
+
# Load audio
|
| 137 |
+
audio, sr = torchaudio.load(fp)
|
| 138 |
+
|
| 139 |
+
# Resample if needed
|
| 140 |
+
if sr != model.sample_rate:
|
| 141 |
+
resampler = torchaudio.transforms.Resample(sr, model.sample_rate)
|
| 142 |
+
audio = resampler(audio)
|
| 143 |
+
|
| 144 |
+
# Convert stereo -> mono if needed
|
| 145 |
+
if audio.shape[0] > 1:
|
| 146 |
+
audio = torch.mean(audio, dim=0, keepdim=True)
|
| 147 |
+
|
| 148 |
+
# audio shape => [1, num_samples]
|
| 149 |
+
lengths.append(audio.shape[-1])
|
| 150 |
+
waveforms_list.append(audio)
|
| 151 |
+
|
| 152 |
+
# ----------------------------------------
|
| 153 |
+
# 2) Determine max length
|
| 154 |
+
# ----------------------------------------
|
| 155 |
+
max_len = max(lengths)
|
| 156 |
+
|
| 157 |
+
# ----------------------------------------
|
| 158 |
+
# 3) Pad each waveform to max length & build masks
|
| 159 |
+
# ----------------------------------------
|
| 160 |
+
batch_size = len(waveforms_list)
|
| 161 |
+
batched_waveforms = torch.zeros(batch_size, 1, max_len, dtype=torch.float32)
|
| 162 |
+
masks = torch.zeros(batch_size, max_len, dtype=torch.float32)
|
| 163 |
+
|
| 164 |
+
for i, audio in enumerate(waveforms_list):
|
| 165 |
+
cur_len = audio.shape[-1]
|
| 166 |
+
batched_waveforms[i, :, :cur_len] = audio
|
| 167 |
+
masks[i, :cur_len] = 1.0 # valid portion
|
| 168 |
+
|
| 169 |
+
# ----------------------------------------
|
| 170 |
+
# 4) Move batched data to device BEFORE normalization
|
| 171 |
+
# ----------------------------------------
|
| 172 |
+
batched_waveforms = batched_waveforms.to(device)
|
| 173 |
+
masks = masks.to(device)
|
| 174 |
+
|
| 175 |
+
# ----------------------------------------
|
| 176 |
+
# 5) Normalize if needed (model.mean, model.std)
|
| 177 |
+
# ----------------------------------------
|
| 178 |
+
if normalize:
|
| 179 |
+
# model.mean and model.std are buffers; ensure they're on the correct device
|
| 180 |
+
mean = model.mean.to(device)
|
| 181 |
+
std = model.std.to(device)
|
| 182 |
+
batched_waveforms = (batched_waveforms - mean) / (std + 1e-6)
|
| 183 |
+
|
| 184 |
+
# ----------------------------------------
|
| 185 |
+
# 6) Single forward pass
|
| 186 |
+
# ----------------------------------------
|
| 187 |
+
with torch.no_grad():
|
| 188 |
+
predictions = model(batched_waveforms, masks)
|
| 189 |
+
# predictions shape => [batch_size, 3]
|
| 190 |
+
|
| 191 |
+
# ----------------------------------------
|
| 192 |
+
# 7) Build result dict
|
| 193 |
+
# ----------------------------------------
|
| 194 |
+
results = {}
|
| 195 |
+
for i, fp in enumerate(file_paths):
|
| 196 |
+
arousal = predictions[i, 0].item()
|
| 197 |
+
valence = predictions[i, 1].item()
|
| 198 |
+
dominance = predictions[i, 2].item()
|
| 199 |
+
filename = os.path.basename(fp)
|
| 200 |
+
results[filename] = {
|
| 201 |
+
"arousal": arousal,
|
| 202 |
+
"valence": valence,
|
| 203 |
+
"dominance": dominance
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
return results
|
| 207 |
+
|
| 208 |
+
if __name__ == "__main__":
|
| 209 |
+
# -----------------------------------------
|
| 210 |
+
# Example usage
|
| 211 |
+
# -----------------------------------------
|
| 212 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 213 |
+
|
| 214 |
+
checkpoint_path = "<weights.pt>"
|
| 215 |
+
model = load_model_from_checkpoint(checkpoint_path, device=device)
|
| 216 |
+
|
| 217 |
+
# Suppose you have a folder of .wav files
|
| 218 |
+
wav_folder = "<directory containing .wav files>"
|
| 219 |
+
wav_paths = glob.glob(os.path.join(wav_folder, "*.wav"))
|
| 220 |
+
|
| 221 |
+
# Do a single pass of batch inference
|
| 222 |
+
all_results = batch_inference(model, wav_paths, device=device, normalize=True)
|
| 223 |
+
|
| 224 |
+
# Print results
|
| 225 |
+
for fname, preds in all_results.items():
|
| 226 |
+
print(f"{fname}: Arousal={preds['arousal']:.3f}, "
|
| 227 |
+
f"Valence={preds['valence']:.3f}, Dominance={preds['dominance']:.3f}")
|
| 228 |
+
```
|
SER-Odyssey/SER-WavLM-Multi-Attributes/onnx/ReadMe
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
model in onnx format
|
SER-Odyssey/SER-WavLM-Multi-Attributes/onnx/ser_dyn.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dad8465907c9dcfaa47628d7e8401a281396c17fe17b3c8b72071279cb6b2cac
|
| 3 |
+
size 1274295745
|
SER-Odyssey/SER-WavLM-Multi-Attributes/pytorch/best_weights.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:809f5b3ef98835b5ca9dcf9d0efb4bd6cf0a9cc458cfb9443ae07ef71b44f670
|
| 3 |
+
size 1299851786
|
SER-Odyssey/SER-WavLM-Multi-Attributes/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/uyiosa/SER-WavLM-Multi-Attributes
|
SER-Odyssey/SER-WavLM-Multi-Attributes/tensorrt/ReadMe
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trt10 -- compiled with TensorRT version 10
|
| 2 |
+
trt8 -- comiled with TensorRT version 8
|
SER-Odyssey/SER-WavLM-Multi-Attributes/tensorrt/trt10_ser_fp16.plan
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ed792e1cd7a6e6f1d89413b5800da2b7328c40483d8532d8b0bc2e74444e0516
|
| 3 |
+
size 644044452
|
SER-Odyssey/SER-WavLM-Multi-Attributes/tensorrt/trt8_ser_dyn_fp16.plan
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6e157bb3cc07c0e808a93815a478874a092cd9f172abf83c1822ce9b9f1d55d
|
| 3 |
+
size 643712772
|
SER-Odyssey/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/3loi/models
|
WavLM. Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0ca8836ebdf8236e610187738217d4c91c5ead13873472e476423f1561e9238e
|
| 3 |
+
size 929604
|
tiny-random-WavLMForAudioFrameClassification-ONNX/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
tiny-random-WavLMForAudioFrameClassification-ONNX/config.json
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_attn_implementation_autoset": true,
|
| 3 |
+
"_name_or_path": "hf-internal-testing/tiny-random-WavLMForAudioFrameClassification",
|
| 4 |
+
"activation_dropout": 0.1,
|
| 5 |
+
"adapter_kernel_size": 3,
|
| 6 |
+
"adapter_stride": 2,
|
| 7 |
+
"add_adapter": false,
|
| 8 |
+
"apply_spec_augment": true,
|
| 9 |
+
"architectures": [
|
| 10 |
+
"WavLMForAudioFrameClassification"
|
| 11 |
+
],
|
| 12 |
+
"attention_dropout": 0.1,
|
| 13 |
+
"bos_token_id": 1,
|
| 14 |
+
"classifier_proj_size": 256,
|
| 15 |
+
"codevector_dim": 256,
|
| 16 |
+
"contrastive_logits_temperature": 0.1,
|
| 17 |
+
"conv_bias": false,
|
| 18 |
+
"conv_dim": [
|
| 19 |
+
32,
|
| 20 |
+
32,
|
| 21 |
+
32
|
| 22 |
+
],
|
| 23 |
+
"conv_kernel": [
|
| 24 |
+
8,
|
| 25 |
+
8,
|
| 26 |
+
8
|
| 27 |
+
],
|
| 28 |
+
"conv_stride": [
|
| 29 |
+
4,
|
| 30 |
+
4,
|
| 31 |
+
4
|
| 32 |
+
],
|
| 33 |
+
"ctc_loss_reduction": "mean",
|
| 34 |
+
"ctc_zero_infinity": false,
|
| 35 |
+
"diversity_loss_weight": 0.1,
|
| 36 |
+
"do_stable_layer_norm": false,
|
| 37 |
+
"eos_token_id": 2,
|
| 38 |
+
"feat_extract_activation": "gelu",
|
| 39 |
+
"feat_extract_dropout": 0.0,
|
| 40 |
+
"feat_extract_norm": "group",
|
| 41 |
+
"feat_proj_dropout": 0.0,
|
| 42 |
+
"final_dropout": 0.1,
|
| 43 |
+
"hidden_act": "gelu",
|
| 44 |
+
"hidden_dropout": 0.1,
|
| 45 |
+
"hidden_dropout_prob": 0.1,
|
| 46 |
+
"hidden_size": 16,
|
| 47 |
+
"initializer_range": 0.02,
|
| 48 |
+
"intermediate_size": 20,
|
| 49 |
+
"layer_norm_eps": 1e-05,
|
| 50 |
+
"layerdrop": 0.1,
|
| 51 |
+
"mask_feature_length": 10,
|
| 52 |
+
"mask_feature_prob": 0.0,
|
| 53 |
+
"mask_time_length": 10,
|
| 54 |
+
"mask_time_min_masks": 2,
|
| 55 |
+
"mask_time_prob": 0.05,
|
| 56 |
+
"max_bucket_distance": 800,
|
| 57 |
+
"model_type": "wavlm",
|
| 58 |
+
"num_adapter_layers": 3,
|
| 59 |
+
"num_attention_heads": 2,
|
| 60 |
+
"num_buckets": 320,
|
| 61 |
+
"num_codevector_groups": 2,
|
| 62 |
+
"num_codevectors_per_group": 320,
|
| 63 |
+
"num_conv_pos_embedding_groups": 2,
|
| 64 |
+
"num_conv_pos_embeddings": 16,
|
| 65 |
+
"num_ctc_classes": 80,
|
| 66 |
+
"num_feat_extract_layers": 3,
|
| 67 |
+
"num_hidden_layers": 4,
|
| 68 |
+
"num_negatives": 100,
|
| 69 |
+
"output_hidden_size": 16,
|
| 70 |
+
"pad_token_id": 0,
|
| 71 |
+
"proj_codevector_dim": 256,
|
| 72 |
+
"tdnn_dilation": [
|
| 73 |
+
1,
|
| 74 |
+
1
|
| 75 |
+
],
|
| 76 |
+
"tdnn_dim": [
|
| 77 |
+
32,
|
| 78 |
+
32
|
| 79 |
+
],
|
| 80 |
+
"tdnn_kernel": [
|
| 81 |
+
3,
|
| 82 |
+
3
|
| 83 |
+
],
|
| 84 |
+
"transformers_version": "4.48.2",
|
| 85 |
+
"use_weighted_layer_sum": false,
|
| 86 |
+
"vocab_size": 32,
|
| 87 |
+
"xvector_output_dim": 32
|
| 88 |
+
}
|
tiny-random-WavLMForAudioFrameClassification-ONNX/onnx/model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:43d68f66c0eb42e09d03c533d705eabde0fd481635fdff874b9d94ae4445b550
|
| 3 |
+
size 276448
|