niobures commited on
Commit
e36896b
·
verified ·
1 Parent(s): 713778e
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +7 -0
  2. SER-Odyssey/Baseline_Model.pdf +3 -0
  3. SER-Odyssey/MSP-Podcast_Challenge [JMasr] +48 -24.zip +3 -0
  4. SER-Odyssey/MSP-Podcast_Challenge.zip +3 -0
  5. SER-Odyssey/Odyssey 2024 - Speech Emotion Recognition Challenge. Dataset, Baseline, Framework, and Results.pdf +3 -0
  6. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/.gitattributes +35 -0
  7. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/README.md +83 -0
  8. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/config.json +26 -0
  9. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/model.safetensors +3 -0
  10. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/pipeline_utils.py +165 -0
  11. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/pytorch_model.bin +3 -0
  12. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/.gitattributes +35 -0
  13. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/README.md +87 -0
  14. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/config.json +32 -0
  15. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/model.safetensors +3 -0
  16. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/pipeline_utils.py +171 -0
  17. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/pytorch_model.bin +3 -0
  18. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/.gitattributes +35 -0
  19. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/README.md +83 -0
  20. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/config.json +26 -0
  21. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/model.safetensors +3 -0
  22. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/pipeline_utils.py +165 -0
  23. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/pytorch_model.bin +3 -0
  24. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/.gitattributes +35 -0
  25. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/README.md +84 -0
  26. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/config.json +28 -0
  27. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/model.safetensors +3 -0
  28. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/pipeline_utils.py +167 -0
  29. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/preprocessor_config.json +3 -0
  30. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/pytorch_model.bin +3 -0
  31. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/.gitattributes +35 -0
  32. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/README.md +83 -0
  33. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/config.json +26 -0
  34. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/model.safetensors +3 -0
  35. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/pipeline_utils.py +165 -0
  36. SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/pytorch_model.bin +3 -0
  37. SER-Odyssey/SER-WavLM-Multi-Attributes/.gitattributes +37 -0
  38. SER-Odyssey/SER-WavLM-Multi-Attributes/README.md +228 -0
  39. SER-Odyssey/SER-WavLM-Multi-Attributes/onnx/ReadMe +1 -0
  40. SER-Odyssey/SER-WavLM-Multi-Attributes/onnx/ser_dyn.onnx +3 -0
  41. SER-Odyssey/SER-WavLM-Multi-Attributes/pytorch/best_weights.pt +3 -0
  42. SER-Odyssey/SER-WavLM-Multi-Attributes/source.txt +1 -0
  43. SER-Odyssey/SER-WavLM-Multi-Attributes/tensorrt/ReadMe +2 -0
  44. SER-Odyssey/SER-WavLM-Multi-Attributes/tensorrt/trt10_ser_fp16.plan +3 -0
  45. SER-Odyssey/SER-WavLM-Multi-Attributes/tensorrt/trt8_ser_dyn_fp16.plan +3 -0
  46. SER-Odyssey/source.txt +1 -0
  47. WavLM. Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing.pdf +3 -0
  48. tiny-random-WavLMForAudioFrameClassification-ONNX/.gitattributes +35 -0
  49. tiny-random-WavLMForAudioFrameClassification-ONNX/config.json +88 -0
  50. tiny-random-WavLMForAudioFrameClassification-ONNX/onnx/model.onnx +3 -0
.gitattributes CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ SER-Odyssey/Baseline_Model.pdf filter=lfs diff=lfs merge=lfs -text
37
+ SER-Odyssey/Odyssey[[:space:]]2024[[:space:]]-[[:space:]]Speech[[:space:]]Emotion[[:space:]]Recognition[[:space:]]Challenge.[[:space:]]Dataset,[[:space:]]Baseline,[[:space:]]Framework,[[:space:]]and[[:space:]]Results.pdf filter=lfs diff=lfs merge=lfs -text
38
+ SER-Odyssey/SER-WavLM-Multi-Attributes/tensorrt/trt10_ser_fp16.plan filter=lfs diff=lfs merge=lfs -text
39
+ SER-Odyssey/SER-WavLM-Multi-Attributes/tensorrt/trt8_ser_dyn_fp16.plan filter=lfs diff=lfs merge=lfs -text
40
+ wavlm-large-mnn/wavlm_large_fp16.mnn filter=lfs diff=lfs merge=lfs -text
41
+ wavlm-large-mnn/wavlm_large_int8.mnn filter=lfs diff=lfs merge=lfs -text
42
+ WavLM.[[:space:]]Large-Scale[[:space:]]Self-Supervised[[:space:]]Pre-Training[[:space:]]for[[:space:]]Full[[:space:]]Stack[[:space:]]Speech[[:space:]]Processing.pdf filter=lfs diff=lfs merge=lfs -text
SER-Odyssey/Baseline_Model.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2006e79620902e9b411dd8e110f296c9e7d2458110faa8043d900187f203e103
3
+ size 460836
SER-Odyssey/MSP-Podcast_Challenge [JMasr] +48 -24.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b53455da505412b271968d94febb011505ef41201826ba048dc7308306838a04
3
+ size 895217
SER-Odyssey/MSP-Podcast_Challenge.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c86e9700cc05734431656503b7602aa0a5f9b60be4a5a02238e87121324055a7
3
+ size 897745
SER-Odyssey/Odyssey 2024 - Speech Emotion Recognition Challenge. Dataset, Baseline, Framework, and Results.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11f79ccab188b27218b3c5038fbec0ef21e0dca0d08af3d998e77b993d0ed31c
3
+ size 1083858
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/README.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ pipeline_tag: audio-classification
6
+ tags:
7
+ - wavlm
8
+ - msp-podcast
9
+ - emotion-recognition
10
+ - audio
11
+ - speech
12
+ - arousal
13
+ - lucas
14
+ - speech-emotion-recognition
15
+ ---
16
+ The model was trained on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) for the Odyssey 2024 Emotion Recognition competition baseline<br>
17
+ This particular model is the single-task specialized arousal model, which predict arousal in a range of approximately 0...1.
18
+
19
+
20
+
21
+ # Benchmarks
22
+ CCC based on Test3 and Development sets of the Odyssey Competition
23
+ <table style="width:500px">
24
+ <tr><th colspan=2 align="center"> Sinle-Task Setup </th></tr>
25
+ <tr><th colspan=1 align="center">Test 3</th><th colspan=1 align="center">Development</th></tr>
26
+ <tr> <td align="center">Aro</td> <td align="center">Aro</td> </tr>
27
+ <tr> <td align="center"> 0.566</td> <td align="center" >0.651 </td> </tr>
28
+ </table>
29
+
30
+
31
+
32
+ For more details: [demo](https://huggingface.co/spaces/3loi/WavLM-SER-Multi-Baseline-Odyssey2024), [paper](https://ecs.utdallas.edu/research/researchlabs/msp-lab/publications/Goncalves_2024.pdf), and [GitHub](https://github.com/MSP-UTD/MSP-Podcast_Challenge/tree/main).
33
+
34
+
35
+ ```
36
+ @InProceedings{Goncalves_2024,
37
+ author={L. Goncalves and A. N. Salman and A. {Reddy Naini} and L. Moro-Velazquez and T. Thebaud and L. {Paola Garcia} and N. Dehak and B. Sisman and C. Busso},
38
+ title={Odyssey2024 - Speech Emotion Recognition Challenge: Dataset, Baseline Framework, and Results},
39
+ booktitle={Odyssey 2024: The Speaker and Language Recognition Workshop)},
40
+ volume={To appear},
41
+ year={2024},
42
+ month={June},
43
+ address = {Quebec, Canada},
44
+ }
45
+ ```
46
+
47
+
48
+ # Usage
49
+ ```python
50
+ from transformers import AutoModelForAudioClassification
51
+ import librosa, torch
52
+
53
+ #load model
54
+ model = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Arousal", trust_remote_code=True)
55
+
56
+ #get mean/std
57
+ mean = model.config.mean
58
+ std = model.config.std
59
+
60
+
61
+ #load an audio file
62
+ audio_path = "/path/to/audio.wav"
63
+ raw_wav, _ = librosa.load(audio_path, sr=model.config.sampling_rate)
64
+
65
+ #normalize the audio by mean/std
66
+ norm_wav = (raw_wav - mean) / (std+0.000001)
67
+
68
+ #generate the mask
69
+ mask = torch.ones(1, len(norm_wav))
70
+
71
+ #batch it (add dim)
72
+ wavs = torch.tensor(norm_wav).unsqueeze(0)
73
+
74
+
75
+ #predict
76
+ with torch.no_grad():
77
+ pred = model(wavs, mask)
78
+
79
+ print(model.config.id2label)
80
+ print(pred)
81
+ #{0: 'arousal'}
82
+ #tensor([[0.3670]])
83
+ ```
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SERModel"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "pipeline_utils.SERConfig",
7
+ "AutoModelForAudioClassification": "pipeline_utils.SERModel"
8
+ },
9
+ "id2label": {
10
+ "0": "arousal"
11
+ },
12
+ "sampling_rate": 16000,
13
+ "maxlen": 192000,
14
+ "mean": -8.278621631819787e-05,
15
+ "std": 0.08485510250851999,
16
+ "classifier_dropout_prob": 0.5,
17
+ "classifier_hidden_layers": 1,
18
+ "hidden_size": 1024,
19
+ "model_type": "ser",
20
+ "num_attention_heads": 16,
21
+ "num_classes": 1,
22
+ "num_hidden_layers": 24,
23
+ "ssl_type": "microsoft/wavlm-large",
24
+ "torch_dtype": "float32",
25
+ "transformers_version": "4.34.0.dev0"
26
+ }
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6513eca66ff2f599b248059ad44c41fef39d61b5cfc4995f777022c42c07106c
3
+ size 1274482316
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/pipeline_utils.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from transformers import AutoModel
5
+ from transformers.modeling_utils import PreTrainedModel ,PretrainedConfig
6
+
7
+
8
+ class Pooling(nn.Module):
9
+ def __init__(self):
10
+ super().__init__()
11
+ def compute_length_from_mask(self, mask):
12
+ """
13
+ mask: (batch_size, T)
14
+ Assuming that the sampling rate is 16kHz, the frame shift is 20ms
15
+ """
16
+ wav_lens = torch.sum(mask, dim=1) # (batch_size, )
17
+ feat_lens = torch.div(wav_lens-1, 16000*0.02, rounding_mode="floor") + 1
18
+ feat_lens = feat_lens.int().tolist()
19
+ return feat_lens
20
+
21
+ def forward(self, x, mask):
22
+ raise NotImplementedError
23
+
24
+ class MeanPooling(Pooling):
25
+ def __init__(self):
26
+ super().__init__()
27
+ def forward(self, xs, mask):
28
+ """
29
+ xs: (batch_size, T, feat_dim)
30
+ mask: (batch_size, T)
31
+
32
+ => output: (batch_size, feat_dim)
33
+ """
34
+ feat_lens = self.compute_length_from_mask(mask)
35
+ pooled_list = []
36
+ for x, feat_len in zip(xs, feat_lens):
37
+ pooled = torch.mean(x[:feat_len], dim=0) # (feat_dim, )
38
+ pooled_list.append(pooled)
39
+ pooled = torch.stack(pooled_list, dim=0) # (batch_size, feat_dim)
40
+ return pooled
41
+
42
+
43
+ class AttentiveStatisticsPooling(Pooling):
44
+ """
45
+ AttentiveStatisticsPooling
46
+ Paper: Attentive Statistics Pooling for Deep Speaker Embedding
47
+ Link: https://arxiv.org/pdf/1803.10963.pdf
48
+ """
49
+ def __init__(self, input_size):
50
+ super().__init__()
51
+ self._indim = input_size
52
+ self.sap_linear = nn.Linear(input_size, input_size)
53
+ self.attention = nn.Parameter(torch.FloatTensor(input_size, 1))
54
+ torch.nn.init.normal_(self.attention, mean=0, std=1)
55
+
56
+ def forward(self, xs, mask):
57
+ """
58
+ xs: (batch_size, T, feat_dim)
59
+ mask: (batch_size, T)
60
+
61
+ => output: (batch_size, feat_dim*2)
62
+ """
63
+ feat_lens = self.compute_length_from_mask(mask)
64
+ pooled_list = []
65
+ for x, feat_len in zip(xs, feat_lens):
66
+ x = x[:feat_len].unsqueeze(0)
67
+ h = torch.tanh(self.sap_linear(x))
68
+ w = torch.matmul(h, self.attention).squeeze(dim=2)
69
+ w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1)
70
+ mu = torch.sum(x * w, dim=1)
71
+ rh = torch.sqrt((torch.sum((x**2) * w, dim=1) - mu**2).clamp(min=1e-5))
72
+ x = torch.cat((mu, rh), 1).squeeze(0)
73
+ pooled_list.append(x)
74
+ return torch.stack(pooled_list)
75
+
76
+
77
+
78
+
79
+ class EmotionRegression(nn.Module):
80
+ def __init__(self, *args, **kwargs):
81
+ super(EmotionRegression, self).__init__()
82
+ input_dim = args[0]
83
+ hidden_dim = args[1]
84
+ num_layers = args[2]
85
+ output_dim = args[3]
86
+ p = kwargs.get("dropout", 0.5)
87
+
88
+ self.fc=nn.ModuleList([
89
+ nn.Sequential(
90
+ nn.Linear(input_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
91
+ )
92
+ ])
93
+ for lidx in range(num_layers-1):
94
+ self.fc.append(
95
+ nn.Sequential(
96
+ nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
97
+ )
98
+ )
99
+ self.out = nn.Sequential(
100
+ nn.Linear(hidden_dim, output_dim)
101
+ )
102
+
103
+ self.inp_drop = nn.Dropout(p)
104
+ def get_repr(self, x):
105
+ h = self.inp_drop(x)
106
+ for lidx, fc in enumerate(self.fc):
107
+ h=fc(h)
108
+ return h
109
+
110
+ def forward(self, x):
111
+ h=self.get_repr(x)
112
+ result = self.out(h)
113
+ return result
114
+
115
+ class SERConfig(PretrainedConfig):
116
+ model_type = "ser"
117
+
118
+ def __init__(
119
+ self,
120
+ num_classes: int = 1,
121
+ num_attention_heads = 16,
122
+ num_hidden_layers = 24,
123
+ hidden_size = 1024,
124
+ classifier_hidden_layers = 1,
125
+ classifier_dropout_prob = 0.5,
126
+ ssl_type= "microsoft/wavlm-large",
127
+ torch_dtype= "float32",
128
+ **kwargs,
129
+ ):
130
+ self.num_classes = num_classes
131
+ self.num_attention_heads = num_attention_heads
132
+ self.num_hidden_layers = num_hidden_layers
133
+ self.hidden_size = hidden_size
134
+ self.classifier_hidden_layers = classifier_hidden_layers
135
+ self.classifier_dropout_prob = classifier_dropout_prob
136
+ self.ssl_type = ssl_type
137
+ self.torch_dtype = torch_dtype
138
+ super().__init__(**kwargs)
139
+
140
+ class SERModel(PreTrainedModel):
141
+ config_class = SERConfig
142
+
143
+ def __init__(self, config):
144
+ super().__init__(config)
145
+ self.ssl_model = AutoModel.from_pretrained(config.ssl_type)
146
+ self.ssl_model.freeze_feature_encoder()
147
+
148
+ self.pool_model = AttentiveStatisticsPooling(config.hidden_size)
149
+
150
+ self.ser_model = EmotionRegression(config.hidden_size*2,
151
+ config.hidden_size,
152
+ config.classifier_hidden_layers,
153
+ config.num_classes,
154
+ dropout=config.classifier_dropout_prob)
155
+
156
+
157
+ def forward(self, x, mask):
158
+ ssl = self.ssl_model(x, attention_mask=mask).last_hidden_state
159
+
160
+ ssl = self.pool_model(ssl, mask)
161
+
162
+ pred = self.ser_model(ssl)
163
+
164
+ return pred
165
+
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Arousal/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f8a77ea0603b9ab91b3ce1d03c165db58d95ebae1c4210ab12dc94459c36b60
3
+ size 1274585617
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/README.md ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ pipeline_tag: audio-classification
6
+ tags:
7
+ - wavlm
8
+ - msp-podcast
9
+ - emotion-recognition
10
+ - audio
11
+ - speech
12
+ - categorical
13
+ - lucas
14
+ - speech-emotion-recognition
15
+ ---
16
+ The model was trained on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) for the Odyssey 2024 Emotion Recognition competition baseline<br>
17
+ This particular model is the categorical based model which predicts: "Angry", "Sad", "Happy", "Surprise", "Fear", "Disgust", "Contempt" and "Neutral".
18
+
19
+
20
+ # Benchmarks
21
+ F1-scores based on Test3 and Development sets of the Odyssey Competition
22
+ <table style="width:500px">
23
+ <tr><th colspan=8 align="center" >Categorical Setup</th></tr>
24
+ <tr><th colspan=4 align="center">Test 3</th><th colspan=4 align="center">Development</th></tr>
25
+ <tr> <td>F1-Mic.</td> <td>F1-Ma.</td> <td>Prec.</td> <td>Rec.</td> <td>F1-Mic.</td> <td>F1-Ma.</td> <td>Prec.</td> <td>Rec.</td> </tr>
26
+ <tr> <td> 0.327</td> <td>0.311</td> <td>0.332</td> <td>0.325</td> <td>0.409</td> <td>0.307</td> <td>0.316</td> <td>0.345</td> </tr>
27
+ </table>
28
+
29
+
30
+
31
+ For more details: [demo](https://huggingface.co/spaces/3loi/WavLM-SER-Multi-Baseline-Odyssey2024), [paper](https://ecs.utdallas.edu/research/researchlabs/msp-lab/publications/Goncalves_2024.pdf), and [GitHub](https://github.com/MSP-UTD/MSP-Podcast_Challenge/tree/main).
32
+
33
+
34
+ ```
35
+ @InProceedings{Goncalves_2024,
36
+ author={L. Goncalves and A. N. Salman and A. {Reddy Naini} and L. Moro-Velazquez and T. Thebaud and L. {Paola Garcia} and N. Dehak and B. Sisman and C. Busso},
37
+ title={Odyssey2024 - Speech Emotion Recognition Challenge: Dataset, Baseline Framework, and Results},
38
+ booktitle={Odyssey 2024: The Speaker and Language Recognition Workshop)},
39
+ volume={To appear},
40
+ year={2024},
41
+ month={June},
42
+ address = {Quebec, Canada},
43
+ }
44
+ ```
45
+
46
+
47
+ # Usage
48
+ ```python
49
+ from transformers import AutoModelForAudioClassification
50
+ import librosa, torch
51
+
52
+ #load model
53
+ model = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes", trust_remote_code=True)
54
+
55
+ #get mean/std
56
+ mean = model.config.mean
57
+ std = model.config.std
58
+
59
+
60
+ #load an audio file
61
+ audio_path = "/path/to/audio.wav"
62
+ raw_wav, _ = librosa.load(audio_path, sr=model.config.sampling_rate)
63
+
64
+ #normalize the audio by mean/std
65
+ norm_wav = (raw_wav - mean) / (std+0.000001)
66
+
67
+ #generate the mask
68
+ mask = torch.ones(1, len(norm_wav))
69
+
70
+ #batch it (add dim)
71
+ wavs = torch.tensor(norm_wav).unsqueeze(0)
72
+
73
+
74
+ #predict
75
+ with torch.no_grad():
76
+ pred = model(wavs, mask)
77
+
78
+ print(model.config.id2label)
79
+ print(pred)
80
+ #{0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise', 4: 'Fear', 5: 'Disgust', 6: 'Contempt', 7: 'Neutral'}
81
+ #tensor([[0.0015, 0.3651, 0.0593, 0.0315, 0.0600, 0.0125, 0.0319, 0.4382]])
82
+
83
+ #convert logits to probability
84
+ probabilities = torch.nn.functional.softmax(pred, dim=1)
85
+ print(probabilities)
86
+ #[[0.0015, 0.3651, 0.0593, 0.0315, 0.0600, 0.0125, 0.0319, 0.4382]]
87
+ ```
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SERModel"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "pipeline_utils.SERConfig",
7
+ "AutoModelForAudioClassification": "pipeline_utils.SERModel"
8
+ },
9
+ "id2label": {
10
+ "0": "Angry",
11
+ "1": "Sad",
12
+ "2": "Happy",
13
+ "3": "Surprise",
14
+ "4": "Fear",
15
+ "5": "Disgust",
16
+ "6": "Contempt",
17
+ "7": "Neutral"
18
+ },
19
+ "sampling_rate": 16000,
20
+ "classifier_dropout_prob": 0.5,
21
+ "classifier_hidden_layers": 1,
22
+ "hidden_size": 1024,
23
+ "mean": -8.278621631819787e-05,
24
+ "model_type": "ser",
25
+ "num_attention_heads": 16,
26
+ "num_classes": 8,
27
+ "num_hidden_layers": 24,
28
+ "ssl_type": "microsoft/wavlm-large",
29
+ "std": 0.08485510250851999,
30
+ "torch_dtype": "float32",
31
+ "transformers_version": "4.34.0.dev0"
32
+ }
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb52f3f472b6a5a824ac238537fa60bf39a73d74b3fa5f4a4473c012cb3d18f4
3
+ size 1274511016
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/pipeline_utils.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from transformers import AutoModel
5
+ from transformers.modeling_utils import PreTrainedModel ,PretrainedConfig
6
+
7
+
8
+ class Pooling(nn.Module):
9
+ def __init__(self):
10
+ super().__init__()
11
+ def compute_length_from_mask(self, mask):
12
+ """
13
+ mask: (batch_size, T)
14
+ Assuming that the sampling rate is 16kHz, the frame shift is 20ms
15
+ """
16
+ wav_lens = torch.sum(mask, dim=1) # (batch_size, )
17
+ feat_lens = torch.div(wav_lens-1, 16000*0.02, rounding_mode="floor") + 1
18
+ feat_lens = feat_lens.int().tolist()
19
+ return feat_lens
20
+
21
+ def forward(self, x, mask):
22
+ raise NotImplementedError
23
+
24
+ class MeanPooling(Pooling):
25
+ def __init__(self):
26
+ super().__init__()
27
+ def forward(self, xs, mask):
28
+ """
29
+ xs: (batch_size, T, feat_dim)
30
+ mask: (batch_size, T)
31
+
32
+ => output: (batch_size, feat_dim)
33
+ """
34
+ feat_lens = self.compute_length_from_mask(mask)
35
+ pooled_list = []
36
+ for x, feat_len in zip(xs, feat_lens):
37
+ pooled = torch.mean(x[:feat_len], dim=0) # (feat_dim, )
38
+ pooled_list.append(pooled)
39
+ pooled = torch.stack(pooled_list, dim=0) # (batch_size, feat_dim)
40
+ return pooled
41
+
42
+
43
+ class AttentiveStatisticsPooling(Pooling):
44
+ """
45
+ AttentiveStatisticsPooling
46
+ Paper: Attentive Statistics Pooling for Deep Speaker Embedding
47
+ Link: https://arxiv.org/pdf/1803.10963.pdf
48
+ """
49
+ def __init__(self, input_size):
50
+ super().__init__()
51
+ self._indim = input_size
52
+ self.sap_linear = nn.Linear(input_size, input_size)
53
+ self.attention = nn.Parameter(torch.FloatTensor(input_size, 1))
54
+ torch.nn.init.normal_(self.attention, mean=0, std=1)
55
+
56
+ def forward(self, xs, mask):
57
+ """
58
+ xs: (batch_size, T, feat_dim)
59
+ mask: (batch_size, T)
60
+
61
+ => output: (batch_size, feat_dim*2)
62
+ """
63
+ feat_lens = self.compute_length_from_mask(mask)
64
+ pooled_list = []
65
+ for x, feat_len in zip(xs, feat_lens):
66
+ x = x[:feat_len].unsqueeze(0)
67
+ h = torch.tanh(self.sap_linear(x))
68
+ w = torch.matmul(h, self.attention).squeeze(dim=2)
69
+ w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1)
70
+ mu = torch.sum(x * w, dim=1)
71
+ rh = torch.sqrt((torch.sum((x**2) * w, dim=1) - mu**2).clamp(min=1e-5))
72
+ x = torch.cat((mu, rh), 1).squeeze(0)
73
+ pooled_list.append(x)
74
+ return torch.stack(pooled_list)
75
+
76
+
77
+
78
+
79
+ class EmotionRegression(nn.Module):
80
+ def __init__(self, *args, **kwargs):
81
+ super(EmotionRegression, self).__init__()
82
+ input_dim = args[0]
83
+ hidden_dim = args[1]
84
+ num_layers = args[2]
85
+ output_dim = args[3]
86
+ p = kwargs.get("dropout", 0.5)
87
+
88
+ self.fc=nn.ModuleList([
89
+ nn.Sequential(
90
+ nn.Linear(input_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
91
+ )
92
+ ])
93
+ for lidx in range(num_layers-1):
94
+ self.fc.append(
95
+ nn.Sequential(
96
+ nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
97
+ )
98
+ )
99
+ self.out = nn.Sequential(
100
+ nn.Linear(hidden_dim, output_dim)
101
+ )
102
+
103
+ self.inp_drop = nn.Dropout(p)
104
+ def get_repr(self, x):
105
+ h = self.inp_drop(x)
106
+ for lidx, fc in enumerate(self.fc):
107
+ h=fc(h)
108
+ return h
109
+
110
+ def forward(self, x):
111
+ h=self.get_repr(x)
112
+ result = self.out(h)
113
+ return result
114
+
115
+
116
+ class SERConfig(PretrainedConfig):
117
+ model_type = "ser"
118
+
119
+ def __init__(
120
+ self,
121
+ num_classes: int = 8,
122
+ num_attention_heads = 16,
123
+ num_hidden_layers = 24,
124
+ hidden_size = 1024,
125
+ classifier_hidden_layers = 1,
126
+ classifier_dropout_prob = 0.5,
127
+ ssl_type= "microsoft/wavlm-large",
128
+ torch_dtype= "float32",
129
+ mean= -8.278621631819787e-05,
130
+ std=0.08485510250851999,
131
+ **kwargs,
132
+ ):
133
+ self.num_classes = num_classes
134
+ self.num_attention_heads = num_attention_heads
135
+ self.num_hidden_layers = num_hidden_layers
136
+ self.hidden_size = hidden_size
137
+ self.classifier_hidden_layers = classifier_hidden_layers
138
+ self.classifier_dropout_prob = classifier_dropout_prob
139
+ self.ssl_type = ssl_type
140
+ self.torch_dtype = torch_dtype
141
+
142
+ self.mean = mean
143
+ self.std = std
144
+ super().__init__(**kwargs)
145
+
146
+ class SERModel(PreTrainedModel):
147
+ config_class = SERConfig
148
+
149
+ def __init__(self, config):
150
+ super().__init__(config)
151
+ self.ssl_model = AutoModel.from_pretrained(config.ssl_type)
152
+ self.ssl_model.freeze_feature_encoder()
153
+
154
+ self.pool_model = AttentiveStatisticsPooling(config.hidden_size)
155
+
156
+ self.ser_model = EmotionRegression(config.hidden_size*2,
157
+ config.hidden_size,
158
+ config.classifier_hidden_layers,
159
+ config.num_classes,
160
+ dropout=config.classifier_dropout_prob)
161
+
162
+
163
+ def forward(self, x, mask):
164
+ ssl = self.ssl_model(x, attention_mask=mask).last_hidden_state
165
+
166
+ ssl = self.pool_model(ssl, mask)
167
+
168
+ pred = self.ser_model(ssl)
169
+
170
+ return pred
171
+
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Categorical/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:446f71c92a67b69977c50b065a0e418c37fa20aba1d2e44ecb1190d97f9c0cbb
3
+ size 1274614289
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/README.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ pipeline_tag: audio-classification
6
+ tags:
7
+ - wavlm
8
+ - msp-podcast
9
+ - emotion-recognition
10
+ - audio
11
+ - speech
12
+ - dominance
13
+ - lucas
14
+ - speech-emotion-recognition
15
+ ---
16
+ The model was trained on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) for the Odyssey 2024 Emotion Recognition competition baseline<br>
17
+ This particular model is the single-task specialized dominance model, which predict dominance in a range of approximately 0...1.
18
+
19
+
20
+
21
+ # Benchmarks
22
+ CCC based on Test3 and Development sets of the Odyssey Competition
23
+ <table style="width:500px">
24
+ <tr><th colspan=2 align="center"> Sinle-Task Setup </th></tr>
25
+ <tr><th colspan=1 align="center">Test 3</th><th colspan=1 align="center">Development</th></tr>
26
+ <tr> <td align="center">Dom</td> <td align="center">Dom</td> </tr>
27
+ <tr> <td align="center"> 0.424</td> <td align="center" >0.584 </td> </tr>
28
+ </table>
29
+
30
+
31
+
32
+ For more details: [demo](https://huggingface.co/spaces/3loi/WavLM-SER-Multi-Baseline-Odyssey2024), [paper](https://ecs.utdallas.edu/research/researchlabs/msp-lab/publications/Goncalves_2024.pdf) and [GitHub](https://github.com/MSP-UTD/MSP-Podcast_Challenge/tree/main).
33
+
34
+
35
+ ```
36
+ @InProceedings{Goncalves_2024,
37
+ author={L. Goncalves and A. N. Salman and A. {Reddy Naini} and L. Moro-Velazquez and T. Thebaud and L. {Paola Garcia} and N. Dehak and B. Sisman and C. Busso},
38
+ title={Odyssey2024 - Speech Emotion Recognition Challenge: Dataset, Baseline Framework, and Results},
39
+ booktitle={Odyssey 2024: The Speaker and Language Recognition Workshop)},
40
+ volume={To appear},
41
+ year={2024},
42
+ month={June},
43
+ address = {Quebec, Canada},
44
+ }
45
+ ```
46
+
47
+
48
+ # Usage
49
+ ```python
50
+ from transformers import AutoModelForAudioClassification
51
+ import librosa, torch
52
+
53
+ #load model
54
+ model = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Dominance", trust_remote_code=True)
55
+
56
+ #get mean/std
57
+ mean = model.config.mean
58
+ std = model.config.std
59
+
60
+
61
+ #load an audio file
62
+ audio_path = "/path/to/audio.wav"
63
+ raw_wav, _ = librosa.load(audio_path, sr=model.config.sampling_rate)
64
+
65
+ #normalize the audio by mean/std
66
+ norm_wav = (raw_wav - mean) / (std+0.000001)
67
+
68
+ #generate the mask
69
+ mask = torch.ones(1, len(norm_wav))
70
+
71
+ #batch it (add dim)
72
+ wavs = torch.tensor(norm_wav).unsqueeze(0)
73
+
74
+
75
+ #predict
76
+ with torch.no_grad():
77
+ pred = model(wavs, mask)
78
+
79
+ print(model.config.id2label)
80
+ print(pred)
81
+ #{0: 'dominance'}
82
+ #tensor([[0.3670]])
83
+ ```
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SERModel"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "pipeline_utils.SERConfig",
7
+ "AutoModelForAudioClassification": "pipeline_utils.SERModel"
8
+ },
9
+ "id2label": {
10
+ "0": "dominance"
11
+ },
12
+ "sampling_rate": 16000,
13
+ "maxlen": 192000,
14
+ "mean": -8.278621631819787e-05,
15
+ "std": 0.08485510250851999,
16
+ "classifier_dropout_prob": 0.5,
17
+ "classifier_hidden_layers": 1,
18
+ "hidden_size": 1024,
19
+ "model_type": "ser",
20
+ "num_attention_heads": 16,
21
+ "num_classes": 1,
22
+ "num_hidden_layers": 24,
23
+ "ssl_type": "microsoft/wavlm-large",
24
+ "torch_dtype": "float32",
25
+ "transformers_version": "4.34.0.dev0"
26
+ }
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5279e2387d029fb3c7529830546a876518bc32e264d61a21a593d708c9491e0
3
+ size 1274482316
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/pipeline_utils.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from transformers import AutoModel
5
+ from transformers.modeling_utils import PreTrainedModel ,PretrainedConfig
6
+
7
+
8
+ class Pooling(nn.Module):
9
+ def __init__(self):
10
+ super().__init__()
11
+ def compute_length_from_mask(self, mask):
12
+ """
13
+ mask: (batch_size, T)
14
+ Assuming that the sampling rate is 16kHz, the frame shift is 20ms
15
+ """
16
+ wav_lens = torch.sum(mask, dim=1) # (batch_size, )
17
+ feat_lens = torch.div(wav_lens-1, 16000*0.02, rounding_mode="floor") + 1
18
+ feat_lens = feat_lens.int().tolist()
19
+ return feat_lens
20
+
21
+ def forward(self, x, mask):
22
+ raise NotImplementedError
23
+
24
+ class MeanPooling(Pooling):
25
+ def __init__(self):
26
+ super().__init__()
27
+ def forward(self, xs, mask):
28
+ """
29
+ xs: (batch_size, T, feat_dim)
30
+ mask: (batch_size, T)
31
+
32
+ => output: (batch_size, feat_dim)
33
+ """
34
+ feat_lens = self.compute_length_from_mask(mask)
35
+ pooled_list = []
36
+ for x, feat_len in zip(xs, feat_lens):
37
+ pooled = torch.mean(x[:feat_len], dim=0) # (feat_dim, )
38
+ pooled_list.append(pooled)
39
+ pooled = torch.stack(pooled_list, dim=0) # (batch_size, feat_dim)
40
+ return pooled
41
+
42
+
43
+ class AttentiveStatisticsPooling(Pooling):
44
+ """
45
+ AttentiveStatisticsPooling
46
+ Paper: Attentive Statistics Pooling for Deep Speaker Embedding
47
+ Link: https://arxiv.org/pdf/1803.10963.pdf
48
+ """
49
+ def __init__(self, input_size):
50
+ super().__init__()
51
+ self._indim = input_size
52
+ self.sap_linear = nn.Linear(input_size, input_size)
53
+ self.attention = nn.Parameter(torch.FloatTensor(input_size, 1))
54
+ torch.nn.init.normal_(self.attention, mean=0, std=1)
55
+
56
+ def forward(self, xs, mask):
57
+ """
58
+ xs: (batch_size, T, feat_dim)
59
+ mask: (batch_size, T)
60
+
61
+ => output: (batch_size, feat_dim*2)
62
+ """
63
+ feat_lens = self.compute_length_from_mask(mask)
64
+ pooled_list = []
65
+ for x, feat_len in zip(xs, feat_lens):
66
+ x = x[:feat_len].unsqueeze(0)
67
+ h = torch.tanh(self.sap_linear(x))
68
+ w = torch.matmul(h, self.attention).squeeze(dim=2)
69
+ w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1)
70
+ mu = torch.sum(x * w, dim=1)
71
+ rh = torch.sqrt((torch.sum((x**2) * w, dim=1) - mu**2).clamp(min=1e-5))
72
+ x = torch.cat((mu, rh), 1).squeeze(0)
73
+ pooled_list.append(x)
74
+ return torch.stack(pooled_list)
75
+
76
+
77
+
78
+
79
+ class EmotionRegression(nn.Module):
80
+ def __init__(self, *args, **kwargs):
81
+ super(EmotionRegression, self).__init__()
82
+ input_dim = args[0]
83
+ hidden_dim = args[1]
84
+ num_layers = args[2]
85
+ output_dim = args[3]
86
+ p = kwargs.get("dropout", 0.5)
87
+
88
+ self.fc=nn.ModuleList([
89
+ nn.Sequential(
90
+ nn.Linear(input_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
91
+ )
92
+ ])
93
+ for lidx in range(num_layers-1):
94
+ self.fc.append(
95
+ nn.Sequential(
96
+ nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
97
+ )
98
+ )
99
+ self.out = nn.Sequential(
100
+ nn.Linear(hidden_dim, output_dim)
101
+ )
102
+
103
+ self.inp_drop = nn.Dropout(p)
104
+ def get_repr(self, x):
105
+ h = self.inp_drop(x)
106
+ for lidx, fc in enumerate(self.fc):
107
+ h=fc(h)
108
+ return h
109
+
110
+ def forward(self, x):
111
+ h=self.get_repr(x)
112
+ result = self.out(h)
113
+ return result
114
+
115
+ class SERConfig(PretrainedConfig):
116
+ model_type = "ser"
117
+
118
+ def __init__(
119
+ self,
120
+ num_classes: int = 1,
121
+ num_attention_heads = 16,
122
+ num_hidden_layers = 24,
123
+ hidden_size = 1024,
124
+ classifier_hidden_layers = 1,
125
+ classifier_dropout_prob = 0.5,
126
+ ssl_type= "microsoft/wavlm-large",
127
+ torch_dtype= "float32",
128
+ **kwargs,
129
+ ):
130
+ self.num_classes = num_classes
131
+ self.num_attention_heads = num_attention_heads
132
+ self.num_hidden_layers = num_hidden_layers
133
+ self.hidden_size = hidden_size
134
+ self.classifier_hidden_layers = classifier_hidden_layers
135
+ self.classifier_dropout_prob = classifier_dropout_prob
136
+ self.ssl_type = ssl_type
137
+ self.torch_dtype = torch_dtype
138
+ super().__init__(**kwargs)
139
+
140
+ class SERModel(PreTrainedModel):
141
+ config_class = SERConfig
142
+
143
+ def __init__(self, config):
144
+ super().__init__(config)
145
+ self.ssl_model = AutoModel.from_pretrained(config.ssl_type)
146
+ self.ssl_model.freeze_feature_encoder()
147
+
148
+ self.pool_model = AttentiveStatisticsPooling(config.hidden_size)
149
+
150
+ self.ser_model = EmotionRegression(config.hidden_size*2,
151
+ config.hidden_size,
152
+ config.classifier_hidden_layers,
153
+ config.num_classes,
154
+ dropout=config.classifier_dropout_prob)
155
+
156
+
157
+ def forward(self, x, mask):
158
+ ssl = self.ssl_model(x, attention_mask=mask).last_hidden_state
159
+
160
+ ssl = self.pool_model(ssl, mask)
161
+
162
+ pred = self.ser_model(ssl)
163
+
164
+ return pred
165
+
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Dominance/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6fc0167d183d89114be10df1c4e4f74040b558408efee99a71fcf5205865ef2
3
+ size 1274585617
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/README.md ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ pipeline_tag: audio-classification
6
+ tags:
7
+ - wavlm
8
+ - msp-podcast
9
+ - emotion-recognition
10
+ - audio
11
+ - speech
12
+ - valence
13
+ - arousal
14
+ - dominance
15
+ - lucas
16
+ - speech-emotion-recognition
17
+ ---
18
+ The model was trained on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) for the Odyssey 2024 Emotion Recognition competition baseline<br>
19
+ This particular model is the multi-attributed based model which predict arousal, dominance and valence in a range of approximately 0...1.
20
+
21
+
22
+ # Benchmarks
23
+ CCC based on Test3 and Development sets of the Odyssey Competition
24
+ <table style="width:500px">
25
+ <tr><th colspan=6 align="center" >Multi-Task Setup</th></tr>
26
+ <tr><th colspan=3 align="center">Test 3</th><th colspan=3 align="center">Development</th></tr>
27
+ <tr> <td>Val</td> <td>Dom</td> <td>Aro</td> <td>Val</td> <td>Dom</td> <td>Aro</td> </tr>
28
+ <tr> <td> 0.577</td> <td>0.577</td> <td>0.405</td> <td>0.652</td> <td>0.688</td> <td>0.579</td> </tr>
29
+ </table>
30
+
31
+
32
+
33
+ For more details: [demo](https://huggingface.co/spaces/3loi/WavLM-SER-Multi-Baseline-Odyssey2024), [paper](https://ecs.utdallas.edu/research/researchlabs/msp-lab/publications/Goncalves_2024.pdf), and [GitHub](https://github.com/MSP-UTD/MSP-Podcast_Challenge/tree/main).
34
+
35
+
36
+ ```
37
+ @InProceedings{Goncalves_2024,
38
+ author={L. Goncalves and A. N. Salman and A. {Reddy Naini} and L. Moro-Velazquez and T. Thebaud and L. {Paola Garcia} and N. Dehak and B. Sisman and C. Busso},
39
+ title={Odyssey2024 - Speech Emotion Recognition Challenge: Dataset, Baseline Framework, and Results},
40
+ booktitle={Odyssey 2024: The Speaker and Language Recognition Workshop)},
41
+ volume={To appear},
42
+ year={2024},
43
+ month={June},
44
+ address = {Quebec, Canada},
45
+ }
46
+ ```
47
+
48
+
49
+ # Usage
50
+ ```python
51
+ from transformers import AutoModelForAudioClassification
52
+ import librosa, torch
53
+
54
+ #load model
55
+ model = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes", trust_remote_code=True)
56
+
57
+ #get mean/std
58
+ mean = model.config.mean
59
+ std = model.config.std
60
+
61
+
62
+ #load an audio file
63
+ audio_path = "/path/to/audio.wav"
64
+ raw_wav, _ = librosa.load(audio_path, sr=model.config.sampling_rate)
65
+
66
+ #normalize the audio by mean/std
67
+ norm_wav = (raw_wav - mean) / (std+0.000001)
68
+
69
+ #generate the mask
70
+ mask = torch.ones(1, len(norm_wav))
71
+
72
+ #batch it (add dim)
73
+ wavs = torch.tensor(norm_wav).unsqueeze(0)
74
+
75
+
76
+ #predict
77
+ with torch.no_grad():
78
+ pred = model(wavs, mask)
79
+
80
+ print(model.config.id2label)
81
+ print(pred)
82
+ #{0: 'arousal', 1: 'dominance', 2: 'valence'}
83
+ #tensor([[0.3670, 0.4553, 0.4240]])
84
+ ```
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SERModel"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "pipeline_utils.SERConfig",
7
+ "AutoModelForAudioClassification": "pipeline_utils.SERModel"
8
+ },
9
+ "id2label": {
10
+ "0": "arousal",
11
+ "1": "dominance",
12
+ "2": "valence"
13
+ },
14
+ "sampling_rate": 16000,
15
+ "maxlen": 192000,
16
+ "mean": -8.278621631819787e-05,
17
+ "std": 0.08485510250851999,
18
+ "classifier_dropout_prob": 0.5,
19
+ "classifier_hidden_layers": 1,
20
+ "hidden_size": 1024,
21
+ "model_type": "ser",
22
+ "num_attention_heads": 16,
23
+ "num_classes": 3,
24
+ "num_hidden_layers": 24,
25
+ "ssl_type": "microsoft/wavlm-large",
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.34.0.dev0"
28
+ }
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:557ba9b4aa8461a60bc7f5c5bd2e34b4de34d4c8ccfa684c438b6cbdc1893c9d
3
+ size 1274490516
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/pipeline_utils.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from transformers import AutoModel
5
+ from transformers.modeling_utils import PreTrainedModel ,PretrainedConfig
6
+
7
+
8
+
9
+
10
+ class Pooling(nn.Module):
11
+ def __init__(self):
12
+ super().__init__()
13
+ def compute_length_from_mask(self, mask):
14
+ """
15
+ mask: (batch_size, T)
16
+ Assuming that the sampling rate is 16kHz, the frame shift is 20ms
17
+ """
18
+ wav_lens = torch.sum(mask, dim=1) # (batch_size, )
19
+ feat_lens = torch.div(wav_lens-1, 16000*0.02, rounding_mode="floor") + 1
20
+ feat_lens = feat_lens.int().tolist()
21
+ return feat_lens
22
+
23
+ def forward(self, x, mask):
24
+ raise NotImplementedError
25
+
26
+ class MeanPooling(Pooling):
27
+ def __init__(self):
28
+ super().__init__()
29
+ def forward(self, xs, mask):
30
+ """
31
+ xs: (batch_size, T, feat_dim)
32
+ mask: (batch_size, T)
33
+
34
+ => output: (batch_size, feat_dim)
35
+ """
36
+ feat_lens = self.compute_length_from_mask(mask)
37
+ pooled_list = []
38
+ for x, feat_len in zip(xs, feat_lens):
39
+ pooled = torch.mean(x[:feat_len], dim=0) # (feat_dim, )
40
+ pooled_list.append(pooled)
41
+ pooled = torch.stack(pooled_list, dim=0) # (batch_size, feat_dim)
42
+ return pooled
43
+
44
+
45
+ class AttentiveStatisticsPooling(Pooling):
46
+ """
47
+ AttentiveStatisticsPooling
48
+ Paper: Attentive Statistics Pooling for Deep Speaker Embedding
49
+ Link: https://arxiv.org/pdf/1803.10963.pdf
50
+ """
51
+ def __init__(self, input_size):
52
+ super().__init__()
53
+ self._indim = input_size
54
+ self.sap_linear = nn.Linear(input_size, input_size)
55
+ self.attention = nn.Parameter(torch.FloatTensor(input_size, 1))
56
+ torch.nn.init.normal_(self.attention, mean=0, std=1)
57
+
58
+ def forward(self, xs, mask):
59
+ """
60
+ xs: (batch_size, T, feat_dim)
61
+ mask: (batch_size, T)
62
+
63
+ => output: (batch_size, feat_dim*2)
64
+ """
65
+ feat_lens = self.compute_length_from_mask(mask)
66
+ pooled_list = []
67
+ for x, feat_len in zip(xs, feat_lens):
68
+ x = x[:feat_len].unsqueeze(0)
69
+ h = torch.tanh(self.sap_linear(x))
70
+ w = torch.matmul(h, self.attention).squeeze(dim=2)
71
+ w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1)
72
+ mu = torch.sum(x * w, dim=1)
73
+ rh = torch.sqrt((torch.sum((x**2) * w, dim=1) - mu**2).clamp(min=1e-5))
74
+ x = torch.cat((mu, rh), 1).squeeze(0)
75
+ pooled_list.append(x)
76
+ return torch.stack(pooled_list)
77
+
78
+
79
+
80
+
81
+ class EmotionRegression(nn.Module):
82
+ def __init__(self, *args, **kwargs):
83
+ super(EmotionRegression, self).__init__()
84
+ input_dim = args[0]
85
+ hidden_dim = args[1]
86
+ num_layers = args[2]
87
+ output_dim = args[3]
88
+ p = kwargs.get("dropout", 0.5)
89
+
90
+ self.fc=nn.ModuleList([
91
+ nn.Sequential(
92
+ nn.Linear(input_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
93
+ )
94
+ ])
95
+ for lidx in range(num_layers-1):
96
+ self.fc.append(
97
+ nn.Sequential(
98
+ nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
99
+ )
100
+ )
101
+ self.out = nn.Sequential(
102
+ nn.Linear(hidden_dim, output_dim)
103
+ )
104
+
105
+ self.inp_drop = nn.Dropout(p)
106
+ def get_repr(self, x):
107
+ h = self.inp_drop(x)
108
+ for lidx, fc in enumerate(self.fc):
109
+ h=fc(h)
110
+ return h
111
+
112
+ def forward(self, x):
113
+ h=self.get_repr(x)
114
+ result = self.out(h)
115
+ return result
116
+
117
+ class SERConfig(PretrainedConfig):
118
+ model_type = "ser"
119
+
120
+ def __init__(
121
+ self,
122
+ num_classes: int = 3,
123
+ num_attention_heads = 16,
124
+ num_hidden_layers = 24,
125
+ hidden_size = 1024,
126
+ classifier_hidden_layers = 1,
127
+ classifier_dropout_prob = 0.5,
128
+ ssl_type= "microsoft/wavlm-large",
129
+ torch_dtype= "float32",
130
+ **kwargs,
131
+ ):
132
+ self.num_classes = num_classes
133
+ self.num_attention_heads = num_attention_heads
134
+ self.num_hidden_layers = num_hidden_layers
135
+ self.hidden_size = hidden_size
136
+ self.classifier_hidden_layers = classifier_hidden_layers
137
+ self.classifier_dropout_prob = classifier_dropout_prob
138
+ self.ssl_type = ssl_type
139
+ self.torch_dtype = torch_dtype
140
+ super().__init__(**kwargs)
141
+
142
+ class SERModel(PreTrainedModel):
143
+ config_class = SERConfig
144
+
145
+ def __init__(self, config):
146
+ super().__init__(config)
147
+ self.ssl_model = AutoModel.from_pretrained(config.ssl_type)
148
+ self.ssl_model.freeze_feature_encoder()
149
+
150
+ self.pool_model = AttentiveStatisticsPooling(config.hidden_size)
151
+
152
+ self.ser_model = EmotionRegression(config.hidden_size*2,
153
+ config.hidden_size,
154
+ config.classifier_hidden_layers,
155
+ config.num_classes,
156
+ dropout=config.classifier_dropout_prob)
157
+
158
+
159
+ def forward(self, x, mask):
160
+ ssl = self.ssl_model(x, attention_mask=mask).last_hidden_state
161
+
162
+ ssl = self.pool_model(ssl, mask)
163
+
164
+ pred = self.ser_model(ssl)
165
+
166
+ return pred
167
+
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/preprocessor_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "mean": 10
3
+ }
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Multi-Attributes/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c34b4fd571efce7b4530a7539f1928213d535f6be19b2324bceca0c08c3e601
3
+ size 1274593809
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/README.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ pipeline_tag: audio-classification
6
+ tags:
7
+ - wavlm
8
+ - msp-podcast
9
+ - emotion-recognition
10
+ - audio
11
+ - speech
12
+ - valence
13
+ - lucas
14
+ - speech-emotion-recognition
15
+ ---
16
+ The model was trained on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) for the Odyssey 2024 Emotion Recognition competition baseline<br>
17
+ This particular model is the single-task specialized valence model, which predict valence in a range of approximately 0...1.
18
+
19
+
20
+
21
+ # Benchmarks
22
+ CCC based on Test3 and Development sets of the Odyssey Competition
23
+ <table style="width:500px">
24
+ <tr><th colspan=2 align="center"> Sinle-Task Setup </th></tr>
25
+ <tr><th colspan=1 align="center">Test 3</th><th colspan=1 align="center">Development</th></tr>
26
+ <tr> <td align="center">Val</td> <td align="center">Val</td> </tr>
27
+ <tr> <td align="center"> 0.607</td> <td align="center" >0.709 </td> </tr>
28
+ </table>
29
+
30
+
31
+
32
+ For more details: [demo](https://huggingface.co/spaces/3loi/WavLM-SER-Multi-Baseline-Odyssey2024), [paper](https://ecs.utdallas.edu/research/researchlabs/msp-lab/publications/Goncalves_2024.pdf), and [GitHub](https://github.com/MSP-UTD/MSP-Podcast_Challenge/tree/main).
33
+
34
+
35
+ ```
36
+ @InProceedings{Goncalves_2024,
37
+ author={L. Goncalves and A. N. Salman and A. {Reddy Naini} and L. Moro-Velazquez and T. Thebaud and L. {Paola Garcia} and N. Dehak and B. Sisman and C. Busso},
38
+ title={Odyssey2024 - Speech Emotion Recognition Challenge: Dataset, Baseline Framework, and Results},
39
+ booktitle={Odyssey 2024: The Speaker and Language Recognition Workshop)},
40
+ volume={To appear},
41
+ year={2024},
42
+ month={June},
43
+ address = {Quebec, Canada},
44
+ }
45
+ ```
46
+
47
+
48
+ # Usage
49
+ ```python
50
+ from transformers import AutoModelForAudioClassification
51
+ import librosa, torch
52
+
53
+ #load model
54
+ model = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Valence", trust_remote_code=True)
55
+
56
+ #get mean/std
57
+ mean = model.config.mean
58
+ std = model.config.std
59
+
60
+
61
+ #load an audio file
62
+ audio_path = "/path/to/audio.wav"
63
+ raw_wav, _ = librosa.load(audio_path, sr=model.config.sampling_rate)
64
+
65
+ #normalize the audio by mean/std
66
+ norm_wav = (raw_wav - mean) / (std+0.000001)
67
+
68
+ #generate the mask
69
+ mask = torch.ones(1, len(norm_wav))
70
+
71
+ #batch it (add dim)
72
+ wavs = torch.tensor(norm_wav).unsqueeze(0)
73
+
74
+
75
+ #predict
76
+ with torch.no_grad():
77
+ pred = model(wavs, mask)
78
+
79
+ print(model.config.id2label)
80
+ print(pred)
81
+ #{0: 'valence'}
82
+ #tensor([[0.3670]])
83
+ ```
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SERModel"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "pipeline_utils.SERConfig",
7
+ "AutoModelForAudioClassification": "pipeline_utils.SERModel"
8
+ },
9
+ "id2label": {
10
+ "0": "valence"
11
+ },
12
+ "sampling_rate": 16000,
13
+ "maxlen": 192000,
14
+ "mean": -8.278621631819787e-05,
15
+ "std": 0.08485510250851999,
16
+ "classifier_dropout_prob": 0.5,
17
+ "classifier_hidden_layers": 1,
18
+ "hidden_size": 1024,
19
+ "model_type": "ser",
20
+ "num_attention_heads": 16,
21
+ "num_classes": 1,
22
+ "num_hidden_layers": 24,
23
+ "ssl_type": "microsoft/wavlm-large",
24
+ "torch_dtype": "float32",
25
+ "transformers_version": "4.34.0.dev0"
26
+ }
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44449ad4b46e4af5168f29b25055ca67c28ffd44829d11020782c43712bbc8b3
3
+ size 1274482316
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/pipeline_utils.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from transformers import AutoModel
5
+ from transformers.modeling_utils import PreTrainedModel ,PretrainedConfig
6
+
7
+
8
+ class Pooling(nn.Module):
9
+ def __init__(self):
10
+ super().__init__()
11
+ def compute_length_from_mask(self, mask):
12
+ """
13
+ mask: (batch_size, T)
14
+ Assuming that the sampling rate is 16kHz, the frame shift is 20ms
15
+ """
16
+ wav_lens = torch.sum(mask, dim=1) # (batch_size, )
17
+ feat_lens = torch.div(wav_lens-1, 16000*0.02, rounding_mode="floor") + 1
18
+ feat_lens = feat_lens.int().tolist()
19
+ return feat_lens
20
+
21
+ def forward(self, x, mask):
22
+ raise NotImplementedError
23
+
24
+ class MeanPooling(Pooling):
25
+ def __init__(self):
26
+ super().__init__()
27
+ def forward(self, xs, mask):
28
+ """
29
+ xs: (batch_size, T, feat_dim)
30
+ mask: (batch_size, T)
31
+
32
+ => output: (batch_size, feat_dim)
33
+ """
34
+ feat_lens = self.compute_length_from_mask(mask)
35
+ pooled_list = []
36
+ for x, feat_len in zip(xs, feat_lens):
37
+ pooled = torch.mean(x[:feat_len], dim=0) # (feat_dim, )
38
+ pooled_list.append(pooled)
39
+ pooled = torch.stack(pooled_list, dim=0) # (batch_size, feat_dim)
40
+ return pooled
41
+
42
+
43
+ class AttentiveStatisticsPooling(Pooling):
44
+ """
45
+ AttentiveStatisticsPooling
46
+ Paper: Attentive Statistics Pooling for Deep Speaker Embedding
47
+ Link: https://arxiv.org/pdf/1803.10963.pdf
48
+ """
49
+ def __init__(self, input_size):
50
+ super().__init__()
51
+ self._indim = input_size
52
+ self.sap_linear = nn.Linear(input_size, input_size)
53
+ self.attention = nn.Parameter(torch.FloatTensor(input_size, 1))
54
+ torch.nn.init.normal_(self.attention, mean=0, std=1)
55
+
56
+ def forward(self, xs, mask):
57
+ """
58
+ xs: (batch_size, T, feat_dim)
59
+ mask: (batch_size, T)
60
+
61
+ => output: (batch_size, feat_dim*2)
62
+ """
63
+ feat_lens = self.compute_length_from_mask(mask)
64
+ pooled_list = []
65
+ for x, feat_len in zip(xs, feat_lens):
66
+ x = x[:feat_len].unsqueeze(0)
67
+ h = torch.tanh(self.sap_linear(x))
68
+ w = torch.matmul(h, self.attention).squeeze(dim=2)
69
+ w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1)
70
+ mu = torch.sum(x * w, dim=1)
71
+ rh = torch.sqrt((torch.sum((x**2) * w, dim=1) - mu**2).clamp(min=1e-5))
72
+ x = torch.cat((mu, rh), 1).squeeze(0)
73
+ pooled_list.append(x)
74
+ return torch.stack(pooled_list)
75
+
76
+
77
+
78
+
79
+ class EmotionRegression(nn.Module):
80
+ def __init__(self, *args, **kwargs):
81
+ super(EmotionRegression, self).__init__()
82
+ input_dim = args[0]
83
+ hidden_dim = args[1]
84
+ num_layers = args[2]
85
+ output_dim = args[3]
86
+ p = kwargs.get("dropout", 0.5)
87
+
88
+ self.fc=nn.ModuleList([
89
+ nn.Sequential(
90
+ nn.Linear(input_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
91
+ )
92
+ ])
93
+ for lidx in range(num_layers-1):
94
+ self.fc.append(
95
+ nn.Sequential(
96
+ nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
97
+ )
98
+ )
99
+ self.out = nn.Sequential(
100
+ nn.Linear(hidden_dim, output_dim)
101
+ )
102
+
103
+ self.inp_drop = nn.Dropout(p)
104
+ def get_repr(self, x):
105
+ h = self.inp_drop(x)
106
+ for lidx, fc in enumerate(self.fc):
107
+ h=fc(h)
108
+ return h
109
+
110
+ def forward(self, x):
111
+ h=self.get_repr(x)
112
+ result = self.out(h)
113
+ return result
114
+
115
+ class SERConfig(PretrainedConfig):
116
+ model_type = "ser"
117
+
118
+ def __init__(
119
+ self,
120
+ num_classes: int = 1,
121
+ num_attention_heads = 16,
122
+ num_hidden_layers = 24,
123
+ hidden_size = 1024,
124
+ classifier_hidden_layers = 1,
125
+ classifier_dropout_prob = 0.5,
126
+ ssl_type= "microsoft/wavlm-large",
127
+ torch_dtype= "float32",
128
+ **kwargs,
129
+ ):
130
+ self.num_classes = num_classes
131
+ self.num_attention_heads = num_attention_heads
132
+ self.num_hidden_layers = num_hidden_layers
133
+ self.hidden_size = hidden_size
134
+ self.classifier_hidden_layers = classifier_hidden_layers
135
+ self.classifier_dropout_prob = classifier_dropout_prob
136
+ self.ssl_type = ssl_type
137
+ self.torch_dtype = torch_dtype
138
+ super().__init__(**kwargs)
139
+
140
+ class SERModel(PreTrainedModel):
141
+ config_class = SERConfig
142
+
143
+ def __init__(self, config):
144
+ super().__init__(config)
145
+ self.ssl_model = AutoModel.from_pretrained(config.ssl_type)
146
+ self.ssl_model.freeze_feature_encoder()
147
+
148
+ self.pool_model = AttentiveStatisticsPooling(config.hidden_size)
149
+
150
+ self.ser_model = EmotionRegression(config.hidden_size*2,
151
+ config.hidden_size,
152
+ config.classifier_hidden_layers,
153
+ config.num_classes,
154
+ dropout=config.classifier_dropout_prob)
155
+
156
+
157
+ def forward(self, x, mask):
158
+ ssl = self.ssl_model(x, attention_mask=mask).last_hidden_state
159
+
160
+ ssl = self.pool_model(ssl, mask)
161
+
162
+ pred = self.ser_model(ssl)
163
+
164
+ return pred
165
+
SER-Odyssey/SER-Odyssey-Baseline-WavLM-Valence/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ca8929ea564b56819ed846e96b1a472df11fa39d63f540108bab62c84b269b8
3
+ size 1274585617
SER-Odyssey/SER-WavLM-Multi-Attributes/.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tensorrt/trt10_ser_fp16.plan filter=lfs diff=lfs merge=lfs -text
37
+ tensorrt/trt8_ser_dyn_fp16.plan filter=lfs diff=lfs merge=lfs -text
SER-Odyssey/SER-WavLM-Multi-Attributes/README.md ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ pipeline_tag: audio-classification
6
+ tags:
7
+ - pytorch
8
+ - wavlm
9
+ - msp-podcast
10
+ - emotion-recognition
11
+ - audio
12
+ - speech
13
+ - valence
14
+ - arousal
15
+ - dominance
16
+ - lucas
17
+ - speech-emotion-recognition
18
+ ---
19
+ The model is a recreation of [3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes](https://huggingface.co/3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes) for direct implementation in torch, with class definition and feed forward method. This model was recreated with the hopes of greater flexibilty of control, training/fine-tuning of model. The model was trained on the same [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) dataset as the original, but a different smaller subset was used. The subset is evenly distributed across gender and emotion category with hopes that training would improve accuracy of valence and arousal predictions.
20
+ This model is therefore a multi-attributed based model which predict arousal, dominance and valence. However, unlike the original model, I just kept the original attribute score range of 0...7 (the range the dataset follows). I will provide the evaluations later on. For now I decided to make this repo so that other people could test out my model and see what they think of the inference accuracy themselves, or retrain from scratch, modify etc. My best trained weights s of now are provided in this repo. The class definition for the model is can be found in my [github](https://github.com/PhilipAmadasun/SER-Model-for-dimensional-attribute-prediction#).
21
+
22
+ # Get class definition
23
+ ```
24
+ git clone https://github.com/PhilipAmadasun/SER-Model-for-dimensional-attribute-prediction.git
25
+ ```
26
+
27
+ # Usage
28
+ ## Inference Testing
29
+ ```python
30
+ import torch
31
+ import torchaudio
32
+ from SER_Model_setup import SERModel
33
+
34
+ device = "cuda" if torch.cuda.is_available() else "cpu"
35
+
36
+ checkpoint_path = "<model.pt file>"
37
+ checkpoint = torch.load(checkpoint_path, map_location=device)
38
+
39
+ # Create the model architecture and load weights
40
+ model = SERModel()
41
+ model.load_state_dict(checkpoint['model_state_dict'])
42
+ model.to(device)
43
+ model.eval()
44
+
45
+ audio_path = "<wav file>"
46
+ audio, sr = torchaudio.load(audio_path)
47
+
48
+ if sr != model.sample_rate:
49
+ resampler = torchaudio.transforms.Resample(sr, model.sample_rate)
50
+ audio = resampler(audio)
51
+ #print(audio.shape[0])
52
+
53
+ if audio.shape[0] > 1:
54
+ audio = torch.mean(audio, dim=0, keepdim=True)
55
+
56
+ audio_len = audio.shape[-1]
57
+
58
+ # Create waveform tensor (shape: [1, audio_len])
59
+ waveform = torch.zeros(1, audio_len, dtype=torch.float32)
60
+ # print(waveform)
61
+ # print()
62
+ # print(f"waveform shape: {waveform.shape}")
63
+ # print()
64
+ waveform[0, :audio_len] = audio
65
+ # print(waveform)
66
+ # print()
67
+ # Create mask as 2D tensor: shape [1, audio_len] with ones in valid region
68
+ mask = torch.ones(1, audio_len, dtype=torch.float32)
69
+ # print(mask)
70
+ # print()
71
+ # print(f"mask shape: {mask.shape}")
72
+
73
+ # Move waveform and mask to device
74
+ waveform = waveform.to(device)
75
+ mask = mask.to(device)
76
+
77
+ # Normalize waveform using model's mean and std
78
+ mean = model.mean.to(device)
79
+ std = model.std.to(device)
80
+ waveform = (waveform - mean) / (std + 1e-6)
81
+
82
+ with torch.no_grad():
83
+ predictions = model(waveform, mask) # predictions shape: [1, 3]
84
+
85
+ # Extract predictions: [0,0] for arousal, [0,1] for valence, [0,2] for dominance
86
+ arousal = predictions[0, 0].item()
87
+ valence = predictions[0, 1].item()
88
+ dominance = predictions[0, 2].item()
89
+
90
+ print(f"Arousal: {arousal:.3f}")
91
+ print(f"Valence: {valence:.3f}")
92
+ print(f"Dominance: {dominance:.3f}")
93
+ ```
94
+ ## Batch inference
95
+ ```python
96
+ import os
97
+ import glob
98
+ import torch
99
+ import torchaudio
100
+ from SER_Model_setup import SERModel # Adjust if your model code is elsewhere
101
+
102
+ def load_model_from_checkpoint(checkpoint_path, device='cpu'):
103
+ """
104
+ Loads the SERModel and weights from a checkpoint, moves to device, sets eval mode.
105
+ """
106
+ checkpoint = torch.load(checkpoint_path, map_location=device)
107
+
108
+ # Create the model architecture
109
+ model = SERModel()
110
+ model.load_state_dict(checkpoint['model_state_dict'])
111
+
112
+ model.to(device)
113
+ model.eval()
114
+ return model
115
+
116
+ def batch_inference(model, file_paths, device='cpu', normalize=True):
117
+ """
118
+ Perform true batch inference on multiple .wav files in one forward pass.
119
+
120
+ Args:
121
+ model (SERModel): The loaded SER model in eval mode
122
+ file_paths (list[str]): List of paths to .wav files
123
+ device (str or torch.device): 'cpu' or 'cuda'
124
+ normalize (bool): Whether to normalize waveforms (subtract mean, divide std)
125
+
126
+ Returns:
127
+ dict: {filename: {"arousal": float, "valence": float, "dominance": float}}
128
+ """
129
+
130
+ # ----------------------------------------
131
+ # 1) Load & store all waveforms in memory
132
+ # ----------------------------------------
133
+ waveforms_list = []
134
+ lengths = []
135
+ for fp in file_paths:
136
+ # Load audio
137
+ audio, sr = torchaudio.load(fp)
138
+
139
+ # Resample if needed
140
+ if sr != model.sample_rate:
141
+ resampler = torchaudio.transforms.Resample(sr, model.sample_rate)
142
+ audio = resampler(audio)
143
+
144
+ # Convert stereo -> mono if needed
145
+ if audio.shape[0] > 1:
146
+ audio = torch.mean(audio, dim=0, keepdim=True)
147
+
148
+ # audio shape => [1, num_samples]
149
+ lengths.append(audio.shape[-1])
150
+ waveforms_list.append(audio)
151
+
152
+ # ----------------------------------------
153
+ # 2) Determine max length
154
+ # ----------------------------------------
155
+ max_len = max(lengths)
156
+
157
+ # ----------------------------------------
158
+ # 3) Pad each waveform to max length & build masks
159
+ # ----------------------------------------
160
+ batch_size = len(waveforms_list)
161
+ batched_waveforms = torch.zeros(batch_size, 1, max_len, dtype=torch.float32)
162
+ masks = torch.zeros(batch_size, max_len, dtype=torch.float32)
163
+
164
+ for i, audio in enumerate(waveforms_list):
165
+ cur_len = audio.shape[-1]
166
+ batched_waveforms[i, :, :cur_len] = audio
167
+ masks[i, :cur_len] = 1.0 # valid portion
168
+
169
+ # ----------------------------------------
170
+ # 4) Move batched data to device BEFORE normalization
171
+ # ----------------------------------------
172
+ batched_waveforms = batched_waveforms.to(device)
173
+ masks = masks.to(device)
174
+
175
+ # ----------------------------------------
176
+ # 5) Normalize if needed (model.mean, model.std)
177
+ # ----------------------------------------
178
+ if normalize:
179
+ # model.mean and model.std are buffers; ensure they're on the correct device
180
+ mean = model.mean.to(device)
181
+ std = model.std.to(device)
182
+ batched_waveforms = (batched_waveforms - mean) / (std + 1e-6)
183
+
184
+ # ----------------------------------------
185
+ # 6) Single forward pass
186
+ # ----------------------------------------
187
+ with torch.no_grad():
188
+ predictions = model(batched_waveforms, masks)
189
+ # predictions shape => [batch_size, 3]
190
+
191
+ # ----------------------------------------
192
+ # 7) Build result dict
193
+ # ----------------------------------------
194
+ results = {}
195
+ for i, fp in enumerate(file_paths):
196
+ arousal = predictions[i, 0].item()
197
+ valence = predictions[i, 1].item()
198
+ dominance = predictions[i, 2].item()
199
+ filename = os.path.basename(fp)
200
+ results[filename] = {
201
+ "arousal": arousal,
202
+ "valence": valence,
203
+ "dominance": dominance
204
+ }
205
+
206
+ return results
207
+
208
+ if __name__ == "__main__":
209
+ # -----------------------------------------
210
+ # Example usage
211
+ # -----------------------------------------
212
+ device = "cuda" if torch.cuda.is_available() else "cpu"
213
+
214
+ checkpoint_path = "<weights.pt>"
215
+ model = load_model_from_checkpoint(checkpoint_path, device=device)
216
+
217
+ # Suppose you have a folder of .wav files
218
+ wav_folder = "<directory containing .wav files>"
219
+ wav_paths = glob.glob(os.path.join(wav_folder, "*.wav"))
220
+
221
+ # Do a single pass of batch inference
222
+ all_results = batch_inference(model, wav_paths, device=device, normalize=True)
223
+
224
+ # Print results
225
+ for fname, preds in all_results.items():
226
+ print(f"{fname}: Arousal={preds['arousal']:.3f}, "
227
+ f"Valence={preds['valence']:.3f}, Dominance={preds['dominance']:.3f}")
228
+ ```
SER-Odyssey/SER-WavLM-Multi-Attributes/onnx/ReadMe ADDED
@@ -0,0 +1 @@
 
 
1
+ model in onnx format
SER-Odyssey/SER-WavLM-Multi-Attributes/onnx/ser_dyn.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dad8465907c9dcfaa47628d7e8401a281396c17fe17b3c8b72071279cb6b2cac
3
+ size 1274295745
SER-Odyssey/SER-WavLM-Multi-Attributes/pytorch/best_weights.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:809f5b3ef98835b5ca9dcf9d0efb4bd6cf0a9cc458cfb9443ae07ef71b44f670
3
+ size 1299851786
SER-Odyssey/SER-WavLM-Multi-Attributes/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/uyiosa/SER-WavLM-Multi-Attributes
SER-Odyssey/SER-WavLM-Multi-Attributes/tensorrt/ReadMe ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ trt10 -- compiled with TensorRT version 10
2
+ trt8 -- comiled with TensorRT version 8
SER-Odyssey/SER-WavLM-Multi-Attributes/tensorrt/trt10_ser_fp16.plan ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed792e1cd7a6e6f1d89413b5800da2b7328c40483d8532d8b0bc2e74444e0516
3
+ size 644044452
SER-Odyssey/SER-WavLM-Multi-Attributes/tensorrt/trt8_ser_dyn_fp16.plan ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6e157bb3cc07c0e808a93815a478874a092cd9f172abf83c1822ce9b9f1d55d
3
+ size 643712772
SER-Odyssey/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/3loi/models
WavLM. Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ca8836ebdf8236e610187738217d4c91c5ead13873472e476423f1561e9238e
3
+ size 929604
tiny-random-WavLMForAudioFrameClassification-ONNX/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
tiny-random-WavLMForAudioFrameClassification-ONNX/config.json ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "_name_or_path": "hf-internal-testing/tiny-random-WavLMForAudioFrameClassification",
4
+ "activation_dropout": 0.1,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "WavLMForAudioFrameClassification"
11
+ ],
12
+ "attention_dropout": 0.1,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 256,
15
+ "codevector_dim": 256,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": false,
18
+ "conv_dim": [
19
+ 32,
20
+ 32,
21
+ 32
22
+ ],
23
+ "conv_kernel": [
24
+ 8,
25
+ 8,
26
+ 8
27
+ ],
28
+ "conv_stride": [
29
+ 4,
30
+ 4,
31
+ 4
32
+ ],
33
+ "ctc_loss_reduction": "mean",
34
+ "ctc_zero_infinity": false,
35
+ "diversity_loss_weight": 0.1,
36
+ "do_stable_layer_norm": false,
37
+ "eos_token_id": 2,
38
+ "feat_extract_activation": "gelu",
39
+ "feat_extract_dropout": 0.0,
40
+ "feat_extract_norm": "group",
41
+ "feat_proj_dropout": 0.0,
42
+ "final_dropout": 0.1,
43
+ "hidden_act": "gelu",
44
+ "hidden_dropout": 0.1,
45
+ "hidden_dropout_prob": 0.1,
46
+ "hidden_size": 16,
47
+ "initializer_range": 0.02,
48
+ "intermediate_size": 20,
49
+ "layer_norm_eps": 1e-05,
50
+ "layerdrop": 0.1,
51
+ "mask_feature_length": 10,
52
+ "mask_feature_prob": 0.0,
53
+ "mask_time_length": 10,
54
+ "mask_time_min_masks": 2,
55
+ "mask_time_prob": 0.05,
56
+ "max_bucket_distance": 800,
57
+ "model_type": "wavlm",
58
+ "num_adapter_layers": 3,
59
+ "num_attention_heads": 2,
60
+ "num_buckets": 320,
61
+ "num_codevector_groups": 2,
62
+ "num_codevectors_per_group": 320,
63
+ "num_conv_pos_embedding_groups": 2,
64
+ "num_conv_pos_embeddings": 16,
65
+ "num_ctc_classes": 80,
66
+ "num_feat_extract_layers": 3,
67
+ "num_hidden_layers": 4,
68
+ "num_negatives": 100,
69
+ "output_hidden_size": 16,
70
+ "pad_token_id": 0,
71
+ "proj_codevector_dim": 256,
72
+ "tdnn_dilation": [
73
+ 1,
74
+ 1
75
+ ],
76
+ "tdnn_dim": [
77
+ 32,
78
+ 32
79
+ ],
80
+ "tdnn_kernel": [
81
+ 3,
82
+ 3
83
+ ],
84
+ "transformers_version": "4.48.2",
85
+ "use_weighted_layer_sum": false,
86
+ "vocab_size": 32,
87
+ "xvector_output_dim": 32
88
+ }
tiny-random-WavLMForAudioFrameClassification-ONNX/onnx/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43d68f66c0eb42e09d03c533d705eabde0fd481635fdff874b9d94ae4445b550
3
+ size 276448