MrlolDev commited on
Commit
1771007
Β·
verified Β·
1 Parent(s): 013230e

Upload benchmark.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. benchmark.py +185 -177
benchmark.py CHANGED
@@ -1,177 +1,185 @@
1
- # benchmark.py β€” SER benchmarks on IEMOCAP
2
-
3
- import torch
4
- import torch.nn as nn
5
- import torch.nn.functional as F
6
- import numpy as np
7
- from datasets import load_dataset
8
- from transformers import AutoProcessor, AutoModel
9
- from sklearn.metrics import f1_score, accuracy_score, recall_score
10
- import json
11
-
12
- EMOTIONS = ["neutral", "happy", "sad", "angry", "fear", "surprise"]
13
-
14
-
15
- class EmotionHead(nn.Module):
16
- def __init__(self):
17
- super().__init__()
18
- self.net = nn.Sequential(
19
- nn.Linear(1280, 512),
20
- nn.BatchNorm1d(512),
21
- nn.ReLU(),
22
- nn.Dropout(0.3),
23
- nn.Linear(512, 256),
24
- nn.BatchNorm1d(256),
25
- nn.ReLU(),
26
- nn.Dropout(0.3),
27
- nn.Linear(256, 6),
28
- )
29
-
30
- def forward(self, x):
31
- return self.net(x)
32
-
33
-
34
- device = torch.device("cuda")
35
- MODEL_ID = "mistralai/Voxtral-Mini-4B-Realtime-2602"
36
-
37
- print("Loading models...")
38
- processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
39
- voxtral = (
40
- AutoModel.from_pretrained(MODEL_ID, trust_remote_code=True, dtype=torch.bfloat16)
41
- .to(device)
42
- .eval()
43
- )
44
-
45
- emotion_model = EmotionHead().to(device)
46
- emotion_model.load_state_dict(
47
- torch.load("emotion_head_best.pt", map_location=device, weights_only=False)
48
- )
49
- emotion_model.eval()
50
-
51
-
52
- def extract_and_predict(audio_array, sr):
53
- print(f" [{i}] calling processor...", flush=True)
54
- inputs = processor(audio_array, return_tensors="pt")
55
- print(f" [{i}] processor done, calling audio_tower...", flush=True)
56
- feats = inputs["input_features"].to(device=device, dtype=torch.bfloat16)
57
- print(f" [{i}] audio_tower...", flush=True)
58
- with torch.no_grad():
59
- hidden = voxtral.audio_tower(feats).last_hidden_state.mean(1).float()
60
- print(f" [{i}] predicting...", flush=True)
61
- probs = F.softmax(emotion_model(hidden), dim=1).squeeze(0)
62
- return EMOTIONS[probs.argmax().item()]
63
-
64
-
65
- def compute_metrics(true_labels, pred_labels, classes):
66
- if not true_labels:
67
- return {"UA": 0, "WA": 0, "F1": 0, "WF1": 0}
68
- ua = (
69
- recall_score(
70
- true_labels, pred_labels, average="macro", labels=classes, zero_division=0
71
- )
72
- * 100
73
- )
74
- wa = accuracy_score(true_labels, pred_labels) * 100
75
- f1 = (
76
- f1_score(
77
- true_labels, pred_labels, average="macro", labels=classes, zero_division=0
78
- )
79
- * 100
80
- )
81
- wf1 = (
82
- f1_score(
83
- true_labels,
84
- pred_labels,
85
- average="weighted",
86
- labels=classes,
87
- zero_division=0,
88
- )
89
- * 100
90
- )
91
- return {
92
- "UA": round(ua, 1),
93
- "WA": round(wa, 1),
94
- "F1": round(f1, 1),
95
- "WF1": round(wf1, 1),
96
- }
97
-
98
-
99
- # IEMOCAP 4-class map
100
- IEMOCAP_MAP = {
101
- "hap": "happy",
102
- "exc": "happy",
103
- "sad": "sad",
104
- "ang": "angry",
105
- "neu": "neutral",
106
- }
107
-
108
- print("\n=== IEMOCAP ===")
109
- ds = load_dataset("AudioLLMs/iemocap_emotion_recognition", trust_remote_code=True)
110
- iemocap = ds["test"]
111
- print(f"Total samples: {len(iemocap)}")
112
-
113
- preds, trues = [], []
114
-
115
- for i, sample in enumerate(iemocap):
116
- try:
117
- # Get label from answer or instruction
118
- answer = sample.get("answer", "").lower()
119
- label = sample.get("label", "")
120
-
121
- # Map label
122
- if not label:
123
- if "happy" in answer or "excited" in answer:
124
- mapped = "happy"
125
- elif "sad" in answer:
126
- mapped = "sad"
127
- elif "angry" in answer:
128
- mapped = "angry"
129
- elif "neutral" in answer:
130
- mapped = "neutral"
131
- else:
132
- continue
133
- else:
134
- mapped = IEMOCAP_MAP.get(str(label).lower())
135
- if mapped is None:
136
- continue
137
-
138
- # Get audio from context
139
- context = sample.get("context", {})
140
- if not context:
141
- print(f" error at {i}: no context")
142
- continue
143
- audio_array = context.get("array")
144
- if audio_array is None:
145
- print(f" error at {i}: no array in context keys {list(context.keys())}")
146
- continue
147
-
148
- audio_array = np.array(audio_array, dtype=np.float32)
149
- sr = context.get("sampling_rate", 16000)
150
-
151
- pred = extract_and_predict(audio_array, sr)
152
- preds.append(pred)
153
- trues.append(mapped)
154
-
155
- if i % 50 == 0:
156
- print(f" Processed {i}...")
157
- except Exception as e:
158
- import traceback
159
-
160
- print(f" error at {i}: {e}")
161
- traceback.print_exc()
162
-
163
- print(f"Processed: {len(preds)}/{len(iemocap)}")
164
-
165
- results = compute_metrics(trues, preds, ["neutral", "happy", "sad", "angry"])
166
- print(
167
- f" n={len(preds)} | UA={results['UA']} WA={results['WA']} F1={results['F1']} WF1={results['WF1']}"
168
- )
169
-
170
- print("\n=== Results ===")
171
- print(
172
- f"UA: {results['UA']}, WA: {results['WA']}, F1: {results['F1']}, WF1: {results['WF1']}"
173
- )
174
-
175
- with open("benchmark_results.json", "w") as f:
176
- json.dump(results, f, indent=2)
177
- print("\nSaved benchmark_results.json")
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - audio
5
+ - speech
6
+ - emotion-recognition
7
+ - voxtral
8
+ - mistralai
9
+ datasets:
10
+ - MrlolDev/voxtral-emotion-speech
11
+ base_model: mistralai/Voxtral-Mini-4B-Realtime-2602
12
+ ---
13
+
14
+ # Voxtral Emotion Speech - Training Pipeline
15
+
16
+ **Dataset**: [MrlolDev/voxtral-emotion-speech](https://huggingface.co/datasets/MrlolDev/voxtral-emotion-speech)
17
+
18
+ **Model**: [MrlolDev/voxtral-emotion-speech](https://huggingface.co/MrlolDev/voxtral-emotion-speech)
19
+
20
+ ## What We Did
21
+
22
+ 1. Loaded audio from the dataset
23
+ 2. Extracted 1280-dim features from Voxtral encoder hidden states using mean pooling
24
+ 3. Trained a classification head (MLP: 1280 β†’ 512 β†’ 256 β†’ 6) with class weights for imbalance
25
+ 4. Benchmarked against SenseVoice on RAVDESS emotion recognition
26
+ 5. Verified encoder freezing doesn't affect transcription WER on LibriSpeech
27
+
28
+ ## Emotions
29
+
30
+ - neutral
31
+ - happy
32
+ - sad
33
+ - angry
34
+ - fear
35
+ - surprise
36
+
37
+ ## Scripts
38
+
39
+ ### 1. setup.sh
40
+
41
+ Installs dependencies using UV and logs into HuggingFace.
42
+
43
+ ```bash
44
+ bash setup.sh
45
+ ```
46
+
47
+ ### 2. extract_features.py
48
+
49
+ 1. Loads dataset from HuggingFace
50
+ 2. Loads Voxtral model (float16)
51
+ 3. Extracts 1280-dim features from encoder hidden states (mean pooling)
52
+ 4. Saves features to features.pkl
53
+ 5. Uploads features.pkl and README.md to model repo
54
+
55
+ ```bash
56
+ python extract_features.py
57
+ ```
58
+
59
+ Output: `features.pkl` - list of records with keys:
60
+ - `features`: numpy array (1280,)
61
+ - `label`: int (0-5)
62
+ - `emotion`: string
63
+ - `split`: "train"/"validation"/"test"
64
+ - `sensevoice_score`: float
65
+
66
+ ### 3. train.py
67
+
68
+ 1. Loads features from features.pkl
69
+ 2. Splits 70/15/15 if no split in data
70
+ 3. Trains EmotionHead MLP:
71
+ - 1280 β†’ 512 β†’ 256 β†’ 6
72
+ - BatchNorm + ReLU + Dropout(0.3)
73
+ 4. Uses class weights for imbalance
74
+ 5. Trains 150 epochs with AdamW + ReduceLROnPlateau
75
+ 6. Saves best model by validation accuracy
76
+ 7. Uploads model weights and plots to model repo
77
+
78
+ ```bash
79
+ python train.py
80
+ ```
81
+
82
+ Outputs:
83
+ - `emotion_head_best.pt` - Best model weights
84
+ - `confusion_matrix.png` - Test confusion matrix
85
+ - `training_curve.png` - Loss curves
86
+
87
+ ### 4. benchmark.py
88
+
89
+ Benchmarks the trained model:
90
+
91
+ **Bench 1: Emotion F1 vs SenseVoice**
92
+ - Uses RAVDESS test set
93
+ - Maps 8 RAVDESS emotions to 6 classes
94
+ - Compares against SenseVoice baseline
95
+
96
+ **Bench 2: Transcription WER**
97
+ - Uses LibriSpeech test-clean (100 samples)
98
+ - Verifies encoder freezing doesn't affect decoder
99
+
100
+ ```bash
101
+ python benchmark.py
102
+ ```
103
+
104
+ Output: `benchmark_results.json`
105
+
106
+ ---
107
+
108
+ ## Benchmark Results
109
+
110
+ ### How the Benchmark is Done
111
+
112
+ 1. **Load IEMOCAP test set** from [AudioLLMs/iemocap_emotion_recognition](https://huggingface.co/datasets/AudioLLMs/iemocap_emotion_recognition)
113
+ 2. **For each audio sample:**
114
+ - Extract 1280-dim features from Voxtral encoder using `audio_tower()`
115
+ - Mean pool over time dimension β†’ (1280,)
116
+ - Pass through trained MLP classifier
117
+ - Get softmax probabilities
118
+ - Take argmax for prediction
119
+ 3. **Map predictions to 4 classes** (neutral, happy, sad, angry) excluding other emotions
120
+ 4. **Compute metrics:**
121
+ - UA = Unweighted Average (macro recall)
122
+ - WA = Weighted Average (accuracy)
123
+ - F1 = macro F1
124
+ - WF1 = weighted F1
125
+
126
+ This matches the evaluation methodology from the [SenseVoice paper](https://arxiv.org/abs/2407.04051) Table 4.
127
+
128
+ ### Training Curve (Synthetic Data Scaling)
129
+
130
+ | # Training Clips | UA% | WA% | F1% | WF1% |
131
+ |------------------|-----|-----|-----|------|
132
+ | 500 (11Labs synthetic) | 16.3 | 25.4 | 14.2 | 21.9 |
133
+
134
+ ### Final Benchmark Table
135
+
136
+ | Model | UA% | WA% | F1% | WF1% | Trained on IEMOCAP? |
137
+ |-------|-----|-----|-----|------|---------------------|
138
+ | **Ours (Voxtral encoder + MLP)** | 16.3 | 25.4 | 14.2 | 21.9 | ❌ 500 synthetic clips (11Labs) |
139
+ | [SenseVoice-S](https://huggingface.co/FunAudioLLM/SenseVoiceSmall) | 70.5 | 65.7 | 67.9 | 67.8 | ❌ zero-shot |
140
+ | [emotion2vec+ large](https://huggingface.co/emotion2vec/emotion2vec_plus_large) | ~80 | ~80 | - | - | βœ… IEMOCAP + more |
141
+
142
+ > **Note**: We processed 477/1004 IEMOCAP test samples (the 4-class subset: neutral, happy, sad, angry). The model was trained only on 500 synthetic ElevenLabs clips, so the low score is expected. Models marked βœ… were fine-tuned directly on IEMOCAP training data.
143
+
144
+ ## Running on RunPod
145
+
146
+ ### Pod Setup
147
+
148
+ - GPU: RTX 4090 (~$0.48/hr)
149
+ - Template: RunPod PyTorch 2.1
150
+ - Container Disk: 30GB
151
+
152
+ ### Execution Order
153
+
154
+ ```bash
155
+ # 1. Setup
156
+ bash setup.sh
157
+
158
+ # 2. Extract features (~20 min)
159
+ python extract_features.py
160
+
161
+ # 3. Train (~10 min)
162
+ python train.py
163
+
164
+ # 4. Benchmark (~20 min)
165
+ python benchmark.py
166
+
167
+ # 5. Download results
168
+ tar -czf results.tar.gz emotion_head_best.pt features.pkl \
169
+ confusion_matrix.png training_curve.png benchmark_results.json
170
+ ```
171
+
172
+ Then download `results.tar.gz` from RunPod Files tab.
173
+
174
+ ## Model Architecture
175
+
176
+ ```
177
+ Voxtral Encoder (frozen)
178
+ ↓
179
+ Mean Pooling (1280 dims)
180
+ ↓
181
+ EmotionHead MLP
182
+ - Linear(1280, 512) + BatchNorm + ReLU + Dropout(0.3)
183
+ - Linear(512, 256) + BatchNorm + ReLU + Dropout(0.3)
184
+ - Linear(256, 6)
185
+ ```