MrlolDev
/

voxtral-emotion-speech

@@ -1,177 +1,185 @@
-# benchmark.py — SER benchmarks on IEMOCAP
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import numpy as np
-from datasets import load_dataset
-from transformers import AutoProcessor, AutoModel
-from sklearn.metrics import f1_score, accuracy_score, recall_score
-import json
-EMOTIONS = ["neutral", "happy", "sad", "angry", "fear", "surprise"]
-class EmotionHead(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.Linear(1280, 512),
-            nn.BatchNorm1d(512),
-            nn.ReLU(),
-            nn.Dropout(0.3),
-            nn.Linear(512, 256),
-            nn.BatchNorm1d(256),
-            nn.ReLU(),
-            nn.Dropout(0.3),
-            nn.Linear(256, 6),
-        )
-    def forward(self, x):
-        return self.net(x)
-device = torch.device("cuda")
-MODEL_ID = "mistralai/Voxtral-Mini-4B-Realtime-2602"
-print("Loading models...")
-processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
-voxtral = (
-    AutoModel.from_pretrained(MODEL_ID, trust_remote_code=True, dtype=torch.bfloat16)
-    .to(device)
-    .eval()
-)
-emotion_model = EmotionHead().to(device)
-emotion_model.load_state_dict(
-    torch.load("emotion_head_best.pt", map_location=device, weights_only=False)
-)
-emotion_model.eval()
-def extract_and_predict(audio_array, sr):
-    print(f"  [{i}] calling processor...", flush=True)
-    inputs = processor(audio_array, return_tensors="pt")
-    print(f"  [{i}] processor done, calling audio_tower...", flush=True)
-    feats = inputs["input_features"].to(device=device, dtype=torch.bfloat16)
-    print(f"  [{i}] audio_tower...", flush=True)
-    with torch.no_grad():
-        hidden = voxtral.audio_tower(feats).last_hidden_state.mean(1).float()
-        print(f"  [{i}] predicting...", flush=True)
-        probs = F.softmax(emotion_model(hidden), dim=1).squeeze(0)
-    return EMOTIONS[probs.argmax().item()]
-def compute_metrics(true_labels, pred_labels, classes):
-    if not true_labels:
-        return {"UA": 0, "WA": 0, "F1": 0, "WF1": 0}
-    ua = (
-        recall_score(
-            true_labels, pred_labels, average="macro", labels=classes, zero_division=0
-        )
-        * 100
-    )
-    wa = accuracy_score(true_labels, pred_labels) * 100
-    f1 = (
-        f1_score(
-            true_labels, pred_labels, average="macro", labels=classes, zero_division=0
-        )
-        * 100
-    )
-    wf1 = (
-        f1_score(
-            true_labels,
-            pred_labels,
-            average="weighted",
-            labels=classes,
-            zero_division=0,
-        )
-        * 100
-    )
-    return {
-        "UA": round(ua, 1),
-        "WA": round(wa, 1),
-        "F1": round(f1, 1),
-        "WF1": round(wf1, 1),
-    }
-# IEMOCAP 4-class map
-IEMOCAP_MAP = {
-    "hap": "happy",
-    "exc": "happy",
-    "sad": "sad",
-    "ang": "angry",
-    "neu": "neutral",
-}
-print("\n=== IEMOCAP ===")
-ds = load_dataset("AudioLLMs/iemocap_emotion_recognition", trust_remote_code=True)
-iemocap = ds["test"]
-print(f"Total samples: {len(iemocap)}")
-preds, trues = [], []
-for i, sample in enumerate(iemocap):
-    try:
-        # Get label from answer or instruction
-        answer = sample.get("answer", "").lower()
-        label = sample.get("label", "")
-        # Map label
-        if not label:
-            if "happy" in answer or "excited" in answer:
-                mapped = "happy"
-            elif "sad" in answer:
-                mapped = "sad"
-            elif "angry" in answer:
-                mapped = "angry"
-            elif "neutral" in answer:
-                mapped = "neutral"
-            else:
-                continue
-        else:
-            mapped = IEMOCAP_MAP.get(str(label).lower())
-            if mapped is None:
-                continue
-        # Get audio from context
-        context = sample.get("context", {})
-        if not context:
-            print(f"  error at {i}: no context")
-            continue
-        audio_array = context.get("array")
-        if audio_array is None:
-            print(f"  error at {i}: no array in context keys {list(context.keys())}")
-            continue
-        audio_array = np.array(audio_array, dtype=np.float32)
-        sr = context.get("sampling_rate", 16000)
-        pred = extract_and_predict(audio_array, sr)
-        preds.append(pred)
-        trues.append(mapped)
-        if i % 50 == 0:
-            print(f"  Processed {i}...")
-    except Exception as e:
-        import traceback
-        print(f"  error at {i}: {e}")
-        traceback.print_exc()
-print(f"Processed: {len(preds)}/{len(iemocap)}")
-results = compute_metrics(trues, preds, ["neutral", "happy", "sad", "angry"])
-print(
-    f"  n={len(preds)} | UA={results['UA']} WA={results['WA']} F1={results['F1']} WF1={results['WF1']}"
-)
-print("\n=== Results ===")
-print(
-    f"UA: {results['UA']}, WA: {results['WA']}, F1: {results['F1']}, WF1: {results['WF1']}"
-)
-with open("benchmark_results.json", "w") as f:
-    json.dump(results, f, indent=2)
-print("\nSaved benchmark_results.json")

+---
+license: apache-2.0
+tags:
+- audio
+- speech
+- emotion-recognition
+- voxtral
+- mistralai
+datasets:
+- MrlolDev/voxtral-emotion-speech
+base_model: mistralai/Voxtral-Mini-4B-Realtime-2602
+---
+# Voxtral Emotion Speech - Training Pipeline
+**Dataset**: [MrlolDev/voxtral-emotion-speech](https://huggingface.co/datasets/MrlolDev/voxtral-emotion-speech)
+**Model**: [MrlolDev/voxtral-emotion-speech](https://huggingface.co/MrlolDev/voxtral-emotion-speech)
+## What We Did
+1. Loaded audio from the dataset
+2. Extracted 1280-dim features from Voxtral encoder hidden states using mean pooling
+3. Trained a classification head (MLP: 1280 → 512 → 256 → 6) with class weights for imbalance
+4. Benchmarked against SenseVoice on RAVDESS emotion recognition
+5. Verified encoder freezing doesn't affect transcription WER on LibriSpeech
+## Emotions
+- neutral
+- happy
+- sad
+- angry
+- fear
+- surprise
+## Scripts
+### 1. setup.sh
+Installs dependencies using UV and logs into HuggingFace.
+```bash
+bash setup.sh
+```
+### 2. extract_features.py
+1. Loads dataset from HuggingFace
+2. Loads Voxtral model (float16)
+3. Extracts 1280-dim features from encoder hidden states (mean pooling)
+4. Saves features to features.pkl
+5. Uploads features.pkl and README.md to model repo
+```bash
+python extract_features.py
+```
+Output: `features.pkl` - list of records with keys:
+- `features`: numpy array (1280,)
+- `label`: int (0-5)
+- `emotion`: string
+- `split`: "train"/"validation"/"test"
+- `sensevoice_score`: float
+### 3. train.py
+1. Loads features from features.pkl
+2. Splits 70/15/15 if no split in data
+3. Trains EmotionHead MLP:
+   - 1280 → 512 → 256 → 6
+   - BatchNorm + ReLU + Dropout(0.3)
+4. Uses class weights for imbalance
+5. Trains 150 epochs with AdamW + ReduceLROnPlateau
+6. Saves best model by validation accuracy
+7. Uploads model weights and plots to model repo
+```bash
+python train.py
+```
+Outputs:
+- `emotion_head_best.pt` - Best model weights
+- `confusion_matrix.png` - Test confusion matrix
+- `training_curve.png` - Loss curves
+### 4. benchmark.py
+Benchmarks the trained model:
+**Bench 1: Emotion F1 vs SenseVoice**
+- Uses RAVDESS test set
+- Maps 8 RAVDESS emotions to 6 classes
+- Compares against SenseVoice baseline
+**Bench 2: Transcription WER**
+- Uses LibriSpeech test-clean (100 samples)
+- Verifies encoder freezing doesn't affect decoder
+```bash
+python benchmark.py
+```
+Output: `benchmark_results.json`
+---
+## Benchmark Results
+### How the Benchmark is Done
+1. **Load IEMOCAP test set** from [AudioLLMs/iemocap_emotion_recognition](https://huggingface.co/datasets/AudioLLMs/iemocap_emotion_recognition)
+2. **For each audio sample:**
+   - Extract 1280-dim features from Voxtral encoder using `audio_tower()`
+   - Mean pool over time dimension → (1280,)
+   - Pass through trained MLP classifier
+   - Get softmax probabilities
+   - Take argmax for prediction
+3. **Map predictions to 4 classes** (neutral, happy, sad, angry) excluding other emotions
+4. **Compute metrics:**
+   - UA = Unweighted Average (macro recall)
+   - WA = Weighted Average (accuracy)
+   - F1 = macro F1
+   - WF1 = weighted F1
+This matches the evaluation methodology from the [SenseVoice paper](https://arxiv.org/abs/2407.04051) Table 4.
+### Training Curve (Synthetic Data Scaling)
+| # Training Clips | UA% | WA% | F1% | WF1% |
+|------------------|-----|-----|-----|------|
+| 500 (11Labs synthetic) | 16.3 | 25.4 | 14.2 | 21.9 |
+### Final Benchmark Table
+| Model | UA% | WA% | F1% | WF1% | Trained on IEMOCAP? |
+|-------|-----|-----|-----|------|---------------------|
+| **Ours (Voxtral encoder + MLP)** | 16.3 | 25.4 | 14.2 | 21.9 | ❌ 500 synthetic clips (11Labs) |
+| [SenseVoice-S](https://huggingface.co/FunAudioLLM/SenseVoiceSmall) | 70.5 | 65.7 | 67.9 | 67.8 | ❌ zero-shot |
+| [emotion2vec+ large](https://huggingface.co/emotion2vec/emotion2vec_plus_large) | ~80 | ~80 | - | - | ✅ IEMOCAP + more |
+> **Note**: We processed 477/1004 IEMOCAP test samples (the 4-class subset: neutral, happy, sad, angry). The model was trained only on 500 synthetic ElevenLabs clips, so the low score is expected. Models marked ✅ were fine-tuned directly on IEMOCAP training data.
+## Running on RunPod
+### Pod Setup
+- GPU: RTX 4090 (~$0.48/hr)
+- Template: RunPod PyTorch 2.1
+- Container Disk: 30GB
+### Execution Order
+```bash
+# 1. Setup
+bash setup.sh
+# 2. Extract features (~20 min)
+python extract_features.py
+# 3. Train (~10 min)
+python train.py
+# 4. Benchmark (~20 min)
+python benchmark.py
+# 5. Download results
+tar -czf results.tar.gz emotion_head_best.pt features.pkl \
+    confusion_matrix.png training_curve.png benchmark_results.json
+```
+Then download `results.tar.gz` from RunPod Files tab.
+## Model Architecture
+```
+Voxtral Encoder (frozen)
+    ↓
+Mean Pooling (1280 dims)
+    ↓
+EmotionHead MLP
+    - Linear(1280, 512) + BatchNorm + ReLU + Dropout(0.3)
+    - Linear(512, 256) + BatchNorm + ReLU + Dropout(0.3)
+    - Linear(256, 6)
+```