Upload SER models, predict script, and config

Browse files

Files changed (6) hide show

.gitattributes +2 -0
outputs/README.md +78 -0
outputs/config.json +30 -0
outputs/fusion_model.keras +3 -0
outputs/model1_cnn_bilstm_attn.keras +3 -0
outputs/predict.py +74 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+outputs/fusion_model.keras filter=lfs diff=lfs merge=lfs -text
+outputs/model1_cnn_bilstm_attn.keras filter=lfs diff=lfs merge=lfs -text

outputs/README.md ADDED Viewed

	@@ -0,0 +1,78 @@

+---
+tags:
+- audio-classification
+- speech-emotion-recognition
+- tensorflow
+- keras
+- emotion2vec
+language:
+- en
+license: apache-2.0
+metrics:
+- accuracy
+---
+# Speech Emotion Recognition (SER) System
+## Overview
+Production-quality Speech Emotion Recognition detecting **6 core emotions** from voice/audio:
+- **Angry** | **Disgust** | **Fear** | **Happy** | **Neutral** | **Sad**
+## Architecture
+**Fusion Model**: CNN + BiLSTM + Multi-Head Self-Attention (spectrogram features) + emotion2vec embeddings
+### Feature Pipeline
+| Feature | Dimensions |
+|---------|-----------|
+| Mel Spectrogram | 128 bands |
+| MFCC | 40 coefficients |
+| Zero Crossing Rate | 1 |
+| RMS Energy | 1 |
+| **Total** | **170 × 200 → (170, 200, 1)** |
+| emotion2vec embedding | 768-dim |
+### Training Data
+- **CREMA-D**: 7,442 clips, 91 actors (train/val/test split provided)
+- **RAVDESS**: 1,056 speech clips, 24 actors (70/15/15 split)
+- **Augmentation**: pitch shift, time stretch, Gaussian noise, SpecAugment
+## Results
+| Model | Val Accuracy | Test Accuracy |
+|-------|-------------|---------------|
+| CNN+BiLSTM+Attention | 56.0% | 59.2% |
+| **Fusion (CNN + emotion2vec)** | **53.2%** | **54.9%** |
+| Human baseline (audio-only) | - | 40.9% |
+**Best: Model 1 — 59.2% test accuracy (+18.3pp over human baseline)**
+## Quick Start
+```bash
+pip install tensorflow librosa numpy funasr modelscope
+```
+```python
+from predict import predict_emotion
+label, confidence, probs = predict_emotion("audio.wav", model_dir="./outputs")
+# Prints: Predicted Emotion: HAPPY, Confidence: 87.3%
+```
+## Download & Use Locally
+```bash
+# Clone the repo
+git lfs install
+git clone https://huggingface.co/SamOp224/speech-emotion-recognition
+cd speech-emotion-recognition
+# Run prediction
+python outputs/predict.py your_audio.wav outputs
+```
+## Files
+- `outputs/fusion_model.keras` — Fusion model (best)
+- `outputs/model1_cnn_bilstm_attn.keras` — CNN+BiLSTM+Attention standalone
+- `outputs/predict.py` — Prediction script with visualization
+- `outputs/config.json` — Configuration and results

outputs/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "model_name": "Speech Emotion Recognition",
+  "architecture": "CNN + BiLSTM + Multi-Head Attention + emotion2vec Fusion",
+  "datasets": [
+    "CREMA-D (confit/cremad-parquet)",
+    "RAVDESS (xbgoose/ravdess)"
+  ],
+  "emotions": [
+    "angry",
+    "disgust",
+    "fear",
+    "happy",
+    "neutral",
+    "sad"
+  ],
+  "num_classes": 6,
+  "sample_rate": 16000,
+  "feature_dim": 170,
+  "max_len": 200,
+  "n_mels": 128,
+  "n_mfcc": 40,
+  "n_fft": 2048,
+  "hop_length": 512,
+  "model1_val_acc": 0.5604395866394043,
+  "model1_test_acc": 0.5916928052902222,
+  "fusion_val_acc": 0.5321820974349976,
+  "fusion_test_acc": 0.5485893487930298,
+  "best_model": "Model 1",
+  "best_test_acc": 0.5916928052902222
+}

outputs/fusion_model.keras ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0693a744d8df8ad58caf9d6404a424ca39f3b0a3157c28556ec2eaea3a8856f0
+size 77311751

outputs/model1_cnn_bilstm_attn.keras ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:838bb62a998802c45af8a230b604f407b469ff10836470443a5089da1c53048c
+size 75116347

outputs/predict.py ADDED Viewed

	@@ -0,0 +1,74 @@

+#!/usr/bin/env python3
+"""
+Speech Emotion Recognition - Prediction Script
+Usage: python predict.py <path_to_wav_file> [model_dir]
+"""
+import os, sys, numpy as np, librosa
+SAMPLE_RATE = 16000
+MAX_LEN = 200
+N_MELS = 128
+N_MFCC = 40
+N_FFT = 2048
+HOP_LENGTH = 512
+EMOTION_LABELS = ["angry", "disgust", "fear", "happy", "neutral", "sad"]
+def extract_features(wav, sr=SAMPLE_RATE, max_len=MAX_LEN):
+    mel = librosa.feature.melspectrogram(y=wav, sr=sr, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH)
+    mel_db = librosa.power_to_db(mel, ref=np.max)
+    mfcc = librosa.feature.mfcc(y=wav, sr=sr, n_mfcc=N_MFCC, n_fft=N_FFT, hop_length=HOP_LENGTH)
+    zcr = librosa.feature.zero_crossing_rate(wav, frame_length=N_FFT, hop_length=HOP_LENGTH)
+    rms = librosa.feature.rms(y=wav, frame_length=N_FFT, hop_length=HOP_LENGTH)
+    features = np.vstack([mel_db, mfcc, zcr, rms])
+    mean = features.mean(axis=1, keepdims=True)
+    std = features.std(axis=1, keepdims=True)
+    features = (features - mean) / (std + 1e-8)
+    T = features.shape[1]
+    if T < max_len:
+        features = np.pad(features, ((0,0),(0,max_len-T)), mode="constant")
+    else:
+        features = features[:, :max_len]
+    return features[:, :, np.newaxis].astype(np.float32)
+def extract_emotion2vec_embedding(wav_path):
+    try:
+        from funasr import AutoModel
+        model = AutoModel(model="iic/emotion2vec_base", hub="hf", disable_update=True)
+        res = model.generate(wav_path, output_dir=None, granularity="utterance", extract_embedding=True)
+        emb = np.array(res[0]["feats"]).flatten()[:768]
+        if len(emb) < 768:
+            emb = np.pad(emb, (0, 768-len(emb)))
+        return emb.astype(np.float32)
+    except Exception as e:
+        print(f"emotion2vec failed: {e}, using zeros")
+        return np.zeros(768, dtype=np.float32)
+def predict_emotion(file_path, model_dir="./outputs"):
+    import tensorflow as tf
+    wav, sr = librosa.load(file_path, sr=SAMPLE_RATE)
+    spec = extract_features(wav)[np.newaxis]  # (1, 170, 200, 1)
+    e2v = extract_emotion2vec_embedding(file_path)[np.newaxis]  # (1, 768)
+    fusion = tf.keras.models.load_model(os.path.join(model_dir, "fusion_model.keras"))
+    probs = fusion.predict({"spec_input": spec, "e2v_input": e2v}, verbose=0)[0]
+    idx = np.argmax(probs)
+    label = EMOTION_LABELS[idx]
+    conf = probs[idx] * 100
+    print(f"\nPredicted Emotion: {label.upper()}")
+    print(f"Confidence: {conf:.1f}%\n")
+    bar_w = 40
+    for i in sorted(range(len(EMOTION_LABELS)), key=lambda i: -probs[i]):
+        bl = int(probs[i] * bar_w)
+        bar = "█" * bl + "░" * (bar_w - bl)
+        m = " ◄" if i == idx else ""
+        print(f"  {EMOTION_LABELS[i]:>8s} [{bar}] {probs[i]*100:5.1f}%{m}")
+    return label, conf, {EMOTION_LABELS[i]: float(probs[i]*100) for i in range(len(EMOTION_LABELS))}
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python predict.py <wav_file> [model_dir]")
+        sys.exit(1)
+    predict_emotion(sys.argv[1], sys.argv[2] if len(sys.argv)>2 else "./outputs")