SamOp224 commited on
Commit
6cd0752
·
verified ·
1 Parent(s): 6a80b3b

Upload SER models, predict script, and config

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ outputs/fusion_model.keras filter=lfs diff=lfs merge=lfs -text
37
+ outputs/model1_cnn_bilstm_attn.keras filter=lfs diff=lfs merge=lfs -text
outputs/README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - audio-classification
4
+ - speech-emotion-recognition
5
+ - tensorflow
6
+ - keras
7
+ - emotion2vec
8
+ language:
9
+ - en
10
+ license: apache-2.0
11
+ metrics:
12
+ - accuracy
13
+ ---
14
+
15
+ # Speech Emotion Recognition (SER) System
16
+
17
+ ## Overview
18
+ Production-quality Speech Emotion Recognition detecting **6 core emotions** from voice/audio:
19
+ - **Angry** | **Disgust** | **Fear** | **Happy** | **Neutral** | **Sad**
20
+
21
+ ## Architecture
22
+ **Fusion Model**: CNN + BiLSTM + Multi-Head Self-Attention (spectrogram features) + emotion2vec embeddings
23
+
24
+ ### Feature Pipeline
25
+ | Feature | Dimensions |
26
+ |---------|-----------|
27
+ | Mel Spectrogram | 128 bands |
28
+ | MFCC | 40 coefficients |
29
+ | Zero Crossing Rate | 1 |
30
+ | RMS Energy | 1 |
31
+ | **Total** | **170 × 200 → (170, 200, 1)** |
32
+ | emotion2vec embedding | 768-dim |
33
+
34
+ ### Training Data
35
+ - **CREMA-D**: 7,442 clips, 91 actors (train/val/test split provided)
36
+ - **RAVDESS**: 1,056 speech clips, 24 actors (70/15/15 split)
37
+ - **Augmentation**: pitch shift, time stretch, Gaussian noise, SpecAugment
38
+
39
+ ## Results
40
+
41
+ | Model | Val Accuracy | Test Accuracy |
42
+ |-------|-------------|---------------|
43
+ | CNN+BiLSTM+Attention | 56.0% | 59.2% |
44
+ | **Fusion (CNN + emotion2vec)** | **53.2%** | **54.9%** |
45
+ | Human baseline (audio-only) | - | 40.9% |
46
+
47
+ **Best: Model 1 — 59.2% test accuracy (+18.3pp over human baseline)**
48
+
49
+ ## Quick Start
50
+
51
+ ```bash
52
+ pip install tensorflow librosa numpy funasr modelscope
53
+ ```
54
+
55
+ ```python
56
+ from predict import predict_emotion
57
+
58
+ label, confidence, probs = predict_emotion("audio.wav", model_dir="./outputs")
59
+ # Prints: Predicted Emotion: HAPPY, Confidence: 87.3%
60
+ ```
61
+
62
+ ## Download & Use Locally
63
+
64
+ ```bash
65
+ # Clone the repo
66
+ git lfs install
67
+ git clone https://huggingface.co/SamOp224/speech-emotion-recognition
68
+ cd speech-emotion-recognition
69
+
70
+ # Run prediction
71
+ python outputs/predict.py your_audio.wav outputs
72
+ ```
73
+
74
+ ## Files
75
+ - `outputs/fusion_model.keras` — Fusion model (best)
76
+ - `outputs/model1_cnn_bilstm_attn.keras` — CNN+BiLSTM+Attention standalone
77
+ - `outputs/predict.py` — Prediction script with visualization
78
+ - `outputs/config.json` — Configuration and results
outputs/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Speech Emotion Recognition",
3
+ "architecture": "CNN + BiLSTM + Multi-Head Attention + emotion2vec Fusion",
4
+ "datasets": [
5
+ "CREMA-D (confit/cremad-parquet)",
6
+ "RAVDESS (xbgoose/ravdess)"
7
+ ],
8
+ "emotions": [
9
+ "angry",
10
+ "disgust",
11
+ "fear",
12
+ "happy",
13
+ "neutral",
14
+ "sad"
15
+ ],
16
+ "num_classes": 6,
17
+ "sample_rate": 16000,
18
+ "feature_dim": 170,
19
+ "max_len": 200,
20
+ "n_mels": 128,
21
+ "n_mfcc": 40,
22
+ "n_fft": 2048,
23
+ "hop_length": 512,
24
+ "model1_val_acc": 0.5604395866394043,
25
+ "model1_test_acc": 0.5916928052902222,
26
+ "fusion_val_acc": 0.5321820974349976,
27
+ "fusion_test_acc": 0.5485893487930298,
28
+ "best_model": "Model 1",
29
+ "best_test_acc": 0.5916928052902222
30
+ }
outputs/fusion_model.keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0693a744d8df8ad58caf9d6404a424ca39f3b0a3157c28556ec2eaea3a8856f0
3
+ size 77311751
outputs/model1_cnn_bilstm_attn.keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:838bb62a998802c45af8a230b604f407b469ff10836470443a5089da1c53048c
3
+ size 75116347
outputs/predict.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Speech Emotion Recognition - Prediction Script
4
+ Usage: python predict.py <path_to_wav_file> [model_dir]
5
+ """
6
+ import os, sys, numpy as np, librosa
7
+
8
+ SAMPLE_RATE = 16000
9
+ MAX_LEN = 200
10
+ N_MELS = 128
11
+ N_MFCC = 40
12
+ N_FFT = 2048
13
+ HOP_LENGTH = 512
14
+ EMOTION_LABELS = ["angry", "disgust", "fear", "happy", "neutral", "sad"]
15
+
16
+ def extract_features(wav, sr=SAMPLE_RATE, max_len=MAX_LEN):
17
+ mel = librosa.feature.melspectrogram(y=wav, sr=sr, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH)
18
+ mel_db = librosa.power_to_db(mel, ref=np.max)
19
+ mfcc = librosa.feature.mfcc(y=wav, sr=sr, n_mfcc=N_MFCC, n_fft=N_FFT, hop_length=HOP_LENGTH)
20
+ zcr = librosa.feature.zero_crossing_rate(wav, frame_length=N_FFT, hop_length=HOP_LENGTH)
21
+ rms = librosa.feature.rms(y=wav, frame_length=N_FFT, hop_length=HOP_LENGTH)
22
+ features = np.vstack([mel_db, mfcc, zcr, rms])
23
+ mean = features.mean(axis=1, keepdims=True)
24
+ std = features.std(axis=1, keepdims=True)
25
+ features = (features - mean) / (std + 1e-8)
26
+ T = features.shape[1]
27
+ if T < max_len:
28
+ features = np.pad(features, ((0,0),(0,max_len-T)), mode="constant")
29
+ else:
30
+ features = features[:, :max_len]
31
+ return features[:, :, np.newaxis].astype(np.float32)
32
+
33
+ def extract_emotion2vec_embedding(wav_path):
34
+ try:
35
+ from funasr import AutoModel
36
+ model = AutoModel(model="iic/emotion2vec_base", hub="hf", disable_update=True)
37
+ res = model.generate(wav_path, output_dir=None, granularity="utterance", extract_embedding=True)
38
+ emb = np.array(res[0]["feats"]).flatten()[:768]
39
+ if len(emb) < 768:
40
+ emb = np.pad(emb, (0, 768-len(emb)))
41
+ return emb.astype(np.float32)
42
+ except Exception as e:
43
+ print(f"emotion2vec failed: {e}, using zeros")
44
+ return np.zeros(768, dtype=np.float32)
45
+
46
+ def predict_emotion(file_path, model_dir="./outputs"):
47
+ import tensorflow as tf
48
+ wav, sr = librosa.load(file_path, sr=SAMPLE_RATE)
49
+ spec = extract_features(wav)[np.newaxis] # (1, 170, 200, 1)
50
+ e2v = extract_emotion2vec_embedding(file_path)[np.newaxis] # (1, 768)
51
+
52
+ fusion = tf.keras.models.load_model(os.path.join(model_dir, "fusion_model.keras"))
53
+ probs = fusion.predict({"spec_input": spec, "e2v_input": e2v}, verbose=0)[0]
54
+
55
+ idx = np.argmax(probs)
56
+ label = EMOTION_LABELS[idx]
57
+ conf = probs[idx] * 100
58
+
59
+ print(f"\nPredicted Emotion: {label.upper()}")
60
+ print(f"Confidence: {conf:.1f}%\n")
61
+ bar_w = 40
62
+ for i in sorted(range(len(EMOTION_LABELS)), key=lambda i: -probs[i]):
63
+ bl = int(probs[i] * bar_w)
64
+ bar = "█" * bl + "░" * (bar_w - bl)
65
+ m = " ◄" if i == idx else ""
66
+ print(f" {EMOTION_LABELS[i]:>8s} [{bar}] {probs[i]*100:5.1f}%{m}")
67
+
68
+ return label, conf, {EMOTION_LABELS[i]: float(probs[i]*100) for i in range(len(EMOTION_LABELS))}
69
+
70
+ if __name__ == "__main__":
71
+ if len(sys.argv) < 2:
72
+ print("Usage: python predict.py <wav_file> [model_dir]")
73
+ sys.exit(1)
74
+ predict_emotion(sys.argv[1], sys.argv[2] if len(sys.argv)>2 else "./outputs")