Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- README.md +60 -0
- emotion_recognizer.py +133 -0
- label_map.json +1 -0
- model.keras +3 -0
.gitattributes
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
model.keras filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Audio Emotion Recognition
|
| 2 |
+
|
| 3 |
+
This model recognizes emotions in speech audio files. It can detect the following emotions:
|
| 4 |
+
- Angry
|
| 5 |
+
- Disgust
|
| 6 |
+
- Fear
|
| 7 |
+
- Happy
|
| 8 |
+
- Neutral
|
| 9 |
+
- Sad
|
| 10 |
+
- Surprise
|
| 11 |
+
|
| 12 |
+
## Model Description
|
| 13 |
+
|
| 14 |
+
This model uses a two-stage approach:
|
| 15 |
+
1. Audio feature extraction using YAMNet
|
| 16 |
+
2. Emotion classification using a custom neural network
|
| 17 |
+
|
| 18 |
+
## Usage
|
| 19 |
+
|
| 20 |
+
### Installation
|
| 21 |
+
|
| 22 |
+
```bash
|
| 23 |
+
pip install tensorflow librosa huggingface_hub tensorflow_hub
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
### Quick Start
|
| 27 |
+
|
| 28 |
+
```python
|
| 29 |
+
from huggingface_hub import snapshot_download
|
| 30 |
+
import os
|
| 31 |
+
import sys
|
| 32 |
+
sys.path.append(snapshot_download(repo_id="vkushwahaa/audio-emotion-recognition"))
|
| 33 |
+
|
| 34 |
+
from emotion_recognizer import load_from_hf
|
| 35 |
+
|
| 36 |
+
# Load model
|
| 37 |
+
recognizer = load_from_hf()
|
| 38 |
+
|
| 39 |
+
# Predict emotion from audio file
|
| 40 |
+
result = recognizer.predict("path/to/audio.wav")
|
| 41 |
+
|
| 42 |
+
# Print results
|
| 43 |
+
print(f"Predicted emotion: {result['predicted_emotion']} (confidence: {result['confidence']:.2f})")
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
### Command Line Usage
|
| 47 |
+
|
| 48 |
+
```bash
|
| 49 |
+
python emotion_recognizer.py path/to/audio.wav
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
## Training
|
| 53 |
+
|
| 54 |
+
This model was trained on multiple datasets:
|
| 55 |
+
- CREMA-D
|
| 56 |
+
- RAVDESS
|
| 57 |
+
- SAVEE
|
| 58 |
+
- TESS
|
| 59 |
+
|
| 60 |
+
The model uses YAMNet embeddings as features and employs data augmentation techniques to improve robustness.
|
emotion_recognizer.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import os
|
| 3 |
+
import numpy as np
|
| 4 |
+
import tensorflow as tf
|
| 5 |
+
import librosa
|
| 6 |
+
import json
|
| 7 |
+
from huggingface_hub import HfApi, snapshot_download
|
| 8 |
+
|
| 9 |
+
# Custom Focal Loss (needed for model loading)
|
| 10 |
+
@tf.keras.utils.register_keras_serializable(package="CustomLosses")
|
| 11 |
+
class SparseCategoricalFocalLoss(tf.keras.losses.Loss):
|
| 12 |
+
def __init__(self, gamma=2.0, alpha=0.25, name="sparse_focal_loss", **kwargs):
|
| 13 |
+
super().__init__(name=name, **kwargs)
|
| 14 |
+
self.gamma = gamma
|
| 15 |
+
self.alpha = alpha
|
| 16 |
+
|
| 17 |
+
def call(self, y_true, y_pred):
|
| 18 |
+
y_true_one_hot = tf.one_hot(tf.cast(y_true, tf.int32), depth=tf.shape(y_pred)[-1])
|
| 19 |
+
y_pred = tf.clip_by_value(y_pred, 1e-7, 1.0 - 1e-7)
|
| 20 |
+
ce = -y_true_one_hot * tf.math.log(y_pred)
|
| 21 |
+
weight = self.alpha * tf.pow(1.0 - y_pred, self.gamma)
|
| 22 |
+
focal_loss = weight * ce
|
| 23 |
+
return tf.reduce_sum(focal_loss, axis=-1)
|
| 24 |
+
|
| 25 |
+
# YAMNet Wrapper
|
| 26 |
+
class YamnetWrapper(tf.keras.layers.Layer):
|
| 27 |
+
def __init__(self, trainable=False, **kwargs):
|
| 28 |
+
super().__init__(**kwargs)
|
| 29 |
+
import tensorflow_hub as hub
|
| 30 |
+
self.yamnet = hub.KerasLayer("https://tfhub.dev/google/yamnet/1", trainable=trainable)
|
| 31 |
+
|
| 32 |
+
@tf.function
|
| 33 |
+
def call(self, waveforms):
|
| 34 |
+
return tf.vectorized_map(lambda w: self.yamnet(w)[1], waveforms)
|
| 35 |
+
|
| 36 |
+
class EmotionRecognizer:
|
| 37 |
+
def __init__(self, model_path="model.keras", label_map_path="label_map.json"):
|
| 38 |
+
# Load label map
|
| 39 |
+
with open(label_map_path, "r") as f:
|
| 40 |
+
self.label_map = {int(k): v for k, v in json.load(f).items()}
|
| 41 |
+
|
| 42 |
+
# Load model
|
| 43 |
+
self.model = tf.keras.models.load_model(model_path, custom_objects={
|
| 44 |
+
'SparseCategoricalFocalLoss': SparseCategoricalFocalLoss,
|
| 45 |
+
'YamnetWrapper': YamnetWrapper
|
| 46 |
+
})
|
| 47 |
+
|
| 48 |
+
# Initialize YAMNet
|
| 49 |
+
self.yamnet = YamnetWrapper()
|
| 50 |
+
|
| 51 |
+
def preprocess_audio(self, file_path, sample_rate=16000, duration=3.0):
|
| 52 |
+
"Preprocess audio file"
|
| 53 |
+
samples = int(sample_rate * duration)
|
| 54 |
+
|
| 55 |
+
try:
|
| 56 |
+
# Load and normalize audio
|
| 57 |
+
y, sr = librosa.load(file_path, sr=sample_rate, duration=duration)
|
| 58 |
+
y = librosa.util.normalize(y)
|
| 59 |
+
|
| 60 |
+
# Ensure consistent length
|
| 61 |
+
if len(y) < samples:
|
| 62 |
+
y = np.pad(y, (0, samples - len(y)))
|
| 63 |
+
else:
|
| 64 |
+
y = y[:samples]
|
| 65 |
+
|
| 66 |
+
return y
|
| 67 |
+
except Exception as e:
|
| 68 |
+
print(f"Error processing {file_path}: {e}")
|
| 69 |
+
return None
|
| 70 |
+
|
| 71 |
+
def predict(self, file_path):
|
| 72 |
+
"Predict emotion from audio file"
|
| 73 |
+
# Preprocess audio
|
| 74 |
+
audio = self.preprocess_audio(file_path)
|
| 75 |
+
if audio is None:
|
| 76 |
+
return {"error": f"Failed to process audio file: {file_path}"}
|
| 77 |
+
|
| 78 |
+
# Extract embeddings
|
| 79 |
+
try:
|
| 80 |
+
embedding = self.yamnet(tf.constant([audio], dtype=tf.float32)).numpy()
|
| 81 |
+
except Exception as e:
|
| 82 |
+
return {"error": f"Failed to extract embeddings: {e}"}
|
| 83 |
+
|
| 84 |
+
# Make prediction
|
| 85 |
+
prediction = self.model.predict(embedding)[0]
|
| 86 |
+
|
| 87 |
+
# Get predicted class
|
| 88 |
+
pred_index = np.argmax(prediction)
|
| 89 |
+
pred_emotion = self.label_map[pred_index]
|
| 90 |
+
|
| 91 |
+
# Prepare results with confidence scores
|
| 92 |
+
results = {
|
| 93 |
+
"predicted_emotion": pred_emotion,
|
| 94 |
+
"confidence": float(prediction[pred_index]),
|
| 95 |
+
"all_scores": {self.label_map[i]: float(prediction[i]) for i in range(len(prediction))}
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
return results
|
| 99 |
+
|
| 100 |
+
def load_from_hf(repo_id="vkushwahaa/audio-emotion-recognition"):
|
| 101 |
+
"Load model from Hugging Face Hub"
|
| 102 |
+
# Download model from Hugging Face Hub
|
| 103 |
+
model_path = snapshot_download(repo_id=repo_id)
|
| 104 |
+
|
| 105 |
+
# Initialize emotion recognizer
|
| 106 |
+
recognizer = EmotionRecognizer(
|
| 107 |
+
model_path=os.path.join(model_path, "model.keras"),
|
| 108 |
+
label_map_path=os.path.join(model_path, "label_map.json")
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
return recognizer
|
| 112 |
+
|
| 113 |
+
# Example usage
|
| 114 |
+
if __name__ == "__main__":
|
| 115 |
+
import sys
|
| 116 |
+
|
| 117 |
+
if len(sys.argv) < 2:
|
| 118 |
+
print("Usage: python emotion_recognizer.py <audio_file_path>")
|
| 119 |
+
sys.exit(1)
|
| 120 |
+
|
| 121 |
+
audio_path = sys.argv[1]
|
| 122 |
+
|
| 123 |
+
# Load model from Hugging Face Hub
|
| 124 |
+
recognizer = load_from_hf()
|
| 125 |
+
|
| 126 |
+
# Predict emotion
|
| 127 |
+
result = recognizer.predict(audio_path)
|
| 128 |
+
|
| 129 |
+
# Print results
|
| 130 |
+
print(f"Predicted emotion: {result['predicted_emotion']} (confidence: {result['confidence']:.2f})")
|
| 131 |
+
print("\nAll emotion scores:")
|
| 132 |
+
for emotion, score in result['all_scores'].items():
|
| 133 |
+
print(f" {emotion}: {score:.4f}")
|
label_map.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"0": "angry", "1": "disgust", "2": "fear", "3": "happy", "4": "neutral", "5": "sad", "6": "surprise"}
|
model.keras
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6c4d6cebb2a3d28f3b2ebe4b904cc4102713e8d26b87c623d292173fe2f267f3
|
| 3 |
+
size 7194739
|