vkushwahaa commited on
Commit
41cb454
·
verified ·
1 Parent(s): 460ca40

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. README.md +60 -0
  3. emotion_recognizer.py +133 -0
  4. label_map.json +1 -0
  5. model.keras +3 -0
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ model.keras filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Audio Emotion Recognition
2
+
3
+ This model recognizes emotions in speech audio files. It can detect the following emotions:
4
+ - Angry
5
+ - Disgust
6
+ - Fear
7
+ - Happy
8
+ - Neutral
9
+ - Sad
10
+ - Surprise
11
+
12
+ ## Model Description
13
+
14
+ This model uses a two-stage approach:
15
+ 1. Audio feature extraction using YAMNet
16
+ 2. Emotion classification using a custom neural network
17
+
18
+ ## Usage
19
+
20
+ ### Installation
21
+
22
+ ```bash
23
+ pip install tensorflow librosa huggingface_hub tensorflow_hub
24
+ ```
25
+
26
+ ### Quick Start
27
+
28
+ ```python
29
+ from huggingface_hub import snapshot_download
30
+ import os
31
+ import sys
32
+ sys.path.append(snapshot_download(repo_id="vkushwahaa/audio-emotion-recognition"))
33
+
34
+ from emotion_recognizer import load_from_hf
35
+
36
+ # Load model
37
+ recognizer = load_from_hf()
38
+
39
+ # Predict emotion from audio file
40
+ result = recognizer.predict("path/to/audio.wav")
41
+
42
+ # Print results
43
+ print(f"Predicted emotion: {result['predicted_emotion']} (confidence: {result['confidence']:.2f})")
44
+ ```
45
+
46
+ ### Command Line Usage
47
+
48
+ ```bash
49
+ python emotion_recognizer.py path/to/audio.wav
50
+ ```
51
+
52
+ ## Training
53
+
54
+ This model was trained on multiple datasets:
55
+ - CREMA-D
56
+ - RAVDESS
57
+ - SAVEE
58
+ - TESS
59
+
60
+ The model uses YAMNet embeddings as features and employs data augmentation techniques to improve robustness.
emotion_recognizer.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import numpy as np
4
+ import tensorflow as tf
5
+ import librosa
6
+ import json
7
+ from huggingface_hub import HfApi, snapshot_download
8
+
9
+ # Custom Focal Loss (needed for model loading)
10
+ @tf.keras.utils.register_keras_serializable(package="CustomLosses")
11
+ class SparseCategoricalFocalLoss(tf.keras.losses.Loss):
12
+ def __init__(self, gamma=2.0, alpha=0.25, name="sparse_focal_loss", **kwargs):
13
+ super().__init__(name=name, **kwargs)
14
+ self.gamma = gamma
15
+ self.alpha = alpha
16
+
17
+ def call(self, y_true, y_pred):
18
+ y_true_one_hot = tf.one_hot(tf.cast(y_true, tf.int32), depth=tf.shape(y_pred)[-1])
19
+ y_pred = tf.clip_by_value(y_pred, 1e-7, 1.0 - 1e-7)
20
+ ce = -y_true_one_hot * tf.math.log(y_pred)
21
+ weight = self.alpha * tf.pow(1.0 - y_pred, self.gamma)
22
+ focal_loss = weight * ce
23
+ return tf.reduce_sum(focal_loss, axis=-1)
24
+
25
+ # YAMNet Wrapper
26
+ class YamnetWrapper(tf.keras.layers.Layer):
27
+ def __init__(self, trainable=False, **kwargs):
28
+ super().__init__(**kwargs)
29
+ import tensorflow_hub as hub
30
+ self.yamnet = hub.KerasLayer("https://tfhub.dev/google/yamnet/1", trainable=trainable)
31
+
32
+ @tf.function
33
+ def call(self, waveforms):
34
+ return tf.vectorized_map(lambda w: self.yamnet(w)[1], waveforms)
35
+
36
+ class EmotionRecognizer:
37
+ def __init__(self, model_path="model.keras", label_map_path="label_map.json"):
38
+ # Load label map
39
+ with open(label_map_path, "r") as f:
40
+ self.label_map = {int(k): v for k, v in json.load(f).items()}
41
+
42
+ # Load model
43
+ self.model = tf.keras.models.load_model(model_path, custom_objects={
44
+ 'SparseCategoricalFocalLoss': SparseCategoricalFocalLoss,
45
+ 'YamnetWrapper': YamnetWrapper
46
+ })
47
+
48
+ # Initialize YAMNet
49
+ self.yamnet = YamnetWrapper()
50
+
51
+ def preprocess_audio(self, file_path, sample_rate=16000, duration=3.0):
52
+ "Preprocess audio file"
53
+ samples = int(sample_rate * duration)
54
+
55
+ try:
56
+ # Load and normalize audio
57
+ y, sr = librosa.load(file_path, sr=sample_rate, duration=duration)
58
+ y = librosa.util.normalize(y)
59
+
60
+ # Ensure consistent length
61
+ if len(y) < samples:
62
+ y = np.pad(y, (0, samples - len(y)))
63
+ else:
64
+ y = y[:samples]
65
+
66
+ return y
67
+ except Exception as e:
68
+ print(f"Error processing {file_path}: {e}")
69
+ return None
70
+
71
+ def predict(self, file_path):
72
+ "Predict emotion from audio file"
73
+ # Preprocess audio
74
+ audio = self.preprocess_audio(file_path)
75
+ if audio is None:
76
+ return {"error": f"Failed to process audio file: {file_path}"}
77
+
78
+ # Extract embeddings
79
+ try:
80
+ embedding = self.yamnet(tf.constant([audio], dtype=tf.float32)).numpy()
81
+ except Exception as e:
82
+ return {"error": f"Failed to extract embeddings: {e}"}
83
+
84
+ # Make prediction
85
+ prediction = self.model.predict(embedding)[0]
86
+
87
+ # Get predicted class
88
+ pred_index = np.argmax(prediction)
89
+ pred_emotion = self.label_map[pred_index]
90
+
91
+ # Prepare results with confidence scores
92
+ results = {
93
+ "predicted_emotion": pred_emotion,
94
+ "confidence": float(prediction[pred_index]),
95
+ "all_scores": {self.label_map[i]: float(prediction[i]) for i in range(len(prediction))}
96
+ }
97
+
98
+ return results
99
+
100
+ def load_from_hf(repo_id="vkushwahaa/audio-emotion-recognition"):
101
+ "Load model from Hugging Face Hub"
102
+ # Download model from Hugging Face Hub
103
+ model_path = snapshot_download(repo_id=repo_id)
104
+
105
+ # Initialize emotion recognizer
106
+ recognizer = EmotionRecognizer(
107
+ model_path=os.path.join(model_path, "model.keras"),
108
+ label_map_path=os.path.join(model_path, "label_map.json")
109
+ )
110
+
111
+ return recognizer
112
+
113
+ # Example usage
114
+ if __name__ == "__main__":
115
+ import sys
116
+
117
+ if len(sys.argv) < 2:
118
+ print("Usage: python emotion_recognizer.py <audio_file_path>")
119
+ sys.exit(1)
120
+
121
+ audio_path = sys.argv[1]
122
+
123
+ # Load model from Hugging Face Hub
124
+ recognizer = load_from_hf()
125
+
126
+ # Predict emotion
127
+ result = recognizer.predict(audio_path)
128
+
129
+ # Print results
130
+ print(f"Predicted emotion: {result['predicted_emotion']} (confidence: {result['confidence']:.2f})")
131
+ print("\nAll emotion scores:")
132
+ for emotion, score in result['all_scores'].items():
133
+ print(f" {emotion}: {score:.4f}")
label_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"0": "angry", "1": "disgust", "2": "fear", "3": "happy", "4": "neutral", "5": "sad", "6": "surprise"}
model.keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c4d6cebb2a3d28f3b2ebe4b904cc4102713e8d26b87c623d292173fe2f267f3
3
+ size 7194739