Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

.gitattributes +1 -0
README.md +60 -0
emotion_recognizer.py +133 -0
label_map.json +1 -0
model.keras +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ model.keras filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,60 @@

+# Audio Emotion Recognition
+This model recognizes emotions in speech audio files. It can detect the following emotions:
+- Angry
+- Disgust
+- Fear
+- Happy
+- Neutral
+- Sad
+- Surprise
+## Model Description
+This model uses a two-stage approach:
+1. Audio feature extraction using YAMNet
+2. Emotion classification using a custom neural network
+## Usage
+### Installation
+```bash
+pip install tensorflow librosa huggingface_hub tensorflow_hub
+```
+### Quick Start
+```python
+from huggingface_hub import snapshot_download
+import os
+import sys
+sys.path.append(snapshot_download(repo_id="vkushwahaa/audio-emotion-recognition"))
+from emotion_recognizer import load_from_hf
+# Load model
+recognizer = load_from_hf()
+# Predict emotion from audio file
+result = recognizer.predict("path/to/audio.wav")
+# Print results
+print(f"Predicted emotion: {result['predicted_emotion']} (confidence: {result['confidence']:.2f})")
+```
+### Command Line Usage
+```bash
+python emotion_recognizer.py path/to/audio.wav
+```
+## Training
+This model was trained on multiple datasets:
+- CREMA-D
+- RAVDESS
+- SAVEE
+- TESS
+The model uses YAMNet embeddings as features and employs data augmentation techniques to improve robustness.

emotion_recognizer.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import os
+import numpy as np
+import tensorflow as tf
+import librosa
+import json
+from huggingface_hub import HfApi, snapshot_download
+# Custom Focal Loss (needed for model loading)
+@tf.keras.utils.register_keras_serializable(package="CustomLosses")
+class SparseCategoricalFocalLoss(tf.keras.losses.Loss):
+    def __init__(self, gamma=2.0, alpha=0.25, name="sparse_focal_loss", **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.gamma = gamma
+        self.alpha = alpha
+    def call(self, y_true, y_pred):
+        y_true_one_hot = tf.one_hot(tf.cast(y_true, tf.int32), depth=tf.shape(y_pred)[-1])
+        y_pred = tf.clip_by_value(y_pred, 1e-7, 1.0 - 1e-7)
+        ce = -y_true_one_hot * tf.math.log(y_pred)
+        weight = self.alpha * tf.pow(1.0 - y_pred, self.gamma)
+        focal_loss = weight * ce
+        return tf.reduce_sum(focal_loss, axis=-1)
+# YAMNet Wrapper
+class YamnetWrapper(tf.keras.layers.Layer):
+    def __init__(self, trainable=False, **kwargs):
+        super().__init__(**kwargs)
+        import tensorflow_hub as hub
+        self.yamnet = hub.KerasLayer("https://tfhub.dev/google/yamnet/1", trainable=trainable)
+    @tf.function
+    def call(self, waveforms):
+        return tf.vectorized_map(lambda w: self.yamnet(w)[1], waveforms)
+class EmotionRecognizer:
+    def __init__(self, model_path="model.keras", label_map_path="label_map.json"):
+        # Load label map
+        with open(label_map_path, "r") as f:
+            self.label_map = {int(k): v for k, v in json.load(f).items()}
+        # Load model
+        self.model = tf.keras.models.load_model(model_path, custom_objects={
+            'SparseCategoricalFocalLoss': SparseCategoricalFocalLoss,
+            'YamnetWrapper': YamnetWrapper
+        })
+        # Initialize YAMNet
+        self.yamnet = YamnetWrapper()
+    def preprocess_audio(self, file_path, sample_rate=16000, duration=3.0):
+        "Preprocess audio file"
+        samples = int(sample_rate * duration)
+        try:
+            # Load and normalize audio
+            y, sr = librosa.load(file_path, sr=sample_rate, duration=duration)
+            y = librosa.util.normalize(y)
+            # Ensure consistent length
+            if len(y) < samples:
+                y = np.pad(y, (0, samples - len(y)))
+            else:
+                y = y[:samples]
+            return y
+        except Exception as e:
+            print(f"Error processing {file_path}: {e}")
+            return None
+    def predict(self, file_path):
+        "Predict emotion from audio file"
+        # Preprocess audio
+        audio = self.preprocess_audio(file_path)
+        if audio is None:
+            return {"error": f"Failed to process audio file: {file_path}"}
+        # Extract embeddings
+        try:
+            embedding = self.yamnet(tf.constant([audio], dtype=tf.float32)).numpy()
+        except Exception as e:
+            return {"error": f"Failed to extract embeddings: {e}"}
+        # Make prediction
+        prediction = self.model.predict(embedding)[0]
+        # Get predicted class
+        pred_index = np.argmax(prediction)
+        pred_emotion = self.label_map[pred_index]
+        # Prepare results with confidence scores
+        results = {
+            "predicted_emotion": pred_emotion,
+            "confidence": float(prediction[pred_index]),
+            "all_scores": {self.label_map[i]: float(prediction[i]) for i in range(len(prediction))}
+        }
+        return results
+def load_from_hf(repo_id="vkushwahaa/audio-emotion-recognition"):
+    "Load model from Hugging Face Hub"
+    # Download model from Hugging Face Hub
+    model_path = snapshot_download(repo_id=repo_id)
+    # Initialize emotion recognizer
+    recognizer = EmotionRecognizer(
+        model_path=os.path.join(model_path, "model.keras"),
+        label_map_path=os.path.join(model_path, "label_map.json")
+    )
+    return recognizer
+# Example usage
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) < 2:
+        print("Usage: python emotion_recognizer.py <audio_file_path>")
+        sys.exit(1)
+    audio_path = sys.argv[1]
+    # Load model from Hugging Face Hub
+    recognizer = load_from_hf()
+    # Predict emotion
+    result = recognizer.predict(audio_path)
+    # Print results
+    print(f"Predicted emotion: {result['predicted_emotion']} (confidence: {result['confidence']:.2f})")
+    print("\nAll emotion scores:")
+    for emotion, score in result['all_scores'].items():
+        print(f"  {emotion}: {score:.4f}")

label_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"0": "angry", "1": "disgust", "2": "fear", "3": "happy", "4": "neutral", "5": "sad", "6": "surprise"}

model.keras ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c4d6cebb2a3d28f3b2ebe4b904cc4102713e8d26b87c623d292173fe2f267f3
+size 7194739