Spaces:

HaryaniAnjali
/

Audio_File_Emotion_Classification

Sleeping

App Files Files Community

HaryaniAnjali commited on Apr 5, 2025

Commit

6069c51

verified ·

1 Parent(s): 6c8529b

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -26

app.py CHANGED Viewed

@@ -1,14 +1,40 @@
 import gradio as gr
-import tensorflow as tf
 import librosa
 import numpy as np
 import os
-# Load the model directly from the .h5 file
-model_path = os.path.join(os.path.dirname(__file__), 'wav2vec_model.h5')
-model = tf.keras.models.load_model(model_path)
-# Define emotions list
 emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]
 def extract_features(audio_path, sample_rate=16000, n_mfcc=13, max_length=128):
@@ -32,30 +58,25 @@ def extract_features(audio_path, sample_rate=16000, n_mfcc=13, max_length=128):
         return None
 def predict_emotion(audio):
-    """Predict emotion from audio input
-    This function accepts both file path (when uploading) and audio array
-    (when recording via microphone) as input
-    """
     try:
-        # Check if audio is a file path or audio array
         if isinstance(audio, str):  # File path
             features = extract_features(audio)
         else:  # Audio array from microphone
-            # If audio is a tuple (audio array, sample rate)
             if isinstance(audio, tuple):
                 audio_array, sample_rate = audio
             else:
-                # If only audio array is provided, assume sample rate
                 audio_array = audio
                 sample_rate = 16000
             # Convert to mono if stereo
-            if len(audio_array.shape) > 1:
                 audio_array = np.mean(audio_array, axis=1)
             # Extract features
-            mfccs = librosa.feature.mfcc(y=audio_array, sr=sample_rate, n_mfcc=13)
             # Pad or truncate to fixed length
             max_length = 128
@@ -70,30 +91,34 @@ def predict_emotion(audio):
         if features is None:
             return {emotion: 0.0 for emotion in emotions}
-        # Reshape for model input
-        features = np.expand_dims(features, axis=0)
-        # Make prediction
-        predictions = model.predict(features)
         # Format results
-        result = {emotion: float(predictions[0][i]) for i, emotion in enumerate(emotions)}
         return result
     except Exception as e:
         print(f"Error in prediction: {e}")
-        return {emotion: 0.0 for emotion in emotions}
-# Create Gradio interface with both file upload and microphone
 demo = gr.Interface(
     fn=predict_emotion,
     inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
     outputs=gr.Label(num_top_classes=7),
     title="Speech Emotion Recognition",
-    description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, and surprised emotions.",
-    examples=[
-        ["example1.wav"],  # Add example files here if you have them
-    ]
 )
 demo.launch()

 import gradio as gr
+import torch
 import librosa
 import numpy as np
 import os
+# Define PyTorch model class (must match the structure used during conversion)
+class EmotionClassifier(torch.nn.Module):
+    def __init__(self, input_shape, num_classes):
+        super().__init__()
+        # Adjust this architecture to match your converted model
+        self.flatten = torch.nn.Flatten()
+        self.layers = torch.nn.Sequential(
+            torch.nn.Linear(input_shape, 128),
+            torch.nn.ReLU(),
+            torch.nn.Dropout(0.3),
+            torch.nn.Linear(128, 64),
+            torch.nn.ReLU(),
+            torch.nn.Dropout(0.3),
+            torch.nn.Linear(64, num_classes)
+        )
+    def forward(self, x):
+        x = self.flatten(x)
+        return self.layers(x)
+# Create model instance
+input_shape = 13 * 128  # n_mfcc * max_length
+num_classes = 7  # Number of emotions
+model = EmotionClassifier(input_shape, num_classes)
+# Load the saved model weights
+model_path = os.path.join(os.path.dirname(__file__), 'emotion_model.pt')
+model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
+model.eval()
+# Define emotions
 emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]
 def extract_features(audio_path, sample_rate=16000, n_mfcc=13, max_length=128):
         return None
 def predict_emotion(audio):
+    """Predict emotion from audio input"""
     try:
+        # Process audio input
         if isinstance(audio, str):  # File path
             features = extract_features(audio)
         else:  # Audio array from microphone
+            # Handle microphone input
             if isinstance(audio, tuple):
                 audio_array, sample_rate = audio
             else:
                 audio_array = audio
                 sample_rate = 16000
             # Convert to mono if stereo
+            if len(np.array(audio_array).shape) > 1:
                 audio_array = np.mean(audio_array, axis=1)
             # Extract features
+            mfccs = librosa.feature.mfcc(y=np.array(audio_array), sr=sample_rate, n_mfcc=13)
             # Pad or truncate to fixed length
             max_length = 128
         if features is None:
             return {emotion: 0.0 for emotion in emotions}
+        # Flatten the features (adjust based on your model's input expectations)
+        features_flat = features.reshape(1, -1)
+        # Convert to PyTorch tensor
+        features_tensor = torch.tensor(features_flat, dtype=torch.float32)
+        # Get predictions
+        with torch.no_grad():
+            outputs = model(features_tensor)
+            probabilities = torch.nn.functional.softmax(outputs, dim=1)
         # Format results
+        result = {emotion: float(probabilities[0][i].item()) for i, emotion in enumerate(emotions)}
         return result
     except Exception as e:
         print(f"Error in prediction: {e}")
+        import traceback
+        traceback.print_exc()
+        return {emotion: 1/len(emotions) for emotion in emotions}
+# Create Gradio interface
 demo = gr.Interface(
     fn=predict_emotion,
     inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
     outputs=gr.Label(num_top_classes=7),
     title="Speech Emotion Recognition",
+    description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, and surprised emotions."
 )
 demo.launch()