HaryaniAnjali's picture
Update app.py
6069c51 verified
raw
history blame
4.44 kB
import gradio as gr
import torch
import librosa
import numpy as np
import os
# Define PyTorch model class (must match the structure used during conversion)
class EmotionClassifier(torch.nn.Module):
def __init__(self, input_shape, num_classes):
super().__init__()
# Adjust this architecture to match your converted model
self.flatten = torch.nn.Flatten()
self.layers = torch.nn.Sequential(
torch.nn.Linear(input_shape, 128),
torch.nn.ReLU(),
torch.nn.Dropout(0.3),
torch.nn.Linear(128, 64),
torch.nn.ReLU(),
torch.nn.Dropout(0.3),
torch.nn.Linear(64, num_classes)
)
def forward(self, x):
x = self.flatten(x)
return self.layers(x)
# Create model instance
input_shape = 13 * 128 # n_mfcc * max_length
num_classes = 7 # Number of emotions
model = EmotionClassifier(input_shape, num_classes)
# Load the saved model weights
model_path = os.path.join(os.path.dirname(__file__), 'emotion_model.pt')
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.eval()
# Define emotions
emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]
def extract_features(audio_path, sample_rate=16000, n_mfcc=13, max_length=128):
"""Extract MFCC features from an audio file"""
try:
audio, sr = librosa.load(audio_path, sr=sample_rate)
# Extract MFCCs
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
# Pad or truncate to fixed length
if mfccs.shape[1] < max_length:
pad_width = max_length - mfccs.shape[1]
mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
else:
mfccs = mfccs[:, :max_length]
return mfccs
except Exception as e:
print(f"Error in feature extraction: {e}")
return None
def predict_emotion(audio):
"""Predict emotion from audio input"""
try:
# Process audio input
if isinstance(audio, str): # File path
features = extract_features(audio)
else: # Audio array from microphone
# Handle microphone input
if isinstance(audio, tuple):
audio_array, sample_rate = audio
else:
audio_array = audio
sample_rate = 16000
# Convert to mono if stereo
if len(np.array(audio_array).shape) > 1:
audio_array = np.mean(audio_array, axis=1)
# Extract features
mfccs = librosa.feature.mfcc(y=np.array(audio_array), sr=sample_rate, n_mfcc=13)
# Pad or truncate to fixed length
max_length = 128
if mfccs.shape[1] < max_length:
pad_width = max_length - mfccs.shape[1]
mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
else:
mfccs = mfccs[:, :max_length]
features = mfccs
if features is None:
return {emotion: 0.0 for emotion in emotions}
# Flatten the features (adjust based on your model's input expectations)
features_flat = features.reshape(1, -1)
# Convert to PyTorch tensor
features_tensor = torch.tensor(features_flat, dtype=torch.float32)
# Get predictions
with torch.no_grad():
outputs = model(features_tensor)
probabilities = torch.nn.functional.softmax(outputs, dim=1)
# Format results
result = {emotion: float(probabilities[0][i].item()) for i, emotion in enumerate(emotions)}
return result
except Exception as e:
print(f"Error in prediction: {e}")
import traceback
traceback.print_exc()
return {emotion: 1/len(emotions) for emotion in emotions}
# Create Gradio interface
demo = gr.Interface(
fn=predict_emotion,
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
outputs=gr.Label(num_top_classes=7),
title="Speech Emotion Recognition",
description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, and surprised emotions."
)
demo.launch()