File size: 4,443 Bytes
5d7352c 6069c51 5d7352c f1445b2 5d7352c 6069c51 5d7352c 6069c51 5d7352c f1445b2 6069c51 f1445b2 6069c51 f1445b2 6069c51 f1445b2 6069c51 f1445b2 6069c51 f1445b2 6069c51 f1445b2 6069c51 f1445b2 6069c51 f1445b2 6069c51 5d7352c 6069c51 5d7352c f1445b2 5d7352c 6069c51 5d7352c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import gradio as gr
import torch
import librosa
import numpy as np
import os
# Define PyTorch model class (must match the structure used during conversion)
class EmotionClassifier(torch.nn.Module):
def __init__(self, input_shape, num_classes):
super().__init__()
# Adjust this architecture to match your converted model
self.flatten = torch.nn.Flatten()
self.layers = torch.nn.Sequential(
torch.nn.Linear(input_shape, 128),
torch.nn.ReLU(),
torch.nn.Dropout(0.3),
torch.nn.Linear(128, 64),
torch.nn.ReLU(),
torch.nn.Dropout(0.3),
torch.nn.Linear(64, num_classes)
)
def forward(self, x):
x = self.flatten(x)
return self.layers(x)
# Create model instance
input_shape = 13 * 128 # n_mfcc * max_length
num_classes = 7 # Number of emotions
model = EmotionClassifier(input_shape, num_classes)
# Load the saved model weights
model_path = os.path.join(os.path.dirname(__file__), 'emotion_model.pt')
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.eval()
# Define emotions
emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]
def extract_features(audio_path, sample_rate=16000, n_mfcc=13, max_length=128):
"""Extract MFCC features from an audio file"""
try:
audio, sr = librosa.load(audio_path, sr=sample_rate)
# Extract MFCCs
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
# Pad or truncate to fixed length
if mfccs.shape[1] < max_length:
pad_width = max_length - mfccs.shape[1]
mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
else:
mfccs = mfccs[:, :max_length]
return mfccs
except Exception as e:
print(f"Error in feature extraction: {e}")
return None
def predict_emotion(audio):
"""Predict emotion from audio input"""
try:
# Process audio input
if isinstance(audio, str): # File path
features = extract_features(audio)
else: # Audio array from microphone
# Handle microphone input
if isinstance(audio, tuple):
audio_array, sample_rate = audio
else:
audio_array = audio
sample_rate = 16000
# Convert to mono if stereo
if len(np.array(audio_array).shape) > 1:
audio_array = np.mean(audio_array, axis=1)
# Extract features
mfccs = librosa.feature.mfcc(y=np.array(audio_array), sr=sample_rate, n_mfcc=13)
# Pad or truncate to fixed length
max_length = 128
if mfccs.shape[1] < max_length:
pad_width = max_length - mfccs.shape[1]
mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
else:
mfccs = mfccs[:, :max_length]
features = mfccs
if features is None:
return {emotion: 0.0 for emotion in emotions}
# Flatten the features (adjust based on your model's input expectations)
features_flat = features.reshape(1, -1)
# Convert to PyTorch tensor
features_tensor = torch.tensor(features_flat, dtype=torch.float32)
# Get predictions
with torch.no_grad():
outputs = model(features_tensor)
probabilities = torch.nn.functional.softmax(outputs, dim=1)
# Format results
result = {emotion: float(probabilities[0][i].item()) for i, emotion in enumerate(emotions)}
return result
except Exception as e:
print(f"Error in prediction: {e}")
import traceback
traceback.print_exc()
return {emotion: 1/len(emotions) for emotion in emotions}
# Create Gradio interface
demo = gr.Interface(
fn=predict_emotion,
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
outputs=gr.Label(num_top_classes=7),
title="Speech Emotion Recognition",
description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, and surprised emotions."
)
demo.launch() |