File size: 6,168 Bytes
5d7352c 6069c51 5d7352c f1445b2 e5b4dac 5d7352c e5b4dac 6069c51 e5b4dac 6069c51 e5b4dac 6069c51 e5b4dac 6069c51 e5b4dac 5d7352c e5b4dac 6069c51 e5b4dac f1445b2 e5b4dac f1445b2 e5b4dac f1445b2 e5b4dac f1445b2 e5b4dac f1445b2 e5b4dac f1445b2 e5b4dac f1445b2 6069c51 e5b4dac f1445b2 e5b4dac f1445b2 e5b4dac f1445b2 e5b4dac f1445b2 e5b4dac 6069c51 e5b4dac 6069c51 f1445b2 e5b4dac 6069c51 e5b4dac f1445b2 6069c51 5d7352c 6069c51 5d7352c f1445b2 e5b4dac 5d7352c e5b4dac 5d7352c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import gradio as gr
import torch
import librosa
import numpy as np
import os
import traceback
# Define your PyTorch model class to match the conversion
class EmotionClassifier(torch.nn.Module):
def __init__(self, input_features, hidden_sizes, num_classes):
super().__init__()
# Build the sequential model
layers = []
prev_size = input_features
# Add hidden layers
for size in hidden_sizes:
layers.append(torch.nn.Linear(prev_size, size))
layers.append(torch.nn.ReLU())
prev_size = size
# Add output layer
layers.append(torch.nn.Linear(prev_size, num_classes))
# Create the model
self.model = torch.nn.Sequential(*layers)
def forward(self, x):
return self.model(x)
# Define emotions list - make sure this matches your model's output classes
emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised", "calm"] # Added "calm" as the 8th emotion based on your model
# Load the PyTorch model
try:
print("Loading PyTorch model...")
# Parameters determined from the Keras model
input_features = 768 # From the Keras model's first layer weights
hidden_sizes = [256, 128, 64] # From the Keras model architecture
num_classes = 8 # From the Keras model's output layer
model = EmotionClassifier(input_features, hidden_sizes, num_classes)
model_path = os.path.join(os.path.dirname(__file__), 'emotion_model.pt')
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.eval()
print("Model loaded successfully")
except Exception as e:
print(f"Error loading model: {e}")
traceback.print_exc()
model = None
def extract_features(audio_path, sample_rate=16000):
"""Extract features from an audio file that match what your model expects"""
try:
print(f"Extracting features from {audio_path}")
audio, sr = librosa.load(audio_path, sr=sample_rate)
# We need to extract features that match what your model was trained on
# Based on your model, it seems to expect 768 features
# Let's extract MFCCs, spectral features, and more to get a rich feature set
# Extract MFCCs
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20)
mfccs_mean = np.mean(mfccs.T, axis=0)
mfccs_var = np.var(mfccs.T, axis=0)
# Extract spectral features
chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
chroma_mean = np.mean(chroma.T, axis=0)
chroma_var = np.var(chroma.T, axis=0)
# Extract mel spectrogram
mel = librosa.feature.melspectrogram(y=audio, sr=sr)
mel_mean = np.mean(mel.T, axis=0)
mel_var = np.var(mel.T, axis=0)
# Extract spectral contrast
contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
contrast_mean = np.mean(contrast.T, axis=0)
contrast_var = np.var(contrast.T, axis=0)
# Combine all features
features = np.hstack([
mfccs_mean, mfccs_var,
chroma_mean, chroma_var,
mel_mean[:200], mel_var[:200], # Limit to 200 features to avoid exceeding 768
contrast_mean, contrast_var
])
# Ensure we have exactly 768 features
if len(features) < 768:
# Pad with zeros if needed
features = np.pad(features, (0, 768 - len(features)))
elif len(features) > 768:
# Truncate if too many
features = features[:768]
print(f"Extracted {len(features)} features")
return features
except Exception as e:
print(f"Error extracting features: {e}")
traceback.print_exc()
return None
def predict_emotion(audio):
"""Predict emotion from audio input"""
if model is None:
return {emotion: 1/len(emotions) for emotion in emotions}
try:
print(f"Processing audio input: {type(audio)}")
# Process audio based on input type
if isinstance(audio, str): # File path
features = extract_features(audio)
else: # Audio array from microphone
# Save to a temporary file
import tempfile
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
if isinstance(audio, tuple):
audio_array, sample_rate = audio
else:
audio_array = audio
sample_rate = 16000
import soundfile as sf
sf.write(temp_file.name, audio_array, sample_rate)
features = extract_features(temp_file.name)
# Clean up
os.remove(temp_file.name)
if features is None:
return {emotion: 1/len(emotions) for emotion in emotions}
# Convert features to PyTorch tensor
features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)
# Make prediction
with torch.no_grad():
outputs = model(features_tensor)
probabilities = torch.nn.functional.softmax(outputs, dim=1)
# Format result
result = {emotion: float(probabilities[0][i].item()) for i, emotion in enumerate(emotions)}
print(f"Prediction result: {result}")
return result
except Exception as e:
print(f"Error in prediction: {e}")
traceback.print_exc()
return {emotion: 1/len(emotions) for emotion in emotions}
# Create Gradio interface
demo = gr.Interface(
fn=predict_emotion,
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
outputs=gr.Label(num_top_classes=8), # Updated to match the 8 emotions
title="Speech Emotion Recognition",
description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, surprised, and calm emotions."
)
demo.launch() |