|
|
import gradio as gr |
|
|
import torch |
|
|
import librosa |
|
|
import numpy as np |
|
|
import os |
|
|
import traceback |
|
|
|
|
|
|
|
|
class EmotionClassifier(torch.nn.Module): |
|
|
def __init__(self, input_features, hidden_sizes, num_classes): |
|
|
super().__init__() |
|
|
|
|
|
|
|
|
layers = [] |
|
|
prev_size = input_features |
|
|
|
|
|
|
|
|
for size in hidden_sizes: |
|
|
layers.append(torch.nn.Linear(prev_size, size)) |
|
|
layers.append(torch.nn.ReLU()) |
|
|
prev_size = size |
|
|
|
|
|
|
|
|
layers.append(torch.nn.Linear(prev_size, num_classes)) |
|
|
|
|
|
|
|
|
self.model = torch.nn.Sequential(*layers) |
|
|
|
|
|
def forward(self, x): |
|
|
return self.model(x) |
|
|
|
|
|
|
|
|
emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised", "calm"] |
|
|
|
|
|
|
|
|
try: |
|
|
print("Loading PyTorch model...") |
|
|
|
|
|
|
|
|
input_features = 768 |
|
|
hidden_sizes = [256, 128, 64] |
|
|
num_classes = 8 |
|
|
|
|
|
model = EmotionClassifier(input_features, hidden_sizes, num_classes) |
|
|
|
|
|
model_path = os.path.join(os.path.dirname(__file__), 'emotion_model.pt') |
|
|
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))) |
|
|
model.eval() |
|
|
print("Model loaded successfully") |
|
|
except Exception as e: |
|
|
print(f"Error loading model: {e}") |
|
|
traceback.print_exc() |
|
|
model = None |
|
|
|
|
|
def extract_features(audio_path, sample_rate=16000): |
|
|
"""Extract features from an audio file that match what your model expects""" |
|
|
try: |
|
|
print(f"Extracting features from {audio_path}") |
|
|
audio, sr = librosa.load(audio_path, sr=sample_rate) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20) |
|
|
mfccs_mean = np.mean(mfccs.T, axis=0) |
|
|
mfccs_var = np.var(mfccs.T, axis=0) |
|
|
|
|
|
|
|
|
chroma = librosa.feature.chroma_stft(y=audio, sr=sr) |
|
|
chroma_mean = np.mean(chroma.T, axis=0) |
|
|
chroma_var = np.var(chroma.T, axis=0) |
|
|
|
|
|
|
|
|
mel = librosa.feature.melspectrogram(y=audio, sr=sr) |
|
|
mel_mean = np.mean(mel.T, axis=0) |
|
|
mel_var = np.var(mel.T, axis=0) |
|
|
|
|
|
|
|
|
contrast = librosa.feature.spectral_contrast(y=audio, sr=sr) |
|
|
contrast_mean = np.mean(contrast.T, axis=0) |
|
|
contrast_var = np.var(contrast.T, axis=0) |
|
|
|
|
|
|
|
|
features = np.hstack([ |
|
|
mfccs_mean, mfccs_var, |
|
|
chroma_mean, chroma_var, |
|
|
mel_mean[:200], mel_var[:200], |
|
|
contrast_mean, contrast_var |
|
|
]) |
|
|
|
|
|
|
|
|
if len(features) < 768: |
|
|
|
|
|
features = np.pad(features, (0, 768 - len(features))) |
|
|
elif len(features) > 768: |
|
|
|
|
|
features = features[:768] |
|
|
|
|
|
print(f"Extracted {len(features)} features") |
|
|
return features |
|
|
except Exception as e: |
|
|
print(f"Error extracting features: {e}") |
|
|
traceback.print_exc() |
|
|
return None |
|
|
|
|
|
def predict_emotion(audio): |
|
|
"""Predict emotion from audio input""" |
|
|
if model is None: |
|
|
return {emotion: 1/len(emotions) for emotion in emotions} |
|
|
|
|
|
try: |
|
|
print(f"Processing audio input: {type(audio)}") |
|
|
|
|
|
|
|
|
if isinstance(audio, str): |
|
|
features = extract_features(audio) |
|
|
else: |
|
|
|
|
|
import tempfile |
|
|
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file: |
|
|
if isinstance(audio, tuple): |
|
|
audio_array, sample_rate = audio |
|
|
else: |
|
|
audio_array = audio |
|
|
sample_rate = 16000 |
|
|
|
|
|
import soundfile as sf |
|
|
sf.write(temp_file.name, audio_array, sample_rate) |
|
|
features = extract_features(temp_file.name) |
|
|
|
|
|
os.remove(temp_file.name) |
|
|
|
|
|
if features is None: |
|
|
return {emotion: 1/len(emotions) for emotion in emotions} |
|
|
|
|
|
|
|
|
features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model(features_tensor) |
|
|
probabilities = torch.nn.functional.softmax(outputs, dim=1) |
|
|
|
|
|
|
|
|
result = {emotion: float(probabilities[0][i].item()) for i, emotion in enumerate(emotions)} |
|
|
print(f"Prediction result: {result}") |
|
|
return result |
|
|
except Exception as e: |
|
|
print(f"Error in prediction: {e}") |
|
|
traceback.print_exc() |
|
|
return {emotion: 1/len(emotions) for emotion in emotions} |
|
|
|
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=predict_emotion, |
|
|
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"), |
|
|
outputs=gr.Label(num_top_classes=8), |
|
|
title="Speech Emotion Recognition", |
|
|
description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, surprised, and calm emotions." |
|
|
) |
|
|
|
|
|
demo.launch() |