import gradio as gr import torch import librosa import numpy as np import os # Define PyTorch model class (must match the structure used during conversion) class EmotionClassifier(torch.nn.Module): def __init__(self, input_shape, num_classes): super().__init__() # Adjust this architecture to match your converted model self.flatten = torch.nn.Flatten() self.layers = torch.nn.Sequential( torch.nn.Linear(input_shape, 128), torch.nn.ReLU(), torch.nn.Dropout(0.3), torch.nn.Linear(128, 64), torch.nn.ReLU(), torch.nn.Dropout(0.3), torch.nn.Linear(64, num_classes) ) def forward(self, x): x = self.flatten(x) return self.layers(x) # Create model instance input_shape = 13 * 128 # n_mfcc * max_length num_classes = 7 # Number of emotions model = EmotionClassifier(input_shape, num_classes) # Load the saved model weights model_path = os.path.join(os.path.dirname(__file__), 'emotion_model.pt') model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))) model.eval() # Define emotions emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"] def extract_features(audio_path, sample_rate=16000, n_mfcc=13, max_length=128): """Extract MFCC features from an audio file""" try: audio, sr = librosa.load(audio_path, sr=sample_rate) # Extract MFCCs mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc) # Pad or truncate to fixed length if mfccs.shape[1] < max_length: pad_width = max_length - mfccs.shape[1] mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant') else: mfccs = mfccs[:, :max_length] return mfccs except Exception as e: print(f"Error in feature extraction: {e}") return None def predict_emotion(audio): """Predict emotion from audio input""" try: # Process audio input if isinstance(audio, str): # File path features = extract_features(audio) else: # Audio array from microphone # Handle microphone input if isinstance(audio, tuple): audio_array, sample_rate = audio else: audio_array = audio sample_rate = 16000 # Convert to mono if stereo if len(np.array(audio_array).shape) > 1: audio_array = np.mean(audio_array, axis=1) # Extract features mfccs = librosa.feature.mfcc(y=np.array(audio_array), sr=sample_rate, n_mfcc=13) # Pad or truncate to fixed length max_length = 128 if mfccs.shape[1] < max_length: pad_width = max_length - mfccs.shape[1] mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant') else: mfccs = mfccs[:, :max_length] features = mfccs if features is None: return {emotion: 0.0 for emotion in emotions} # Flatten the features (adjust based on your model's input expectations) features_flat = features.reshape(1, -1) # Convert to PyTorch tensor features_tensor = torch.tensor(features_flat, dtype=torch.float32) # Get predictions with torch.no_grad(): outputs = model(features_tensor) probabilities = torch.nn.functional.softmax(outputs, dim=1) # Format results result = {emotion: float(probabilities[0][i].item()) for i, emotion in enumerate(emotions)} return result except Exception as e: print(f"Error in prediction: {e}") import traceback traceback.print_exc() return {emotion: 1/len(emotions) for emotion in emotions} # Create Gradio interface demo = gr.Interface( fn=predict_emotion, inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"), outputs=gr.Label(num_top_classes=7), title="Speech Emotion Recognition", description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, and surprised emotions." ) demo.launch()