Spaces:

HaryaniAnjali
/

Audio_File_Emotion_Classification

Sleeping

App Files Files Community

Audio_File_Emotion_Classification / app.py

HaryaniAnjali

Update app.py

6069c51 verified about 1 year ago

raw

history blame

4.44 kB

	import gradio as gr
	import torch
	import librosa
	import numpy as np
	import os

	# Define PyTorch model class (must match the structure used during conversion)
	class EmotionClassifier(torch.nn.Module):
	def __init__(self, input_shape, num_classes):
	super().__init__()
	# Adjust this architecture to match your converted model
	self.flatten = torch.nn.Flatten()
	self.layers = torch.nn.Sequential(
	torch.nn.Linear(input_shape, 128),
	torch.nn.ReLU(),
	torch.nn.Dropout(0.3),
	torch.nn.Linear(128, 64),
	torch.nn.ReLU(),
	torch.nn.Dropout(0.3),
	torch.nn.Linear(64, num_classes)
	)

	def forward(self, x):
	x = self.flatten(x)
	return self.layers(x)

	# Create model instance
	input_shape = 13 * 128 # n_mfcc * max_length
	num_classes = 7 # Number of emotions
	model = EmotionClassifier(input_shape, num_classes)

	# Load the saved model weights
	model_path = os.path.join(os.path.dirname(__file__), 'emotion_model.pt')
	model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
	model.eval()

	# Define emotions
	emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]

	def extract_features(audio_path, sample_rate=16000, n_mfcc=13, max_length=128):
	"""Extract MFCC features from an audio file"""
	try:
	audio, sr = librosa.load(audio_path, sr=sample_rate)

	# Extract MFCCs
	mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)

	# Pad or truncate to fixed length
	if mfccs.shape[1] < max_length:
	pad_width = max_length - mfccs.shape[1]
	mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
	else:
	mfccs = mfccs[:, :max_length]

	return mfccs
	except Exception as e:
	print(f"Error in feature extraction: {e}")
	return None

	def predict_emotion(audio):
	"""Predict emotion from audio input"""
	try:
	# Process audio input
	if isinstance(audio, str): # File path
	features = extract_features(audio)
	else: # Audio array from microphone
	# Handle microphone input
	if isinstance(audio, tuple):
	audio_array, sample_rate = audio
	else:
	audio_array = audio
	sample_rate = 16000

	# Convert to mono if stereo
	if len(np.array(audio_array).shape) > 1:
	audio_array = np.mean(audio_array, axis=1)

	# Extract features
	mfccs = librosa.feature.mfcc(y=np.array(audio_array), sr=sample_rate, n_mfcc=13)

	# Pad or truncate to fixed length
	max_length = 128
	if mfccs.shape[1] < max_length:
	pad_width = max_length - mfccs.shape[1]
	mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
	else:
	mfccs = mfccs[:, :max_length]

	features = mfccs

	if features is None:
	return {emotion: 0.0 for emotion in emotions}

	# Flatten the features (adjust based on your model's input expectations)
	features_flat = features.reshape(1, -1)

	# Convert to PyTorch tensor
	features_tensor = torch.tensor(features_flat, dtype=torch.float32)

	# Get predictions
	with torch.no_grad():
	outputs = model(features_tensor)
	probabilities = torch.nn.functional.softmax(outputs, dim=1)

	# Format results
	result = {emotion: float(probabilities[0][i].item()) for i, emotion in enumerate(emotions)}
	return result

	except Exception as e:
	print(f"Error in prediction: {e}")
	import traceback
	traceback.print_exc()
	return {emotion: 1/len(emotions) for emotion in emotions}

	# Create Gradio interface
	demo = gr.Interface(
	fn=predict_emotion,
	inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
	outputs=gr.Label(num_top_classes=7),
	title="Speech Emotion Recognition",
	description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, and surprised emotions."
	)

	demo.launch()