Spaces:

HaryaniAnjali
/

Audio_File_Emotion_Classification

Sleeping

App Files Files Community

Audio_File_Emotion_Classification / app.py

HaryaniAnjali

Update app.py

e5b4dac verified 9 months ago

raw

history blame contribute delete

6.17 kB

	import gradio as gr
	import torch
	import librosa
	import numpy as np
	import os
	import traceback

	# Define your PyTorch model class to match the conversion
	class EmotionClassifier(torch.nn.Module):
	def __init__(self, input_features, hidden_sizes, num_classes):
	super().__init__()

	# Build the sequential model
	layers = []
	prev_size = input_features

	# Add hidden layers
	for size in hidden_sizes:
	layers.append(torch.nn.Linear(prev_size, size))
	layers.append(torch.nn.ReLU())
	prev_size = size

	# Add output layer
	layers.append(torch.nn.Linear(prev_size, num_classes))

	# Create the model
	self.model = torch.nn.Sequential(*layers)

	def forward(self, x):
	return self.model(x)

	# Define emotions list - make sure this matches your model's output classes
	emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised", "calm"] # Added "calm" as the 8th emotion based on your model

	# Load the PyTorch model
	try:
	print("Loading PyTorch model...")

	# Parameters determined from the Keras model
	input_features = 768 # From the Keras model's first layer weights
	hidden_sizes = [256, 128, 64] # From the Keras model architecture
	num_classes = 8 # From the Keras model's output layer

	model = EmotionClassifier(input_features, hidden_sizes, num_classes)

	model_path = os.path.join(os.path.dirname(__file__), 'emotion_model.pt')
	model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
	model.eval()
	print("Model loaded successfully")
	except Exception as e:
	print(f"Error loading model: {e}")
	traceback.print_exc()
	model = None

	def extract_features(audio_path, sample_rate=16000):
	"""Extract features from an audio file that match what your model expects"""
	try:
	print(f"Extracting features from {audio_path}")
	audio, sr = librosa.load(audio_path, sr=sample_rate)

	# We need to extract features that match what your model was trained on
	# Based on your model, it seems to expect 768 features
	# Let's extract MFCCs, spectral features, and more to get a rich feature set

	# Extract MFCCs
	mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20)
	mfccs_mean = np.mean(mfccs.T, axis=0)
	mfccs_var = np.var(mfccs.T, axis=0)

	# Extract spectral features
	chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
	chroma_mean = np.mean(chroma.T, axis=0)
	chroma_var = np.var(chroma.T, axis=0)

	# Extract mel spectrogram
	mel = librosa.feature.melspectrogram(y=audio, sr=sr)
	mel_mean = np.mean(mel.T, axis=0)
	mel_var = np.var(mel.T, axis=0)

	# Extract spectral contrast
	contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
	contrast_mean = np.mean(contrast.T, axis=0)
	contrast_var = np.var(contrast.T, axis=0)

	# Combine all features
	features = np.hstack([
	mfccs_mean, mfccs_var,
	chroma_mean, chroma_var,
	mel_mean[:200], mel_var[:200], # Limit to 200 features to avoid exceeding 768
	contrast_mean, contrast_var
	])

	# Ensure we have exactly 768 features
	if len(features) < 768:
	# Pad with zeros if needed
	features = np.pad(features, (0, 768 - len(features)))
	elif len(features) > 768:
	# Truncate if too many
	features = features[:768]

	print(f"Extracted {len(features)} features")
	return features
	except Exception as e:
	print(f"Error extracting features: {e}")
	traceback.print_exc()
	return None

	def predict_emotion(audio):
	"""Predict emotion from audio input"""
	if model is None:
	return {emotion: 1/len(emotions) for emotion in emotions}

	try:
	print(f"Processing audio input: {type(audio)}")

	# Process audio based on input type
	if isinstance(audio, str): # File path
	features = extract_features(audio)
	else: # Audio array from microphone
	# Save to a temporary file
	import tempfile
	with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
	if isinstance(audio, tuple):
	audio_array, sample_rate = audio
	else:
	audio_array = audio
	sample_rate = 16000

	import soundfile as sf
	sf.write(temp_file.name, audio_array, sample_rate)
	features = extract_features(temp_file.name)
	# Clean up
	os.remove(temp_file.name)

	if features is None:
	return {emotion: 1/len(emotions) for emotion in emotions}

	# Convert features to PyTorch tensor
	features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)

	# Make prediction
	with torch.no_grad():
	outputs = model(features_tensor)
	probabilities = torch.nn.functional.softmax(outputs, dim=1)

	# Format result
	result = {emotion: float(probabilities[0][i].item()) for i, emotion in enumerate(emotions)}
	print(f"Prediction result: {result}")
	return result
	except Exception as e:
	print(f"Error in prediction: {e}")
	traceback.print_exc()
	return {emotion: 1/len(emotions) for emotion in emotions}

	# Create Gradio interface
	demo = gr.Interface(
	fn=predict_emotion,
	inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
	outputs=gr.Label(num_top_classes=8), # Updated to match the 8 emotions
	title="Speech Emotion Recognition",
	description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, surprised, and calm emotions."
	)

	demo.launch()