Spaces:

HaryaniAnjali
/

Audio_File_Emotion_Classification

Sleeping

App Files Files Community

Audio_File_Emotion_Classification / app.py

HaryaniAnjali

Update app.py

f1445b2 verified about 1 year ago

raw

history blame

3.57 kB

	import gradio as gr
	import tensorflow as tf
	import librosa
	import numpy as np
	import os

	# Load the model directly from the .h5 file
	model_path = os.path.join(os.path.dirname(__file__), 'wav2vec_model.h5')
	model = tf.keras.models.load_model(model_path)

	# Define emotions list
	emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]

	def extract_features(audio_path, sample_rate=16000, n_mfcc=13, max_length=128):
	"""Extract MFCC features from an audio file"""
	try:
	audio, sr = librosa.load(audio_path, sr=sample_rate)

	# Extract MFCCs
	mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)

	# Pad or truncate to fixed length
	if mfccs.shape[1] < max_length:
	pad_width = max_length - mfccs.shape[1]
	mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
	else:
	mfccs = mfccs[:, :max_length]

	return mfccs
	except Exception as e:
	print(f"Error in feature extraction: {e}")
	return None

	def predict_emotion(audio):
	"""Predict emotion from audio input

	This function accepts both file path (when uploading) and audio array
	(when recording via microphone) as input
	"""
	try:
	# Check if audio is a file path or audio array
	if isinstance(audio, str): # File path
	features = extract_features(audio)
	else: # Audio array from microphone
	# If audio is a tuple (audio array, sample rate)
	if isinstance(audio, tuple):
	audio_array, sample_rate = audio
	else:
	# If only audio array is provided, assume sample rate
	audio_array = audio
	sample_rate = 16000

	# Convert to mono if stereo
	if len(audio_array.shape) > 1:
	audio_array = np.mean(audio_array, axis=1)

	# Extract features
	mfccs = librosa.feature.mfcc(y=audio_array, sr=sample_rate, n_mfcc=13)

	# Pad or truncate to fixed length
	max_length = 128
	if mfccs.shape[1] < max_length:
	pad_width = max_length - mfccs.shape[1]
	mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
	else:
	mfccs = mfccs[:, :max_length]

	features = mfccs

	if features is None:
	return {emotion: 0.0 for emotion in emotions}

	# Reshape for model input
	features = np.expand_dims(features, axis=0)

	# Make prediction
	predictions = model.predict(features)

	# Format results
	result = {emotion: float(predictions[0][i]) for i, emotion in enumerate(emotions)}
	return result

	except Exception as e:
	print(f"Error in prediction: {e}")
	return {emotion: 0.0 for emotion in emotions}

	# Create Gradio interface with both file upload and microphone
	demo = gr.Interface(
	fn=predict_emotion,
	inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
	outputs=gr.Label(num_top_classes=7),
	title="Speech Emotion Recognition",
	description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, and surprised emotions.",
	examples=[
	["example1.wav"], # Add example files here if you have them
	]
	)

	demo.launch()