HaryaniAnjali's picture
Update app.py
f1445b2 verified
raw
history blame
3.57 kB
import gradio as gr
import tensorflow as tf
import librosa
import numpy as np
import os
# Load the model directly from the .h5 file
model_path = os.path.join(os.path.dirname(__file__), 'wav2vec_model.h5')
model = tf.keras.models.load_model(model_path)
# Define emotions list
emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]
def extract_features(audio_path, sample_rate=16000, n_mfcc=13, max_length=128):
"""Extract MFCC features from an audio file"""
try:
audio, sr = librosa.load(audio_path, sr=sample_rate)
# Extract MFCCs
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
# Pad or truncate to fixed length
if mfccs.shape[1] < max_length:
pad_width = max_length - mfccs.shape[1]
mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
else:
mfccs = mfccs[:, :max_length]
return mfccs
except Exception as e:
print(f"Error in feature extraction: {e}")
return None
def predict_emotion(audio):
"""Predict emotion from audio input
This function accepts both file path (when uploading) and audio array
(when recording via microphone) as input
"""
try:
# Check if audio is a file path or audio array
if isinstance(audio, str): # File path
features = extract_features(audio)
else: # Audio array from microphone
# If audio is a tuple (audio array, sample rate)
if isinstance(audio, tuple):
audio_array, sample_rate = audio
else:
# If only audio array is provided, assume sample rate
audio_array = audio
sample_rate = 16000
# Convert to mono if stereo
if len(audio_array.shape) > 1:
audio_array = np.mean(audio_array, axis=1)
# Extract features
mfccs = librosa.feature.mfcc(y=audio_array, sr=sample_rate, n_mfcc=13)
# Pad or truncate to fixed length
max_length = 128
if mfccs.shape[1] < max_length:
pad_width = max_length - mfccs.shape[1]
mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
else:
mfccs = mfccs[:, :max_length]
features = mfccs
if features is None:
return {emotion: 0.0 for emotion in emotions}
# Reshape for model input
features = np.expand_dims(features, axis=0)
# Make prediction
predictions = model.predict(features)
# Format results
result = {emotion: float(predictions[0][i]) for i, emotion in enumerate(emotions)}
return result
except Exception as e:
print(f"Error in prediction: {e}")
return {emotion: 0.0 for emotion in emotions}
# Create Gradio interface with both file upload and microphone
demo = gr.Interface(
fn=predict_emotion,
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
outputs=gr.Label(num_top_classes=7),
title="Speech Emotion Recognition",
description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, and surprised emotions.",
examples=[
["example1.wav"], # Add example files here if you have them
]
)
demo.launch()