| import gradio as gr |
| import tensorflow as tf |
| import librosa |
| import numpy as np |
| import os |
|
|
| |
| model_path = os.path.join(os.path.dirname(__file__), 'wav2vec_model.h5') |
| model = tf.keras.models.load_model(model_path) |
|
|
| |
| emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"] |
|
|
| def extract_features(audio_path, sample_rate=16000, n_mfcc=13, max_length=128): |
| """Extract MFCC features from an audio file""" |
| try: |
| audio, sr = librosa.load(audio_path, sr=sample_rate) |
| |
| |
| mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc) |
| |
| |
| if mfccs.shape[1] < max_length: |
| pad_width = max_length - mfccs.shape[1] |
| mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant') |
| else: |
| mfccs = mfccs[:, :max_length] |
| |
| return mfccs |
| except Exception as e: |
| print(f"Error in feature extraction: {e}") |
| return None |
|
|
| def predict_emotion(audio): |
| """Predict emotion from audio input |
| |
| This function accepts both file path (when uploading) and audio array |
| (when recording via microphone) as input |
| """ |
| try: |
| |
| if isinstance(audio, str): |
| features = extract_features(audio) |
| else: |
| |
| if isinstance(audio, tuple): |
| audio_array, sample_rate = audio |
| else: |
| |
| audio_array = audio |
| sample_rate = 16000 |
| |
| |
| if len(audio_array.shape) > 1: |
| audio_array = np.mean(audio_array, axis=1) |
| |
| |
| mfccs = librosa.feature.mfcc(y=audio_array, sr=sample_rate, n_mfcc=13) |
| |
| |
| max_length = 128 |
| if mfccs.shape[1] < max_length: |
| pad_width = max_length - mfccs.shape[1] |
| mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant') |
| else: |
| mfccs = mfccs[:, :max_length] |
| |
| features = mfccs |
| |
| if features is None: |
| return {emotion: 0.0 for emotion in emotions} |
| |
| |
| features = np.expand_dims(features, axis=0) |
| |
| |
| predictions = model.predict(features) |
| |
| |
| result = {emotion: float(predictions[0][i]) for i, emotion in enumerate(emotions)} |
| return result |
| |
| except Exception as e: |
| print(f"Error in prediction: {e}") |
| return {emotion: 0.0 for emotion in emotions} |
|
|
| |
| demo = gr.Interface( |
| fn=predict_emotion, |
| inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"), |
| outputs=gr.Label(num_top_classes=7), |
| title="Speech Emotion Recognition", |
| description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, and surprised emotions.", |
| examples=[ |
| ["example1.wav"], |
| ] |
| ) |
|
|
| demo.launch() |