NV9523's picture
Update app.py
15a41ac verified
import gradio as gr
import numpy as np
import librosa
import tensorflow as tf
# Load trained models
model_female = tf.keras.models.load_model("emotion_recognition_female.h5")
model_male = tf.keras.models.load_model("emotion_recognition_male.h5")
# Constants
FRAME_LENGTH = 2048
HOP_LENGTH = 512
emotion_to_code = {
'neutral': 0, 'happy': 1, 'sad': 2,
'angry': 3, 'fear': 4, 'disgust': 5
}
code_to_emotion = {v: k for k, v in emotion_to_code.items()}
# Preprocess audio
def preprocess_audio(path):
try:
y, sr = librosa.load(path, sr=22050)
y, _ = librosa.effects.trim(y, top_db=25)
if len(y) > 180000:
y = y[:180000]
else:
y = np.pad(y, (0, 180000-len(y)))
return y, sr
except:
sr = 22050
y = np.random.randn(180000) * 0.1
return y, sr
# Extract features (ZCR + RMS + 13 MFCC)
def extract_features(y, sr):
zcr = librosa.feature.zero_crossing_rate(y, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)
rms = librosa.feature.rms(y=y, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=HOP_LENGTH)
features = np.vstack([zcr, rms, mfcc])
features = features.T # shape (352, 15)
return features
# Predict function
def predict(audio_file, gender):
y, sr = preprocess_audio(audio_file)
features = extract_features(y, sr)
features = np.expand_dims(features, axis=0) # add batch dim
if gender == "female":
preds = model_female.predict(features)
else:
preds = model_male.predict(features)
pred_idx = np.argmax(preds)
emotion = code_to_emotion[pred_idx]
confidence = float(np.max(preds))
return {emotion: confidence}
# Gradio interface
demo = gr.Interface(
fn=predict,
inputs=[
gr.Audio(type="filepath", label="Upload Audio"),
gr.Radio(choices=["female", "male"], label="Select Gender")
],
outputs=gr.Label(num_top_classes=3, label="Predicted Emotion"),
title="๐ŸŽ™๏ธ Speech Emotion Recognition Demo",
description="Upload an audio file and select gender to predict emotion."
)
demo.launch()