|
|
import os |
|
|
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' |
|
|
|
|
|
import gradio as gr |
|
|
import tensorflow as tf |
|
|
from tensorflow import keras |
|
|
import librosa |
|
|
import numpy as np |
|
|
import warnings |
|
|
|
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
|
|
|
MODEL_PATH = 'best_model1_weights.h5' |
|
|
try: |
|
|
model = keras.models.load_model(MODEL_PATH) |
|
|
print("✅ Model loaded!") |
|
|
except Exception as e: |
|
|
print(f"❌ Load error: {e}") |
|
|
model = None |
|
|
|
|
|
|
|
|
|
|
|
EMOTIONS = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'] |
|
|
|
|
|
def extract_features(data, sr): |
|
|
zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0) |
|
|
rms = np.mean(librosa.feature.rms(y=data).T, axis=0) |
|
|
mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sr, n_mfcc=40).T, axis=0) |
|
|
mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sr).T, axis=0) |
|
|
return np.hstack((zcr, rms, mfcc, mel)) |
|
|
|
|
|
def preprocess_audio(audio_path): |
|
|
try: |
|
|
data, sr = librosa.load(audio_path, duration=2.5, offset=0.6, res_type='kaiser_fast') |
|
|
|
|
|
|
|
|
if np.sqrt(np.mean(data**2)) < 0.002: |
|
|
return "SILENT" |
|
|
|
|
|
|
|
|
base = extract_features(data, sr) |
|
|
features = np.concatenate((base, base, base)) |
|
|
|
|
|
if len(features) < 2376: |
|
|
features = np.pad(features, (0, 2376 - len(features)), 'constant') |
|
|
else: |
|
|
features = features[:2376] |
|
|
|
|
|
|
|
|
std = np.std(features) |
|
|
if std > 0: |
|
|
features = (features - np.mean(features)) / std |
|
|
|
|
|
return features.reshape(1, 2376, 1) |
|
|
except Exception as e: |
|
|
return None |
|
|
|
|
|
def predict_emotion(audio_filepath): |
|
|
if audio_filepath is None: return "No audio", "0%", {} |
|
|
|
|
|
feat = preprocess_audio(audio_filepath) |
|
|
if feat is "SILENT": return "NEUTRAL (Silence)", "100%", {"neutral": 1.0} |
|
|
if feat is None: return "Error", "0%", {} |
|
|
|
|
|
preds = model.predict(feat, verbose=0)[0] |
|
|
idx = np.argmax(preds) |
|
|
|
|
|
|
|
|
prob_dict = {EMOTIONS[i]: float(preds[i]) for i in range(len(EMOTIONS))} |
|
|
|
|
|
return EMOTIONS[idx].upper(), f"{preds[idx]*100:.2f}%", prob_dict |
|
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown("## 🎙️ Speech Emotion Recognition (Fixed Labels)") |
|
|
with gr.Row(): |
|
|
audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath") |
|
|
with gr.Column(): |
|
|
emotion_out = gr.Textbox(label="Detected Emotion") |
|
|
conf_out = gr.Textbox(label="Confidence") |
|
|
label_chart = gr.Label(label="All Probabilities", num_top_classes=7) |
|
|
|
|
|
btn = gr.Button("Analyze", variant="primary") |
|
|
btn.click(predict_emotion, inputs=audio_input, outputs=[emotion_out, conf_out, label_chart]) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(ssr_mode=False) |