import os os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' import gradio as gr import tensorflow as tf from tensorflow import keras import librosa import numpy as np import warnings warnings.filterwarnings('ignore') # --- 1. MODEL LOADING --- MODEL_PATH = 'best_model1_weights.h5' try: model = keras.models.load_model(MODEL_PATH) print("✅ Model loaded!") except Exception as e: print(f"❌ Load error: {e}") model = None # --- 2. THE CORRECT ALPHABETICAL ORDER --- # This is the order LabelEncoder uses by default EMOTIONS = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'] def extract_features(data, sr): zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0) rms = np.mean(librosa.feature.rms(y=data).T, axis=0) mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sr, n_mfcc=40).T, axis=0) mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sr).T, axis=0) return np.hstack((zcr, rms, mfcc, mel)) def preprocess_audio(audio_path): try: data, sr = librosa.load(audio_path, duration=2.5, offset=0.6, res_type='kaiser_fast') # Silence check if np.sqrt(np.mean(data**2)) < 0.002: return "SILENT" # Stacking features to reach 2376 base = extract_features(data, sr) features = np.concatenate((base, base, base)) if len(features) < 2376: features = np.pad(features, (0, 2376 - len(features)), 'constant') else: features = features[:2376] # Standardize std = np.std(features) if std > 0: features = (features - np.mean(features)) / std return features.reshape(1, 2376, 1) except Exception as e: return None def predict_emotion(audio_filepath): if audio_filepath is None: return "No audio", "0%", {} feat = preprocess_audio(audio_filepath) if feat is "SILENT": return "NEUTRAL (Silence)", "100%", {"neutral": 1.0} if feat is None: return "Error", "0%", {} preds = model.predict(feat, verbose=0)[0] idx = np.argmax(preds) # Map probabilities to the ALPHABETICAL list prob_dict = {EMOTIONS[i]: float(preds[i]) for i in range(len(EMOTIONS))} return EMOTIONS[idx].upper(), f"{preds[idx]*100:.2f}%", prob_dict # --- 3. INTERFACE --- with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("## 🎙️ Speech Emotion Recognition (Fixed Labels)") with gr.Row(): audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath") with gr.Column(): emotion_out = gr.Textbox(label="Detected Emotion") conf_out = gr.Textbox(label="Confidence") label_chart = gr.Label(label="All Probabilities", num_top_classes=7) btn = gr.Button("Analyze", variant="primary") btn.click(predict_emotion, inputs=audio_input, outputs=[emotion_out, conf_out, label_chart]) if __name__ == "__main__": demo.launch(ssr_mode=False)