Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
import os
|
| 2 |
-
# Fix for Hugging Face/Gradio threading issues
|
| 3 |
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
|
| 4 |
|
| 5 |
import gradio as gr
|
|
@@ -12,23 +11,19 @@ import warnings
|
|
| 12 |
warnings.filterwarnings('ignore')
|
| 13 |
|
| 14 |
# --- 1. MODEL LOADING ---
|
| 15 |
-
# Ensure your file is named 'best_model1_weights.h5' in your repository
|
| 16 |
MODEL_PATH = 'best_model1_weights.h5'
|
| 17 |
-
|
| 18 |
try:
|
| 19 |
-
# Loading the complete model to avoid architecture mismatch (9 vs 13 layers)
|
| 20 |
model = keras.models.load_model(MODEL_PATH)
|
| 21 |
-
print("✅ Model loaded
|
| 22 |
except Exception as e:
|
| 23 |
-
print(f"❌
|
| 24 |
model = None
|
| 25 |
|
| 26 |
-
# --- 2.
|
| 27 |
-
#
|
| 28 |
EMOTIONS = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
|
| 29 |
|
| 30 |
def extract_features(data, sr):
|
| 31 |
-
"""Extracts features in the exact order and mean-aggregation used in Kaggle."""
|
| 32 |
zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
|
| 33 |
rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
|
| 34 |
mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sr, n_mfcc=40).T, axis=0)
|
|
@@ -37,86 +32,57 @@ def extract_features(data, sr):
|
|
| 37 |
|
| 38 |
def preprocess_audio(audio_path):
|
| 39 |
try:
|
| 40 |
-
# Load exactly 2.5s with an offset (Kaggle notebook standard)
|
| 41 |
-
# res_type='kaiser_fast' ensures speed and consistency
|
| 42 |
data, sr = librosa.load(audio_path, duration=2.5, offset=0.6, res_type='kaiser_fast')
|
| 43 |
-
|
| 44 |
-
#
|
| 45 |
-
|
| 46 |
-
if rms_val < 0.005:
|
| 47 |
return "SILENT"
|
| 48 |
|
| 49 |
-
#
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
features = np.concatenate((base_feat, base_feat, base_feat))
|
| 53 |
|
| 54 |
-
# Pad or Truncate to exactly 2376
|
| 55 |
if len(features) < 2376:
|
| 56 |
features = np.pad(features, (0, 2376 - len(features)), 'constant')
|
| 57 |
else:
|
| 58 |
features = features[:2376]
|
| 59 |
|
| 60 |
-
#
|
| 61 |
-
# This is the most important step to prevent "Angry" bias
|
| 62 |
std = np.std(features)
|
| 63 |
if std > 0:
|
| 64 |
features = (features - np.mean(features)) / std
|
| 65 |
|
| 66 |
return features.reshape(1, 2376, 1)
|
| 67 |
except Exception as e:
|
| 68 |
-
print(f"Preprocessing Error: {e}")
|
| 69 |
return None
|
| 70 |
|
| 71 |
-
# --- 3. PREDICTION FUNCTION ---
|
| 72 |
def predict_emotion(audio_filepath):
|
| 73 |
-
if
|
| 74 |
-
return "Model not found", "0%", {}
|
| 75 |
-
if audio_filepath is None:
|
| 76 |
-
return "Please record audio", "0%", {}
|
| 77 |
-
|
| 78 |
-
processed_data = preprocess_audio(audio_filepath)
|
| 79 |
-
|
| 80 |
-
if processed_data is "SILENT":
|
| 81 |
-
return "Silence Detected", "100%", {"neutral": 1.0}
|
| 82 |
|
| 83 |
-
|
| 84 |
-
|
|
|
|
| 85 |
|
| 86 |
-
|
| 87 |
-
preds = model.predict(processed_data, verbose=0)[0]
|
| 88 |
-
|
| 89 |
-
# Identify top prediction
|
| 90 |
idx = np.argmax(preds)
|
| 91 |
-
confidence = preds[idx]
|
| 92 |
|
| 93 |
-
# Map
|
| 94 |
prob_dict = {EMOTIONS[i]: float(preds[i]) for i in range(len(EMOTIONS))}
|
| 95 |
|
| 96 |
-
return EMOTIONS[idx].upper(), f"{
|
| 97 |
|
| 98 |
-
# ---
|
| 99 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 100 |
-
gr.Markdown("## 🎙️ Speech Emotion Recognition")
|
| 101 |
-
gr.Markdown("Speak for at least 3 seconds. The model will analyze the last 2.5 seconds.")
|
| 102 |
-
|
| 103 |
with gr.Row():
|
|
|
|
| 104 |
with gr.Column():
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
with gr.Column():
|
| 109 |
-
with gr.Row():
|
| 110 |
-
emotion_out = gr.Textbox(label="Detected Emotion")
|
| 111 |
-
conf_out = gr.Textbox(label="Confidence Level")
|
| 112 |
-
label_chart = gr.Label(label="Confidence Distribution", num_top_classes=7)
|
| 113 |
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
inputs=audio_input,
|
| 117 |
-
outputs=[emotion_out, conf_out, label_chart]
|
| 118 |
-
)
|
| 119 |
|
| 120 |
if __name__ == "__main__":
|
| 121 |
-
|
| 122 |
-
demo.launch(server_name="0.0.0.0", ssr_mode=False)
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
|
| 3 |
|
| 4 |
import gradio as gr
|
|
|
|
| 11 |
warnings.filterwarnings('ignore')
|
| 12 |
|
| 13 |
# --- 1. MODEL LOADING ---
|
|
|
|
| 14 |
MODEL_PATH = 'best_model1_weights.h5'
|
|
|
|
| 15 |
try:
|
|
|
|
| 16 |
model = keras.models.load_model(MODEL_PATH)
|
| 17 |
+
print("✅ Model loaded!")
|
| 18 |
except Exception as e:
|
| 19 |
+
print(f"❌ Load error: {e}")
|
| 20 |
model = None
|
| 21 |
|
| 22 |
+
# --- 2. THE CORRECT ALPHABETICAL ORDER ---
|
| 23 |
+
# This is the order LabelEncoder uses by default
|
| 24 |
EMOTIONS = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
|
| 25 |
|
| 26 |
def extract_features(data, sr):
|
|
|
|
| 27 |
zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
|
| 28 |
rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
|
| 29 |
mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sr, n_mfcc=40).T, axis=0)
|
|
|
|
| 32 |
|
| 33 |
def preprocess_audio(audio_path):
|
| 34 |
try:
|
|
|
|
|
|
|
| 35 |
data, sr = librosa.load(audio_path, duration=2.5, offset=0.6, res_type='kaiser_fast')
|
| 36 |
+
|
| 37 |
+
# Silence check
|
| 38 |
+
if np.sqrt(np.mean(data**2)) < 0.002:
|
|
|
|
| 39 |
return "SILENT"
|
| 40 |
|
| 41 |
+
# Stacking features to reach 2376
|
| 42 |
+
base = extract_features(data, sr)
|
| 43 |
+
features = np.concatenate((base, base, base))
|
|
|
|
| 44 |
|
|
|
|
| 45 |
if len(features) < 2376:
|
| 46 |
features = np.pad(features, (0, 2376 - len(features)), 'constant')
|
| 47 |
else:
|
| 48 |
features = features[:2376]
|
| 49 |
|
| 50 |
+
# Standardize
|
|
|
|
| 51 |
std = np.std(features)
|
| 52 |
if std > 0:
|
| 53 |
features = (features - np.mean(features)) / std
|
| 54 |
|
| 55 |
return features.reshape(1, 2376, 1)
|
| 56 |
except Exception as e:
|
|
|
|
| 57 |
return None
|
| 58 |
|
|
|
|
| 59 |
def predict_emotion(audio_filepath):
|
| 60 |
+
if audio_filepath is None: return "No audio", "0%", {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
+
feat = preprocess_audio(audio_filepath)
|
| 63 |
+
if feat is "SILENT": return "NEUTRAL (Silence)", "100%", {"neutral": 1.0}
|
| 64 |
+
if feat is None: return "Error", "0%", {}
|
| 65 |
|
| 66 |
+
preds = model.predict(feat, verbose=0)[0]
|
|
|
|
|
|
|
|
|
|
| 67 |
idx = np.argmax(preds)
|
|
|
|
| 68 |
|
| 69 |
+
# Map probabilities to the ALPHABETICAL list
|
| 70 |
prob_dict = {EMOTIONS[i]: float(preds[i]) for i in range(len(EMOTIONS))}
|
| 71 |
|
| 72 |
+
return EMOTIONS[idx].upper(), f"{preds[idx]*100:.2f}%", prob_dict
|
| 73 |
|
| 74 |
+
# --- 3. INTERFACE ---
|
| 75 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 76 |
+
gr.Markdown("## 🎙️ Speech Emotion Recognition (Fixed Labels)")
|
|
|
|
|
|
|
| 77 |
with gr.Row():
|
| 78 |
+
audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
|
| 79 |
with gr.Column():
|
| 80 |
+
emotion_out = gr.Textbox(label="Detected Emotion")
|
| 81 |
+
conf_out = gr.Textbox(label="Confidence")
|
| 82 |
+
label_chart = gr.Label(label="All Probabilities", num_top_classes=7)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
+
btn = gr.Button("Analyze", variant="primary")
|
| 85 |
+
btn.click(predict_emotion, inputs=audio_input, outputs=[emotion_out, conf_out, label_chart])
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
if __name__ == "__main__":
|
| 88 |
+
demo.launch(ssr_mode=False)
|
|
|