notuser77 commited on
Commit
743ff29
·
verified ·
1 Parent(s): 1c6458c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -61
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import os
2
- # Fix for Hugging Face/Gradio threading issues
3
  os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
4
 
5
  import gradio as gr
@@ -12,23 +11,19 @@ import warnings
12
  warnings.filterwarnings('ignore')
13
 
14
  # --- 1. MODEL LOADING ---
15
- # Ensure your file is named 'best_model1_weights.h5' in your repository
16
  MODEL_PATH = 'best_model1_weights.h5'
17
-
18
  try:
19
- # Loading the complete model to avoid architecture mismatch (9 vs 13 layers)
20
  model = keras.models.load_model(MODEL_PATH)
21
- print("✅ Model loaded successfully!")
22
  except Exception as e:
23
- print(f"❌ Error loading model: {e}")
24
  model = None
25
 
26
- # --- 2. PREPROCESSING LOGIC ---
27
- # Standard Alphabetical labels for 7-class RAVDESS/TESS models
28
  EMOTIONS = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
29
 
30
  def extract_features(data, sr):
31
- """Extracts features in the exact order and mean-aggregation used in Kaggle."""
32
  zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
33
  rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
34
  mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sr, n_mfcc=40).T, axis=0)
@@ -37,86 +32,57 @@ def extract_features(data, sr):
37
 
38
  def preprocess_audio(audio_path):
39
  try:
40
- # Load exactly 2.5s with an offset (Kaggle notebook standard)
41
- # res_type='kaiser_fast' ensures speed and consistency
42
  data, sr = librosa.load(audio_path, duration=2.5, offset=0.6, res_type='kaiser_fast')
43
-
44
- # SILENCE CHECK: Prevents "99% Angry" on no sound
45
- rms_val = np.sqrt(np.mean(data**2))
46
- if rms_val < 0.005:
47
  return "SILENT"
48
 
49
- # Stack features 3 times to hit the required 2376 input dimension
50
- # (This mimics the Original + Noise + Pitch shift stacking)
51
- base_feat = extract_features(data, sr)
52
- features = np.concatenate((base_feat, base_feat, base_feat))
53
 
54
- # Pad or Truncate to exactly 2376
55
  if len(features) < 2376:
56
  features = np.pad(features, (0, 2376 - len(features)), 'constant')
57
  else:
58
  features = features[:2376]
59
 
60
- # Standardization (Zero Mean, Unit Variance)
61
- # This is the most important step to prevent "Angry" bias
62
  std = np.std(features)
63
  if std > 0:
64
  features = (features - np.mean(features)) / std
65
 
66
  return features.reshape(1, 2376, 1)
67
  except Exception as e:
68
- print(f"Preprocessing Error: {e}")
69
  return None
70
 
71
- # --- 3. PREDICTION FUNCTION ---
72
  def predict_emotion(audio_filepath):
73
- if model is None:
74
- return "Model not found", "0%", {}
75
- if audio_filepath is None:
76
- return "Please record audio", "0%", {}
77
-
78
- processed_data = preprocess_audio(audio_filepath)
79
-
80
- if processed_data is "SILENT":
81
- return "Silence Detected", "100%", {"neutral": 1.0}
82
 
83
- if processed_data is None:
84
- return "Error in audio processing", "0%", {}
 
85
 
86
- # Perform inference
87
- preds = model.predict(processed_data, verbose=0)[0]
88
-
89
- # Identify top prediction
90
  idx = np.argmax(preds)
91
- confidence = preds[idx]
92
 
93
- # Map all probabilities to emotions
94
  prob_dict = {EMOTIONS[i]: float(preds[i]) for i in range(len(EMOTIONS))}
95
 
96
- return EMOTIONS[idx].upper(), f"{confidence*100:.2f}%", prob_dict
97
 
98
- # --- 4. GRADIO INTERFACE ---
99
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
100
- gr.Markdown("## 🎙️ Speech Emotion Recognition")
101
- gr.Markdown("Speak for at least 3 seconds. The model will analyze the last 2.5 seconds.")
102
-
103
  with gr.Row():
 
104
  with gr.Column():
105
- audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input Audio")
106
- analyze_btn = gr.Button("Analyze Emotion", variant="primary")
107
-
108
- with gr.Column():
109
- with gr.Row():
110
- emotion_out = gr.Textbox(label="Detected Emotion")
111
- conf_out = gr.Textbox(label="Confidence Level")
112
- label_chart = gr.Label(label="Confidence Distribution", num_top_classes=7)
113
 
114
- analyze_btn.click(
115
- fn=predict_emotion,
116
- inputs=audio_input,
117
- outputs=[emotion_out, conf_out, label_chart]
118
- )
119
 
120
  if __name__ == "__main__":
121
- # ssr_mode=False is CRITICAL to stop the _DictWrapper crash on HF
122
- demo.launch(server_name="0.0.0.0", ssr_mode=False)
 
1
  import os
 
2
  os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
3
 
4
  import gradio as gr
 
11
  warnings.filterwarnings('ignore')
12
 
13
  # --- 1. MODEL LOADING ---
 
14
  MODEL_PATH = 'best_model1_weights.h5'
 
15
  try:
 
16
  model = keras.models.load_model(MODEL_PATH)
17
+ print("✅ Model loaded!")
18
  except Exception as e:
19
+ print(f"❌ Load error: {e}")
20
  model = None
21
 
22
+ # --- 2. THE CORRECT ALPHABETICAL ORDER ---
23
+ # This is the order LabelEncoder uses by default
24
  EMOTIONS = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
25
 
26
  def extract_features(data, sr):
 
27
  zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
28
  rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
29
  mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sr, n_mfcc=40).T, axis=0)
 
32
 
33
  def preprocess_audio(audio_path):
34
  try:
 
 
35
  data, sr = librosa.load(audio_path, duration=2.5, offset=0.6, res_type='kaiser_fast')
36
+
37
+ # Silence check
38
+ if np.sqrt(np.mean(data**2)) < 0.002:
 
39
  return "SILENT"
40
 
41
+ # Stacking features to reach 2376
42
+ base = extract_features(data, sr)
43
+ features = np.concatenate((base, base, base))
 
44
 
 
45
  if len(features) < 2376:
46
  features = np.pad(features, (0, 2376 - len(features)), 'constant')
47
  else:
48
  features = features[:2376]
49
 
50
+ # Standardize
 
51
  std = np.std(features)
52
  if std > 0:
53
  features = (features - np.mean(features)) / std
54
 
55
  return features.reshape(1, 2376, 1)
56
  except Exception as e:
 
57
  return None
58
 
 
59
  def predict_emotion(audio_filepath):
60
+ if audio_filepath is None: return "No audio", "0%", {}
 
 
 
 
 
 
 
 
61
 
62
+ feat = preprocess_audio(audio_filepath)
63
+ if feat is "SILENT": return "NEUTRAL (Silence)", "100%", {"neutral": 1.0}
64
+ if feat is None: return "Error", "0%", {}
65
 
66
+ preds = model.predict(feat, verbose=0)[0]
 
 
 
67
  idx = np.argmax(preds)
 
68
 
69
+ # Map probabilities to the ALPHABETICAL list
70
  prob_dict = {EMOTIONS[i]: float(preds[i]) for i in range(len(EMOTIONS))}
71
 
72
+ return EMOTIONS[idx].upper(), f"{preds[idx]*100:.2f}%", prob_dict
73
 
74
+ # --- 3. INTERFACE ---
75
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
76
+ gr.Markdown("## 🎙️ Speech Emotion Recognition (Fixed Labels)")
 
 
77
  with gr.Row():
78
+ audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
79
  with gr.Column():
80
+ emotion_out = gr.Textbox(label="Detected Emotion")
81
+ conf_out = gr.Textbox(label="Confidence")
82
+ label_chart = gr.Label(label="All Probabilities", num_top_classes=7)
 
 
 
 
 
83
 
84
+ btn = gr.Button("Analyze", variant="primary")
85
+ btn.click(predict_emotion, inputs=audio_input, outputs=[emotion_out, conf_out, label_chart])
 
 
 
86
 
87
  if __name__ == "__main__":
88
+ demo.launch(ssr_mode=False)