notuser77 commited on
Commit
f32b656
·
verified ·
1 Parent(s): 5d55225

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -25
app.py CHANGED
@@ -8,7 +8,7 @@ import gradio as gr
8
  import huggingface_hub
9
  from speechbrain.inference.classifiers import EncoderClassifier
10
 
11
- # --- 1. PRE-LOAD SETUP (Monkey Patch as before) ---
12
  orig_download = huggingface_hub.hf_hub_download
13
  def patched_download(*args, **kwargs):
14
  if 'use_auth_token' in kwargs: kwargs['token'] = kwargs.pop('use_auth_token')
@@ -25,7 +25,6 @@ huggingface_hub.hf_hub_download = patched_download
25
  warnings.filterwarnings("ignore")
26
 
27
  # --- 2. LOAD MODELS ---
28
- # Using your specific SVM file
29
  SVM_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
30
  print(f"Loading SVM: {SVM_PATH}")
31
  svm_model = joblib.load(SVM_PATH)
@@ -36,45 +35,54 @@ feature_extractor = EncoderClassifier.from_hparams(
36
  savedir="pretrained_models/spkrec-ecapa-voxceleb"
37
  )
38
 
39
- # --- 3. DEFINE INFERENCE ---
40
- # RAVDESS Standard Mapping (1-indexed in many datasets)
41
  EMOTIONS = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
42
 
 
43
  def predict_emotion(audio_path):
44
- if audio_path is None: return "Please upload audio."
45
 
46
- # CRITICAL: Use SpeechBrain's loader.
47
- # It automatically handles resampling to 16kHz and mono conversion.
48
  signal = feature_extractor.load_audio(audio_path)
49
 
50
- # Extract Embeddings
 
 
 
 
 
51
  with torch.no_grad():
52
- # unsqueeze(0) adds the batch dimension [1, time]
53
  embeddings = feature_extractor.encode_batch(signal.unsqueeze(0))
54
  embeddings = embeddings.cpu().numpy().squeeze().reshape(1, -1)
55
 
56
- # MATCH FEATURE NAMES: Your SVM was trained with named features
57
- # '0_speechbrain_embedding' through '191_speechbrain_embedding'
58
  feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
59
  df_embeddings = pd.DataFrame(embeddings, columns=feature_names)
60
 
61
- # Predict
62
- if hasattr(svm_model, "predict_proba"):
63
- probas = svm_model.predict_proba(df_embeddings)[0]
64
- # Map probabilities to emotion names for Gradio Label
65
- return {EMOTIONS[i]: float(probas[i]) for i in range(len(EMOTIONS))}
66
- else:
67
- pred_idx = int(svm_model.predict(df_embeddings)[0])
68
- # If your SVM uses 1-8 labels, subtract 1; if 0-7, keep as is.
69
- # Most RAVDESS SVMs use 0-7 for programming ease.
70
- return EMOTIONS[pred_idx]
 
 
 
 
 
71
 
72
- # --- 4. INTERFACE ---
73
  demo = gr.Interface(
74
  fn=predict_emotion,
75
- inputs=gr.Audio(type="filepath", label="Speech Input"),
76
- outputs=gr.Label(num_top_classes=3),
77
- title="Speech Emotion Classifier (Fixed Resampling)"
 
78
  )
79
 
80
  if __name__ == "__main__":
 
8
  import huggingface_hub
9
  from speechbrain.inference.classifiers import EncoderClassifier
10
 
11
+ # --- 1. BOOTSTRAP (Monkey Patch for SpeechBrain 1.0.0 compatibility) ---
12
  orig_download = huggingface_hub.hf_hub_download
13
  def patched_download(*args, **kwargs):
14
  if 'use_auth_token' in kwargs: kwargs['token'] = kwargs.pop('use_auth_token')
 
25
  warnings.filterwarnings("ignore")
26
 
27
  # --- 2. LOAD MODELS ---
 
28
  SVM_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
29
  print(f"Loading SVM: {SVM_PATH}")
30
  svm_model = joblib.load(SVM_PATH)
 
35
  savedir="pretrained_models/spkrec-ecapa-voxceleb"
36
  )
37
 
38
+ # Standard RAVDESS mapping
 
39
  EMOTIONS = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
40
 
41
+ # --- 3. INFERENCE LOGIC ---
42
  def predict_emotion(audio_path):
43
+ if audio_path is None: return "Please upload an audio file."
44
 
45
+ # A. LOAD & PREPROCESS (Fixes the Bias)
 
46
  signal = feature_extractor.load_audio(audio_path)
47
 
48
+ # 1. Normalize Volume (Crucial for SVM stability)
49
+ # This prevents 'out-of-bounds' values that cause the Disgust/Surprised bias
50
+ if signal.abs().max() > 0:
51
+ signal = signal / signal.abs().max()
52
+
53
+ # 2. Extract Embeddings
54
  with torch.no_grad():
 
55
  embeddings = feature_extractor.encode_batch(signal.unsqueeze(0))
56
  embeddings = embeddings.cpu().numpy().squeeze().reshape(1, -1)
57
 
58
+ # B. PREPARE DATASET FORMAT
59
+ # Ensure column names match what the SVM was trained on
60
  feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
61
  df_embeddings = pd.DataFrame(embeddings, columns=feature_names)
62
 
63
+ # C. PREDICT & HANDLE OUTPUT (Fixes the ValueError)
64
+ prediction = svm_model.predict(df_embeddings)[0]
65
+
66
+ # If the model returns a string ('calm'), return it directly
67
+ if isinstance(prediction, str):
68
+ return prediction.capitalize()
69
+
70
+ # If it returns a number, map it to the EMOTIONS list
71
+ try:
72
+ idx = int(prediction)
73
+ # Handle 1-based indexing (1-8) or 0-based (0-7)
74
+ if 1 <= idx <= 8: return EMOTIONS[idx-1].capitalize()
75
+ return EMOTIONS[idx].capitalize()
76
+ except:
77
+ return str(prediction)
78
 
79
+ # --- 4. GRADIO INTERFACE ---
80
  demo = gr.Interface(
81
  fn=predict_emotion,
82
+ inputs=gr.Audio(type="filepath", label="Record or Upload Audio"),
83
+ outputs=gr.Textbox(label="Predicted Emotion"),
84
+ title="Speech Emotion Recognition",
85
+ description="Optimized for RAVDESS SVM. If accuracy is low, try to speak closer to the mic and minimize background noise."
86
  )
87
 
88
  if __name__ == "__main__":