notuser77 commited on
Commit
5d55225
·
verified ·
1 Parent(s): 3c7ab19

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -46
app.py CHANGED
@@ -3,85 +3,78 @@ import joblib
3
  import pandas as pd
4
  import numpy as np
5
  import torch
6
- import torchaudio
7
- import warnings # <--- This fixes the NameError
8
  import gradio as gr
9
  import huggingface_hub
10
  from speechbrain.inference.classifiers import EncoderClassifier
11
 
12
- # 1. ROBUST MONKEY PATCH
13
- # This fixes the 'use_auth_token' vs 'token' error and the 'NoneType' crash
14
  orig_download = huggingface_hub.hf_hub_download
15
-
16
  def patched_download(*args, **kwargs):
17
- if 'use_auth_token' in kwargs:
18
- kwargs['token'] = kwargs.pop('use_auth_token')
19
-
20
  fname = kwargs.get('filename') or (args[1] if len(args) > 1 else None)
21
-
22
- try:
23
- return orig_download(*args, **kwargs)
24
  except Exception as e:
25
- # If SpeechBrain looks for 'custom.py' and it's missing (404),
26
- # return a dummy file path instead of None to prevent a crash.
27
- if fname == "custom.py" and ("404" in str(e) or "Not Found" in str(e)):
28
  dummy_path = os.path.abspath("dummy_custom.py")
29
  if not os.path.exists(dummy_path):
30
- with open(dummy_path, "w") as f:
31
- f.write("# Dummy file for compatibility\n")
32
  return dummy_path
33
  raise e
34
-
35
  huggingface_hub.hf_hub_download = patched_download
36
  warnings.filterwarnings("ignore")
37
 
38
- # 2. LOAD MODELS
39
- # Load your SVM Classifier (trying both possible filenames)
40
- MODEL_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
41
- if not os.path.exists(MODEL_PATH):
42
- MODEL_PATH = 'svm_model.joblib'
43
-
44
- print(f"Loading SVM classifier: {MODEL_PATH}")
45
- svm_model = joblib.load(MODEL_PATH)
46
 
47
- # Load SpeechBrain Feature Extractor
48
- print("Loading SpeechBrain ECAPA feature extractor...")
49
  feature_extractor = EncoderClassifier.from_hparams(
50
  source="speechbrain/spkrec-ecapa-voxceleb",
51
  savedir="pretrained_models/spkrec-ecapa-voxceleb"
52
  )
53
 
54
- # 3. DEFINE INFERENCE
 
55
  EMOTIONS = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
56
 
57
  def predict_emotion(audio_path):
58
- if audio_path is None:
59
- return "Please upload an audio file."
60
 
61
- # Load and Preprocess Audio
62
- signal, fs = torchaudio.load(audio_path)
 
63
 
64
- # Extract ECAPA-TDNN Embeddings
65
  with torch.no_grad():
66
- embeddings = feature_extractor.encode_batch(signal)
67
- # Convert to numpy and flatten (SVM expects 1D array of 192 features)
68
  embeddings = embeddings.cpu().numpy().squeeze().reshape(1, -1)
69
 
70
- # Predict with SVM
71
- prediction = svm_model.predict(embeddings)[0]
 
 
72
 
73
- # Return mapped label if numeric, otherwise return string
74
- if isinstance(prediction, (int, np.integer)):
75
- return EMOTIONS[prediction]
76
- return prediction
 
 
 
 
 
 
77
 
78
- # 4. GRADIO INTERFACE
79
  demo = gr.Interface(
80
  fn=predict_emotion,
81
- inputs=gr.Audio(type="filepath", label="Upload Speech (WAV)"),
82
- outputs=gr.Label(label="Detected Emotion"),
83
- title="Speech Emotion Recognition (RAVDESS)",
84
- description="This app uses SpeechBrain ECAPA-TDNN embeddings and a pre-trained SVM to classify emotions."
85
  )
86
 
87
  if __name__ == "__main__":
 
3
  import pandas as pd
4
  import numpy as np
5
  import torch
6
+ import warnings
 
7
  import gradio as gr
8
  import huggingface_hub
9
  from speechbrain.inference.classifiers import EncoderClassifier
10
 
11
+ # --- 1. PRE-LOAD SETUP (Monkey Patch as before) ---
 
12
  orig_download = huggingface_hub.hf_hub_download
 
13
  def patched_download(*args, **kwargs):
14
+ if 'use_auth_token' in kwargs: kwargs['token'] = kwargs.pop('use_auth_token')
 
 
15
  fname = kwargs.get('filename') or (args[1] if len(args) > 1 else None)
16
+ try: return orig_download(*args, **kwargs)
 
 
17
  except Exception as e:
18
+ if fname == "custom.py":
 
 
19
  dummy_path = os.path.abspath("dummy_custom.py")
20
  if not os.path.exists(dummy_path):
21
+ with open(dummy_path, "w") as f: f.write("# Dummy\n")
 
22
  return dummy_path
23
  raise e
 
24
  huggingface_hub.hf_hub_download = patched_download
25
  warnings.filterwarnings("ignore")
26
 
27
+ # --- 2. LOAD MODELS ---
28
+ # Using your specific SVM file
29
+ SVM_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
30
+ print(f"Loading SVM: {SVM_PATH}")
31
+ svm_model = joblib.load(SVM_PATH)
 
 
 
32
 
33
+ print("Loading SpeechBrain Feature Extractor...")
 
34
  feature_extractor = EncoderClassifier.from_hparams(
35
  source="speechbrain/spkrec-ecapa-voxceleb",
36
  savedir="pretrained_models/spkrec-ecapa-voxceleb"
37
  )
38
 
39
+ # --- 3. DEFINE INFERENCE ---
40
+ # RAVDESS Standard Mapping (1-indexed in many datasets)
41
  EMOTIONS = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
42
 
43
  def predict_emotion(audio_path):
44
+ if audio_path is None: return "Please upload audio."
 
45
 
46
+ # CRITICAL: Use SpeechBrain's loader.
47
+ # It automatically handles resampling to 16kHz and mono conversion.
48
+ signal = feature_extractor.load_audio(audio_path)
49
 
50
+ # Extract Embeddings
51
  with torch.no_grad():
52
+ # unsqueeze(0) adds the batch dimension [1, time]
53
+ embeddings = feature_extractor.encode_batch(signal.unsqueeze(0))
54
  embeddings = embeddings.cpu().numpy().squeeze().reshape(1, -1)
55
 
56
+ # MATCH FEATURE NAMES: Your SVM was trained with named features
57
+ # '0_speechbrain_embedding' through '191_speechbrain_embedding'
58
+ feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
59
+ df_embeddings = pd.DataFrame(embeddings, columns=feature_names)
60
 
61
+ # Predict
62
+ if hasattr(svm_model, "predict_proba"):
63
+ probas = svm_model.predict_proba(df_embeddings)[0]
64
+ # Map probabilities to emotion names for Gradio Label
65
+ return {EMOTIONS[i]: float(probas[i]) for i in range(len(EMOTIONS))}
66
+ else:
67
+ pred_idx = int(svm_model.predict(df_embeddings)[0])
68
+ # If your SVM uses 1-8 labels, subtract 1; if 0-7, keep as is.
69
+ # Most RAVDESS SVMs use 0-7 for programming ease.
70
+ return EMOTIONS[pred_idx]
71
 
72
+ # --- 4. INTERFACE ---
73
  demo = gr.Interface(
74
  fn=predict_emotion,
75
+ inputs=gr.Audio(type="filepath", label="Speech Input"),
76
+ outputs=gr.Label(num_top_classes=3),
77
+ title="Speech Emotion Classifier (Fixed Resampling)"
 
78
  )
79
 
80
  if __name__ == "__main__":