notuser77 commited on
Commit
2b4f79e
·
verified ·
1 Parent(s): d83c4e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -24
app.py CHANGED
@@ -7,76 +7,91 @@ import torchaudio
7
  import warnings
8
  import gradio as gr
9
 
10
- # --- STEP 1: THE MONKEY PATCH (Fixes the TypeError) ---
 
11
  import huggingface_hub
12
  orig_download = huggingface_hub.hf_hub_download
 
13
  def patched_download(*args, **kwargs):
 
14
  if 'use_auth_token' in kwargs:
15
  kwargs['token'] = kwargs.pop('use_auth_token')
16
- return orig_download(*args, **kwargs)
 
 
 
 
 
 
 
 
 
 
17
  huggingface_hub.hf_hub_download = patched_download
18
 
19
- # Import SpeechBrain after the patch
20
  from speechbrain.inference.speaker import EncoderClassifier
21
 
22
  # Suppress scikit-learn version warnings
23
- warnings.filterwarnings("ignore", category=UserWarning)
24
 
25
  # --- STEP 2: LOAD MODELS ---
26
- # We check both filenames you uploaded
27
- MODEL_PATH = 'svm_model.joblib'
28
  if not os.path.exists(MODEL_PATH):
29
- MODEL_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
30
 
31
- print(f"Loading classifier from: {MODEL_PATH}")
32
  model = joblib.load(MODEL_PATH)
33
 
34
- # Load the ECAPA-TDNN feature extractor
35
- print("Loading SpeechBrain feature extractor...")
36
- feature_extractor = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
 
 
 
37
 
38
  # --- STEP 3: PREDICTION LOGIC ---
39
  def predict_emotion(audio_path):
40
  if audio_path is None:
41
  return "Please upload an audio file."
42
 
43
- # Load audio
44
  signal, fs = torchaudio.load(audio_path)
45
-
46
- # Preprocess (16kHz mono)
47
  if fs != 16000:
48
  resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
49
  signal = resampler(signal)
 
 
50
  if signal.shape[0] > 1:
51
  signal = torch.mean(signal, dim=0, keepdim=True)
52
 
53
- # Extract 192-D Embeddings
54
  with torch.no_grad():
55
  embeddings = feature_extractor.encode_batch(signal)
56
  embeddings = embeddings.squeeze().cpu().numpy().reshape(1, -1)
57
 
58
- # Create DataFrame with exact column names the SVM expects
59
- # (0_speechbrain_embedding, 1_speechbrain_embedding, etc.)
60
  feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
61
  X = pd.DataFrame(embeddings, columns=feature_names)
62
 
63
- # Predict
64
  try:
65
- # Get probabilities for the Label output
66
  probs = model.predict_proba(X)[0]
67
- return {model.classes_[i]: float(probs[i]) for i in range(len(model.classes_))}
68
  except Exception:
69
- # Fallback to direct prediction
70
  prediction = model.predict(X)[0]
71
  return str(prediction)
72
 
73
- # --- STEP 4: GRADIO UI ---
74
  demo = gr.Interface(
75
  fn=predict_emotion,
76
- inputs=gr.Audio(type="filepath", label="Upload Voice Clip"),
77
  outputs=gr.Label(label="Detected Emotion"),
78
  title="RAVDESS Emotion Classifier",
79
- description="This app uses ECAPA-TDNN embeddings from SpeechBrain and an SVM classifier to detect emotions in speech.",
80
  allow_flagging="never"
81
  )
82
 
 
7
  import warnings
8
  import gradio as gr
9
 
10
+ # --- STEP 1: THE ROBUST MONKEY PATCH ---
11
+ # This fixes both the 'use_auth_token' error and the 'custom.py' 404 crash
12
  import huggingface_hub
13
  orig_download = huggingface_hub.hf_hub_download
14
+
15
  def patched_download(*args, **kwargs):
16
+ # 1. Fix the 'use_auth_token' renaming issue
17
  if 'use_auth_token' in kwargs:
18
  kwargs['token'] = kwargs.pop('use_auth_token')
19
+
20
+ try:
21
+ return orig_download(*args, **kwargs)
22
+ except Exception as e:
23
+ # 2. Fix the 'custom.py' 404 crash
24
+ # SpeechBrain looks for this optional file; if it's missing, we return None
25
+ fname = kwargs.get('filename') or (args[1] if len(args) > 1 else None)
26
+ if fname == "custom.py" and ("404" in str(e) or "Not Found" in str(e)):
27
+ return None
28
+ raise e
29
+
30
  huggingface_hub.hf_hub_download = patched_download
31
 
32
+ # Import SpeechBrain AFTER the patch
33
  from speechbrain.inference.speaker import EncoderClassifier
34
 
35
  # Suppress scikit-learn version warnings
36
+ warnings.filterwarnings("ignore")
37
 
38
  # --- STEP 2: LOAD MODELS ---
39
+ # We check for the specific filenames you uploaded
40
+ MODEL_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
41
  if not os.path.exists(MODEL_PATH):
42
+ MODEL_PATH = 'svm_model.joblib'
43
 
44
+ print(f"Loading SVM classifier: {MODEL_PATH}")
45
  model = joblib.load(MODEL_PATH)
46
 
47
+ print("Loading SpeechBrain ECAPA feature extractor...")
48
+ # This will now successfully skip the missing custom.py
49
+ feature_extractor = EncoderClassifier.from_hparams(
50
+ source="speechbrain/spkrec-ecapa-voxceleb",
51
+ savedir="pretrained_models/ecapa"
52
+ )
53
 
54
  # --- STEP 3: PREDICTION LOGIC ---
55
  def predict_emotion(audio_path):
56
  if audio_path is None:
57
  return "Please upload an audio file."
58
 
59
+ # Load audio and resample to 16kHz (ECAPA requirement)
60
  signal, fs = torchaudio.load(audio_path)
 
 
61
  if fs != 16000:
62
  resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
63
  signal = resampler(signal)
64
+
65
+ # Mono conversion
66
  if signal.shape[0] > 1:
67
  signal = torch.mean(signal, dim=0, keepdim=True)
68
 
69
+ # Extract 192-D ECAPA-TDNN Embeddings
70
  with torch.no_grad():
71
  embeddings = feature_extractor.encode_batch(signal)
72
  embeddings = embeddings.squeeze().cpu().numpy().reshape(1, -1)
73
 
74
+ # Create DataFrame with the specific feature names expected by your SVM
 
75
  feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
76
  X = pd.DataFrame(embeddings, columns=feature_names)
77
 
78
+ # Predict Emotion
79
  try:
80
+ # Returns a dictionary of {Emotion: Confidence}
81
  probs = model.predict_proba(X)[0]
82
+ return {str(model.classes_[i]): float(probs[i]) for i in range(len(model.classes_))}
83
  except Exception:
84
+ # Fallback if probability was not enabled during training
85
  prediction = model.predict(X)[0]
86
  return str(prediction)
87
 
88
+ # --- STEP 4: GRADIO INTERFACE ---
89
  demo = gr.Interface(
90
  fn=predict_emotion,
91
+ inputs=gr.Audio(type="filepath", label="Upload Audio (WAV/MP3)"),
92
  outputs=gr.Label(label="Detected Emotion"),
93
  title="RAVDESS Emotion Classifier",
94
+ description="Classifies emotions using ECAPA-TDNN speaker embeddings and a Support Vector Machine.",
95
  allow_flagging="never"
96
  )
97