notuser77 commited on
Commit
a2fc1ad
·
verified ·
1 Parent(s): 97ba441

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -32
app.py CHANGED
@@ -1,77 +1,83 @@
1
- import gradio as gr
2
- import torch
3
- import torchaudio
4
  import joblib
5
  import pandas as pd
6
  import numpy as np
7
- import os
 
8
  import warnings
 
 
 
 
 
 
 
 
 
 
 
 
9
  from speechbrain.inference.speaker import EncoderClassifier
10
 
11
- # Ignore the scikit-learn version warning (1.5.2 vs 1.7.x)
12
- warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
13
 
14
- # 1. Load your SVM model
15
- # We try both names you provided to be safe
16
  MODEL_PATH = 'svm_model.joblib'
17
  if not os.path.exists(MODEL_PATH):
18
  MODEL_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
19
 
20
- print(f"Loading model from: {MODEL_PATH}")
21
  model = joblib.load(MODEL_PATH)
22
 
23
- # 2. Load the SpeechBrain ECAPA-TDNN feature extractor
24
- # NOTE: The pinned huggingface-hub==0.24.0 in requirements.txt fixes the TypeError
25
  feature_extractor = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
26
 
 
27
  def predict_emotion(audio_path):
28
  if audio_path is None:
29
  return "Please upload an audio file."
30
 
31
- # 3. Load and Preprocess Audio
32
  signal, fs = torchaudio.load(audio_path)
33
 
34
- # Resample to 16kHz (ECAPA requirement)
35
  if fs != 16000:
36
  resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
37
  signal = resampler(signal)
38
-
39
- # Convert to mono
40
  if signal.shape[0] > 1:
41
  signal = torch.mean(signal, dim=0, keepdim=True)
42
 
43
- # 4. Feature Extraction (192-D Embeddings)
44
  with torch.no_grad():
45
  embeddings = feature_extractor.encode_batch(signal)
46
  embeddings = embeddings.squeeze().cpu().numpy().reshape(1, -1)
47
 
48
- # 5. Prediction
49
- # Create DataFrame with exact feature names the SVM expects
50
  feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
51
  X = pd.DataFrame(embeddings, columns=feature_names)
52
 
 
53
  try:
54
- # Get probability scores for each class
55
  probs = model.predict_proba(X)[0]
56
- # model.classes_ contains the emotion names
57
  return {model.classes_[i]: float(probs[i]) for i in range(len(model.classes_))}
58
- except AttributeError:
59
- # Fallback if probability=False was used during training
60
  prediction = model.predict(X)[0]
61
  return str(prediction)
62
 
63
- # 6. Gradio Interface
64
- description = (
65
- "Extracts ECAPA-TDNN embeddings via SpeechBrain and classifies them using an SVM. "
66
- "Best results with 3-5 second speech clips."
67
- )
68
-
69
  demo = gr.Interface(
70
  fn=predict_emotion,
71
- inputs=gr.Audio(type="filepath", label="Input Audio"),
72
- outputs=gr.Label(label="Emotion Confidence"),
73
- title="Speech Emotion Recognition",
74
- description=description
 
75
  )
76
 
77
  if __name__ == "__main__":
 
1
+ import os
 
 
2
  import joblib
3
  import pandas as pd
4
  import numpy as np
5
+ import torch
6
+ import torchaudio
7
  import warnings
8
+ import gradio as gr
9
+
10
+ # --- STEP 1: THE MONKEY PATCH (Fixes the TypeError) ---
11
+ import huggingface_hub
12
+ orig_download = huggingface_hub.hf_hub_download
13
+ def patched_download(*args, **kwargs):
14
+ if 'use_auth_token' in kwargs:
15
+ kwargs['token'] = kwargs.pop('use_auth_token')
16
+ return orig_download(*args, **kwargs)
17
+ huggingface_hub.hf_hub_download = patched_download
18
+
19
+ # Import SpeechBrain after the patch
20
  from speechbrain.inference.speaker import EncoderClassifier
21
 
22
+ # Suppress scikit-learn version warnings
23
+ warnings.filterwarnings("ignore", category=UserWarning)
24
 
25
+ # --- STEP 2: LOAD MODELS ---
26
+ # We check both filenames you uploaded
27
  MODEL_PATH = 'svm_model.joblib'
28
  if not os.path.exists(MODEL_PATH):
29
  MODEL_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
30
 
31
+ print(f"Loading classifier from: {MODEL_PATH}")
32
  model = joblib.load(MODEL_PATH)
33
 
34
+ # Load the ECAPA-TDNN feature extractor
35
+ print("Loading SpeechBrain feature extractor...")
36
  feature_extractor = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
37
 
38
+ # --- STEP 3: PREDICTION LOGIC ---
39
  def predict_emotion(audio_path):
40
  if audio_path is None:
41
  return "Please upload an audio file."
42
 
43
+ # Load audio
44
  signal, fs = torchaudio.load(audio_path)
45
 
46
+ # Preprocess (16kHz mono)
47
  if fs != 16000:
48
  resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
49
  signal = resampler(signal)
 
 
50
  if signal.shape[0] > 1:
51
  signal = torch.mean(signal, dim=0, keepdim=True)
52
 
53
+ # Extract 192-D Embeddings
54
  with torch.no_grad():
55
  embeddings = feature_extractor.encode_batch(signal)
56
  embeddings = embeddings.squeeze().cpu().numpy().reshape(1, -1)
57
 
58
+ # Create DataFrame with exact column names the SVM expects
59
+ # (0_speechbrain_embedding, 1_speechbrain_embedding, etc.)
60
  feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
61
  X = pd.DataFrame(embeddings, columns=feature_names)
62
 
63
+ # Predict
64
  try:
65
+ # Get probabilities for the Label output
66
  probs = model.predict_proba(X)[0]
 
67
  return {model.classes_[i]: float(probs[i]) for i in range(len(model.classes_))}
68
+ except Exception:
69
+ # Fallback to direct prediction
70
  prediction = model.predict(X)[0]
71
  return str(prediction)
72
 
73
+ # --- STEP 4: GRADIO UI ---
 
 
 
 
 
74
  demo = gr.Interface(
75
  fn=predict_emotion,
76
+ inputs=gr.Audio(type="filepath", label="Upload Voice Clip"),
77
+ outputs=gr.Label(label="Detected Emotion"),
78
+ title="RAVDESS Emotion Classifier",
79
+ description="This app uses ECAPA-TDNN embeddings from SpeechBrain and an SVM classifier to detect emotions in speech.",
80
+ allow_flagging="never"
81
  )
82
 
83
  if __name__ == "__main__":