notuser77 commited on
Commit
8944133
·
verified ·
1 Parent(s): 4a71fb9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -16
app.py CHANGED
@@ -5,10 +5,14 @@ import joblib
5
  import pandas as pd
6
  import numpy as np
7
  import os
 
8
  from speechbrain.inference.speaker import EncoderClassifier
9
 
10
- # 1. Load the SVM model
11
- # We check for both possible filenames you uploaded
 
 
 
12
  MODEL_PATH = 'svm_model.joblib'
13
  if not os.path.exists(MODEL_PATH):
14
  MODEL_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
@@ -17,53 +21,57 @@ print(f"Loading model from: {MODEL_PATH}")
17
  model = joblib.load(MODEL_PATH)
18
 
19
  # 2. Load the SpeechBrain ECAPA-TDNN feature extractor
20
- # This downloads the pre-trained weights from Hugging Face
21
  feature_extractor = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
22
 
23
  def predict_emotion(audio_path):
24
  if audio_path is None:
25
  return "Please upload an audio file."
26
 
27
- # 3. Load Audio
28
  signal, fs = torchaudio.load(audio_path)
29
 
30
- # 4. Preprocess: Match the 16kHz requirement of ECAPA-TDNN
31
  if fs != 16000:
32
  resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
33
  signal = resampler(signal)
34
 
35
- # Convert stereo to mono
36
  if signal.shape[0] > 1:
37
  signal = torch.mean(signal, dim=0, keepdim=True)
38
 
39
- # 5. Extract 192-D Embeddings
40
  with torch.no_grad():
41
  embeddings = feature_extractor.encode_batch(signal)
42
- # Squeeze and convert to numpy
43
  embeddings = embeddings.squeeze().cpu().numpy().reshape(1, -1)
44
 
45
- # 6. Create Dataframe with exact feature names model expects
 
46
  feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
47
  X = pd.DataFrame(embeddings, columns=feature_names)
48
 
49
- # 7. Predict
50
  try:
51
  # Get probability scores for each class
52
  probs = model.predict_proba(X)[0]
53
- # Return a dictionary of {Emotion: Probability} for Gradio's Label component
54
  return {model.classes_[i]: float(probs[i]) for i in range(len(model.classes_))}
55
  except AttributeError:
56
- # If the model wasn't trained with probability=True, just return the top label
57
  prediction = model.predict(X)[0]
58
  return str(prediction)
59
 
60
- # 8. Build Interface
 
 
 
 
 
61
  demo = gr.Interface(
62
  fn=predict_emotion,
63
- inputs=gr.Audio(type="filepath", label="Upload or Record Speech"),
64
  outputs=gr.Label(label="Emotion Confidence"),
65
- title="RAVDESS Speech Emotion Classifier",
66
- description="This app uses ECAPA-TDNN embeddings and a Support Vector Machine to classify emotions in speech."
67
  )
68
 
69
  if __name__ == "__main__":
 
5
  import pandas as pd
6
  import numpy as np
7
  import os
8
+ import warnings
9
  from speechbrain.inference.speaker import EncoderClassifier
10
 
11
+ # Ignore the scikit-learn version warning (1.5.2 vs 1.7.x)
12
+ warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
13
+
14
+ # 1. Load your SVM model
15
+ # We try both names you provided to be safe
16
  MODEL_PATH = 'svm_model.joblib'
17
  if not os.path.exists(MODEL_PATH):
18
  MODEL_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
 
21
  model = joblib.load(MODEL_PATH)
22
 
23
  # 2. Load the SpeechBrain ECAPA-TDNN feature extractor
24
+ # NOTE: The pinned huggingface-hub==0.24.0 in requirements.txt fixes the TypeError
25
  feature_extractor = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
26
 
27
  def predict_emotion(audio_path):
28
  if audio_path is None:
29
  return "Please upload an audio file."
30
 
31
+ # 3. Load and Preprocess Audio
32
  signal, fs = torchaudio.load(audio_path)
33
 
34
+ # Resample to 16kHz (ECAPA requirement)
35
  if fs != 16000:
36
  resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
37
  signal = resampler(signal)
38
 
39
+ # Convert to mono
40
  if signal.shape[0] > 1:
41
  signal = torch.mean(signal, dim=0, keepdim=True)
42
 
43
+ # 4. Feature Extraction (192-D Embeddings)
44
  with torch.no_grad():
45
  embeddings = feature_extractor.encode_batch(signal)
 
46
  embeddings = embeddings.squeeze().cpu().numpy().reshape(1, -1)
47
 
48
+ # 5. Prediction
49
+ # Create DataFrame with exact feature names the SVM expects
50
  feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
51
  X = pd.DataFrame(embeddings, columns=feature_names)
52
 
 
53
  try:
54
  # Get probability scores for each class
55
  probs = model.predict_proba(X)[0]
56
+ # model.classes_ contains the emotion names
57
  return {model.classes_[i]: float(probs[i]) for i in range(len(model.classes_))}
58
  except AttributeError:
59
+ # Fallback if probability=False was used during training
60
  prediction = model.predict(X)[0]
61
  return str(prediction)
62
 
63
+ # 6. Gradio Interface
64
+ description = (
65
+ "Extracts ECAPA-TDNN embeddings via SpeechBrain and classifies them using an SVM. "
66
+ "Best results with 3-5 second speech clips."
67
+ )
68
+
69
  demo = gr.Interface(
70
  fn=predict_emotion,
71
+ inputs=gr.Audio(type="filepath", label="Input Audio"),
72
  outputs=gr.Label(label="Emotion Confidence"),
73
+ title="Speech Emotion Recognition",
74
+ description=description
75
  )
76
 
77
  if __name__ == "__main__":