Spaces:

notuser77
/

ravdess

Sleeping

App Files Files Community

notuser77 commited on Dec 20, 2025

Commit

2b4f79e

verified ·

1 Parent(s): d83c4e4

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -24

app.py CHANGED Viewed

@@ -7,76 +7,91 @@ import torchaudio
 import warnings
 import gradio as gr
-# --- STEP 1: THE MONKEY PATCH (Fixes the TypeError) ---
 import huggingface_hub
 orig_download = huggingface_hub.hf_hub_download
 def patched_download(*args, **kwargs):
     if 'use_auth_token' in kwargs:
         kwargs['token'] = kwargs.pop('use_auth_token')
-    return orig_download(*args, **kwargs)
 huggingface_hub.hf_hub_download = patched_download
-# Import SpeechBrain after the patch
 from speechbrain.inference.speaker import EncoderClassifier
 # Suppress scikit-learn version warnings
-warnings.filterwarnings("ignore", category=UserWarning)
 # --- STEP 2: LOAD MODELS ---
-# We check both filenames you uploaded
-MODEL_PATH = 'svm_model.joblib'
 if not os.path.exists(MODEL_PATH):
-    MODEL_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
-print(f"Loading classifier from: {MODEL_PATH}")
 model = joblib.load(MODEL_PATH)
-# Load the ECAPA-TDNN feature extractor
-print("Loading SpeechBrain feature extractor...")
-feature_extractor = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
 # --- STEP 3: PREDICTION LOGIC ---
 def predict_emotion(audio_path):
     if audio_path is None:
         return "Please upload an audio file."
-    # Load audio
     signal, fs = torchaudio.load(audio_path)
-    # Preprocess (16kHz mono)
     if fs != 16000:
         resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
         signal = resampler(signal)
     if signal.shape[0] > 1:
         signal = torch.mean(signal, dim=0, keepdim=True)
-    # Extract 192-D Embeddings
     with torch.no_grad():
         embeddings = feature_extractor.encode_batch(signal)
         embeddings = embeddings.squeeze().cpu().numpy().reshape(1, -1)
-    # Create DataFrame with exact column names the SVM expects
-    # (0_speechbrain_embedding, 1_speechbrain_embedding, etc.)
     feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
     X = pd.DataFrame(embeddings, columns=feature_names)
-    # Predict
     try:
-        # Get probabilities for the Label output
         probs = model.predict_proba(X)[0]
-        return {model.classes_[i]: float(probs[i]) for i in range(len(model.classes_))}
     except Exception:
-        # Fallback to direct prediction
         prediction = model.predict(X)[0]
         return str(prediction)
-# --- STEP 4: GRADIO UI ---
 demo = gr.Interface(
     fn=predict_emotion,
-    inputs=gr.Audio(type="filepath", label="Upload Voice Clip"),
     outputs=gr.Label(label="Detected Emotion"),
     title="RAVDESS Emotion Classifier",
-    description="This app uses ECAPA-TDNN embeddings from SpeechBrain and an SVM classifier to detect emotions in speech.",
     allow_flagging="never"
 )

 import warnings
 import gradio as gr
+# --- STEP 1: THE ROBUST MONKEY PATCH ---
+# This fixes both the 'use_auth_token' error and the 'custom.py' 404 crash
 import huggingface_hub
 orig_download = huggingface_hub.hf_hub_download
 def patched_download(*args, **kwargs):
+    # 1. Fix the 'use_auth_token' renaming issue
     if 'use_auth_token' in kwargs:
         kwargs['token'] = kwargs.pop('use_auth_token')
+    try:
+        return orig_download(*args, **kwargs)
+    except Exception as e:
+        # 2. Fix the 'custom.py' 404 crash
+        # SpeechBrain looks for this optional file; if it's missing, we return None
+        fname = kwargs.get('filename') or (args[1] if len(args) > 1 else None)
+        if fname == "custom.py" and ("404" in str(e) or "Not Found" in str(e)):
+            return None
+        raise e
 huggingface_hub.hf_hub_download = patched_download
+# Import SpeechBrain AFTER the patch
 from speechbrain.inference.speaker import EncoderClassifier
 # Suppress scikit-learn version warnings
+warnings.filterwarnings("ignore")
 # --- STEP 2: LOAD MODELS ---
+# We check for the specific filenames you uploaded
+MODEL_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
 if not os.path.exists(MODEL_PATH):
+    MODEL_PATH = 'svm_model.joblib'
+print(f"Loading SVM classifier: {MODEL_PATH}")
 model = joblib.load(MODEL_PATH)
+print("Loading SpeechBrain ECAPA feature extractor...")
+# This will now successfully skip the missing custom.py
+feature_extractor = EncoderClassifier.from_hparams(
+    source="speechbrain/spkrec-ecapa-voxceleb",
+    savedir="pretrained_models/ecapa"
+)
 # --- STEP 3: PREDICTION LOGIC ---
 def predict_emotion(audio_path):
     if audio_path is None:
         return "Please upload an audio file."
+    # Load audio and resample to 16kHz (ECAPA requirement)
     signal, fs = torchaudio.load(audio_path)
     if fs != 16000:
         resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
         signal = resampler(signal)
+    # Mono conversion
     if signal.shape[0] > 1:
         signal = torch.mean(signal, dim=0, keepdim=True)
+    # Extract 192-D ECAPA-TDNN Embeddings
     with torch.no_grad():
         embeddings = feature_extractor.encode_batch(signal)
         embeddings = embeddings.squeeze().cpu().numpy().reshape(1, -1)
+    # Create DataFrame with the specific feature names expected by your SVM
     feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
     X = pd.DataFrame(embeddings, columns=feature_names)
+    # Predict Emotion
     try:
+        # Returns a dictionary of {Emotion: Confidence}
         probs = model.predict_proba(X)[0]
+        return {str(model.classes_[i]): float(probs[i]) for i in range(len(model.classes_))}
     except Exception:
+        # Fallback if probability was not enabled during training
         prediction = model.predict(X)[0]
         return str(prediction)
+# --- STEP 4: GRADIO INTERFACE ---
 demo = gr.Interface(
     fn=predict_emotion,
+    inputs=gr.Audio(type="filepath", label="Upload Audio (WAV/MP3)"),
     outputs=gr.Label(label="Detected Emotion"),
     title="RAVDESS Emotion Classifier",
+    description="Classifies emotions using ECAPA-TDNN speaker embeddings and a Support Vector Machine.",
     allow_flagging="never"
 )