Spaces:

notuser77
/

ravdess

Sleeping

App Files Files Community

ravdess / app.py

notuser77

Update app.py

512e413 verified about 1 month ago

raw

history blame contribute delete

3.21 kB

	import os
	import joblib
	import pandas as pd
	import numpy as np
	import torch
	import warnings
	import gradio as gr
	import huggingface_hub
	from speechbrain.inference.classifiers import EncoderClassifier

	# --- 1. BOOTSTRAP (Monkey Patch for SpeechBrain 1.0.0 compatibility) ---
	orig_download = huggingface_hub.hf_hub_download
	def patched_download(args, *kwargs):
	if 'use_auth_token' in kwargs: kwargs['token'] = kwargs.pop('use_auth_token')
	fname = kwargs.get('filename') or (args[1] if len(args) > 1 else None)
	try: return orig_download(args, *kwargs)
	except Exception as e:
	if fname == "custom.py":
	dummy_path = os.path.abspath("dummy_custom.py")
	if not os.path.exists(dummy_path):
	with open(dummy_path, "w") as f: f.write("# Dummy\n")
	return dummy_path
	raise e
	huggingface_hub.hf_hub_download = patched_download
	warnings.filterwarnings("ignore")

	# --- 2. LOAD MODELS ---
	SVM_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
	print(f"Loading SVM: {SVM_PATH}")
	svm_model = joblib.load(SVM_PATH)

	print("Loading SpeechBrain Feature Extractor...")
	feature_extractor = EncoderClassifier.from_hparams(
	source="speechbrain/spkrec-ecapa-voxceleb",
	savedir="pretrained_models/spkrec-ecapa-voxceleb"
	)

	# Standard RAVDESS mapping
	EMOTIONS = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

	# --- 3. INFERENCE LOGIC ---
	def predict_emotion(audio_path):
	if audio_path is None:
	return "No audio provided"

	# 1. Load and Resample to 16kHz (Critical for SpeechBrain)
	signal, fs = torchaudio.load(audio_path)
	if fs != 16000:
	resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
	signal = resampler(signal)

	# 2. Trim Silence (VAD)
	# Using librosa to remove dead air that biases the model toward 'Calm'
	signal_np = signal.squeeze().numpy()
	trimmed_signal, _ = librosa.effects.trim(signal_np, top_db=20)
	signal = torch.from_numpy(trimmed_signal)

	# 3. Extract Embeddings
	with torch.no_grad():
	# Ensure signal is [batch, time]
	embeddings = feature_extractor.encode_batch(signal.unsqueeze(0))

	# 4. L2 Normalization (Important for Cosine-based ECAPA models)
	# This ensures the vector magnitude is 1.0, making it volume-invariant
	embeddings = F.normalize(embeddings, p=2, dim=2)

	# Reshape for SVM (1, 192)
	embeddings = embeddings.cpu().numpy().squeeze().reshape(1, -1)

	# 5. Predict
	feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
	df_embeddings = pd.DataFrame(embeddings, columns=feature_names)
	prediction = svm_model.predict(df_embeddings)[0]

	return prediction
	# --- 4. GRADIO INTERFACE ---
	demo = gr.Interface(
	fn=predict_emotion,
	inputs=gr.Audio(type="filepath", label="Record or Upload Audio"),
	outputs=gr.Textbox(label="Predicted Emotion"),
	title="Speech Emotion Recognition",
	description="Optimized for RAVDESS SVM. If accuracy is low, try to speak closer to the mic and minimize background noise."
	)

	if __name__ == "__main__":
	demo.launch()