Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| import librosa | |
| from transformers import Wav2Vec2FeatureExtractor, HubertForSequenceClassification | |
| # Load model and processor | |
| model_id = "superb/hubert-base-superb-er" | |
| feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_id) | |
| model = HubertForSequenceClassification.from_pretrained(model_id) | |
| def predict_emotion(audio): | |
| if audio is None: | |
| return "Please upload an audio file." | |
| # Load and resample audio to 16kHz | |
| # Gradio provides the path to the temporary file | |
| speech, sr = librosa.load(audio, sr=16000) | |
| # Preprocess | |
| inputs = feature_extractor(speech, sampling_rate=16000, return_tensors="pt", padding=True) | |
| # Inference | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| # Get probabilities via Softmax | |
| probs = torch.nn.functional.softmax(logits, dim=-1) | |
| # Map to labels | |
| # Model labels: 0: neu, 1: hap, 2: ang, 3: sad | |
| labels = ["Neutral", "Happy", "Angry", "Sad"] | |
| results = {labels[i]: float(probs[0][i]) for i in range(len(labels))} | |
| return results | |
| # Define the Gradio Interface | |
| demo = gr.Interface( | |
| fn=predict_emotion, | |
| inputs=gr.Audio(type="filepath", label="Upload Audio or Record"), | |
| outputs=gr.Label(label="Detected Emotion"), | |
| title="HuBERT Emotion Recognition", | |
| description="Upload an audio clip to detect the primary emotion. This model (hubert-base-superb-er) is fine-tuned for Neutral, Happy, Angry, and Sad classifications.", | |
| examples=[], # You can add paths to example .wav files here | |
| theme="soft" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |