Spaces:
Running
Running
File size: 1,626 Bytes
650adba e23e048 cf64064 650adba cf64064 e23e048 cf64064 cee5ba4 cf64064 cee5ba4 e23e048 cf64064 e23e048 cf64064 e23e048 cf64064 e23e048 cf64064 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | import gradio as gr
import torch
import librosa
from transformers import Wav2Vec2FeatureExtractor, HubertForSequenceClassification
# Load model and processor
model_id = "superb/hubert-base-superb-er"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_id)
model = HubertForSequenceClassification.from_pretrained(model_id)
def predict_emotion(audio):
if audio is None:
return "Please upload an audio file."
# Load and resample audio to 16kHz
# Gradio provides the path to the temporary file
speech, sr = librosa.load(audio, sr=16000)
# Preprocess
inputs = feature_extractor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
# Inference
with torch.no_grad():
logits = model(**inputs).logits
# Get probabilities via Softmax
probs = torch.nn.functional.softmax(logits, dim=-1)
# Map to labels
# Model labels: 0: neu, 1: hap, 2: ang, 3: sad
labels = ["Neutral", "Happy", "Angry", "Sad"]
results = {labels[i]: float(probs[0][i]) for i in range(len(labels))}
return results
# Define the Gradio Interface
demo = gr.Interface(
fn=predict_emotion,
inputs=gr.Audio(type="filepath", label="Upload Audio or Record"),
outputs=gr.Label(label="Detected Emotion"),
title="HuBERT Emotion Recognition",
description="Upload an audio clip to detect the primary emotion. This model (hubert-base-superb-er) is fine-tuned for Neutral, Happy, Angry, and Sad classifications.",
examples=[], # You can add paths to example .wav files here
theme="soft"
)
if __name__ == "__main__":
demo.launch() |