Space5 / app.py
Raemih's picture
Update app.py
cf64064 verified
import gradio as gr
import torch
import librosa
from transformers import Wav2Vec2FeatureExtractor, HubertForSequenceClassification
# Load model and processor
model_id = "superb/hubert-base-superb-er"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_id)
model = HubertForSequenceClassification.from_pretrained(model_id)
def predict_emotion(audio):
if audio is None:
return "Please upload an audio file."
# Load and resample audio to 16kHz
# Gradio provides the path to the temporary file
speech, sr = librosa.load(audio, sr=16000)
# Preprocess
inputs = feature_extractor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
# Inference
with torch.no_grad():
logits = model(**inputs).logits
# Get probabilities via Softmax
probs = torch.nn.functional.softmax(logits, dim=-1)
# Map to labels
# Model labels: 0: neu, 1: hap, 2: ang, 3: sad
labels = ["Neutral", "Happy", "Angry", "Sad"]
results = {labels[i]: float(probs[0][i]) for i in range(len(labels))}
return results
# Define the Gradio Interface
demo = gr.Interface(
fn=predict_emotion,
inputs=gr.Audio(type="filepath", label="Upload Audio or Record"),
outputs=gr.Label(label="Detected Emotion"),
title="HuBERT Emotion Recognition",
description="Upload an audio clip to detect the primary emotion. This model (hubert-base-superb-er) is fine-tuned for Neutral, Happy, Angry, and Sad classifications.",
examples=[], # You can add paths to example .wav files here
theme="soft"
)
if __name__ == "__main__":
demo.launch()