Spaces:

E-motionAssistant
/

Space5

Running

File size: 1,626 Bytes

650adba
e23e048
 
cf64064
650adba
cf64064
 
 
 
e23e048
cf64064
 
 
cee5ba4
cf64064
 
cee5ba4
e23e048
cf64064
 
e23e048
cf64064
e23e048
cf64064
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e23e048
cf64064

import gradio as gr
import torch
import librosa
from transformers import Wav2Vec2FeatureExtractor, HubertForSequenceClassification

# Load model and processor
model_id = "superb/hubert-base-superb-er"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_id)
model = HubertForSequenceClassification.from_pretrained(model_id)

def predict_emotion(audio):
    if audio is None:
        return "Please upload an audio file."

    # Load and resample audio to 16kHz
    # Gradio provides the path to the temporary file
    speech, sr = librosa.load(audio, sr=16000)

    # Preprocess
    inputs = feature_extractor(speech, sampling_rate=16000, return_tensors="pt", padding=True)

    # Inference
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # Get probabilities via Softmax
    probs = torch.nn.functional.softmax(logits, dim=-1)
    
    # Map to labels
    # Model labels: 0: neu, 1: hap, 2: ang, 3: sad
    labels = ["Neutral", "Happy", "Angry", "Sad"]
    results = {labels[i]: float(probs[0][i]) for i in range(len(labels))}
    
    return results

# Define the Gradio Interface
demo = gr.Interface(
    fn=predict_emotion,
    inputs=gr.Audio(type="filepath", label="Upload Audio or Record"),
    outputs=gr.Label(label="Detected Emotion"),
    title="HuBERT Emotion Recognition",
    description="Upload an audio clip to detect the primary emotion. This model (hubert-base-superb-er) is fine-tuned for Neutral, Happy, Angry, and Sad classifications.",
    examples=[], # You can add paths to example .wav files here
    theme="soft"
)

if __name__ == "__main__":
    demo.launch()