import gradio as gr
import torch
import librosa
from transformers import Wav2Vec2FeatureExtractor, HubertForSequenceClassification

# Load model and processor
model_id = "superb/hubert-base-superb-er"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_id)
model = HubertForSequenceClassification.from_pretrained(model_id)

def predict_emotion(audio):
    if audio is None:
        return "Please upload an audio file."

    # Load and resample audio to 16kHz
    # Gradio provides the path to the temporary file
    speech, sr = librosa.load(audio, sr=16000)

    # Preprocess
    inputs = feature_extractor(speech, sampling_rate=16000, return_tensors="pt", padding=True)

    # Inference
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # Get probabilities via Softmax
    probs = torch.nn.functional.softmax(logits, dim=-1)
    
    # Map to labels
    # Model labels: 0: neu, 1: hap, 2: ang, 3: sad
    labels = ["Neutral", "Happy", "Angry", "Sad"]
    results = {labels[i]: float(probs[0][i]) for i in range(len(labels))}
    
    return results

# Define the Gradio Interface
demo = gr.Interface(
    fn=predict_emotion,
    inputs=gr.Audio(type="filepath", label="Upload Audio or Record"),
    outputs=gr.Label(label="Detected Emotion"),
    title="HuBERT Emotion Recognition",
    description="Upload an audio clip to detect the primary emotion. This model (hubert-base-superb-er) is fine-tuned for Neutral, Happy, Angry, and Sad classifications.",
    examples=[], # You can add paths to example .wav files here
    theme="soft"
)

if __name__ == "__main__":
    demo.launch()