import gradio as gr import torch import librosa from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor model_id = "abedir/emotion-detector" processor = Wav2Vec2FeatureExtractor.from_pretrained(model_id) model = AutoModelForAudioClassification.from_pretrained(model_id) label_map = { 0: "Angry/Fearful", 1: "Happy/Laugh", 2: "Neutral/Calm", 3: "Sad/Cry", 4: "Surprised/Amazed" } def predict(audio): audio, sr = librosa.load(audio, sr=16000) inputs = processor(audio, sampling_rate=16000, return_tensors="pt") with torch.no_grad(): logits = model(**inputs).logits probs = torch.softmax(logits, dim=1)[0] pred = torch.argmax(probs).item() return label_map[pred], float(probs[pred]) iface = gr.Interface( fn=predict, inputs=gr.Audio(type="filepath"), outputs=["text", "number"], title="Emotion Detector 🎤" ) iface.launch()