import gradio as gr import torch import librosa from transformers import Wav2Vec2FeatureExtractor, HubertForSequenceClassification # Load model and processor model_id = "superb/hubert-base-superb-er" feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_id) model = HubertForSequenceClassification.from_pretrained(model_id) def predict_emotion(audio): if audio is None: return "Please upload an audio file." # Load and resample audio to 16kHz # Gradio provides the path to the temporary file speech, sr = librosa.load(audio, sr=16000) # Preprocess inputs = feature_extractor(speech, sampling_rate=16000, return_tensors="pt", padding=True) # Inference with torch.no_grad(): logits = model(**inputs).logits # Get probabilities via Softmax probs = torch.nn.functional.softmax(logits, dim=-1) # Map to labels # Model labels: 0: neu, 1: hap, 2: ang, 3: sad labels = ["Neutral", "Happy", "Angry", "Sad"] results = {labels[i]: float(probs[0][i]) for i in range(len(labels))} return results # Define the Gradio Interface demo = gr.Interface( fn=predict_emotion, inputs=gr.Audio(type="filepath", label="Upload Audio or Record"), outputs=gr.Label(label="Detected Emotion"), title="HuBERT Emotion Recognition", description="Upload an audio clip to detect the primary emotion. This model (hubert-base-superb-er) is fine-tuned for Neutral, Happy, Angry, and Sad classifications.", examples=[], # You can add paths to example .wav files here theme="soft" ) if __name__ == "__main__": demo.launch()