File size: 1,417 Bytes
16dbaa3
 
 
 
 
 
0ae8940
16dbaa3
 
2051424
16dbaa3
0ae8940
2051424
16dbaa3
2051424
 
 
 
16dbaa3
2051424
16dbaa3
2051424
 
16dbaa3
2051424
 
 
 
16dbaa3
2051424
16dbaa3
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import gradio as gr
import torch
import torchaudio
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor

# Load model and processor from Hugging Face
model_name = "Dpngtm/wav2vec2-emotion-recognition"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)

# Emotion labels from the model card
labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"]

# Emotion prediction function
def predict_emotion(audio):
    speech, sr = audio
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        speech = resampler(torch.tensor(speech))
    else:
        speech = torch.tensor(speech)

    input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values

    with torch.no_grad():
        logits = model(input_values).logits

    predicted_id = torch.argmax(logits, dim=-1).item()
    emotion = labels[predicted_id]
    return f"Predicted Emotion: **{emotion}**"

# Gradio interface
interface = gr.Interface(
    fn=predict_emotion,
    inputs=gr.Audio(source="microphone", type="numpy", label="Speak or Upload Audio"),
    outputs=gr.Markdown(label="Detected Emotion"),
    title="Voice Emotion Recognition",
    description="This app detects the emotional tone of your speech using a fine-tuned Wav2Vec2 model."
)

interface.launch()