File size: 1,583 Bytes
a834bb6
2de24ed
 
 
 
a834bb6
2de24ed
 
 
50e2b80
 
 
291ce94
 
2de24ed
291ce94
 
 
 
2dad960
291ce94
 
2de24ed
 
291ce94
 
2de24ed
291ce94
 
2dad960
 
 
 
 
 
 
2de24ed
2dad960
 
2de24ed
 
2dad960
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torchaudio
import torch

speech_classifier = pipeline("audio-classification", model="superb/wav2vec2-base-superb-er")
text_tokenizer = AutoTokenizer.from_pretrained("tae898/emoberta-base")
text_model = AutoModelForSequenceClassification.from_pretrained("tae898/emoberta-base")

text_model.gradient_checkpointing_enable()


def predict_emotion(audio, text):
    results = {}

    if audio is not None:
        waveform, sr = torchaudio.load(audio)
        preds = speech_classifier(waveform.squeeze().numpy(), sampling_rate=sr, top_k=3)
        results["audio_emotion"] = preds[0]["label"]

    if text is not None and text.strip() != "":
        inputs = text_tokenizer(text, return_tensors="pt")
        with torch.no_grad():
            outputs = text_model(**inputs)
        emotion = text_model.config.id2label[torch.argmax(outputs.logits)]
        results["text_emotion"] = emotion

    return results
    
# Building the UI
gradio_ui = gr.Interface(
    fn=gradio_combined,
    inputs=[
        gr.Audio(label="🎀 Upload or Record Speech", sources=["microphone", "upload"], type="filepath"),
        gr.Textbox(label="πŸ’¬ Enter Text Emotion", placeholder="Type something...")
    ],
    outputs="json",
    title="🎭 Multimodal Emotion Recognizer",
    description="Use either speech or text β€” the model detects the emotion automatically!"
)

# Mount Gradio at /gradio
app = gr.mount_gradio_app(app, gradio_ui, path="/gradio")


gradio_ui.launch(share=True)