Spaces:

LingoJr
/

emotion-recognition

Runtime error

File size: 1,583 Bytes

a834bb6
2de24ed
 
 
 
a834bb6
2de24ed
 
 
50e2b80
 
 
291ce94
 
2de24ed
291ce94
 
 
 
2dad960
291ce94
 
2de24ed
 
291ce94
 
2de24ed
291ce94
 
2dad960
 
 
 
 
 
 
2de24ed
2dad960
 
2de24ed
 
2dad960

import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torchaudio
import torch

speech_classifier = pipeline("audio-classification", model="superb/wav2vec2-base-superb-er")
text_tokenizer = AutoTokenizer.from_pretrained("tae898/emoberta-base")
text_model = AutoModelForSequenceClassification.from_pretrained("tae898/emoberta-base")

text_model.gradient_checkpointing_enable()


def predict_emotion(audio, text):
    results = {}

    if audio is not None:
        waveform, sr = torchaudio.load(audio)
        preds = speech_classifier(waveform.squeeze().numpy(), sampling_rate=sr, top_k=3)
        results["audio_emotion"] = preds[0]["label"]

    if text is not None and text.strip() != "":
        inputs = text_tokenizer(text, return_tensors="pt")
        with torch.no_grad():
            outputs = text_model(**inputs)
        emotion = text_model.config.id2label[torch.argmax(outputs.logits)]
        results["text_emotion"] = emotion

    return results
    
# Building the UI
gradio_ui = gr.Interface(
    fn=gradio_combined,
    inputs=[
        gr.Audio(label="🎤 Upload or Record Speech", sources=["microphone", "upload"], type="filepath"),
        gr.Textbox(label="💬 Enter Text Emotion", placeholder="Type something...")
    ],
    outputs="json",
    title="🎭 Multimodal Emotion Recognizer",
    description="Use either speech or text — the model detects the emotion automatically!"
)

# Mount Gradio at /gradio
app = gr.mount_gradio_app(app, gradio_ui, path="/gradio")


gradio_ui.launch(share=True)