import gradio as gr
# from transformers import pipeline

# # 1. Whisper ASR
# asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")

# # 2. Emotion classifier (Vietnamese)
# emo_clf = pipeline("text-classification", model="bkai-foundation-models/vietnamese-emotion", top_k=None)
from transformers import pipeline

# ASR bằng Whisper
asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")

# Emotion classification (Vietnamese text)
emo_clf = pipeline("text-classification", model="visolex/bartpho-emotion", top_k=None)


# 3. Function to process audio
def predict_emotion(audio_file):
    # Step 1: Speech to Text
    transcript = asr(audio_file)["text"]

    # Step 2: Emotion classification
    emotions = emo_clf(transcript)[0]
    emotions_sorted = sorted(emotions, key=lambda x: x["score"], reverse=True)

    # Step 3: Bot reply based on top emotion
    top_emotion = emotions_sorted[0]["label"]
    bot_reply = {
        "vui": "Mình cảm nhận được sự vui vẻ của bạn! ❤️",
        "buồn": "Nghe như bạn đang buồn. Mình sẵn sàng lắng nghe bạn chia sẻ. 💙",
        "giận": "Có vẻ bạn đang tức giận. Hít thở sâu và chúng ta cùng nói chuyện nhé. 😌",
        "sợ": "Mình cảm nhận được sự lo lắng từ bạn. Mọi chuyện sẽ ổn thôi. 🤗",
        "trung tính": "Mình đang nghe bạn. Hãy nói thêm cho mình biết nhé. 🙂"
    }.get(top_emotion.lower(), "Mình đã nghe bạn rồi!")

    return transcript, emotions_sorted, bot_reply

# 4. Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## 🎤 Chatbot dự đoán cảm xúc qua lời nói")
    audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Ghi âm hoặc tải file")
    transcript_out = gr.Textbox(label="Transcript")
    emotion_out = gr.JSON(label="Dự đoán cảm xúc")
    bot_reply_out = gr.Textbox(label="Phản hồi Chatbot")

    audio_input.change(fn=predict_emotion, inputs=audio_input, outputs=[transcript_out, emotion_out, bot_reply_out])

demo.launch()