import os
import pickle
import numpy as np
import pandas as pd
import gradio as gr
import soundfile as sf

from faster_whisper import WhisperModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


EMBED_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
CENTROIDS_PATH = "emotion_avg.pkl"

with open(CENTROIDS_PATH, "rb") as f:
    emotion_avg = pickle.load(f)

for k in list(emotion_avg.keys()):
    emotion_avg[k] = np.array(emotion_avg[k])

EMOTIONS = list(emotion_avg.keys())


embedder = SentenceTransformer(EMBED_MODEL_NAME)
whisper_model = WhisperModel("base", compute_type="int8")


def predict_emotion_sentence(sentence):
    emb = embedder.encode([sentence], convert_to_numpy=True)[0]
    labels = []
    sims = []

    for emotion in EMOTIONS:
        sim = cosine_similarity(
            emb.reshape(1, -1),
            emotion_avg[emotion].reshape(1, -1)
        )[0][0]
        labels.append(emotion)
        sims.append(sim)

    order = np.argsort(sims)[::-1]
    best_idx = order[0]
    second_idx = order[1] if len(order) > 1 else order[0]

    return {
        "emotion": labels[best_idx],
        "score": float(sims[best_idx]),
        "margin": float(sims[best_idx] - sims[second_idx])
    }


def analyze_audio(audio_path):
    if audio_path is None:
        return "No transcript yet.", "None", 0.0, pd.DataFrame(columns=["sentence", "emotion", "score", "margin"])

    segments, _ = whisper_model.transcribe(audio_path)

    transcript_parts = []
    rows = []

    for seg in segments:
        text = seg.text.strip()
        if not text:
            continue
        transcript_parts.append(text)

        pred = predict_emotion_sentence(text)
        rows.append({
            "sentence": text,
            "emotion": pred["emotion"],
            "score": pred["score"],
            "margin": pred["margin"]
        })

    transcript = " ".join(transcript_parts).strip()

    if rows:
        latest = rows[-1]
        latest_emotion = latest["emotion"]
        latest_margin = latest["margin"]
    else:
        latest_emotion = "None"
        latest_margin = 0.0

    df = pd.DataFrame(rows)
    return transcript, latest_emotion, latest_margin, df


with gr.Blocks(title="Emotion Speech Classifier") as demo:
    gr.Markdown("# Emotion Speech Classifier")
    gr.Markdown("Upload or record audio, transcribe it, and detect sentence-level emotion.")

    with gr.Row():
        with gr.Column(scale=1):
            audio_input = gr.Audio(
                sources=["microphone", "upload"],
                type="filepath",
                label="Audio Input"
            )
            run_btn = gr.Button("Analyze Audio")

        with gr.Column(scale=2):
            transcript_box = gr.Textbox(label="Transcript", lines=8)
            with gr.Row():
                latest_emotion_box = gr.Textbox(label="Latest Emotion")
                margin_box = gr.Number(label="Match Margin")

    results_df = gr.Dataframe(
        headers=["sentence", "emotion", "score", "margin"],
        label="Sentence Analysis"
    )

    run_btn.click(
        fn=analyze_audio,
        inputs=audio_input,
        outputs=[transcript_box, latest_emotion_box, margin_box, results_df]
    )

if __name__ == "__main__":
    demo.launch()