File size: 3,314 Bytes
91655dd
 
 
 
 
 
 
 
 
 
 
2f1e061
 
91655dd
 
 
 
 
 
 
 
 
 
2f1e061
91655dd
 
 
2f1e061
91655dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f1e061
91655dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f1e061
 
 
91655dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import pickle
import numpy as np
import pandas as pd
import gradio as gr
import soundfile as sf

from faster_whisper import WhisperModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


EMBED_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
CENTROIDS_PATH = "emotion_avg.pkl"

with open(CENTROIDS_PATH, "rb") as f:
    emotion_avg = pickle.load(f)

for k in list(emotion_avg.keys()):
    emotion_avg[k] = np.array(emotion_avg[k])

EMOTIONS = list(emotion_avg.keys())


embedder = SentenceTransformer(EMBED_MODEL_NAME)
whisper_model = WhisperModel("base", compute_type="int8")


def predict_emotion_sentence(sentence):
    emb = embedder.encode([sentence], convert_to_numpy=True)[0]
    labels = []
    sims = []

    for emotion in EMOTIONS:
        sim = cosine_similarity(
            emb.reshape(1, -1),
            emotion_avg[emotion].reshape(1, -1)
        )[0][0]
        labels.append(emotion)
        sims.append(sim)

    order = np.argsort(sims)[::-1]
    best_idx = order[0]
    second_idx = order[1] if len(order) > 1 else order[0]

    return {
        "emotion": labels[best_idx],
        "score": float(sims[best_idx]),
        "margin": float(sims[best_idx] - sims[second_idx])
    }


def analyze_audio(audio_path):
    if audio_path is None:
        return "No transcript yet.", "None", 0.0, pd.DataFrame(columns=["sentence", "emotion", "score", "margin"])

    segments, _ = whisper_model.transcribe(audio_path)

    transcript_parts = []
    rows = []

    for seg in segments:
        text = seg.text.strip()
        if not text:
            continue
        transcript_parts.append(text)

        pred = predict_emotion_sentence(text)
        rows.append({
            "sentence": text,
            "emotion": pred["emotion"],
            "score": pred["score"],
            "margin": pred["margin"]
        })

    transcript = " ".join(transcript_parts).strip()

    if rows:
        latest = rows[-1]
        latest_emotion = latest["emotion"]
        latest_margin = latest["margin"]
    else:
        latest_emotion = "None"
        latest_margin = 0.0

    df = pd.DataFrame(rows)
    return transcript, latest_emotion, latest_margin, df


with gr.Blocks(title="Emotion Speech Classifier") as demo:
    gr.Markdown("# Emotion Speech Classifier")
    gr.Markdown("Upload or record audio, transcribe it, and detect sentence-level emotion.")

    with gr.Row():
        with gr.Column(scale=1):
            audio_input = gr.Audio(
                sources=["microphone", "upload"],
                type="filepath",
                label="Audio Input"
            )
            run_btn = gr.Button("Analyze Audio")

        with gr.Column(scale=2):
            transcript_box = gr.Textbox(label="Transcript", lines=8)
            with gr.Row():
                latest_emotion_box = gr.Textbox(label="Latest Emotion")
                margin_box = gr.Number(label="Match Margin")

    results_df = gr.Dataframe(
        headers=["sentence", "emotion", "score", "margin"],
        label="Sentence Analysis"
    )

    run_btn.click(
        fn=analyze_audio,
        inputs=audio_input,
        outputs=[transcript_box, latest_emotion_box, margin_box, results_df]
    )

if __name__ == "__main__":
    demo.launch()