import os import pickle import numpy as np import pandas as pd import gradio as gr import soundfile as sf from faster_whisper import WhisperModel from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity EMBED_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" CENTROIDS_PATH = "emotion_avg.pkl" with open(CENTROIDS_PATH, "rb") as f: emotion_avg = pickle.load(f) for k in list(emotion_avg.keys()): emotion_avg[k] = np.array(emotion_avg[k]) EMOTIONS = list(emotion_avg.keys()) embedder = SentenceTransformer(EMBED_MODEL_NAME) whisper_model = WhisperModel("base", compute_type="int8") def predict_emotion_sentence(sentence): emb = embedder.encode([sentence], convert_to_numpy=True)[0] labels = [] sims = [] for emotion in EMOTIONS: sim = cosine_similarity( emb.reshape(1, -1), emotion_avg[emotion].reshape(1, -1) )[0][0] labels.append(emotion) sims.append(sim) order = np.argsort(sims)[::-1] best_idx = order[0] second_idx = order[1] if len(order) > 1 else order[0] return { "emotion": labels[best_idx], "score": float(sims[best_idx]), "margin": float(sims[best_idx] - sims[second_idx]) } def analyze_audio(audio_path): if audio_path is None: return "No transcript yet.", "None", 0.0, pd.DataFrame(columns=["sentence", "emotion", "score", "margin"]) segments, _ = whisper_model.transcribe(audio_path) transcript_parts = [] rows = [] for seg in segments: text = seg.text.strip() if not text: continue transcript_parts.append(text) pred = predict_emotion_sentence(text) rows.append({ "sentence": text, "emotion": pred["emotion"], "score": pred["score"], "margin": pred["margin"] }) transcript = " ".join(transcript_parts).strip() if rows: latest = rows[-1] latest_emotion = latest["emotion"] latest_margin = latest["margin"] else: latest_emotion = "None" latest_margin = 0.0 df = pd.DataFrame(rows) return transcript, latest_emotion, latest_margin, df with gr.Blocks(title="Emotion Speech Classifier") as demo: gr.Markdown("# Emotion Speech Classifier") gr.Markdown("Upload or record audio, transcribe it, and detect sentence-level emotion.") with gr.Row(): with gr.Column(scale=1): audio_input = gr.Audio( sources=["microphone", "upload"], type="filepath", label="Audio Input" ) run_btn = gr.Button("Analyze Audio") with gr.Column(scale=2): transcript_box = gr.Textbox(label="Transcript", lines=8) with gr.Row(): latest_emotion_box = gr.Textbox(label="Latest Emotion") margin_box = gr.Number(label="Match Margin") results_df = gr.Dataframe( headers=["sentence", "emotion", "score", "margin"], label="Sentence Analysis" ) run_btn.click( fn=analyze_audio, inputs=audio_input, outputs=[transcript_box, latest_emotion_box, margin_box, results_df] ) if __name__ == "__main__": demo.launch()