Emotion_Project / app.py
pmikh26's picture
Update app.py
2f1e061 verified
import os
import pickle
import numpy as np
import pandas as pd
import gradio as gr
import soundfile as sf
from faster_whisper import WhisperModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
EMBED_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
CENTROIDS_PATH = "emotion_avg.pkl"
with open(CENTROIDS_PATH, "rb") as f:
emotion_avg = pickle.load(f)
for k in list(emotion_avg.keys()):
emotion_avg[k] = np.array(emotion_avg[k])
EMOTIONS = list(emotion_avg.keys())
embedder = SentenceTransformer(EMBED_MODEL_NAME)
whisper_model = WhisperModel("base", compute_type="int8")
def predict_emotion_sentence(sentence):
emb = embedder.encode([sentence], convert_to_numpy=True)[0]
labels = []
sims = []
for emotion in EMOTIONS:
sim = cosine_similarity(
emb.reshape(1, -1),
emotion_avg[emotion].reshape(1, -1)
)[0][0]
labels.append(emotion)
sims.append(sim)
order = np.argsort(sims)[::-1]
best_idx = order[0]
second_idx = order[1] if len(order) > 1 else order[0]
return {
"emotion": labels[best_idx],
"score": float(sims[best_idx]),
"margin": float(sims[best_idx] - sims[second_idx])
}
def analyze_audio(audio_path):
if audio_path is None:
return "No transcript yet.", "None", 0.0, pd.DataFrame(columns=["sentence", "emotion", "score", "margin"])
segments, _ = whisper_model.transcribe(audio_path)
transcript_parts = []
rows = []
for seg in segments:
text = seg.text.strip()
if not text:
continue
transcript_parts.append(text)
pred = predict_emotion_sentence(text)
rows.append({
"sentence": text,
"emotion": pred["emotion"],
"score": pred["score"],
"margin": pred["margin"]
})
transcript = " ".join(transcript_parts).strip()
if rows:
latest = rows[-1]
latest_emotion = latest["emotion"]
latest_margin = latest["margin"]
else:
latest_emotion = "None"
latest_margin = 0.0
df = pd.DataFrame(rows)
return transcript, latest_emotion, latest_margin, df
with gr.Blocks(title="Emotion Speech Classifier") as demo:
gr.Markdown("# Emotion Speech Classifier")
gr.Markdown("Upload or record audio, transcribe it, and detect sentence-level emotion.")
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="Audio Input"
)
run_btn = gr.Button("Analyze Audio")
with gr.Column(scale=2):
transcript_box = gr.Textbox(label="Transcript", lines=8)
with gr.Row():
latest_emotion_box = gr.Textbox(label="Latest Emotion")
margin_box = gr.Number(label="Match Margin")
results_df = gr.Dataframe(
headers=["sentence", "emotion", "score", "margin"],
label="Sentence Analysis"
)
run_btn.click(
fn=analyze_audio,
inputs=audio_input,
outputs=[transcript_box, latest_emotion_box, margin_box, results_df]
)
if __name__ == "__main__":
demo.launch()