pmikh26 commited on
Commit
91655dd
·
verified ·
1 Parent(s): e90de09

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +140 -0
  2. emotion_avg.pkl.py +10 -0
  3. packages.txt +10 -0
  4. requirements.txt +17 -0
app.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """app
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1GiJsUjgSfSzhuo0YkKYDvzQk5Cg2Qiao
8
+ """
9
+
10
+ import os
11
+ import pickle
12
+ import numpy as np
13
+ import pandas as pd
14
+ import gradio as gr
15
+ import soundfile as sf
16
+
17
+ from faster_whisper import WhisperModel
18
+ from sentence_transformers import SentenceTransformer
19
+ from sklearn.metrics.pairwise import cosine_similarity
20
+
21
+ # -----------------------------
22
+ # Load emotion vectors
23
+ # -----------------------------
24
+ EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
25
+ CENTROIDS_PATH = "emotion_avg.pkl"
26
+
27
+ with open(CENTROIDS_PATH, "rb") as f:
28
+ emotion_avg = pickle.load(f)
29
+
30
+ for k in list(emotion_avg.keys()):
31
+ emotion_avg[k] = np.array(emotion_avg[k])
32
+
33
+ EMOTIONS = list(emotion_avg.keys())
34
+
35
+ # -----------------------------
36
+ # Load models
37
+ # -----------------------------
38
+ embedder = SentenceTransformer(EMBED_MODEL_NAME)
39
+ whisper_model = WhisperModel("base", compute_type="int8")
40
+
41
+ # -----------------------------
42
+ # Prediction helper
43
+ # -----------------------------
44
+ def predict_emotion_sentence(sentence):
45
+ emb = embedder.encode([sentence], convert_to_numpy=True)[0]
46
+ labels = []
47
+ sims = []
48
+
49
+ for emotion in EMOTIONS:
50
+ sim = cosine_similarity(
51
+ emb.reshape(1, -1),
52
+ emotion_avg[emotion].reshape(1, -1)
53
+ )[0][0]
54
+ labels.append(emotion)
55
+ sims.append(sim)
56
+
57
+ order = np.argsort(sims)[::-1]
58
+ best_idx = order[0]
59
+ second_idx = order[1] if len(order) > 1 else order[0]
60
+
61
+ return {
62
+ "emotion": labels[best_idx],
63
+ "score": float(sims[best_idx]),
64
+ "margin": float(sims[best_idx] - sims[second_idx])
65
+ }
66
+
67
+ # -----------------------------
68
+ # Main app function
69
+ # -----------------------------
70
+ def analyze_audio(audio_path):
71
+ if audio_path is None:
72
+ return "No transcript yet.", "None", 0.0, pd.DataFrame(columns=["sentence", "emotion", "score", "margin"])
73
+
74
+ segments, _ = whisper_model.transcribe(audio_path)
75
+
76
+ transcript_parts = []
77
+ rows = []
78
+
79
+ for seg in segments:
80
+ text = seg.text.strip()
81
+ if not text:
82
+ continue
83
+ transcript_parts.append(text)
84
+
85
+ pred = predict_emotion_sentence(text)
86
+ rows.append({
87
+ "sentence": text,
88
+ "emotion": pred["emotion"],
89
+ "score": pred["score"],
90
+ "margin": pred["margin"]
91
+ })
92
+
93
+ transcript = " ".join(transcript_parts).strip()
94
+
95
+ if rows:
96
+ latest = rows[-1]
97
+ latest_emotion = latest["emotion"]
98
+ latest_margin = latest["margin"]
99
+ else:
100
+ latest_emotion = "None"
101
+ latest_margin = 0.0
102
+
103
+ df = pd.DataFrame(rows)
104
+ return transcript, latest_emotion, latest_margin, df
105
+
106
+ # -----------------------------
107
+ # UI
108
+ # -----------------------------
109
+ with gr.Blocks(title="Emotion Speech Analyzer") as demo:
110
+ gr.Markdown("# Emotion Speech Analyzer")
111
+ gr.Markdown("Upload or record audio, transcribe it, and detect sentence-level emotion.")
112
+
113
+ with gr.Row():
114
+ with gr.Column(scale=1):
115
+ audio_input = gr.Audio(
116
+ sources=["microphone", "upload"],
117
+ type="filepath",
118
+ label="Audio Input"
119
+ )
120
+ run_btn = gr.Button("Analyze Audio")
121
+
122
+ with gr.Column(scale=2):
123
+ transcript_box = gr.Textbox(label="Transcript", lines=8)
124
+ with gr.Row():
125
+ latest_emotion_box = gr.Textbox(label="Latest Emotion")
126
+ margin_box = gr.Number(label="Match Margin")
127
+
128
+ results_df = gr.Dataframe(
129
+ headers=["sentence", "emotion", "score", "margin"],
130
+ label="Sentence Analysis"
131
+ )
132
+
133
+ run_btn.click(
134
+ fn=analyze_audio,
135
+ inputs=audio_input,
136
+ outputs=[transcript_box, latest_emotion_box, margin_box, results_df]
137
+ )
138
+
139
+ if __name__ == "__main__":
140
+ demo.launch()
emotion_avg.pkl.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """emotion_avg.pkl
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/16UTkPmy595caC3JG_2im6CrqRRsNXgha
8
+ """
9
+
10
+ CENTROIDS_PATH = "emotion_avg.pkl"
packages.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """packages.txt
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1w6c_tqSlT7TQET--l9r1rM9eijDoDs_k
8
+ """
9
+
10
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """requirements.txt
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1KqS7WoYBbtlNDCjJBCtbUSqgCPc5784t
8
+ """
9
+
10
+ gradio
11
+ faster-whisper
12
+ soundfile
13
+ sentence-transformers
14
+ scikit-learn
15
+ numpy
16
+ pandas
17
+ torch