Kevin King commited on
Commit
abd725c
·
1 Parent(s): d5ac657

REFAC: Update Streamlit app to enhance emotion analysis and visualization features

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -1
  2. src/streamlit_app.py +87 -73
requirements.txt CHANGED
@@ -24,4 +24,5 @@ soundfile==0.12.1
24
  librosa==0.10.1
25
  scipy==1.13.0
26
  Pillow==10.3.0
27
- scikit-learn==1.4.2
 
 
24
  librosa==0.10.1
25
  scipy==1.13.0
26
  Pillow==10.3.0
27
+ scikit-learn==1.4.2
28
+ matplotlib==3.8.4
src/streamlit_app.py CHANGED
@@ -13,6 +13,7 @@ from moviepy.editor import VideoFileClip
13
  import time
14
  import pandas as pd
15
  from sklearn.metrics.pairwise import cosine_similarity
 
16
 
17
  # --- Create a cross-platform, writable cache directory ---
18
  CACHE_DIR = os.path.join(tempfile.gettempdir(), "affectlink_cache")
@@ -27,14 +28,10 @@ st.write("Upload a short video clip (under 30 seconds) to see a multimodal emoti
27
 
28
  # --- Logger Configuration ---
29
  logging.basicConfig(level=logging.INFO)
30
- logging.getLogger('deepface').setLevel(logging.ERROR)
31
- logging.getLogger('huggingface_hub').setLevel(logging.WARNING)
32
- logging.getLogger('moviepy').setLevel(logging.ERROR)
33
-
34
 
35
  # --- Emotion Mappings ---
36
- # This is the single source of truth for our final emotion space
37
- UNIFIED_EMOTIONS = ['angry', 'happy', 'sad', 'neutral']
38
  TEXT_TO_UNIFIED = {'neutral': 'neutral', 'joy': 'happy', 'sadness': 'sad', 'anger': 'angry'}
39
  SER_TO_UNIFIED = {'neu': 'neutral', 'hap': 'happy', 'sad': 'sad', 'ang': 'angry'}
40
  FACIAL_TO_UNIFIED = {'neutral': 'neutral', 'happy': 'happy', 'sad': 'sad', 'angry': 'angry', 'fear':None, 'surprise':None, 'disgust':None}
@@ -44,7 +41,7 @@ AUDIO_SAMPLE_RATE = 16000
44
  @st.cache_resource
45
  def load_models():
46
  with st.spinner("Loading AI models, this may take a moment..."):
47
- whisper_model = whisper.load_model("base", download_root=os.path.join(CACHE_DIR, "whisper"))
48
  text_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)
49
  ser_model_name = "superb/hubert-large-superb-er"
50
  ser_feature_extractor = AutoFeatureExtractor.from_pretrained(ser_model_name)
@@ -55,22 +52,15 @@ whisper_model, text_classifier, ser_model, ser_feature_extractor = load_models()
55
 
56
  # --- Helper Functions for Analysis ---
57
  def create_unified_vector(scores_dict, mapping_dict):
58
- """Creates a normalized vector from a dictionary of scores based on a mapping."""
59
  vector = np.zeros(len(UNIFIED_EMOTIONS))
60
  for label, score in scores_dict.items():
61
- # Map the raw label (e.g., 'neu', 'joy') to our unified label ('neutral', 'happy')
62
  unified_label = mapping_dict.get(label)
63
  if unified_label in UNIFIED_EMOTIONS:
64
- idx = UNIFIED_EMOTIONS.index(unified_label)
65
- vector[idx] += score
66
-
67
  norm = np.linalg.norm(vector)
68
- if norm > 0:
69
- vector /= norm
70
- return vector
71
 
72
  def get_consistency_level(cosine_sim):
73
- """Convert cosine similarity to a qualitative label."""
74
  if cosine_sim >= 0.8: return "High"
75
  if cosine_sim >= 0.6: return "Medium"
76
  if cosine_sim >= 0.3: return "Low"
@@ -91,10 +81,17 @@ if uploaded_file is not None:
91
  if st.button("Analyze Video"):
92
  # Dictionaries to hold all results
93
  fer_timeline = {}
94
- audio_analysis_results = {}
95
-
 
 
 
 
 
 
 
96
  # --- Video Processing ---
97
- with st.spinner("Analyzing video for facial expressions... (1 frame per second)"):
98
  cap = None
99
  try:
100
  cap = cv2.VideoCapture(temp_video_path)
@@ -103,8 +100,8 @@ if uploaded_file is not None:
103
  while cap.isOpened():
104
  ret, frame = cap.read()
105
  if not ret: break
 
106
  if frame_count % int(fps) == 0:
107
- timestamp = frame_count / fps
108
  analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False, silent=True)
109
  if isinstance(analysis, list) and len(analysis) > 0:
110
  fer_timeline[timestamp] = analysis[0]['emotion']
@@ -113,8 +110,10 @@ if uploaded_file is not None:
113
  if cap: cap.release()
114
 
115
  # --- Audio Processing ---
116
- with st.spinner("Extracting and analyzing audio..."):
117
- video_clip = None
 
 
118
  try:
119
  video_clip = VideoFileClip(temp_video_path)
120
  if video_clip.audio:
@@ -122,26 +121,29 @@ if uploaded_file is not None:
122
  video_clip.audio.write_audiofile(taudio.name, fps=AUDIO_SAMPLE_RATE, logger=None)
123
  temp_audio_path = taudio.name
124
 
125
- result = whisper_model.transcribe(temp_audio_path, fp16=False)
126
- transcribed_text = result['text'].strip()
127
- audio_analysis_results['Transcription'] = transcribed_text if transcribed_text else "No speech detected."
128
-
129
- if transcribed_text:
130
- text_emotions = text_classifier(transcribed_text)[0]
131
- audio_analysis_results['Text Emotion Scores'] = {emo['label']: emo['score'] for emo in text_emotions}
132
-
133
  audio_array, _ = sf.read(temp_audio_path, dtype='float32')
134
  if audio_array.ndim == 2: audio_array = audio_array.mean(axis=1)
135
- if len(audio_array) < 1024: audio_array = np.pad(audio_array, (0, 1024 - len(audio_array)))
136
-
137
- inputs = ser_feature_extractor(audio_array, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
138
- with torch.no_grad():
139
- logits = ser_model(**inputs).logits
140
- scores = torch.nn.functional.softmax(logits, dim=1).squeeze()
141
- ser_scores = {ser_model.config.id2label[i]: score.item() for i, score in enumerate(scores)}
142
- audio_analysis_results['Speech Emotion Scores'] = ser_scores
143
- else:
144
- audio_analysis_results['Transcription'] = "No audio track found."
 
 
 
 
 
 
 
 
145
  finally:
146
  if video_clip: video_clip.close()
147
  if 'temp_audio_path' in locals() and os.path.exists(temp_audio_path): os.unlink(temp_audio_path)
@@ -149,58 +151,70 @@ if uploaded_file is not None:
149
  # --- Post-Analysis and Visualization ---
150
  st.header("Analysis Results")
151
 
152
- # Prepare data for display
153
- fer_avg_scores = pd.DataFrame(fer_timeline).T.mean().to_dict() if fer_timeline else {}
154
- ser_scores = audio_analysis_results.get('Speech Emotion Scores', {})
155
- text_scores = audio_analysis_results.get('Text Emotion Scores', {})
156
-
157
- # Create vectors using the unified mappings. This ensures cosine similarity is correct.
158
- fer_vector = create_unified_vector(fer_avg_scores, FACIAL_TO_UNIFIED)
159
- ser_vector = create_unified_vector(ser_scores, SER_TO_UNIFIED)
160
- text_vector = create_unified_vector(text_scores, TEXT_TO_UNIFIED)
161
 
162
- # Calculate similarities
 
 
 
163
  sim_face_text = cosine_similarity([fer_vector], [text_vector])[0][0]
164
  sim_face_speech = cosine_similarity([fer_vector], [ser_vector])[0][0]
165
  sim_speech_text = cosine_similarity([ser_vector], [text_vector])[0][0]
166
- avg_similarity = np.mean([sim_face_text, sim_face_speech, sim_speech_text])
167
 
168
- # --- THIS IS THE FIX: Map dominant emotions to unified labels before displaying ---
169
  dominant_fer = max(fer_avg_scores, key=fer_avg_scores.get) if fer_avg_scores else "N/A"
170
- dominant_text_raw = max(text_scores, key=text_scores.get) if text_scores else "N/A"
171
- dominant_ser_raw = max(ser_scores, key=ser_scores.get) if ser_scores else "N/A"
172
-
173
- # Convert raw dominant emotions to their unified, full-word versions for display
174
- display_fer = FACIAL_TO_UNIFIED.get(dominant_fer, "N/A").capitalize()
175
  display_text = TEXT_TO_UNIFIED.get(dominant_text_raw, "N/A").capitalize()
176
  display_ser = SER_TO_UNIFIED.get(dominant_ser_raw, "N/A").capitalize()
177
- # ===================================================================================
178
 
179
- # Display metrics
180
  col1, col2 = st.columns([1, 2])
181
  with col1:
182
  st.subheader("Multimodal Summary")
183
- st.write(f"**Transcription:** \"{audio_analysis_results.get('Transcription', 'N/A')}\"")
184
  st.metric("Dominant Facial Emotion", display_fer)
185
  st.metric("Dominant Text Emotion", display_text)
186
  st.metric("Dominant Speech Emotion", display_ser)
187
  st.metric("Emotion Consistency", get_consistency_level(avg_similarity), f"{avg_similarity:.2f} Avg. Cosine Similarity")
188
-
189
  with col2:
190
- st.subheader("Facial Emotion Over Time")
191
- if fer_timeline:
192
- df = pd.DataFrame(fer_timeline).T
193
- # Filter for only the unified emotions we care about for the plot
194
- plot_columns = [k for k, v in FACIAL_TO_UNIFIED.items() if v is not None]
195
- df_filtered = df[plot_columns].rename(columns=FACIAL_TO_UNIFIED)
196
- st.line_chart(df_filtered[UNIFIED_EMOTIONS]) # Ensure consistent column order
197
- else:
198
- st.write("No faces detected to plot.")
199
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  finally:
201
  if temp_video_path and os.path.exists(temp_video_path):
202
  time.sleep(1)
203
  try:
204
  os.unlink(temp_video_path)
205
- except Exception:
206
- pass
 
13
  import time
14
  import pandas as pd
15
  from sklearn.metrics.pairwise import cosine_similarity
16
+ import matplotlib.pyplot as plt
17
 
18
  # --- Create a cross-platform, writable cache directory ---
19
  CACHE_DIR = os.path.join(tempfile.gettempdir(), "affectlink_cache")
 
28
 
29
  # --- Logger Configuration ---
30
  logging.basicConfig(level=logging.INFO)
31
+ # [Logger setup remains the same]
 
 
 
32
 
33
  # --- Emotion Mappings ---
34
+ UNIFIED_EMOTIONS = ['angry', 'happy', 'sad', 'neutral']
 
35
  TEXT_TO_UNIFIED = {'neutral': 'neutral', 'joy': 'happy', 'sadness': 'sad', 'anger': 'angry'}
36
  SER_TO_UNIFIED = {'neu': 'neutral', 'hap': 'happy', 'sad': 'sad', 'ang': 'angry'}
37
  FACIAL_TO_UNIFIED = {'neutral': 'neutral', 'happy': 'happy', 'sad': 'sad', 'angry': 'angry', 'fear':None, 'surprise':None, 'disgust':None}
 
41
  @st.cache_resource
42
  def load_models():
43
  with st.spinner("Loading AI models, this may take a moment..."):
44
+ whisper_model = whisper.load_model("base.en", download_root=os.path.join(CACHE_DIR, "whisper"))
45
  text_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)
46
  ser_model_name = "superb/hubert-large-superb-er"
47
  ser_feature_extractor = AutoFeatureExtractor.from_pretrained(ser_model_name)
 
52
 
53
  # --- Helper Functions for Analysis ---
54
  def create_unified_vector(scores_dict, mapping_dict):
 
55
  vector = np.zeros(len(UNIFIED_EMOTIONS))
56
  for label, score in scores_dict.items():
 
57
  unified_label = mapping_dict.get(label)
58
  if unified_label in UNIFIED_EMOTIONS:
59
+ vector[UNIFIED_EMOTIONS.index(unified_label)] += score
 
 
60
  norm = np.linalg.norm(vector)
61
+ return vector / norm if norm > 0 else vector
 
 
62
 
63
  def get_consistency_level(cosine_sim):
 
64
  if cosine_sim >= 0.8: return "High"
65
  if cosine_sim >= 0.6: return "Medium"
66
  if cosine_sim >= 0.3: return "Low"
 
81
  if st.button("Analyze Video"):
82
  # Dictionaries to hold all results
83
  fer_timeline = {}
84
+ ser_timeline = {}
85
+ ter_timeline = {}
86
+ full_transcription = "No speech detected."
87
+
88
+ video_clip_for_duration = VideoFileClip(temp_video_path)
89
+ duration = video_clip_for_duration.duration
90
+ video_clip_for_duration.close()
91
+
92
+
93
  # --- Video Processing ---
94
+ with st.spinner("Analyzing facial expressions..."):
95
  cap = None
96
  try:
97
  cap = cv2.VideoCapture(temp_video_path)
 
100
  while cap.isOpened():
101
  ret, frame = cap.read()
102
  if not ret: break
103
+ timestamp = frame_count / fps
104
  if frame_count % int(fps) == 0:
 
105
  analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False, silent=True)
106
  if isinstance(analysis, list) and len(analysis) > 0:
107
  fer_timeline[timestamp] = analysis[0]['emotion']
 
110
  if cap: cap.release()
111
 
112
  # --- Audio Processing ---
113
+ with st.spinner("Analyzing audio and text..."):
114
+ # --- THIS IS THE FIX ---
115
+ video_clip = None
116
+ # =======================
117
  try:
118
  video_clip = VideoFileClip(temp_video_path)
119
  if video_clip.audio:
 
121
  video_clip.audio.write_audiofile(taudio.name, fps=AUDIO_SAMPLE_RATE, logger=None)
122
  temp_audio_path = taudio.name
123
 
124
+ whisper_result = whisper_model.transcribe(temp_audio_path, word_timestamps=True, fp16=False)
125
+ full_transcription = whisper_result['text'].strip()
126
+
 
 
 
 
 
127
  audio_array, _ = sf.read(temp_audio_path, dtype='float32')
128
  if audio_array.ndim == 2: audio_array = audio_array.mean(axis=1)
129
+
130
+ for i in range(int(duration)):
131
+ start_sample = i * AUDIO_SAMPLE_RATE
132
+ end_sample = (i + 1) * AUDIO_SAMPLE_RATE
133
+ chunk = audio_array[start_sample:end_sample]
134
+
135
+ if len(chunk) > 400:
136
+ inputs = ser_feature_extractor(chunk, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
137
+ with torch.no_grad():
138
+ logits = ser_model(**inputs).logits
139
+ scores = torch.nn.functional.softmax(logits, dim=1).squeeze()
140
+ ser_timeline[i] = {ser_model.config.id2label[k]: score.item() for k, score in enumerate(scores)}
141
+
142
+ words_in_segment = [seg['word'] for seg in whisper_result['segments'] if seg['start'] >= i and seg['start'] < i+1 for seg in seg.get('words', [])]
143
+ segment_text = " ".join(words_in_segment).strip()
144
+ if segment_text:
145
+ text_emotions = text_classifier(segment_text)[0]
146
+ ter_timeline[i] = {emo['label']: emo['score'] for emo in text_emotions}
147
  finally:
148
  if video_clip: video_clip.close()
149
  if 'temp_audio_path' in locals() and os.path.exists(temp_audio_path): os.unlink(temp_audio_path)
 
151
  # --- Post-Analysis and Visualization ---
152
  st.header("Analysis Results")
153
 
154
+ fer_df = pd.DataFrame.from_dict(fer_timeline, orient='index').rename(columns=FACIAL_TO_UNIFIED)
155
+ ser_df = pd.DataFrame.from_dict(ser_timeline, orient='index').rename(columns=SER_TO_UNIFIED)
156
+ ter_df = pd.DataFrame.from_dict(ter_timeline, orient='index').rename(columns=TEXT_TO_UNIFIED)
157
+
158
+ fer_avg_scores = fer_df[UNIFIED_EMOTIONS].mean().to_dict()
159
+ ser_avg_scores = ser_df[UNIFIED_EMOTIONS].mean().to_dict()
160
+ ter_avg_scores = ter_df[UNIFIED_EMOTIONS].mean().to_dict()
 
 
161
 
162
+ fer_vector = create_unified_vector(fer_avg_scores, {e: e for e in UNIFIED_EMOTIONS})
163
+ ser_vector = create_unified_vector(ser_avg_scores, {e: e for e in UNIFIED_EMOTIONS})
164
+ text_vector = create_unified_vector(ter_avg_scores, {e: e for e in UNIFIED_EMOTIONS})
165
+
166
  sim_face_text = cosine_similarity([fer_vector], [text_vector])[0][0]
167
  sim_face_speech = cosine_similarity([fer_vector], [ser_vector])[0][0]
168
  sim_speech_text = cosine_similarity([ser_vector], [text_vector])[0][0]
169
+ avg_similarity = np.mean([sim for sim in [sim_face_text, sim_face_speech, sim_speech_text] if not np.isnan(sim)])
170
 
 
171
  dominant_fer = max(fer_avg_scores, key=fer_avg_scores.get) if fer_avg_scores else "N/A"
172
+ dominant_text_raw = max(ter_avg_scores, key=ter_avg_scores.get) if ter_avg_scores else "N/A"
173
+ dominant_ser_raw = max(ser_avg_scores, key=ser_avg_scores.get) if ser_avg_scores else "N/A"
174
+
175
+ display_fer = FACIAL_TO_UNIFIED.get(dominant_fer.lower(), "N/A").capitalize()
 
176
  display_text = TEXT_TO_UNIFIED.get(dominant_text_raw, "N/A").capitalize()
177
  display_ser = SER_TO_UNIFIED.get(dominant_ser_raw, "N/A").capitalize()
 
178
 
 
179
  col1, col2 = st.columns([1, 2])
180
  with col1:
181
  st.subheader("Multimodal Summary")
182
+ st.write(f"**Transcription:** \"{full_transcription}\"")
183
  st.metric("Dominant Facial Emotion", display_fer)
184
  st.metric("Dominant Text Emotion", display_text)
185
  st.metric("Dominant Speech Emotion", display_ser)
186
  st.metric("Emotion Consistency", get_consistency_level(avg_similarity), f"{avg_similarity:.2f} Avg. Cosine Similarity")
187
+
188
  with col2:
189
+ st.subheader("Unified Emotion Timeline")
190
+ combined_df = pd.DataFrame(index=range(int(duration)))
191
+ for emotion in UNIFIED_EMOTIONS:
192
+ if emotion in fer_df: combined_df[f'Facial_{emotion}'] = fer_df[emotion]
193
+ if emotion in ser_df: combined_df[f'Speech_{emotion}'] = ser_df[emotion]
194
+ if emotion in ter_df: combined_df[f'Text_{emotion}'] = ter_df[emotion]
195
+
196
+ combined_df.ffill(inplace=True)
197
+ combined_df.fillna(0, inplace=True)
198
+
199
+ fig, ax = plt.subplots(figsize=(10, 5))
200
+ colors = {'happy': 'green', 'sad': 'blue', 'angry': 'red', 'neutral': 'gray'}
201
+ styles = {'Facial': '-', 'Speech': '--', 'Text': ':'}
202
+
203
+ for col in combined_df.columns:
204
+ modality, emotion = col.split('_')
205
+ if emotion in colors:
206
+ ax.plot(combined_df.index, combined_df[col], label=f'{modality} {emotion.capitalize()}', color=colors[emotion], linestyle=styles[modality], alpha=0.8)
207
+
208
+ ax.set_title("Emotion Confidence Over Time")
209
+ ax.set_xlabel("Time (seconds)")
210
+ ax.set_ylabel("Confidence Score")
211
+ ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
212
+ ax.grid(True, which='both', linestyle='--', linewidth=0.5)
213
+ plt.tight_layout()
214
+ st.pyplot(fig)
215
  finally:
216
  if temp_video_path and os.path.exists(temp_video_path):
217
  time.sleep(1)
218
  try:
219
  os.unlink(temp_video_path)
220
+ except Exception: pass