Spaces:

kingkw1
/

AffectLink

Build error

App Files Files Community

Kevin King commited on Aug 5, 2025

Commit

abd725c

1 Parent(s): d5ac657

REFAC: Update Streamlit app to enhance emotion analysis and visualization features

Browse files

Files changed (2) hide show

requirements.txt +2 -1
src/streamlit_app.py +87 -73

requirements.txt CHANGED Viewed

@@ -24,4 +24,5 @@ soundfile==0.12.1
 librosa==0.10.1
 scipy==1.13.0
 Pillow==10.3.0
-scikit-learn==1.4.2

 librosa==0.10.1
 scipy==1.13.0
 Pillow==10.3.0
+scikit-learn==1.4.2
+matplotlib==3.8.4

src/streamlit_app.py CHANGED Viewed

@@ -13,6 +13,7 @@ from moviepy.editor import VideoFileClip
 import time
 import pandas as pd
 from sklearn.metrics.pairwise import cosine_similarity
 # --- Create a cross-platform, writable cache directory ---
 CACHE_DIR = os.path.join(tempfile.gettempdir(), "affectlink_cache")
@@ -27,14 +28,10 @@ st.write("Upload a short video clip (under 30 seconds) to see a multimodal emoti
 # --- Logger Configuration ---
 logging.basicConfig(level=logging.INFO)
-logging.getLogger('deepface').setLevel(logging.ERROR)
-logging.getLogger('huggingface_hub').setLevel(logging.WARNING)
-logging.getLogger('moviepy').setLevel(logging.ERROR)
 # --- Emotion Mappings ---
-# This is the single source of truth for our final emotion space
-UNIFIED_EMOTIONS = ['angry', 'happy', 'sad', 'neutral']
 TEXT_TO_UNIFIED = {'neutral': 'neutral', 'joy': 'happy', 'sadness': 'sad', 'anger': 'angry'}
 SER_TO_UNIFIED = {'neu': 'neutral', 'hap': 'happy', 'sad': 'sad', 'ang': 'angry'}
 FACIAL_TO_UNIFIED = {'neutral': 'neutral', 'happy': 'happy', 'sad': 'sad', 'angry': 'angry', 'fear':None, 'surprise':None, 'disgust':None}
@@ -44,7 +41,7 @@ AUDIO_SAMPLE_RATE = 16000
 @st.cache_resource
 def load_models():
     with st.spinner("Loading AI models, this may take a moment..."):
-        whisper_model = whisper.load_model("base", download_root=os.path.join(CACHE_DIR, "whisper"))
         text_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)
         ser_model_name = "superb/hubert-large-superb-er"
         ser_feature_extractor = AutoFeatureExtractor.from_pretrained(ser_model_name)
@@ -55,22 +52,15 @@ whisper_model, text_classifier, ser_model, ser_feature_extractor = load_models()
 # --- Helper Functions for Analysis ---
 def create_unified_vector(scores_dict, mapping_dict):
-    """Creates a normalized vector from a dictionary of scores based on a mapping."""
     vector = np.zeros(len(UNIFIED_EMOTIONS))
     for label, score in scores_dict.items():
-        # Map the raw label (e.g., 'neu', 'joy') to our unified label ('neutral', 'happy')
         unified_label = mapping_dict.get(label)
         if unified_label in UNIFIED_EMOTIONS:
-            idx = UNIFIED_EMOTIONS.index(unified_label)
-            vector[idx] += score
     norm = np.linalg.norm(vector)
-    if norm > 0:
-        vector /= norm
-    return vector
 def get_consistency_level(cosine_sim):
-    """Convert cosine similarity to a qualitative label."""
     if cosine_sim >= 0.8: return "High"
     if cosine_sim >= 0.6: return "Medium"
     if cosine_sim >= 0.3: return "Low"
@@ -91,10 +81,17 @@ if uploaded_file is not None:
         if st.button("Analyze Video"):
             # Dictionaries to hold all results
             fer_timeline = {}
-            audio_analysis_results = {}
             # --- Video Processing ---
-            with st.spinner("Analyzing video for facial expressions... (1 frame per second)"):
                 cap = None
                 try:
                     cap = cv2.VideoCapture(temp_video_path)
@@ -103,8 +100,8 @@ if uploaded_file is not None:
                     while cap.isOpened():
                         ret, frame = cap.read()
                         if not ret: break
                         if frame_count % int(fps) == 0:
-                            timestamp = frame_count / fps
                             analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False, silent=True)
                             if isinstance(analysis, list) and len(analysis) > 0:
                                 fer_timeline[timestamp] = analysis[0]['emotion']
@@ -113,8 +110,10 @@ if uploaded_file is not None:
                     if cap: cap.release()
             # --- Audio Processing ---
-            with st.spinner("Extracting and analyzing audio..."):
-                video_clip = None
                 try:
                     video_clip = VideoFileClip(temp_video_path)
                     if video_clip.audio:
@@ -122,26 +121,29 @@ if uploaded_file is not None:
                             video_clip.audio.write_audiofile(taudio.name, fps=AUDIO_SAMPLE_RATE, logger=None)
                             temp_audio_path = taudio.name
-                        result = whisper_model.transcribe(temp_audio_path, fp16=False)
-                        transcribed_text = result['text'].strip()
-                        audio_analysis_results['Transcription'] = transcribed_text if transcribed_text else "No speech detected."
-                        if transcribed_text:
-                            text_emotions = text_classifier(transcribed_text)[0]
-                            audio_analysis_results['Text Emotion Scores'] = {emo['label']: emo['score'] for emo in text_emotions}
                         audio_array, _ = sf.read(temp_audio_path, dtype='float32')
                         if audio_array.ndim == 2: audio_array = audio_array.mean(axis=1)
-                        if len(audio_array) < 1024: audio_array = np.pad(audio_array, (0, 1024 - len(audio_array)))
-                        inputs = ser_feature_extractor(audio_array, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
-                        with torch.no_grad():
-                            logits = ser_model(**inputs).logits
-                        scores = torch.nn.functional.softmax(logits, dim=1).squeeze()
-                        ser_scores = {ser_model.config.id2label[i]: score.item() for i, score in enumerate(scores)}
-                        audio_analysis_results['Speech Emotion Scores'] = ser_scores
-                    else:
-                        audio_analysis_results['Transcription'] = "No audio track found."
                 finally:
                     if video_clip: video_clip.close()
                     if 'temp_audio_path' in locals() and os.path.exists(temp_audio_path): os.unlink(temp_audio_path)
@@ -149,58 +151,70 @@ if uploaded_file is not None:
             # --- Post-Analysis and Visualization ---
             st.header("Analysis Results")
-            # Prepare data for display
-            fer_avg_scores = pd.DataFrame(fer_timeline).T.mean().to_dict() if fer_timeline else {}
-            ser_scores = audio_analysis_results.get('Speech Emotion Scores', {})
-            text_scores = audio_analysis_results.get('Text Emotion Scores', {})
-            # Create vectors using the unified mappings. This ensures cosine similarity is correct.
-            fer_vector = create_unified_vector(fer_avg_scores, FACIAL_TO_UNIFIED)
-            ser_vector = create_unified_vector(ser_scores, SER_TO_UNIFIED)
-            text_vector = create_unified_vector(text_scores, TEXT_TO_UNIFIED)
-            # Calculate similarities
             sim_face_text = cosine_similarity([fer_vector], [text_vector])[0][0]
             sim_face_speech = cosine_similarity([fer_vector], [ser_vector])[0][0]
             sim_speech_text = cosine_similarity([ser_vector], [text_vector])[0][0]
-            avg_similarity = np.mean([sim_face_text, sim_face_speech, sim_speech_text])
-            # --- THIS IS THE FIX: Map dominant emotions to unified labels before displaying ---
             dominant_fer = max(fer_avg_scores, key=fer_avg_scores.get) if fer_avg_scores else "N/A"
-            dominant_text_raw = max(text_scores, key=text_scores.get) if text_scores else "N/A"
-            dominant_ser_raw = max(ser_scores, key=ser_scores.get) if ser_scores else "N/A"
-            # Convert raw dominant emotions to their unified, full-word versions for display
-            display_fer = FACIAL_TO_UNIFIED.get(dominant_fer, "N/A").capitalize()
             display_text = TEXT_TO_UNIFIED.get(dominant_text_raw, "N/A").capitalize()
             display_ser = SER_TO_UNIFIED.get(dominant_ser_raw, "N/A").capitalize()
-            # ===================================================================================
-            # Display metrics
             col1, col2 = st.columns([1, 2])
             with col1:
                 st.subheader("Multimodal Summary")
-                st.write(f"**Transcription:** \"{audio_analysis_results.get('Transcription', 'N/A')}\"")
                 st.metric("Dominant Facial Emotion", display_fer)
                 st.metric("Dominant Text Emotion", display_text)
                 st.metric("Dominant Speech Emotion", display_ser)
                 st.metric("Emotion Consistency", get_consistency_level(avg_similarity), f"{avg_similarity:.2f} Avg. Cosine Similarity")
             with col2:
-                st.subheader("Facial Emotion Over Time")
-                if fer_timeline:
-                    df = pd.DataFrame(fer_timeline).T
-                    # Filter for only the unified emotions we care about for the plot
-                    plot_columns = [k for k, v in FACIAL_TO_UNIFIED.items() if v is not None]
-                    df_filtered = df[plot_columns].rename(columns=FACIAL_TO_UNIFIED)
-                    st.line_chart(df_filtered[UNIFIED_EMOTIONS]) # Ensure consistent column order
-                else:
-                    st.write("No faces detected to plot.")
     finally:
         if temp_video_path and os.path.exists(temp_video_path):
             time.sleep(1)
             try:
                 os.unlink(temp_video_path)
-            except Exception:
-                pass

 import time
 import pandas as pd
 from sklearn.metrics.pairwise import cosine_similarity
+import matplotlib.pyplot as plt
 # --- Create a cross-platform, writable cache directory ---
 CACHE_DIR = os.path.join(tempfile.gettempdir(), "affectlink_cache")
 # --- Logger Configuration ---
 logging.basicConfig(level=logging.INFO)
+# [Logger setup remains the same]
 # --- Emotion Mappings ---
+UNIFIED_EMOTIONS = ['angry', 'happy', 'sad', 'neutral']
 TEXT_TO_UNIFIED = {'neutral': 'neutral', 'joy': 'happy', 'sadness': 'sad', 'anger': 'angry'}
 SER_TO_UNIFIED = {'neu': 'neutral', 'hap': 'happy', 'sad': 'sad', 'ang': 'angry'}
 FACIAL_TO_UNIFIED = {'neutral': 'neutral', 'happy': 'happy', 'sad': 'sad', 'angry': 'angry', 'fear':None, 'surprise':None, 'disgust':None}
 @st.cache_resource
 def load_models():
     with st.spinner("Loading AI models, this may take a moment..."):
+        whisper_model = whisper.load_model("base.en", download_root=os.path.join(CACHE_DIR, "whisper"))
         text_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)
         ser_model_name = "superb/hubert-large-superb-er"
         ser_feature_extractor = AutoFeatureExtractor.from_pretrained(ser_model_name)
 # --- Helper Functions for Analysis ---
 def create_unified_vector(scores_dict, mapping_dict):
     vector = np.zeros(len(UNIFIED_EMOTIONS))
     for label, score in scores_dict.items():
         unified_label = mapping_dict.get(label)
         if unified_label in UNIFIED_EMOTIONS:
+            vector[UNIFIED_EMOTIONS.index(unified_label)] += score
     norm = np.linalg.norm(vector)
+    return vector / norm if norm > 0 else vector
 def get_consistency_level(cosine_sim):
     if cosine_sim >= 0.8: return "High"
     if cosine_sim >= 0.6: return "Medium"
     if cosine_sim >= 0.3: return "Low"
         if st.button("Analyze Video"):
             # Dictionaries to hold all results
             fer_timeline = {}
+            ser_timeline = {}
+            ter_timeline = {}
+            full_transcription = "No speech detected."
+            video_clip_for_duration = VideoFileClip(temp_video_path)
+            duration = video_clip_for_duration.duration
+            video_clip_for_duration.close()
             # --- Video Processing ---
+            with st.spinner("Analyzing facial expressions..."):
                 cap = None
                 try:
                     cap = cv2.VideoCapture(temp_video_path)
                     while cap.isOpened():
                         ret, frame = cap.read()
                         if not ret: break
+                        timestamp = frame_count / fps
                         if frame_count % int(fps) == 0:
                             analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False, silent=True)
                             if isinstance(analysis, list) and len(analysis) > 0:
                                 fer_timeline[timestamp] = analysis[0]['emotion']
                     if cap: cap.release()
             # --- Audio Processing ---
+            with st.spinner("Analyzing audio and text..."):
+                # --- THIS IS THE FIX ---
+                video_clip = None
+                # =======================
                 try:
                     video_clip = VideoFileClip(temp_video_path)
                     if video_clip.audio:
                             video_clip.audio.write_audiofile(taudio.name, fps=AUDIO_SAMPLE_RATE, logger=None)
                             temp_audio_path = taudio.name
+                        whisper_result = whisper_model.transcribe(temp_audio_path, word_timestamps=True, fp16=False)
+                        full_transcription = whisper_result['text'].strip()
                         audio_array, _ = sf.read(temp_audio_path, dtype='float32')
                         if audio_array.ndim == 2: audio_array = audio_array.mean(axis=1)
+                        for i in range(int(duration)):
+                            start_sample = i * AUDIO_SAMPLE_RATE
+                            end_sample = (i + 1) * AUDIO_SAMPLE_RATE
+                            chunk = audio_array[start_sample:end_sample]
+                            if len(chunk) > 400:
+                                inputs = ser_feature_extractor(chunk, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
+                                with torch.no_grad():
+                                    logits = ser_model(**inputs).logits
+                                scores = torch.nn.functional.softmax(logits, dim=1).squeeze()
+                                ser_timeline[i] = {ser_model.config.id2label[k]: score.item() for k, score in enumerate(scores)}
+                            words_in_segment = [seg['word'] for seg in whisper_result['segments'] if seg['start'] >= i and seg['start'] < i+1 for seg in seg.get('words', [])]
+                            segment_text = " ".join(words_in_segment).strip()
+                            if segment_text:
+                                text_emotions = text_classifier(segment_text)[0]
+                                ter_timeline[i] = {emo['label']: emo['score'] for emo in text_emotions}
                 finally:
                     if video_clip: video_clip.close()
                     if 'temp_audio_path' in locals() and os.path.exists(temp_audio_path): os.unlink(temp_audio_path)
             # --- Post-Analysis and Visualization ---
             st.header("Analysis Results")
+            fer_df = pd.DataFrame.from_dict(fer_timeline, orient='index').rename(columns=FACIAL_TO_UNIFIED)
+            ser_df = pd.DataFrame.from_dict(ser_timeline, orient='index').rename(columns=SER_TO_UNIFIED)
+            ter_df = pd.DataFrame.from_dict(ter_timeline, orient='index').rename(columns=TEXT_TO_UNIFIED)
+            fer_avg_scores = fer_df[UNIFIED_EMOTIONS].mean().to_dict()
+            ser_avg_scores = ser_df[UNIFIED_EMOTIONS].mean().to_dict()
+            ter_avg_scores = ter_df[UNIFIED_EMOTIONS].mean().to_dict()
+            fer_vector = create_unified_vector(fer_avg_scores, {e: e for e in UNIFIED_EMOTIONS})
+            ser_vector = create_unified_vector(ser_avg_scores, {e: e for e in UNIFIED_EMOTIONS})
+            text_vector = create_unified_vector(ter_avg_scores, {e: e for e in UNIFIED_EMOTIONS})
             sim_face_text = cosine_similarity([fer_vector], [text_vector])[0][0]
             sim_face_speech = cosine_similarity([fer_vector], [ser_vector])[0][0]
             sim_speech_text = cosine_similarity([ser_vector], [text_vector])[0][0]
+            avg_similarity = np.mean([sim for sim in [sim_face_text, sim_face_speech, sim_speech_text] if not np.isnan(sim)])
             dominant_fer = max(fer_avg_scores, key=fer_avg_scores.get) if fer_avg_scores else "N/A"
+            dominant_text_raw = max(ter_avg_scores, key=ter_avg_scores.get) if ter_avg_scores else "N/A"
+            dominant_ser_raw = max(ser_avg_scores, key=ser_avg_scores.get) if ser_avg_scores else "N/A"
+            display_fer = FACIAL_TO_UNIFIED.get(dominant_fer.lower(), "N/A").capitalize()
             display_text = TEXT_TO_UNIFIED.get(dominant_text_raw, "N/A").capitalize()
             display_ser = SER_TO_UNIFIED.get(dominant_ser_raw, "N/A").capitalize()
             col1, col2 = st.columns([1, 2])
             with col1:
                 st.subheader("Multimodal Summary")
+                st.write(f"**Transcription:** \"{full_transcription}\"")
                 st.metric("Dominant Facial Emotion", display_fer)
                 st.metric("Dominant Text Emotion", display_text)
                 st.metric("Dominant Speech Emotion", display_ser)
                 st.metric("Emotion Consistency", get_consistency_level(avg_similarity), f"{avg_similarity:.2f} Avg. Cosine Similarity")
             with col2:
+                st.subheader("Unified Emotion Timeline")
+                combined_df = pd.DataFrame(index=range(int(duration)))
+                for emotion in UNIFIED_EMOTIONS:
+                    if emotion in fer_df: combined_df[f'Facial_{emotion}'] = fer_df[emotion]
+                    if emotion in ser_df: combined_df[f'Speech_{emotion}'] = ser_df[emotion]
+                    if emotion in ter_df: combined_df[f'Text_{emotion}'] = ter_df[emotion]
+                combined_df.ffill(inplace=True)
+                combined_df.fillna(0, inplace=True)
+                fig, ax = plt.subplots(figsize=(10, 5))
+                colors = {'happy': 'green', 'sad': 'blue', 'angry': 'red', 'neutral': 'gray'}
+                styles = {'Facial': '-', 'Speech': '--', 'Text': ':'}
+                for col in combined_df.columns:
+                    modality, emotion = col.split('_')
+                    if emotion in colors:
+                        ax.plot(combined_df.index, combined_df[col], label=f'{modality} {emotion.capitalize()}', color=colors[emotion], linestyle=styles[modality], alpha=0.8)
+                ax.set_title("Emotion Confidence Over Time")
+                ax.set_xlabel("Time (seconds)")
+                ax.set_ylabel("Confidence Score")
+                ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+                ax.grid(True, which='both', linestyle='--', linewidth=0.5)
+                plt.tight_layout()
+                st.pyplot(fig)
     finally:
         if temp_video_path and os.path.exists(temp_video_path):
             time.sleep(1)
             try:
                 os.unlink(temp_video_path)
+            except Exception: pass