Kevin King commited on
Commit
a667ff5
Β·
1 Parent(s): 75c12ca

refactor: simplify Streamlit app for webcam testing and remove unused components

Browse files
Files changed (2) hide show
  1. src/streamlit_app.py +15 -173
  2. src/streamlit_app.py.old +199 -0
src/streamlit_app.py CHANGED
@@ -1,68 +1,19 @@
1
- import os
2
  import streamlit as st
3
-
4
- # Set home directories for model caching inside the app's writable directory
5
- os.environ['DEEPFACE_HOME'] = '/tmp/.deepface'
6
- os.environ['HF_HOME'] = '/tmp/huggingface'
7
-
8
  from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration
9
  import av
10
- import numpy as np
11
- import torch
12
- import whisper
13
- from transformers import pipeline, AutoModelForAudioClassification, AutoFeatureExtractor
14
- from deepface import DeepFace
15
- import logging
16
- import queue
17
- import soundfile as sf
18
- from scipy.io.wavfile import write as write_wav
19
- import tempfile
20
 
21
  # --- Page Configuration ---
22
  st.set_page_config(
23
- page_title="AffectLink Online Demo",
24
- page_icon="😊",
25
  layout="wide"
26
  )
27
 
28
- st.title("AffectLink: Real-time Emotion Recognition")
29
- st.write("This demo analyzes your facial expressions in real-time and processes short audio clips for speech and text-based emotion.")
30
-
31
- # --- Logger Configuration ---
32
- logging.basicConfig(level=logging.INFO)
33
- logging.getLogger('deepface').setLevel(logging.ERROR)
34
- logging.getLogger('huggingface_hub').setLevel(logging.WARNING)
35
-
36
- # --- Emotion Mappings ---
37
- UNIFIED_EMOTIONS = ['neutral', 'happy', 'sad', 'angry']
38
- FACIAL_TO_UNIFIED = {
39
- 'neutral': 'neutral', 'happy': 'happy', 'sad': 'sad', 'angry': 'angry',
40
- 'fear': None, 'surprise': None, 'disgust': None
41
- }
42
- TEXT_TO_UNIFIED = {
43
- 'neutral': 'neutral', 'joy': 'happy', 'sadness': 'sad', 'anger': 'angry',
44
- 'fear': None, 'surprise': None, 'disgust': None
45
- }
46
- SER_TO_UNIFIED = {
47
- 'neu': 'neutral', 'hap': 'happy', 'sad': 'sad', 'ang': 'angry'
48
- }
49
- AUDIO_SAMPLE_RATE = 16000
50
 
51
- # --- Model Loading ---
52
- @st.cache_resource
53
- def load_models():
54
- with st.spinner("Loading AI models... This may take a moment on first run."):
55
- whisper_model = whisper.load_model("base", download_root="/tmp/whisper_cache")
56
- text_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)
57
- ser_model_name = "superb/hubert-large-superb-er"
58
- ser_feature_extractor = AutoFeatureExtractor.from_pretrained(ser_model_name)
59
- ser_model = AutoModelForAudioClassification.from_pretrained(ser_model_name)
60
- return whisper_model, text_classifier, ser_model, ser_feature_extractor
61
-
62
- whisper_model, text_classifier, ser_model, ser_feature_extractor = load_models()
63
-
64
- # --- WebRTC and Video Processing ---
65
- # === THIS IS THE UPDATED CONFIGURATION ===
66
  RTC_CONFIGURATION = RTCConfiguration(
67
  {"iceServers": [{"urls": [
68
  "stun:stun.l.google.com:19302",
@@ -72,128 +23,19 @@ RTC_CONFIGURATION = RTCConfiguration(
72
  "stun:stun4.l.google.com:19302",
73
  ]}]}
74
  )
75
- # ==========================================
76
-
77
- webrtc_ctx = webrtc_streamer(
78
- key="affectlink-video",
79
- mode=WebRtcMode.SENDRECV,
80
- rtc_configuration=RTC_CONFIGURATION, # Use the new config
81
- media_stream_constraints={"video": True, "audio": False},
82
- async_processing=True,
83
- )
84
-
85
- if 'facial_emotion' not in st.session_state:
86
- st.session_state.facial_emotion = "Neutral"
87
- if 'last_emotion_time' not in st.session_state:
88
- st.session_state.last_emotion_time = 0
89
 
 
90
  def video_frame_callback(frame: av.VideoFrame) -> av.VideoFrame:
91
- img = frame.to_ndarray(format="bgr24")
92
- current_time = st.session_state.get('last_emotion_time', 0)
93
- if torch.cuda.is_available() or (hasattr(st.session_state, 'last_emotion_time') and (torch.tensor(current_time).item() + 5 < torch.tensor(frame.time).item())):
94
- try:
95
- analysis = DeepFace.analyze(img, actions=['emotion'], enforce_detection=False, silent=True)
96
- if isinstance(analysis, list) and len(analysis) > 0:
97
- dominant_emotion = analysis[0]['dominant_emotion']
98
- st.session_state.facial_emotion = dominant_emotion.capitalize()
99
- else:
100
- st.session_state.facial_emotion = "Unknown"
101
- except Exception as e:
102
- logging.error(f"DeepFace analysis failed: {e}")
103
- st.session_state.facial_emotion = "Error"
104
- st.session_state.last_emotion_time = frame.time
105
- return av.VideoFrame.from_ndarray(img, format="bgr24")
106
-
107
- if webrtc_ctx.video_processor:
108
- webrtc_ctx.video_processor.video_frame_callback = video_frame_callback
109
-
110
- # --- Audio Processing ---
111
- if "audio_buffer" not in st.session_state:
112
- st.session_state.audio_buffer = []
113
-
114
- def audio_frame_callback(frame: av.AudioFrame):
115
- sound = np.frombuffer(frame.to_ndarray(), dtype=np.int16)
116
- st.session_state.audio_buffer.append(sound)
117
 
 
118
  webrtc_streamer(
119
- key="affectlink-audio",
120
- mode=WebRtcMode.RECVONLY,
121
- rtc_configuration=RTC_CONFIGURATION, # Use the new config
122
- media_stream_constraints={"video": False, "audio": True},
123
- audio_frame_callback=audio_frame_callback,
124
  async_processing=True,
125
  )
126
 
127
- # --- UI Layout ---
128
- st.sidebar.header("Facial Emotion")
129
- st.sidebar.metric("Current Expression", st.session_state.get('facial_emotion', 'N/A'))
130
- st.sidebar.info("Facial emotion is updated every 5 seconds.")
131
- st.sidebar.divider()
132
-
133
- st.sidebar.header("Audio Analysis")
134
- is_recording = st.sidebar.checkbox("Start Recording Audio")
135
-
136
- st.sidebar.subheader("Transcription:")
137
- transcription_placeholder = st.sidebar.empty()
138
- transcription_placeholder.write("_Waiting for audio..._")
139
-
140
- st.sidebar.subheader("Text Emotion:")
141
- text_emotion_placeholder = st.sidebar.empty()
142
- text_emotion_placeholder.write("_Waiting for audio..._")
143
-
144
- st.sidebar.subheader("Speech Emotion:")
145
- ser_placeholder = st.sidebar.empty()
146
- ser_placeholder.write("_Waiting for audio..._")
147
-
148
- if not is_recording and st.session_state.audio_buffer:
149
- audio_data = np.concatenate(st.session_state.audio_buffer)
150
- st.session_state.audio_buffer = []
151
-
152
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio_file:
153
- write_wav(tmp_audio_file.name, AUDIO_SAMPLE_RATE, audio_data)
154
- with st.spinner("Transcribing audio..."):
155
- try:
156
- result = whisper_model.transcribe(tmp_audio_file.name, fp16=False)
157
- transcribed_text = result['text']
158
- except Exception as e:
159
- transcribed_text = f"Transcription failed: {e}"
160
- transcription_placeholder.write(f'"{transcribed_text}"')
161
-
162
- with st.spinner("Analyzing text emotion..."):
163
- if transcribed_text:
164
- try:
165
- text_emotions = text_classifier(transcribed_text)[0]
166
- unified_text_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
167
- for EMO in text_emotions:
168
- unified_emo = TEXT_TO_UNIFIED.get(EMO['label'])
169
- if unified_emo:
170
- unified_text_scores[unified_emo] += EMO['score']
171
- dominant_text_emotion = max(unified_text_scores, key=unified_text_scores.get)
172
- except Exception as e:
173
- dominant_text_emotion = f"Text analysis failed: {e}"
174
- else:
175
- dominant_text_emotion = "No text to analyze."
176
- text_emotion_placeholder.write(dominant_text_emotion.capitalize())
177
-
178
- with st.spinner("Analyzing speech emotion..."):
179
- try:
180
- audio_array, _ = sf.read(tmp_audio_file.name)
181
- inputs = ser_feature_extractor(audio_array, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
182
- with torch.no_grad():
183
- logits = ser_model(**inputs).logits
184
- scores = torch.nn.functional.softmax(logits, dim=1).squeeze()
185
- unified_ser_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
186
- for i, score in enumerate(scores):
187
- raw_emo = ser_model.config.id2label[i]
188
- unified_emo = SER_TO_UNIFIED.get(raw_emo)
189
- if unified_emo:
190
- unified_ser_scores[unified_emo] += score.item()
191
- dominant_ser_emotion = max(unified_ser_scores, key=unified_ser_scores.get)
192
- except Exception as e:
193
- dominant_ser_emotion = f"Speech analysis failed: {e}"
194
- ser_placeholder.write(dominant_ser_emotion.capitalize())
195
-
196
- os.unlink(tmp_audio_file.name)
197
-
198
- elif is_recording:
199
- st.sidebar.warning("Recording audio... Uncheck to stop and process.")
 
 
1
  import streamlit as st
 
 
 
 
 
2
  from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration
3
  import av
 
 
 
 
 
 
 
 
 
 
4
 
5
  # --- Page Configuration ---
6
  st.set_page_config(
7
+ page_title="AffectLink Webcam Test",
8
+ page_icon="πŸ“Έ",
9
  layout="wide"
10
  )
11
 
12
+ st.title("AffectLink: Webcam Feed Test")
13
+ st.write("This is a minimal test to check if the webcam video stream can be established.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ # --- WebRTC Configuration ---
16
+ # Using a robust list of public STUN servers to help establish a connection
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  RTC_CONFIGURATION = RTCConfiguration(
18
  {"iceServers": [{"urls": [
19
  "stun:stun.l.google.com:19302",
 
23
  "stun:stun4.l.google.com:19302",
24
  ]}]}
25
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ # A simple "pass-through" function that returns the frame without modification
28
  def video_frame_callback(frame: av.VideoFrame) -> av.VideoFrame:
29
+ return frame
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # --- WebRTC Streamer ---
32
  webrtc_streamer(
33
+ key="webcam-test",
34
+ mode=WebRtcMode.SENDRECV,
35
+ rtc_configuration=RTC_CONFIGURATION,
36
+ media_stream_constraints={"video": True, "audio": False},
37
+ video_frame_callback=video_frame_callback,
38
  async_processing=True,
39
  )
40
 
41
+ st.sidebar.info("This is a test page. All AI analysis and audio processing has been disabled.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/streamlit_app.py.old ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+
4
+ # Set home directories for model caching inside the app's writable directory
5
+ os.environ['DEEPFACE_HOME'] = '/tmp/.deepface'
6
+ os.environ['HF_HOME'] = '/tmp/huggingface'
7
+
8
+ from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration
9
+ import av
10
+ import numpy as np
11
+ import torch
12
+ import whisper
13
+ from transformers import pipeline, AutoModelForAudioClassification, AutoFeatureExtractor
14
+ from deepface import DeepFace
15
+ import logging
16
+ import queue
17
+ import soundfile as sf
18
+ from scipy.io.wavfile import write as write_wav
19
+ import tempfile
20
+
21
+ # --- Page Configuration ---
22
+ st.set_page_config(
23
+ page_title="AffectLink Online Demo",
24
+ page_icon="😊",
25
+ layout="wide"
26
+ )
27
+
28
+ st.title("AffectLink: Real-time Emotion Recognition")
29
+ st.write("This demo analyzes your facial expressions in real-time and processes short audio clips for speech and text-based emotion.")
30
+
31
+ # --- Logger Configuration ---
32
+ logging.basicConfig(level=logging.INFO)
33
+ logging.getLogger('deepface').setLevel(logging.ERROR)
34
+ logging.getLogger('huggingface_hub').setLevel(logging.WARNING)
35
+
36
+ # --- Emotion Mappings ---
37
+ UNIFIED_EMOTIONS = ['neutral', 'happy', 'sad', 'angry']
38
+ FACIAL_TO_UNIFIED = {
39
+ 'neutral': 'neutral', 'happy': 'happy', 'sad': 'sad', 'angry': 'angry',
40
+ 'fear': None, 'surprise': None, 'disgust': None
41
+ }
42
+ TEXT_TO_UNIFIED = {
43
+ 'neutral': 'neutral', 'joy': 'happy', 'sadness': 'sad', 'anger': 'angry',
44
+ 'fear': None, 'surprise': None, 'disgust': None
45
+ }
46
+ SER_TO_UNIFIED = {
47
+ 'neu': 'neutral', 'hap': 'happy', 'sad': 'sad', 'ang': 'angry'
48
+ }
49
+ AUDIO_SAMPLE_RATE = 16000
50
+
51
+ # --- Model Loading ---
52
+ @st.cache_resource
53
+ def load_models():
54
+ with st.spinner("Loading AI models... This may take a moment on first run."):
55
+ whisper_model = whisper.load_model("base", download_root="/tmp/whisper_cache")
56
+ text_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)
57
+ ser_model_name = "superb/hubert-large-superb-er"
58
+ ser_feature_extractor = AutoFeatureExtractor.from_pretrained(ser_model_name)
59
+ ser_model = AutoModelForAudioClassification.from_pretrained(ser_model_name)
60
+ return whisper_model, text_classifier, ser_model, ser_feature_extractor
61
+
62
+ whisper_model, text_classifier, ser_model, ser_feature_extractor = load_models()
63
+
64
+ # --- WebRTC and Video Processing ---
65
+ # === THIS IS THE UPDATED CONFIGURATION ===
66
+ RTC_CONFIGURATION = RTCConfiguration(
67
+ {"iceServers": [{"urls": [
68
+ "stun:stun.l.google.com:19302",
69
+ "stun:stun1.l.google.com:19302",
70
+ "stun:stun2.l.google.com:19302",
71
+ "stun:stun3.l.google.com:19302",
72
+ "stun:stun4.l.google.com:19302",
73
+ ]}]}
74
+ )
75
+ # ==========================================
76
+
77
+ webrtc_ctx = webrtc_streamer(
78
+ key="affectlink-video",
79
+ mode=WebRtcMode.SENDRECV,
80
+ rtc_configuration=RTC_CONFIGURATION, # Use the new config
81
+ media_stream_constraints={"video": True, "audio": False},
82
+ async_processing=True,
83
+ )
84
+
85
+ if 'facial_emotion' not in st.session_state:
86
+ st.session_state.facial_emotion = "Neutral"
87
+ if 'last_emotion_time' not in st.session_state:
88
+ st.session_state.last_emotion_time = 0
89
+
90
+ def video_frame_callback(frame: av.VideoFrame) -> av.VideoFrame:
91
+ img = frame.to_ndarray(format="bgr24")
92
+ current_time = st.session_state.get('last_emotion_time', 0)
93
+ if torch.cuda.is_available() or (hasattr(st.session_state, 'last_emotion_time') and (torch.tensor(current_time).item() + 5 < torch.tensor(frame.time).item())):
94
+ try:
95
+ analysis = DeepFace.analyze(img, actions=['emotion'], enforce_detection=False, silent=True)
96
+ if isinstance(analysis, list) and len(analysis) > 0:
97
+ dominant_emotion = analysis[0]['dominant_emotion']
98
+ st.session_state.facial_emotion = dominant_emotion.capitalize()
99
+ else:
100
+ st.session_state.facial_emotion = "Unknown"
101
+ except Exception as e:
102
+ logging.error(f"DeepFace analysis failed: {e}")
103
+ st.session_state.facial_emotion = "Error"
104
+ st.session_state.last_emotion_time = frame.time
105
+ return av.VideoFrame.from_ndarray(img, format="bgr24")
106
+
107
+ if webrtc_ctx.video_processor:
108
+ webrtc_ctx.video_processor.video_frame_callback = video_frame_callback
109
+
110
+ # --- Audio Processing ---
111
+ if "audio_buffer" not in st.session_state:
112
+ st.session_state.audio_buffer = []
113
+
114
+ def audio_frame_callback(frame: av.AudioFrame):
115
+ sound = np.frombuffer(frame.to_ndarray(), dtype=np.int16)
116
+ st.session_state.audio_buffer.append(sound)
117
+
118
+ webrtc_streamer(
119
+ key="affectlink-audio",
120
+ mode=WebRtcMode.RECVONLY,
121
+ rtc_configuration=RTC_CONFIGURATION, # Use the new config
122
+ media_stream_constraints={"video": False, "audio": True},
123
+ audio_frame_callback=audio_frame_callback,
124
+ async_processing=True,
125
+ )
126
+
127
+ # --- UI Layout ---
128
+ st.sidebar.header("Facial Emotion")
129
+ st.sidebar.metric("Current Expression", st.session_state.get('facial_emotion', 'N/A'))
130
+ st.sidebar.info("Facial emotion is updated every 5 seconds.")
131
+ st.sidebar.divider()
132
+
133
+ st.sidebar.header("Audio Analysis")
134
+ is_recording = st.sidebar.checkbox("Start Recording Audio")
135
+
136
+ st.sidebar.subheader("Transcription:")
137
+ transcription_placeholder = st.sidebar.empty()
138
+ transcription_placeholder.write("_Waiting for audio..._")
139
+
140
+ st.sidebar.subheader("Text Emotion:")
141
+ text_emotion_placeholder = st.sidebar.empty()
142
+ text_emotion_placeholder.write("_Waiting for audio..._")
143
+
144
+ st.sidebar.subheader("Speech Emotion:")
145
+ ser_placeholder = st.sidebar.empty()
146
+ ser_placeholder.write("_Waiting for audio..._")
147
+
148
+ if not is_recording and st.session_state.audio_buffer:
149
+ audio_data = np.concatenate(st.session_state.audio_buffer)
150
+ st.session_state.audio_buffer = []
151
+
152
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio_file:
153
+ write_wav(tmp_audio_file.name, AUDIO_SAMPLE_RATE, audio_data)
154
+ with st.spinner("Transcribing audio..."):
155
+ try:
156
+ result = whisper_model.transcribe(tmp_audio_file.name, fp16=False)
157
+ transcribed_text = result['text']
158
+ except Exception as e:
159
+ transcribed_text = f"Transcription failed: {e}"
160
+ transcription_placeholder.write(f'"{transcribed_text}"')
161
+
162
+ with st.spinner("Analyzing text emotion..."):
163
+ if transcribed_text:
164
+ try:
165
+ text_emotions = text_classifier(transcribed_text)[0]
166
+ unified_text_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
167
+ for EMO in text_emotions:
168
+ unified_emo = TEXT_TO_UNIFIED.get(EMO['label'])
169
+ if unified_emo:
170
+ unified_text_scores[unified_emo] += EMO['score']
171
+ dominant_text_emotion = max(unified_text_scores, key=unified_text_scores.get)
172
+ except Exception as e:
173
+ dominant_text_emotion = f"Text analysis failed: {e}"
174
+ else:
175
+ dominant_text_emotion = "No text to analyze."
176
+ text_emotion_placeholder.write(dominant_text_emotion.capitalize())
177
+
178
+ with st.spinner("Analyzing speech emotion..."):
179
+ try:
180
+ audio_array, _ = sf.read(tmp_audio_file.name)
181
+ inputs = ser_feature_extractor(audio_array, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
182
+ with torch.no_grad():
183
+ logits = ser_model(**inputs).logits
184
+ scores = torch.nn.functional.softmax(logits, dim=1).squeeze()
185
+ unified_ser_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
186
+ for i, score in enumerate(scores):
187
+ raw_emo = ser_model.config.id2label[i]
188
+ unified_emo = SER_TO_UNIFIED.get(raw_emo)
189
+ if unified_emo:
190
+ unified_ser_scores[unified_emo] += score.item()
191
+ dominant_ser_emotion = max(unified_ser_scores, key=unified_ser_scores.get)
192
+ except Exception as e:
193
+ dominant_ser_emotion = f"Speech analysis failed: {e}"
194
+ ser_placeholder.write(dominant_ser_emotion.capitalize())
195
+
196
+ os.unlink(tmp_audio_file.name)
197
+
198
+ elif is_recording:
199
+ st.sidebar.warning("Recording audio... Uncheck to stop and process.")