Spaces:
Sleeping
Sleeping
Kevin King
commited on
Upload 3 files
Browse filesreworked app.py file as a skeleton
- app.py +23 -11
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -60,6 +60,9 @@ def load_models():
|
|
| 60 |
ser_feature_extractor = AutoFeatureExtractor.from_pretrained(ser_model_name)
|
| 61 |
ser_model = AutoModelForAudioClassification.from_pretrained(ser_model_name)
|
| 62 |
|
|
|
|
|
|
|
|
|
|
| 63 |
return whisper_model, text_classifier, ser_model, ser_feature_extractor
|
| 64 |
|
| 65 |
whisper_model, text_classifier, ser_model, ser_feature_extractor = load_models()
|
|
@@ -123,16 +126,30 @@ webrtc_streamer(
|
|
| 123 |
async_processing=True,
|
| 124 |
)
|
| 125 |
|
|
|
|
| 126 |
st.sidebar.header("Facial Emotion")
|
| 127 |
st.sidebar.metric("Current Expression", st.session_state.get('facial_emotion', 'N/A'))
|
| 128 |
st.sidebar.info("Facial emotion is updated every 5 seconds to optimize performance.")
|
|
|
|
| 129 |
|
| 130 |
st.sidebar.header("Audio Analysis")
|
| 131 |
-
is_recording = st.sidebar.checkbox("Start Recording")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
if not is_recording and st.session_state.audio_buffer:
|
| 134 |
-
st.sidebar.info("Processing audio... please wait.")
|
| 135 |
-
|
| 136 |
# Combine audio chunks
|
| 137 |
audio_data = np.concatenate(st.session_state.audio_buffer)
|
| 138 |
st.session_state.audio_buffer = [] # Clear buffer
|
|
@@ -148,15 +165,13 @@ if not is_recording and st.session_state.audio_buffer:
|
|
| 148 |
transcribed_text = result['text']
|
| 149 |
except Exception as e:
|
| 150 |
transcribed_text = f"Transcription failed: {e}"
|
| 151 |
-
|
| 152 |
-
st.sidebar.write(transcribed_text)
|
| 153 |
|
| 154 |
# 2. Text-based Emotion
|
| 155 |
with st.spinner("Analyzing text emotion..."):
|
| 156 |
if transcribed_text:
|
| 157 |
try:
|
| 158 |
text_emotions = text_classifier(transcribed_text)[0]
|
| 159 |
-
# Map to unified emotions
|
| 160 |
unified_text_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
|
| 161 |
for EMO in text_emotions:
|
| 162 |
unified_emo = TEXT_TO_UNIFIED.get(EMO['label'])
|
|
@@ -168,8 +183,7 @@ if not is_recording and st.session_state.audio_buffer:
|
|
| 168 |
dominant_text_emotion = f"Text analysis failed: {e}"
|
| 169 |
else:
|
| 170 |
dominant_text_emotion = "No text to analyze."
|
| 171 |
-
|
| 172 |
-
st.sidebar.write(dominant_text_emotion.capitalize())
|
| 173 |
|
| 174 |
# 3. Speech Emotion Recognition (SER)
|
| 175 |
with st.spinner("Analyzing speech emotion..."):
|
|
@@ -180,7 +194,6 @@ if not is_recording and st.session_state.audio_buffer:
|
|
| 180 |
logits = ser_model(**inputs).logits
|
| 181 |
scores = torch.nn.functional.softmax(logits, dim=1).squeeze()
|
| 182 |
|
| 183 |
-
# Map to unified emotions
|
| 184 |
unified_ser_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
|
| 185 |
for i, score in enumerate(scores):
|
| 186 |
raw_emo = ser_model.config.id2label[i]
|
|
@@ -191,8 +204,7 @@ if not is_recording and st.session_state.audio_buffer:
|
|
| 191 |
dominant_ser_emotion = max(unified_ser_scores, key=unified_ser_scores.get)
|
| 192 |
except Exception as e:
|
| 193 |
dominant_ser_emotion = f"Speech analysis failed: {e}"
|
| 194 |
-
|
| 195 |
-
st.sidebar.write(dominant_ser_emotion.capitalize())
|
| 196 |
|
| 197 |
# Clean up the temporary file
|
| 198 |
os.unlink(tmp_audio_file.name)
|
|
|
|
| 60 |
ser_feature_extractor = AutoFeatureExtractor.from_pretrained(ser_model_name)
|
| 61 |
ser_model = AutoModelForAudioClassification.from_pretrained(ser_model_name)
|
| 62 |
|
| 63 |
+
# NEW: Pre-load the DeepFace model to prevent lag on first use
|
| 64 |
+
DeepFace.build_model("Emotion")
|
| 65 |
+
|
| 66 |
return whisper_model, text_classifier, ser_model, ser_feature_extractor
|
| 67 |
|
| 68 |
whisper_model, text_classifier, ser_model, ser_feature_extractor = load_models()
|
|
|
|
| 126 |
async_processing=True,
|
| 127 |
)
|
| 128 |
|
| 129 |
+
# --- UI Layout ---
|
| 130 |
st.sidebar.header("Facial Emotion")
|
| 131 |
st.sidebar.metric("Current Expression", st.session_state.get('facial_emotion', 'N/A'))
|
| 132 |
st.sidebar.info("Facial emotion is updated every 5 seconds to optimize performance.")
|
| 133 |
+
st.sidebar.divider()
|
| 134 |
|
| 135 |
st.sidebar.header("Audio Analysis")
|
| 136 |
+
is_recording = st.sidebar.checkbox("Start Recording Audio")
|
| 137 |
+
|
| 138 |
+
# NEW: Set up placeholders for audio results
|
| 139 |
+
st.sidebar.subheader("Transcription:")
|
| 140 |
+
transcription_placeholder = st.sidebar.empty()
|
| 141 |
+
transcription_placeholder.write("_Waiting for audio..._")
|
| 142 |
+
|
| 143 |
+
st.sidebar.subheader("Text Emotion:")
|
| 144 |
+
text_emotion_placeholder = st.sidebar.empty()
|
| 145 |
+
text_emotion_placeholder.write("_Waiting for audio..._")
|
| 146 |
+
|
| 147 |
+
st.sidebar.subheader("Speech Emotion:")
|
| 148 |
+
ser_placeholder = st.sidebar.empty()
|
| 149 |
+
ser_placeholder.write("_Waiting for audio..._")
|
| 150 |
+
|
| 151 |
|
| 152 |
if not is_recording and st.session_state.audio_buffer:
|
|
|
|
|
|
|
| 153 |
# Combine audio chunks
|
| 154 |
audio_data = np.concatenate(st.session_state.audio_buffer)
|
| 155 |
st.session_state.audio_buffer = [] # Clear buffer
|
|
|
|
| 165 |
transcribed_text = result['text']
|
| 166 |
except Exception as e:
|
| 167 |
transcribed_text = f"Transcription failed: {e}"
|
| 168 |
+
transcription_placeholder.write(f'"{transcribed_text}"')
|
|
|
|
| 169 |
|
| 170 |
# 2. Text-based Emotion
|
| 171 |
with st.spinner("Analyzing text emotion..."):
|
| 172 |
if transcribed_text:
|
| 173 |
try:
|
| 174 |
text_emotions = text_classifier(transcribed_text)[0]
|
|
|
|
| 175 |
unified_text_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
|
| 176 |
for EMO in text_emotions:
|
| 177 |
unified_emo = TEXT_TO_UNIFIED.get(EMO['label'])
|
|
|
|
| 183 |
dominant_text_emotion = f"Text analysis failed: {e}"
|
| 184 |
else:
|
| 185 |
dominant_text_emotion = "No text to analyze."
|
| 186 |
+
text_emotion_placeholder.write(dominant_text_emotion.capitalize())
|
|
|
|
| 187 |
|
| 188 |
# 3. Speech Emotion Recognition (SER)
|
| 189 |
with st.spinner("Analyzing speech emotion..."):
|
|
|
|
| 194 |
logits = ser_model(**inputs).logits
|
| 195 |
scores = torch.nn.functional.softmax(logits, dim=1).squeeze()
|
| 196 |
|
|
|
|
| 197 |
unified_ser_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
|
| 198 |
for i, score in enumerate(scores):
|
| 199 |
raw_emo = ser_model.config.id2label[i]
|
|
|
|
| 204 |
dominant_ser_emotion = max(unified_ser_scores, key=unified_ser_scores.get)
|
| 205 |
except Exception as e:
|
| 206 |
dominant_ser_emotion = f"Speech analysis failed: {e}"
|
| 207 |
+
ser_placeholder.write(dominant_ser_emotion.capitalize())
|
|
|
|
| 208 |
|
| 209 |
# Clean up the temporary file
|
| 210 |
os.unlink(tmp_audio_file.name)
|
requirements.txt
CHANGED
|
@@ -11,4 +11,5 @@ torchaudio
|
|
| 11 |
openai-whisper
|
| 12 |
soundfile
|
| 13 |
librosa
|
| 14 |
-
scipy
|
|
|
|
|
|
| 11 |
openai-whisper
|
| 12 |
soundfile
|
| 13 |
librosa
|
| 14 |
+
scipy
|
| 15 |
+
streamlit-autorefresh
|