Spaces:
Sleeping
Sleeping
File size: 6,478 Bytes
813d2f0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 | import os
import librosa
import numpy as np
import speech_recognition as sr
from groq import Groq
from inference_sdk import InferenceHTTPClient
from transformers import pipeline
# Initialize the voice emotion pipeline once (global)
# This prevents reloading the model on every function call
try:
voice_pipe = pipeline(
"audio-classification",
model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
)
except Exception as e:
print(f"Warning: Could not load voice emotion model: {e}")
voice_pipe = None
def get_facial_emotion(image_path):
"""
Analyzes facial emotion from an image using Roboflow API.
Args:
image_path: Path to the image file
Returns:
str: Detected emotion (e.g., "happy", "sad", "neutral")
"""
try:
# Get API key from environment variable
api_key = os.getenv("ROBOFLOW_API_KEY")
if not api_key:
print("Error: ROBOFLOW_API_KEY not found in environment variables")
return "neutral"
# Initialize Roboflow client
client = InferenceHTTPClient(
api_url="https://detect.roboflow.com",
api_key=api_key
)
# Run inference on the image
result = client.infer(image_path, model_id="human-face-emotions/28")
# Parse response and get top prediction
if result and "predictions" in result and len(result["predictions"]) > 0:
top_prediction = result["predictions"][0]
emotion = top_prediction.get("class", "neutral")
confidence = top_prediction.get("confidence", 0)
print(f"Facial emotion detected: {emotion} (confidence: {confidence:.2f})")
return emotion
else:
print("No face detected in image")
return "neutral"
except Exception as e:
print(f"Error in facial emotion detection: {e}")
return "neutral"
def get_voice_emotion(audio_path):
"""
Analyzes vocal emotion from an audio file using Hugging Face transformers.
Args:
audio_path: Path to the audio file
Returns:
str: Detected emotion (e.g., "calm", "angry", "happy")
"""
try:
if voice_pipe is None:
print("Voice emotion model not loaded")
return "neutral"
# Load audio file and resample to 16kHz (required by the model)
audio_array, sample_rate = librosa.load(audio_path, sr=16000)
# Run inference
result = voice_pipe(audio_array)
# Get the highest scoring emotion
if result and len(result) > 0:
top_emotion = result[0]
emotion_label = top_emotion.get("label", "neutral")
score = top_emotion.get("score", 0)
print(f"Voice emotion detected: {emotion_label} (score: {score:.2f})")
return emotion_label
else:
return "neutral"
except Exception as e:
print(f"Error in voice emotion detection: {e}")
return "neutral"
def get_transcript(audio_path):
"""
Transcribes speech from an audio file using Google Speech Recognition.
Args:
audio_path: Path to the audio file
Returns:
str: Transcribed text, or empty string if transcription fails
"""
try:
# Initialize recognizer
r = sr.Recognizer()
# Load audio file
with sr.AudioFile(audio_path) as source:
audio_data = r.record(source)
# Transcribe using Google Speech Recognition
text = r.recognize_google(audio_data)
print(f"Transcription: {text}")
return text
except sr.UnknownValueError:
print("Could not understand audio")
return ""
except sr.RequestError as e:
print(f"Could not request results from Google Speech Recognition service: {e}")
return ""
except Exception as e:
print(f"Error in transcription: {e}")
return ""
def get_llm_response(user_query, face, voice, text):
"""
Generates an empathetic response using Groq LLM based on emotional context.
Args:
user_query: The user's typed query
face: Detected facial emotion
voice: Detected vocal emotion
text: Transcribed speech text
Returns:
str: AI-generated empathetic response
"""
try:
# Get API key from environment variable
api_key = os.getenv("GROQ_API_KEY")
if not api_key:
return "Error: GROQ_API_KEY not found in environment variables"
# Initialize Groq client
client = Groq(api_key=api_key)
# Create detailed system prompt with emotional context
system_prompt = f"""You are an empathetic AI assistant that provides thoughtful, caring responses based on the user's emotional state.
**Emotional Context Analysis:**
- Facial Expression: {face}
- Vocal Tone: {voice}
- Spoken Words: {text if text else "No speech detected"}
**Instructions:**
1. First, acknowledge and validate the user's emotional state based on the above indicators
2. Show empathy and understanding
3. Provide a helpful, supportive answer to their query
4. Keep your response warm, genuine, and human-like
5. If there are discrepancies between emotional signals, address them sensitively
**User's Query:** {user_query}
Respond in a natural, conversational manner that demonstrates emotional intelligence."""
# Call Groq API
chat_completion = client.chat.completions.create(
messages=[
{
"role": "system",
"content": system_prompt
}
],
model="llama-3.1-8b-instant",
temperature=0.7,
max_tokens=1024
)
# Extract and return response
response = chat_completion.choices[0].message.content
return response
except Exception as e:
return f"Error generating response: {e}"
# The record_audio function has been removed as it is no longer needed.
# st.audio_recorder in app.py handles audio capture in the browser.
|