Spaces:
Sleeping
Sleeping
| import os | |
| import librosa | |
| import numpy as np | |
| import speech_recognition as sr | |
| from groq import Groq | |
| from inference_sdk import InferenceHTTPClient | |
| from transformers import pipeline | |
| # Initialize the voice emotion pipeline once (global) | |
| # This prevents reloading the model on every function call | |
| try: | |
| voice_pipe = pipeline( | |
| "audio-classification", | |
| model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" | |
| ) | |
| except Exception as e: | |
| print(f"Warning: Could not load voice emotion model: {e}") | |
| voice_pipe = None | |
| def get_facial_emotion(image_path): | |
| """ | |
| Analyzes facial emotion from an image using Roboflow API. | |
| Args: | |
| image_path: Path to the image file | |
| Returns: | |
| str: Detected emotion (e.g., "happy", "sad", "neutral") | |
| """ | |
| try: | |
| # Get API key from environment variable | |
| api_key = os.getenv("ROBOFLOW_API_KEY") | |
| if not api_key: | |
| print("Error: ROBOFLOW_API_KEY not found in environment variables") | |
| return "neutral" | |
| # Initialize Roboflow client | |
| client = InferenceHTTPClient( | |
| api_url="https://detect.roboflow.com", | |
| api_key=api_key | |
| ) | |
| # Run inference on the image | |
| result = client.infer(image_path, model_id="human-face-emotions/28") | |
| # Parse response and get top prediction | |
| if result and "predictions" in result and len(result["predictions"]) > 0: | |
| top_prediction = result["predictions"][0] | |
| emotion = top_prediction.get("class", "neutral") | |
| confidence = top_prediction.get("confidence", 0) | |
| print(f"Facial emotion detected: {emotion} (confidence: {confidence:.2f})") | |
| return emotion | |
| else: | |
| print("No face detected in image") | |
| return "neutral" | |
| except Exception as e: | |
| print(f"Error in facial emotion detection: {e}") | |
| return "neutral" | |
| def get_voice_emotion(audio_path): | |
| """ | |
| Analyzes vocal emotion from an audio file using Hugging Face transformers. | |
| Args: | |
| audio_path: Path to the audio file | |
| Returns: | |
| str: Detected emotion (e.g., "calm", "angry", "happy") | |
| """ | |
| try: | |
| if voice_pipe is None: | |
| print("Voice emotion model not loaded") | |
| return "neutral" | |
| # Load audio file and resample to 16kHz (required by the model) | |
| audio_array, sample_rate = librosa.load(audio_path, sr=16000) | |
| # Run inference | |
| result = voice_pipe(audio_array) | |
| # Get the highest scoring emotion | |
| if result and len(result) > 0: | |
| top_emotion = result[0] | |
| emotion_label = top_emotion.get("label", "neutral") | |
| score = top_emotion.get("score", 0) | |
| print(f"Voice emotion detected: {emotion_label} (score: {score:.2f})") | |
| return emotion_label | |
| else: | |
| return "neutral" | |
| except Exception as e: | |
| print(f"Error in voice emotion detection: {e}") | |
| return "neutral" | |
| def get_transcript(audio_path): | |
| """ | |
| Transcribes speech from an audio file using Google Speech Recognition. | |
| Args: | |
| audio_path: Path to the audio file | |
| Returns: | |
| str: Transcribed text, or empty string if transcription fails | |
| """ | |
| try: | |
| # Initialize recognizer | |
| r = sr.Recognizer() | |
| # Load audio file | |
| with sr.AudioFile(audio_path) as source: | |
| audio_data = r.record(source) | |
| # Transcribe using Google Speech Recognition | |
| text = r.recognize_google(audio_data) | |
| print(f"Transcription: {text}") | |
| return text | |
| except sr.UnknownValueError: | |
| print("Could not understand audio") | |
| return "" | |
| except sr.RequestError as e: | |
| print(f"Could not request results from Google Speech Recognition service: {e}") | |
| return "" | |
| except Exception as e: | |
| print(f"Error in transcription: {e}") | |
| return "" | |
| def get_llm_response(user_query, face, voice, text): | |
| """ | |
| Generates an empathetic response using Groq LLM based on emotional context. | |
| Args: | |
| user_query: The user's typed query | |
| face: Detected facial emotion | |
| voice: Detected vocal emotion | |
| text: Transcribed speech text | |
| Returns: | |
| str: AI-generated empathetic response | |
| """ | |
| try: | |
| # Get API key from environment variable | |
| api_key = os.getenv("GROQ_API_KEY") | |
| if not api_key: | |
| return "Error: GROQ_API_KEY not found in environment variables" | |
| # Initialize Groq client | |
| client = Groq(api_key=api_key) | |
| # Create detailed system prompt with emotional context | |
| system_prompt = f"""You are an empathetic AI assistant that provides thoughtful, caring responses based on the user's emotional state. | |
| **Emotional Context Analysis:** | |
| - Facial Expression: {face} | |
| - Vocal Tone: {voice} | |
| - Spoken Words: {text if text else "No speech detected"} | |
| **Instructions:** | |
| 1. First, acknowledge and validate the user's emotional state based on the above indicators | |
| 2. Show empathy and understanding | |
| 3. Provide a helpful, supportive answer to their query | |
| 4. Keep your response warm, genuine, and human-like | |
| 5. If there are discrepancies between emotional signals, address them sensitively | |
| **User's Query:** {user_query} | |
| Respond in a natural, conversational manner that demonstrates emotional intelligence.""" | |
| # Call Groq API | |
| chat_completion = client.chat.completions.create( | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": system_prompt | |
| } | |
| ], | |
| model="llama-3.1-8b-instant", | |
| temperature=0.7, | |
| max_tokens=1024 | |
| ) | |
| # Extract and return response | |
| response = chat_completion.choices[0].message.content | |
| return response | |
| except Exception as e: | |
| return f"Error generating response: {e}" | |
| # The record_audio function has been removed as it is no longer needed. | |
| # st.audio_recorder in app.py handles audio capture in the browser. | |