Spaces:
Paused
Paused
| """ | |
| emotion_engine.py — Dual-Layer Emotional Intelligence for Maya | |
| Architecture: | |
| Layer 1: Audio emotion via SpeechBrain wav2vec2-IEMOCAP (Acoustic) | |
| Layer 2: Text emotion via keyword matching (Semantic) | |
| Fusion: Fused reading adapts Maya's LLM prompt and TTS parameters. | |
| """ | |
| import asyncio | |
| import numpy as np | |
| import torch | |
| import io | |
| from dataclasses import dataclass, field | |
| from typing import Optional | |
| from enum import Enum | |
| # ── EMOTION TYPES ──────────────────────────────────────────────────────────── | |
| class Emotion(str, Enum): | |
| CALM = "calm" | |
| HAPPY = "happy" | |
| CONFUSED = "confused" | |
| URGENT = "urgent" # Pain / Emergency | |
| FRUSTRATED = "frustrated" | |
| ANGRY = "angry" | |
| class EmotionResult: | |
| emotion: Emotion = Emotion.CALM | |
| confidence: float = 0.0 | |
| audio_emotion: Optional[str] = None | |
| text_emotion: Optional[str] = None | |
| triggered_words: list = field(default_factory=list) | |
| should_escalate: bool = False | |
| class ResponseProfile: | |
| emotion: Emotion | |
| tts_pace: float | |
| prompt_suffix: str | |
| gujarati_opener: str | |
| hindi_opener: str | |
| english_opener: str | |
| # ── RESPONSE PROFILES ──────────────────────────────────────────────────────── | |
| RESPONSE_PROFILES: dict[Emotion, ResponseProfile] = { | |
| Emotion.CALM: ResponseProfile( | |
| Emotion.CALM, 0.92, "", "", "", "" | |
| ), | |
| Emotion.HAPPY: ResponseProfile( | |
| Emotion.HAPPY, 0.95, | |
| "\nThe caller sounds happy. Match their energy with warmth.", | |
| "ખૂબ સારું! ", "बहुत अच्छा! ", "Wonderful! " | |
| ), | |
| Emotion.CONFUSED: ResponseProfile( | |
| Emotion.CONFUSED, 0.80, | |
| "\nThe caller sounds confused. Use simple language and ask ONE question at a time.", | |
| "ભલે, ચિંતા ન કરો. ", "ठीक है, चिंता मत करो। ", "No problem, let me help. " | |
| ), | |
| Emotion.URGENT: ResponseProfile( | |
| Emotion.URGENT, 1.00, | |
| "\nCRITICAL: The caller is in pain or has an emergency. Acknowledge this, get their name/phone, and say the doctor will call back IMMEDIATELY. Skip standard booking.", | |
| "હું સમજી ગઈ, આ ખૂબ અગત્યની વાત છે. ", "मैं समझ गई, यह बहुत ज़रूरी है। ", "I understand, this sounds urgent. " | |
| ), | |
| Emotion.FRUSTRATED: ResponseProfile( | |
| Emotion.FRUSTRATED, 0.88, | |
| "\nThe caller sounds frustrated. Keep responses short (max 2 sentences) and offer to have the owner call them back.", | |
| "માફ કરશો, હું સમજી ગઈ. ", "माफ़ करें, मैं समझ गई। ", "I apologize for the inconvenience. " | |
| ), | |
| Emotion.ANGRY: ResponseProfile( | |
| Emotion.ANGRY, 0.82, | |
| "\nCRITICAL: The caller is ANGRY. Be extremely apologetic and brief. Say the owner will call them personally within minutes.", | |
| "મને ખૂબ જ ખેદ છે. ", "मुझे बहुत खेद है। ", "I sincerely apologize. " | |
| ), | |
| } | |
| # ── TEXT KEYWORDS ──────────────────────────────────────────────────────────── | |
| TEXT_KEYWORDS: dict[Emotion, dict[str, list[str]]] = { | |
| Emotion.URGENT: { | |
| "gujarati": ["દુઃખાવો", "પીડા", "ઇમર્જન્સી", "તાત્કાલિક", "અત્યારે જ", "હમણાં જ", "દુઃખે છે", "તકલીફ"], | |
| "hindi": ["दर्द", "तकलीफ", "पीड़ा", "इमरजेंसी", "तुरंत", "अभी", "जल्दी", "अर्जेंट"], | |
| "english": ["pain", "emergency", "urgent", "immediately", "hurts", "bleeding", "critical"] | |
| }, | |
| Emotion.FRUSTRATED: { | |
| "gujarati": ["કેમ નથી", "કેટલી વાર", "ફરીથી", "સમજાતું નથી", "બરાબર નથી"], | |
| "hindi": ["क्यों नहीं", "कितनी बार", "फिर से", "समझ नहीं", "ठीक नहीं"], | |
| "english": ["why not", "how long", "again", "already told", "frustrated"] | |
| }, | |
| Emotion.ANGRY: { | |
| "gujarati": ["ગુસ્સો", "ફરિયાદ", "બેકાર", "ખરાબ", "નકામું"], | |
| "hindi": ["गुस्सा", "शिकायत", "बेकार", "घटिया", "बकवास"], | |
| "english": ["angry", "complain", "useless", "terrible", "worst"] | |
| }, | |
| Emotion.HAPPY: { | |
| "gujarati": ["આભાર", "ધન્યવાદ", "ઉત્તમ", "ખૂબ સરસ", "સારું"], | |
| "hindi": ["धन्यवाद", "शुक्रिया", "बहुत अच्छा", "बढ़िया", "सही है"], | |
| "english": ["thanks", "thank you", "great", "perfect", "good"] | |
| } | |
| } | |
| # ── AUDIO EMOTION DETECTOR ─────────────────────────────────────────────────── | |
| class AudioEmotionDetector: | |
| AUDIO_EMOTION_MAP = { | |
| "hap": Emotion.HAPPY, "neu": Emotion.CALM, "ang": Emotion.ANGRY, | |
| "sad": Emotion.CONFUSED, "fru": Emotion.FRUSTRATED, "fea": Emotion.URGENT | |
| } | |
| def __init__(self): | |
| self._classifier = None | |
| self._loaded = False | |
| self._failed = False # Permanent failure flag — avoids retry-on-import-error | |
| def _ensure_loaded(self): | |
| if self._loaded or self._failed: | |
| return | |
| try: | |
| from speechbrain.inference.interfaces import foreign_class | |
| self._classifier = foreign_class( | |
| source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", | |
| pymodule_file="custom_interface.py", | |
| classname="CustomEncoderWav2vec2Classifier", | |
| run_opts={"device": "cpu"} | |
| ) | |
| self._loaded = True | |
| print("[Emotion-Audio] SpeechBrain model loaded.") | |
| except Exception as e: | |
| self._failed = True | |
| print(f"[Emotion-Audio] Model unavailable, falling back to text-only: {e}") | |
| def classify(self, audio_bytes: bytes) -> tuple[Emotion, float]: | |
| self._ensure_loaded() | |
| # Gracefully return CALM if model is unavailable | |
| if self._failed or not self._loaded or not audio_bytes: | |
| return Emotion.CALM, 0.0 | |
| try: | |
| audio_np = np.frombuffer(audio_bytes, dtype=np.float32) | |
| if len(audio_np) < 8000: return Emotion.CALM, 0.0 | |
| audio_tensor = torch.FloatTensor(audio_np).unsqueeze(0) | |
| out_prob, score, _, text_lab = self._classifier.classify_batch(audio_tensor) | |
| raw_label = text_lab[0].lower().strip() | |
| confidence = float(score[0].item()) | |
| return self.AUDIO_EMOTION_MAP.get(raw_label, Emotion.CALM), confidence | |
| except Exception as e: | |
| print(f"[Emotion-Audio] classify error: {e}") | |
| return Emotion.CALM, 0.0 | |
| # ── MAIN ENGINE ─────────────────────────────────────────────────────────────── | |
| class EmotionEngine: | |
| def __init__(self): | |
| self._audio = AudioEmotionDetector() | |
| async def analyze_turn(self, audio: bytes, transcript: str, lang: str) -> EmotionResult: | |
| res = EmotionResult() | |
| # Text Layer | |
| t_lower = transcript.lower() | |
| for emo, lang_map in TEXT_KEYWORDS.items(): | |
| words = lang_map.get(lang, []) + lang_map.get("english", []) | |
| matches = [w for w in words if w in t_lower] | |
| if matches: | |
| res.text_emotion = emo.value | |
| res.triggered_words = matches | |
| res.emotion = emo | |
| res.confidence = 0.8 | |
| break | |
| # Audio Layer (Async) | |
| if not res.text_emotion: | |
| loop = asyncio.get_event_loop() | |
| a_emo, a_conf = await loop.run_in_executor(None, self._audio.classify, audio) | |
| res.audio_emotion = a_emo.value | |
| res.emotion = a_emo | |
| res.confidence = a_conf | |
| res.should_escalate = res.emotion in [Emotion.ANGRY, Emotion.URGENT] | |
| print(f"[Emotion] Detected: {res.emotion.value} (conf={res.confidence:.2f})") | |
| return res | |