Spaces:
Paused
Paused
File size: 8,935 Bytes
6d68d81 ebf9701 6d68d81 ebf9701 6d68d81 ebf9701 6d68d81 ebf9701 6d68d81 ebf9701 6d68d81 ebf9701 6d68d81 4b918ea 6d68d81 4b918ea 6d68d81 4b918ea 6d68d81 4b918ea 6d68d81 4b918ea 6d68d81 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 | """
emotion_engine.py — Dual-Layer Emotional Intelligence for Maya
Architecture:
Layer 1: Audio emotion via SpeechBrain wav2vec2-IEMOCAP (Acoustic)
Layer 2: Text emotion via keyword matching (Semantic)
Fusion: Fused reading adapts Maya's LLM prompt and TTS parameters.
"""
import asyncio
import numpy as np
import torch
import io
from dataclasses import dataclass, field
from typing import Optional
from enum import Enum
# ── EMOTION TYPES ────────────────────────────────────────────────────────────
class Emotion(str, Enum):
CALM = "calm"
HAPPY = "happy"
CONFUSED = "confused"
URGENT = "urgent" # Pain / Emergency
FRUSTRATED = "frustrated"
ANGRY = "angry"
@dataclass
class EmotionResult:
emotion: Emotion = Emotion.CALM
confidence: float = 0.0
audio_emotion: Optional[str] = None
text_emotion: Optional[str] = None
triggered_words: list = field(default_factory=list)
should_escalate: bool = False
@dataclass
class ResponseProfile:
emotion: Emotion
tts_pace: float
prompt_suffix: str
gujarati_opener: str
hindi_opener: str
english_opener: str
# ── RESPONSE PROFILES ────────────────────────────────────────────────────────
RESPONSE_PROFILES: dict[Emotion, ResponseProfile] = {
Emotion.CALM: ResponseProfile(
Emotion.CALM, 0.92, "", "", "", ""
),
Emotion.HAPPY: ResponseProfile(
Emotion.HAPPY, 0.95,
"\nThe caller sounds happy. Match their energy with warmth.",
"ખૂબ સારું! ", "बहुत अच्छा! ", "Wonderful! "
),
Emotion.CONFUSED: ResponseProfile(
Emotion.CONFUSED, 0.80,
"\nThe caller sounds confused. Use simple language and ask ONE question at a time.",
"ભલે, ચિંતા ન કરો. ", "ठीक है, चिंता मत करो। ", "No problem, let me help. "
),
Emotion.URGENT: ResponseProfile(
Emotion.URGENT, 1.00,
"\nCRITICAL: The caller is in pain or has an emergency. Acknowledge this, get their name/phone, and say the doctor will call back IMMEDIATELY. Skip standard booking.",
"હું સમજી ગઈ, આ ખૂબ અગત્યની વાત છે. ", "मैं समझ गई, यह बहुत ज़रूरी है। ", "I understand, this sounds urgent. "
),
Emotion.FRUSTRATED: ResponseProfile(
Emotion.FRUSTRATED, 0.88,
"\nThe caller sounds frustrated. Keep responses short (max 2 sentences) and offer to have the owner call them back.",
"માફ કરશો, હું સમજી ગઈ. ", "माफ़ करें, मैं समझ गई। ", "I apologize for the inconvenience. "
),
Emotion.ANGRY: ResponseProfile(
Emotion.ANGRY, 0.82,
"\nCRITICAL: The caller is ANGRY. Be extremely apologetic and brief. Say the owner will call them personally within minutes.",
"મને ખૂબ જ ખેદ છે. ", "मुझे बहुत खेद है। ", "I sincerely apologize. "
),
}
# ── TEXT KEYWORDS ────────────────────────────────────────────────────────────
TEXT_KEYWORDS: dict[Emotion, dict[str, list[str]]] = {
Emotion.URGENT: {
"gujarati": ["દુઃખાવો", "પીડા", "ઇમર્જન્સી", "તાત્કાલિક", "અત્યારે જ", "હમણાં જ", "દુઃખે છે", "તકલીફ"],
"hindi": ["दर्द", "तकलीफ", "पीड़ा", "इमरजेंसी", "तुरंत", "अभी", "जल्दी", "अर्जेंट"],
"english": ["pain", "emergency", "urgent", "immediately", "hurts", "bleeding", "critical"]
},
Emotion.FRUSTRATED: {
"gujarati": ["કેમ નથી", "કેટલી વાર", "ફરીથી", "સમજાતું નથી", "બરાબર નથી"],
"hindi": ["क्यों नहीं", "कितनी बार", "फिर से", "समझ नहीं", "ठीक नहीं"],
"english": ["why not", "how long", "again", "already told", "frustrated"]
},
Emotion.ANGRY: {
"gujarati": ["ગુસ્સો", "ફરિયાદ", "બેકાર", "ખરાબ", "નકામું"],
"hindi": ["गुस्सा", "शिकायत", "बेकार", "घटिया", "बकवास"],
"english": ["angry", "complain", "useless", "terrible", "worst"]
},
Emotion.HAPPY: {
"gujarati": ["આભાર", "ધન્યવાદ", "ઉત્તમ", "ખૂબ સરસ", "સારું"],
"hindi": ["धन्यवाद", "शुक्रिया", "बहुत अच्छा", "बढ़िया", "सही है"],
"english": ["thanks", "thank you", "great", "perfect", "good"]
}
}
# ── AUDIO EMOTION DETECTOR ───────────────────────────────────────────────────
class AudioEmotionDetector:
AUDIO_EMOTION_MAP = {
"hap": Emotion.HAPPY, "neu": Emotion.CALM, "ang": Emotion.ANGRY,
"sad": Emotion.CONFUSED, "fru": Emotion.FRUSTRATED, "fea": Emotion.URGENT
}
def __init__(self):
self._classifier = None
self._loaded = False
self._failed = False # Permanent failure flag — avoids retry-on-import-error
def _ensure_loaded(self):
if self._loaded or self._failed:
return
try:
from speechbrain.inference.interfaces import foreign_class
self._classifier = foreign_class(
source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
pymodule_file="custom_interface.py",
classname="CustomEncoderWav2vec2Classifier",
run_opts={"device": "cpu"}
)
self._loaded = True
print("[Emotion-Audio] SpeechBrain model loaded.")
except Exception as e:
self._failed = True
print(f"[Emotion-Audio] Model unavailable, falling back to text-only: {e}")
def classify(self, audio_bytes: bytes) -> tuple[Emotion, float]:
self._ensure_loaded()
# Gracefully return CALM if model is unavailable
if self._failed or not self._loaded or not audio_bytes:
return Emotion.CALM, 0.0
try:
audio_np = np.frombuffer(audio_bytes, dtype=np.float32)
if len(audio_np) < 8000: return Emotion.CALM, 0.0
audio_tensor = torch.FloatTensor(audio_np).unsqueeze(0)
out_prob, score, _, text_lab = self._classifier.classify_batch(audio_tensor)
raw_label = text_lab[0].lower().strip()
confidence = float(score[0].item())
return self.AUDIO_EMOTION_MAP.get(raw_label, Emotion.CALM), confidence
except Exception as e:
print(f"[Emotion-Audio] classify error: {e}")
return Emotion.CALM, 0.0
# ── MAIN ENGINE ───────────────────────────────────────────────────────────────
class EmotionEngine:
def __init__(self):
self._audio = AudioEmotionDetector()
async def analyze_turn(self, audio: bytes, transcript: str, lang: str) -> EmotionResult:
res = EmotionResult()
# Text Layer
t_lower = transcript.lower()
for emo, lang_map in TEXT_KEYWORDS.items():
words = lang_map.get(lang, []) + lang_map.get("english", [])
matches = [w for w in words if w in t_lower]
if matches:
res.text_emotion = emo.value
res.triggered_words = matches
res.emotion = emo
res.confidence = 0.8
break
# Audio Layer (Async)
if not res.text_emotion:
loop = asyncio.get_event_loop()
a_emo, a_conf = await loop.run_in_executor(None, self._audio.classify, audio)
res.audio_emotion = a_emo.value
res.emotion = a_emo
res.confidence = a_conf
res.should_escalate = res.emotion in [Emotion.ANGRY, Emotion.URGENT]
print(f"[Emotion] Detected: {res.emotion.value} (conf={res.confidence:.2f})")
return res
|