maya-voice-agent / src /emotion_engine.py
rudyByte
feat(E5): complete Gujarati perfection — Sarvam STT, fixed TTS, code-mix prompts
ebf9701
"""
emotion_engine.py — Dual-Layer Emotional Intelligence for Maya
Architecture:
Layer 1: Audio emotion via SpeechBrain wav2vec2-IEMOCAP (Acoustic)
Layer 2: Text emotion via keyword matching (Semantic)
Fusion: Fused reading adapts Maya's LLM prompt and TTS parameters.
"""
import asyncio
import numpy as np
import torch
import io
from dataclasses import dataclass, field
from typing import Optional
from enum import Enum
# ── EMOTION TYPES ────────────────────────────────────────────────────────────
class Emotion(str, Enum):
CALM = "calm"
HAPPY = "happy"
CONFUSED = "confused"
URGENT = "urgent" # Pain / Emergency
FRUSTRATED = "frustrated"
ANGRY = "angry"
@dataclass
class EmotionResult:
emotion: Emotion = Emotion.CALM
confidence: float = 0.0
audio_emotion: Optional[str] = None
text_emotion: Optional[str] = None
triggered_words: list = field(default_factory=list)
should_escalate: bool = False
@dataclass
class ResponseProfile:
emotion: Emotion
tts_pace: float
prompt_suffix: str
gujarati_opener: str
hindi_opener: str
english_opener: str
# ── RESPONSE PROFILES ────────────────────────────────────────────────────────
RESPONSE_PROFILES: dict[Emotion, ResponseProfile] = {
Emotion.CALM: ResponseProfile(
Emotion.CALM, 0.92, "", "", "", ""
),
Emotion.HAPPY: ResponseProfile(
Emotion.HAPPY, 0.95,
"\nThe caller sounds happy. Match their energy with warmth.",
"ખૂબ સારું! ", "बहुत अच्छा! ", "Wonderful! "
),
Emotion.CONFUSED: ResponseProfile(
Emotion.CONFUSED, 0.80,
"\nThe caller sounds confused. Use simple language and ask ONE question at a time.",
"ભલે, ચિંતા ન કરો. ", "ठीक है, चिंता मत करो। ", "No problem, let me help. "
),
Emotion.URGENT: ResponseProfile(
Emotion.URGENT, 1.00,
"\nCRITICAL: The caller is in pain or has an emergency. Acknowledge this, get their name/phone, and say the doctor will call back IMMEDIATELY. Skip standard booking.",
"હું સમજી ગઈ, આ ખૂબ અગત્યની વાત છે. ", "मैं समझ गई, यह बहुत ज़रूरी है। ", "I understand, this sounds urgent. "
),
Emotion.FRUSTRATED: ResponseProfile(
Emotion.FRUSTRATED, 0.88,
"\nThe caller sounds frustrated. Keep responses short (max 2 sentences) and offer to have the owner call them back.",
"માફ કરશો, હું સમજી ગઈ. ", "माफ़ करें, मैं समझ गई। ", "I apologize for the inconvenience. "
),
Emotion.ANGRY: ResponseProfile(
Emotion.ANGRY, 0.82,
"\nCRITICAL: The caller is ANGRY. Be extremely apologetic and brief. Say the owner will call them personally within minutes.",
"મને ખૂબ જ ખેદ છે. ", "मुझे बहुत खेद है। ", "I sincerely apologize. "
),
}
# ── TEXT KEYWORDS ────────────────────────────────────────────────────────────
TEXT_KEYWORDS: dict[Emotion, dict[str, list[str]]] = {
Emotion.URGENT: {
"gujarati": ["દુઃખાવો", "પીડા", "ઇમર્જન્સી", "તાત્કાલિક", "અત્યારે જ", "હમણાં જ", "દુઃખે છે", "તકલીફ"],
"hindi": ["दर्द", "तकलीफ", "पीड़ा", "इमरजेंसी", "तुरंत", "अभी", "जल्दी", "अर्जेंट"],
"english": ["pain", "emergency", "urgent", "immediately", "hurts", "bleeding", "critical"]
},
Emotion.FRUSTRATED: {
"gujarati": ["કેમ નથી", "કેટલી વાર", "ફરીથી", "સમજાતું નથી", "બરાબર નથી"],
"hindi": ["क्यों नहीं", "कितनी बार", "फिर से", "समझ नहीं", "ठीक नहीं"],
"english": ["why not", "how long", "again", "already told", "frustrated"]
},
Emotion.ANGRY: {
"gujarati": ["ગુસ્સો", "ફરિયાદ", "બેકાર", "ખરાબ", "નકામું"],
"hindi": ["गुस्सा", "शिकायत", "बेकार", "घटिया", "बकवास"],
"english": ["angry", "complain", "useless", "terrible", "worst"]
},
Emotion.HAPPY: {
"gujarati": ["આભાર", "ધન્યવાદ", "ઉત્તમ", "ખૂબ સરસ", "સારું"],
"hindi": ["धन्यवाद", "शुक्रिया", "बहुत अच्छा", "बढ़िया", "सही है"],
"english": ["thanks", "thank you", "great", "perfect", "good"]
}
}
# ── AUDIO EMOTION DETECTOR ───────────────────────────────────────────────────
class AudioEmotionDetector:
AUDIO_EMOTION_MAP = {
"hap": Emotion.HAPPY, "neu": Emotion.CALM, "ang": Emotion.ANGRY,
"sad": Emotion.CONFUSED, "fru": Emotion.FRUSTRATED, "fea": Emotion.URGENT
}
def __init__(self):
self._classifier = None
self._loaded = False
self._failed = False # Permanent failure flag — avoids retry-on-import-error
def _ensure_loaded(self):
if self._loaded or self._failed:
return
try:
from speechbrain.inference.interfaces import foreign_class
self._classifier = foreign_class(
source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
pymodule_file="custom_interface.py",
classname="CustomEncoderWav2vec2Classifier",
run_opts={"device": "cpu"}
)
self._loaded = True
print("[Emotion-Audio] SpeechBrain model loaded.")
except Exception as e:
self._failed = True
print(f"[Emotion-Audio] Model unavailable, falling back to text-only: {e}")
def classify(self, audio_bytes: bytes) -> tuple[Emotion, float]:
self._ensure_loaded()
# Gracefully return CALM if model is unavailable
if self._failed or not self._loaded or not audio_bytes:
return Emotion.CALM, 0.0
try:
audio_np = np.frombuffer(audio_bytes, dtype=np.float32)
if len(audio_np) < 8000: return Emotion.CALM, 0.0
audio_tensor = torch.FloatTensor(audio_np).unsqueeze(0)
out_prob, score, _, text_lab = self._classifier.classify_batch(audio_tensor)
raw_label = text_lab[0].lower().strip()
confidence = float(score[0].item())
return self.AUDIO_EMOTION_MAP.get(raw_label, Emotion.CALM), confidence
except Exception as e:
print(f"[Emotion-Audio] classify error: {e}")
return Emotion.CALM, 0.0
# ── MAIN ENGINE ───────────────────────────────────────────────────────────────
class EmotionEngine:
def __init__(self):
self._audio = AudioEmotionDetector()
async def analyze_turn(self, audio: bytes, transcript: str, lang: str) -> EmotionResult:
res = EmotionResult()
# Text Layer
t_lower = transcript.lower()
for emo, lang_map in TEXT_KEYWORDS.items():
words = lang_map.get(lang, []) + lang_map.get("english", [])
matches = [w for w in words if w in t_lower]
if matches:
res.text_emotion = emo.value
res.triggered_words = matches
res.emotion = emo
res.confidence = 0.8
break
# Audio Layer (Async)
if not res.text_emotion:
loop = asyncio.get_event_loop()
a_emo, a_conf = await loop.run_in_executor(None, self._audio.classify, audio)
res.audio_emotion = a_emo.value
res.emotion = a_emo
res.confidence = a_conf
res.should_escalate = res.emotion in [Emotion.ANGRY, Emotion.URGENT]
print(f"[Emotion] Detected: {res.emotion.value} (conf={res.confidence:.2f})")
return res