Spaces:

rudyByte
/

maya-voice-agent

Paused

maya-voice-agent / src /emotion_engine.py

rudyByte

feat(E5): complete Gujarati perfection — Sarvam STT, fixed TTS, code-mix prompts

ebf9701 19 days ago

8.94 kB

	"""
	emotion_engine.py — Dual-Layer Emotional Intelligence for Maya

	Architecture:
	Layer 1: Audio emotion via SpeechBrain wav2vec2-IEMOCAP (Acoustic)
	Layer 2: Text emotion via keyword matching (Semantic)
	Fusion: Fused reading adapts Maya's LLM prompt and TTS parameters.
	"""

	import asyncio
	import numpy as np
	import torch
	import io
	from dataclasses import dataclass, field
	from typing import Optional
	from enum import Enum


	# ── EMOTION TYPES ────────────────────────────────────────────────────────────

	class Emotion(str, Enum):
	CALM = "calm"
	HAPPY = "happy"
	CONFUSED = "confused"
	URGENT = "urgent" # Pain / Emergency
	FRUSTRATED = "frustrated"
	ANGRY = "angry"


	@dataclass
	class EmotionResult:
	emotion: Emotion = Emotion.CALM
	confidence: float = 0.0
	audio_emotion: Optional[str] = None
	text_emotion: Optional[str] = None
	triggered_words: list = field(default_factory=list)
	should_escalate: bool = False


	@dataclass
	class ResponseProfile:
	emotion: Emotion
	tts_pace: float
	prompt_suffix: str
	gujarati_opener: str
	hindi_opener: str
	english_opener: str


	# ── RESPONSE PROFILES ────────────────────────────────────────────────────────

	RESPONSE_PROFILES: dict[Emotion, ResponseProfile] = {
	Emotion.CALM: ResponseProfile(
	Emotion.CALM, 0.92, "", "", "", ""
	),
	Emotion.HAPPY: ResponseProfile(
	Emotion.HAPPY, 0.95,
	"\nThe caller sounds happy. Match their energy with warmth.",
	"ખૂબ સારું! ", "बहुत अच्छा! ", "Wonderful! "
	),
	Emotion.CONFUSED: ResponseProfile(
	Emotion.CONFUSED, 0.80,
	"\nThe caller sounds confused. Use simple language and ask ONE question at a time.",
	"ભલે, ચિંતા ન કરો. ", "ठीक है, चिंता मत करो। ", "No problem, let me help. "
	),
	Emotion.URGENT: ResponseProfile(
	Emotion.URGENT, 1.00,
	"\nCRITICAL: The caller is in pain or has an emergency. Acknowledge this, get their name/phone, and say the doctor will call back IMMEDIATELY. Skip standard booking.",
	"હું સમજી ગઈ, આ ખૂબ અગત્યની વાત છે. ", "मैं समझ गई, यह बहुत ज़रूरी है। ", "I understand, this sounds urgent. "
	),
	Emotion.FRUSTRATED: ResponseProfile(
	Emotion.FRUSTRATED, 0.88,
	"\nThe caller sounds frustrated. Keep responses short (max 2 sentences) and offer to have the owner call them back.",
	"માફ કરશો, હું સમજી ગઈ. ", "माफ़ करें, मैं समझ गई। ", "I apologize for the inconvenience. "
	),
	Emotion.ANGRY: ResponseProfile(
	Emotion.ANGRY, 0.82,
	"\nCRITICAL: The caller is ANGRY. Be extremely apologetic and brief. Say the owner will call them personally within minutes.",
	"મને ખૂબ જ ખેદ છે. ", "मुझे बहुत खेद है। ", "I sincerely apologize. "
	),
	}


	# ── TEXT KEYWORDS ────────────────────────────────────────────────────────────

	TEXT_KEYWORDS: dict[Emotion, dict[str, list[str]]] = {
	Emotion.URGENT: {
	"gujarati": ["દુઃખાવો", "પીડા", "ઇમર્જન્સી", "તાત્કાલિક", "અત્યારે જ", "હમણાં જ", "દુઃખે છે", "તકલીફ"],
	"hindi": ["दर्द", "तकलीफ", "पीड़ा", "इमरजेंसी", "तुरंत", "अभी", "जल्दी", "अर्जेंट"],
	"english": ["pain", "emergency", "urgent", "immediately", "hurts", "bleeding", "critical"]
	},
	Emotion.FRUSTRATED: {
	"gujarati": ["કેમ નથી", "કેટલી વાર", "ફરીથી", "સમજાતું નથી", "બરાબર નથી"],
	"hindi": ["क्यों नहीं", "कितनी बार", "फिर से", "समझ नहीं", "ठीक नहीं"],
	"english": ["why not", "how long", "again", "already told", "frustrated"]
	},
	Emotion.ANGRY: {
	"gujarati": ["ગુસ્સો", "ફરિયાદ", "બેકાર", "ખરાબ", "નકામું"],
	"hindi": ["गुस्सा", "शिकायत", "बेकार", "घटिया", "बकवास"],
	"english": ["angry", "complain", "useless", "terrible", "worst"]
	},
	Emotion.HAPPY: {
	"gujarati": ["આભાર", "ધન્યવાદ", "ઉત્તમ", "ખૂબ સરસ", "સારું"],
	"hindi": ["धन्यवाद", "शुक्रिया", "बहुत अच्छा", "बढ़िया", "सही है"],
	"english": ["thanks", "thank you", "great", "perfect", "good"]
	}
	}


	# ── AUDIO EMOTION DETECTOR ───────────────────────────────────────────────────

	class AudioEmotionDetector:
	AUDIO_EMOTION_MAP = {
	"hap": Emotion.HAPPY, "neu": Emotion.CALM, "ang": Emotion.ANGRY,
	"sad": Emotion.CONFUSED, "fru": Emotion.FRUSTRATED, "fea": Emotion.URGENT
	}

	def __init__(self):
	self._classifier = None
	self._loaded = False
	self._failed = False # Permanent failure flag — avoids retry-on-import-error

	def _ensure_loaded(self):
	if self._loaded or self._failed:
	return
	try:
	from speechbrain.inference.interfaces import foreign_class
	self._classifier = foreign_class(
	source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
	pymodule_file="custom_interface.py",
	classname="CustomEncoderWav2vec2Classifier",
	run_opts={"device": "cpu"}
	)
	self._loaded = True
	print("[Emotion-Audio] SpeechBrain model loaded.")
	except Exception as e:
	self._failed = True
	print(f"[Emotion-Audio] Model unavailable, falling back to text-only: {e}")

	def classify(self, audio_bytes: bytes) -> tuple[Emotion, float]:
	self._ensure_loaded()
	# Gracefully return CALM if model is unavailable
	if self._failed or not self._loaded or not audio_bytes:
	return Emotion.CALM, 0.0

	try:
	audio_np = np.frombuffer(audio_bytes, dtype=np.float32)
	if len(audio_np) < 8000: return Emotion.CALM, 0.0

	audio_tensor = torch.FloatTensor(audio_np).unsqueeze(0)
	out_prob, score, _, text_lab = self._classifier.classify_batch(audio_tensor)

	raw_label = text_lab[0].lower().strip()
	confidence = float(score[0].item())
	return self.AUDIO_EMOTION_MAP.get(raw_label, Emotion.CALM), confidence
	except Exception as e:
	print(f"[Emotion-Audio] classify error: {e}")
	return Emotion.CALM, 0.0


	# ── MAIN ENGINE ───────────────────────────────────────────────────────────────

	class EmotionEngine:
	def __init__(self):
	self._audio = AudioEmotionDetector()

	async def analyze_turn(self, audio: bytes, transcript: str, lang: str) -> EmotionResult:
	res = EmotionResult()

	# Text Layer
	t_lower = transcript.lower()
	for emo, lang_map in TEXT_KEYWORDS.items():
	words = lang_map.get(lang, []) + lang_map.get("english", [])
	matches = [w for w in words if w in t_lower]
	if matches:
	res.text_emotion = emo.value
	res.triggered_words = matches
	res.emotion = emo
	res.confidence = 0.8
	break

	# Audio Layer (Async)
	if not res.text_emotion:
	loop = asyncio.get_event_loop()
	a_emo, a_conf = await loop.run_in_executor(None, self._audio.classify, audio)
	res.audio_emotion = a_emo.value
	res.emotion = a_emo
	res.confidence = a_conf

	res.should_escalate = res.emotion in [Emotion.ANGRY, Emotion.URGENT]
	print(f"[Emotion] Detected: {res.emotion.value} (conf={res.confidence:.2f})")
	return res