Spaces:

rudyByte
/

maya-voice-agent

Paused

File size: 8,935 Bytes

"""
emotion_engine.py — Dual-Layer Emotional Intelligence for Maya

Architecture:
  Layer 1: Audio emotion via SpeechBrain wav2vec2-IEMOCAP (Acoustic)
  Layer 2: Text emotion via keyword matching (Semantic)
  Fusion:   Fused reading adapts Maya's LLM prompt and TTS parameters.
"""

import asyncio
import numpy as np
import torch
import io
from dataclasses import dataclass, field
from typing import Optional
from enum import Enum


# ── EMOTION TYPES ────────────────────────────────────────────────────────────

class Emotion(str, Enum):
    CALM       = "calm"
    HAPPY      = "happy"
    CONFUSED   = "confused"
    URGENT     = "urgent"      # Pain / Emergency
    FRUSTRATED = "frustrated"
    ANGRY      = "angry"


@dataclass
class EmotionResult:
    emotion:        Emotion      = Emotion.CALM
    confidence:     float        = 0.0
    audio_emotion:  Optional[str] = None
    text_emotion:   Optional[str] = None
    triggered_words: list        = field(default_factory=list)
    should_escalate: bool        = False


@dataclass
class ResponseProfile:
    emotion:             Emotion
    tts_pace:            float
    prompt_suffix:       str
    gujarati_opener:     str
    hindi_opener:        str
    english_opener:      str


# ── RESPONSE PROFILES ────────────────────────────────────────────────────────

RESPONSE_PROFILES: dict[Emotion, ResponseProfile] = {
    Emotion.CALM: ResponseProfile(
        Emotion.CALM, 0.92, "", "", "", ""
    ),
    Emotion.HAPPY: ResponseProfile(
        Emotion.HAPPY, 0.95, 
        "\nThe caller sounds happy. Match their energy with warmth.",
        "ખૂબ સારું! ", "बहुत अच्छा! ", "Wonderful! "
    ),
    Emotion.CONFUSED: ResponseProfile(
        Emotion.CONFUSED, 0.80,
        "\nThe caller sounds confused. Use simple language and ask ONE question at a time.",
        "ભલે, ચિંતા ન કરો. ", "ठीक है, चिंता मत करो। ", "No problem, let me help. "
    ),
    Emotion.URGENT: ResponseProfile(
        Emotion.URGENT, 1.00,
        "\nCRITICAL: The caller is in pain or has an emergency. Acknowledge this, get their name/phone, and say the doctor will call back IMMEDIATELY. Skip standard booking.",
        "હું સમજી ગઈ, આ ખૂબ અગત્યની વાત છે. ", "मैं समझ गई, यह बहुत ज़रूरी है। ", "I understand, this sounds urgent. "
    ),
    Emotion.FRUSTRATED: ResponseProfile(
        Emotion.FRUSTRATED, 0.88,
        "\nThe caller sounds frustrated. Keep responses short (max 2 sentences) and offer to have the owner call them back.",
        "માફ કરશો, હું સમજી ગઈ. ", "माफ़ करें, मैं समझ गई। ", "I apologize for the inconvenience. "
    ),
    Emotion.ANGRY: ResponseProfile(
        Emotion.ANGRY, 0.82,
        "\nCRITICAL: The caller is ANGRY. Be extremely apologetic and brief. Say the owner will call them personally within minutes.",
        "મને ખૂબ જ ખેદ છે. ", "मुझे बहुत खेद है। ", "I sincerely apologize. "
    ),
}


# ── TEXT KEYWORDS ────────────────────────────────────────────────────────────

TEXT_KEYWORDS: dict[Emotion, dict[str, list[str]]] = {
    Emotion.URGENT: {
        "gujarati": ["દુઃખાવો", "પીડા", "ઇમર્જન્સી", "તાત્કાલિક", "અત્યારે જ", "હમણાં જ", "દુઃખે છે", "તકલીફ"],
        "hindi": ["दर्द", "तकलीफ", "पीड़ा", "इमरजेंसी", "तुरंत", "अभी", "जल्दी", "अर्जेंट"],
        "english": ["pain", "emergency", "urgent", "immediately", "hurts", "bleeding", "critical"]
    },
    Emotion.FRUSTRATED: {
        "gujarati": ["કેમ નથી", "કેટલી વાર", "ફરીથી", "સમજાતું નથી", "બરાબર નથી"],
        "hindi": ["क्यों नहीं", "कितनी बार", "फिर से", "समझ नहीं", "ठीक नहीं"],
        "english": ["why not", "how long", "again", "already told", "frustrated"]
    },
    Emotion.ANGRY: {
        "gujarati": ["ગુસ્સો", "ફરિયાદ", "બેકાર", "ખરાબ", "નકામું"],
        "hindi": ["गुस्सा", "शिकायत", "बेकार", "घटिया", "बकवास"],
        "english": ["angry", "complain", "useless", "terrible", "worst"]
    },
    Emotion.HAPPY: {
        "gujarati": ["આભાર", "ધન્યવાદ", "ઉત્તમ", "ખૂબ સરસ", "સારું"],
        "hindi": ["धन्यवाद", "शुक्रिया", "बहुत अच्छा", "बढ़िया", "सही है"],
        "english": ["thanks", "thank you", "great", "perfect", "good"]
    }
}


# ── AUDIO EMOTION DETECTOR ───────────────────────────────────────────────────

class AudioEmotionDetector:
    AUDIO_EMOTION_MAP = {
        "hap": Emotion.HAPPY, "neu": Emotion.CALM, "ang": Emotion.ANGRY, 
        "sad": Emotion.CONFUSED, "fru": Emotion.FRUSTRATED, "fea": Emotion.URGENT
    }

    def __init__(self):
        self._classifier = None
        self._loaded = False
        self._failed = False  # Permanent failure flag — avoids retry-on-import-error

    def _ensure_loaded(self):
        if self._loaded or self._failed:
            return
        try:
            from speechbrain.inference.interfaces import foreign_class
            self._classifier = foreign_class(
                source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
                pymodule_file="custom_interface.py",
                classname="CustomEncoderWav2vec2Classifier",
                run_opts={"device": "cpu"}
            )
            self._loaded = True
            print("[Emotion-Audio] SpeechBrain model loaded.")
        except Exception as e:
            self._failed = True
            print(f"[Emotion-Audio] Model unavailable, falling back to text-only: {e}")

    def classify(self, audio_bytes: bytes) -> tuple[Emotion, float]:
        self._ensure_loaded()
        # Gracefully return CALM if model is unavailable
        if self._failed or not self._loaded or not audio_bytes:
            return Emotion.CALM, 0.0
        
        try:
            audio_np = np.frombuffer(audio_bytes, dtype=np.float32)
            if len(audio_np) < 8000: return Emotion.CALM, 0.0
            
            audio_tensor = torch.FloatTensor(audio_np).unsqueeze(0)
            out_prob, score, _, text_lab = self._classifier.classify_batch(audio_tensor)
            
            raw_label = text_lab[0].lower().strip()
            confidence = float(score[0].item())
            return self.AUDIO_EMOTION_MAP.get(raw_label, Emotion.CALM), confidence
        except Exception as e:
            print(f"[Emotion-Audio] classify error: {e}")
            return Emotion.CALM, 0.0


# ── MAIN ENGINE ───────────────────────────────────────────────────────────────

class EmotionEngine:
    def __init__(self):
        self._audio = AudioEmotionDetector()

    async def analyze_turn(self, audio: bytes, transcript: str, lang: str) -> EmotionResult:
        res = EmotionResult()
        
        # Text Layer
        t_lower = transcript.lower()
        for emo, lang_map in TEXT_KEYWORDS.items():
            words = lang_map.get(lang, []) + lang_map.get("english", [])
            matches = [w for w in words if w in t_lower]
            if matches:
                res.text_emotion = emo.value
                res.triggered_words = matches
                res.emotion = emo
                res.confidence = 0.8
                break
        
        # Audio Layer (Async)
        if not res.text_emotion:
            loop = asyncio.get_event_loop()
            a_emo, a_conf = await loop.run_in_executor(None, self._audio.classify, audio)
            res.audio_emotion = a_emo.value
            res.emotion = a_emo
            res.confidence = a_conf

        res.should_escalate = res.emotion in [Emotion.ANGRY, Emotion.URGENT]
        print(f"[Emotion] Detected: {res.emotion.value} (conf={res.confidence:.2f})")
        return res