File size: 8,935 Bytes
6d68d81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ebf9701
6d68d81
 
ebf9701
6d68d81
 
 
 
ebf9701
6d68d81
 
 
 
ebf9701
6d68d81
 
 
 
ebf9701
6d68d81
 
 
 
ebf9701
6d68d81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b918ea
6d68d81
 
4b918ea
 
 
6d68d81
4b918ea
 
 
 
 
 
 
 
 
 
 
6d68d81
 
 
4b918ea
 
6d68d81
 
 
 
 
 
 
 
 
 
 
 
4b918ea
 
6d68d81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
"""
emotion_engine.py — Dual-Layer Emotional Intelligence for Maya

Architecture:
  Layer 1: Audio emotion via SpeechBrain wav2vec2-IEMOCAP (Acoustic)
  Layer 2: Text emotion via keyword matching (Semantic)
  Fusion:   Fused reading adapts Maya's LLM prompt and TTS parameters.
"""

import asyncio
import numpy as np
import torch
import io
from dataclasses import dataclass, field
from typing import Optional
from enum import Enum


# ── EMOTION TYPES ────────────────────────────────────────────────────────────

class Emotion(str, Enum):
    CALM       = "calm"
    HAPPY      = "happy"
    CONFUSED   = "confused"
    URGENT     = "urgent"      # Pain / Emergency
    FRUSTRATED = "frustrated"
    ANGRY      = "angry"


@dataclass
class EmotionResult:
    emotion:        Emotion      = Emotion.CALM
    confidence:     float        = 0.0
    audio_emotion:  Optional[str] = None
    text_emotion:   Optional[str] = None
    triggered_words: list        = field(default_factory=list)
    should_escalate: bool        = False


@dataclass
class ResponseProfile:
    emotion:             Emotion
    tts_pace:            float
    prompt_suffix:       str
    gujarati_opener:     str
    hindi_opener:        str
    english_opener:      str


# ── RESPONSE PROFILES ────────────────────────────────────────────────────────

RESPONSE_PROFILES: dict[Emotion, ResponseProfile] = {
    Emotion.CALM: ResponseProfile(
        Emotion.CALM, 0.92, "", "", "", ""
    ),
    Emotion.HAPPY: ResponseProfile(
        Emotion.HAPPY, 0.95, 
        "\nThe caller sounds happy. Match their energy with warmth.",
        "ખૂબ સારું! ", "बहुत अच्छा! ", "Wonderful! "
    ),
    Emotion.CONFUSED: ResponseProfile(
        Emotion.CONFUSED, 0.80,
        "\nThe caller sounds confused. Use simple language and ask ONE question at a time.",
        "ભલે, ચિંતા ન કરો. ", "ठीक है, चिंता मत करो। ", "No problem, let me help. "
    ),
    Emotion.URGENT: ResponseProfile(
        Emotion.URGENT, 1.00,
        "\nCRITICAL: The caller is in pain or has an emergency. Acknowledge this, get their name/phone, and say the doctor will call back IMMEDIATELY. Skip standard booking.",
        "હું સમજી ગઈ, આ ખૂબ અગત્યની વાત છે. ", "मैं समझ गई, यह बहुत ज़रूरी है। ", "I understand, this sounds urgent. "
    ),
    Emotion.FRUSTRATED: ResponseProfile(
        Emotion.FRUSTRATED, 0.88,
        "\nThe caller sounds frustrated. Keep responses short (max 2 sentences) and offer to have the owner call them back.",
        "માફ કરશો, હું સમજી ગઈ. ", "माफ़ करें, मैं समझ गई। ", "I apologize for the inconvenience. "
    ),
    Emotion.ANGRY: ResponseProfile(
        Emotion.ANGRY, 0.82,
        "\nCRITICAL: The caller is ANGRY. Be extremely apologetic and brief. Say the owner will call them personally within minutes.",
        "મને ખૂબ જ ખેદ છે. ", "मुझे बहुत खेद है। ", "I sincerely apologize. "
    ),
}


# ── TEXT KEYWORDS ────────────────────────────────────────────────────────────

TEXT_KEYWORDS: dict[Emotion, dict[str, list[str]]] = {
    Emotion.URGENT: {
        "gujarati": ["દુઃખાવો", "પીડા", "ઇમર્જન્સી", "તાત્કાલિક", "અત્યારે જ", "હમણાં જ", "દુઃખે છે", "તકલીફ"],
        "hindi": ["दर्द", "तकलीफ", "पीड़ा", "इमरजेंसी", "तुरंत", "अभी", "जल्दी", "अर्जेंट"],
        "english": ["pain", "emergency", "urgent", "immediately", "hurts", "bleeding", "critical"]
    },
    Emotion.FRUSTRATED: {
        "gujarati": ["કેમ નથી", "કેટલી વાર", "ફરીથી", "સમજાતું નથી", "બરાબર નથી"],
        "hindi": ["क्यों नहीं", "कितनी बार", "फिर से", "समझ नहीं", "ठीक नहीं"],
        "english": ["why not", "how long", "again", "already told", "frustrated"]
    },
    Emotion.ANGRY: {
        "gujarati": ["ગુસ્સો", "ફરિયાદ", "બેકાર", "ખરાબ", "નકામું"],
        "hindi": ["गुस्सा", "शिकायत", "बेकार", "घटिया", "बकवास"],
        "english": ["angry", "complain", "useless", "terrible", "worst"]
    },
    Emotion.HAPPY: {
        "gujarati": ["આભાર", "ધન્યવાદ", "ઉત્તમ", "ખૂબ સરસ", "સારું"],
        "hindi": ["धन्यवाद", "शुक्रिया", "बहुत अच्छा", "बढ़िया", "सही है"],
        "english": ["thanks", "thank you", "great", "perfect", "good"]
    }
}


# ── AUDIO EMOTION DETECTOR ───────────────────────────────────────────────────

class AudioEmotionDetector:
    AUDIO_EMOTION_MAP = {
        "hap": Emotion.HAPPY, "neu": Emotion.CALM, "ang": Emotion.ANGRY, 
        "sad": Emotion.CONFUSED, "fru": Emotion.FRUSTRATED, "fea": Emotion.URGENT
    }

    def __init__(self):
        self._classifier = None
        self._loaded = False
        self._failed = False  # Permanent failure flag — avoids retry-on-import-error

    def _ensure_loaded(self):
        if self._loaded or self._failed:
            return
        try:
            from speechbrain.inference.interfaces import foreign_class
            self._classifier = foreign_class(
                source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
                pymodule_file="custom_interface.py",
                classname="CustomEncoderWav2vec2Classifier",
                run_opts={"device": "cpu"}
            )
            self._loaded = True
            print("[Emotion-Audio] SpeechBrain model loaded.")
        except Exception as e:
            self._failed = True
            print(f"[Emotion-Audio] Model unavailable, falling back to text-only: {e}")

    def classify(self, audio_bytes: bytes) -> tuple[Emotion, float]:
        self._ensure_loaded()
        # Gracefully return CALM if model is unavailable
        if self._failed or not self._loaded or not audio_bytes:
            return Emotion.CALM, 0.0
        
        try:
            audio_np = np.frombuffer(audio_bytes, dtype=np.float32)
            if len(audio_np) < 8000: return Emotion.CALM, 0.0
            
            audio_tensor = torch.FloatTensor(audio_np).unsqueeze(0)
            out_prob, score, _, text_lab = self._classifier.classify_batch(audio_tensor)
            
            raw_label = text_lab[0].lower().strip()
            confidence = float(score[0].item())
            return self.AUDIO_EMOTION_MAP.get(raw_label, Emotion.CALM), confidence
        except Exception as e:
            print(f"[Emotion-Audio] classify error: {e}")
            return Emotion.CALM, 0.0


# ── MAIN ENGINE ───────────────────────────────────────────────────────────────

class EmotionEngine:
    def __init__(self):
        self._audio = AudioEmotionDetector()

    async def analyze_turn(self, audio: bytes, transcript: str, lang: str) -> EmotionResult:
        res = EmotionResult()
        
        # Text Layer
        t_lower = transcript.lower()
        for emo, lang_map in TEXT_KEYWORDS.items():
            words = lang_map.get(lang, []) + lang_map.get("english", [])
            matches = [w for w in words if w in t_lower]
            if matches:
                res.text_emotion = emo.value
                res.triggered_words = matches
                res.emotion = emo
                res.confidence = 0.8
                break
        
        # Audio Layer (Async)
        if not res.text_emotion:
            loop = asyncio.get_event_loop()
            a_emo, a_conf = await loop.run_in_executor(None, self._audio.classify, audio)
            res.audio_emotion = a_emo.value
            res.emotion = a_emo
            res.confidence = a_conf

        res.should_escalate = res.emotion in [Emotion.ANGRY, Emotion.URGENT]
        print(f"[Emotion] Detected: {res.emotion.value} (conf={res.confidence:.2f})")
        return res