File size: 8,343 Bytes
5669b22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
import asyncio
import re
from typing import Optional, Union, Any, List, Dict
import numpy as np
import json
from loguru import logger

from ..message_handler import message_handler
from .types import WebSocketSend, BroadcastContext
from .tts_manager import TTSTaskManager
from ..agent.output_types import SentenceOutput, AudioOutput
from ..agent.input_types import BatchInput, TextData, ImageData, TextSource, ImageSource
from ..asr.asr_interface import ASRInterface
from ..live2d_model import Live2dModel
from ..tts.tts_interface import TTSInterface
from ..utils.stream_audio import prepare_audio_payload


# Convert class methods to standalone functions
def create_batch_input(

    input_text: str,

    images: Optional[List[Dict[str, Any]]],

    from_name: str,

    metadata: Optional[Dict[str, Any]] = None,

) -> BatchInput:
    """Create batch input for agent processing"""
    return BatchInput(
        texts=[
            TextData(source=TextSource.INPUT, content=input_text, from_name=from_name)
        ],
        images=[
            ImageData(
                source=ImageSource(img["source"]),
                data=img["data"],
                mime_type=img["mime_type"],
            )
            for img in (images or [])
        ]
        if images
        else None,
        metadata=metadata,
    )


async def process_agent_output(

    output: Union[AudioOutput, SentenceOutput],

    character_config: Any,

    live2d_model: Live2dModel,

    tts_engine: TTSInterface,

    websocket_send: WebSocketSend,

    tts_manager: TTSTaskManager,

    translate_engine: Optional[Any] = None,

) -> str:
    """Process agent output with character information and optional translation"""
    output.display_text.name = character_config.character_name
    output.display_text.avatar = character_config.avatar

    full_response = ""
    try:
        if isinstance(output, SentenceOutput):
            full_response = await handle_sentence_output(
                output,
                live2d_model,
                tts_engine,
                websocket_send,
                tts_manager,
                translate_engine,
            )
        elif isinstance(output, AudioOutput):
            full_response = await handle_audio_output(output, websocket_send)
        else:
            logger.warning(f"Unknown output type: {type(output)}")
    except Exception as e:
        logger.error(f"Error processing agent output: {e}")
        await websocket_send(
            json.dumps(
                {"type": "error", "message": f"Error processing response: {str(e)}"}
            )
        )

    return full_response


async def handle_sentence_output(

    output: SentenceOutput,

    live2d_model: Live2dModel,

    tts_engine: TTSInterface,

    websocket_send: WebSocketSend,

    tts_manager: TTSTaskManager,

    translate_engine: Optional[Any] = None,

) -> str:
    """Handle sentence output type with optional translation support"""
    full_response = ""
    async for display_text, tts_text, actions in output:
        logger.debug(f"๐Ÿƒ Processing output: '''{tts_text}'''...")

        if translate_engine:
            if len(re.sub(r'[\s.,!?๏ผŒใ€‚๏ผ๏ผŸ\'"ใ€ใ€๏ผ‰ใ€‘\s]+', "", tts_text)):
                tts_text = translate_engine.translate(tts_text)
            logger.info(f"๐Ÿƒ Text after translation: '''{tts_text}'''...")
        else:
            logger.debug("๐Ÿšซ No translation engine available. Skipping translation.")

        full_response += display_text.text
        await tts_manager.speak(
            tts_text=tts_text,
            display_text=display_text,
            actions=actions,
            live2d_model=live2d_model,
            tts_engine=tts_engine,
            websocket_send=websocket_send,
        )
    return full_response


async def handle_audio_output(

    output: AudioOutput,

    websocket_send: WebSocketSend,

) -> str:
    """Process and send AudioOutput directly to the client"""
    full_response = ""
    async for audio_path, display_text, transcript, actions in output:
        full_response += transcript
        audio_payload = prepare_audio_payload(
            audio_path=audio_path,
            display_text=display_text,
            actions=actions.to_dict() if actions else None,
        )
        await websocket_send(json.dumps(audio_payload))
    return full_response


async def send_conversation_start_signals(websocket_send: WebSocketSend) -> None:
    """Send initial conversation signals"""
    await websocket_send(
        json.dumps(
            {
                "type": "control",
                "text": "conversation-chain-start",
            }
        )
    )
    await websocket_send(json.dumps({"type": "full-text", "text": "Thinking..."}))


async def process_user_input(

    user_input: Union[str, np.ndarray],

    asr_engine: ASRInterface,

    websocket_send: WebSocketSend,

) -> str:
    """Process user input, converting audio to text if needed"""
    if isinstance(user_input, np.ndarray):
        logger.info("Transcribing audio input...")
        input_text = await asr_engine.async_transcribe_np(user_input)
        await websocket_send(
            json.dumps({"type": "user-input-transcription", "text": input_text})
        )
        return input_text
    return user_input


async def finalize_conversation_turn(

    tts_manager: TTSTaskManager,

    websocket_send: WebSocketSend,

    client_uid: str,

    broadcast_ctx: Optional[BroadcastContext] = None,

) -> None:
    """Finalize a conversation turn"""
    if tts_manager.task_list:
        await asyncio.gather(*tts_manager.task_list)
        await websocket_send(json.dumps({"type": "backend-synth-complete"}))

        response = await message_handler.wait_for_response(
            client_uid, "frontend-playback-complete"
        )

        if not response:
            logger.warning(f"No playback completion response from {client_uid}")
            return

    await websocket_send(json.dumps({"type": "force-new-message"}))

    if broadcast_ctx and broadcast_ctx.broadcast_func:
        await broadcast_ctx.broadcast_func(
            broadcast_ctx.group_members,
            {"type": "force-new-message"},
            broadcast_ctx.current_client_uid,
        )

    await send_conversation_end_signal(websocket_send, broadcast_ctx)


async def send_conversation_end_signal(

    websocket_send: WebSocketSend,

    broadcast_ctx: Optional[BroadcastContext],

    session_emoji: str = "๐Ÿ˜Š",

) -> None:
    """Send conversation chain end signal"""
    chain_end_msg = {
        "type": "control",
        "text": "conversation-chain-end",
    }

    await websocket_send(json.dumps(chain_end_msg))

    if broadcast_ctx and broadcast_ctx.broadcast_func and broadcast_ctx.group_members:
        await broadcast_ctx.broadcast_func(
            broadcast_ctx.group_members,
            chain_end_msg,
        )

    logger.info(f"๐Ÿ˜Ž๐Ÿ‘โœ… Conversation Chain {session_emoji} completed!")


def cleanup_conversation(tts_manager: TTSTaskManager, session_emoji: str) -> None:
    """Clean up conversation resources"""
    tts_manager.clear()
    logger.debug(f"๐Ÿงน Clearing up conversation {session_emoji}.")


EMOJI_LIST = [
    "๐Ÿถ",
    "๐Ÿฑ",
    "๐Ÿญ",
    "๐Ÿน",
    "๐Ÿฐ",
    "๐ŸฆŠ",
    "๐Ÿป",
    "๐Ÿผ",
    "๐Ÿจ",
    "๐Ÿฏ",
    "๐Ÿฆ",
    "๐Ÿฎ",
    "๐Ÿท",
    "๐Ÿธ",
    "๐Ÿต",
    "๐Ÿ”",
    "๐Ÿง",
    "๐Ÿฆ",
    "๐Ÿค",
    "๐Ÿฃ",
    "๐Ÿฅ",
    "๐Ÿฆ†",
    "๐Ÿฆ…",
    "๐Ÿฆ‰",
    "๐Ÿฆ‡",
    "๐Ÿบ",
    "๐Ÿ—",
    "๐Ÿด",
    "๐Ÿฆ„",
    "๐Ÿ",
    "๐ŸŒต",
    "๐ŸŽ„",
    "๐ŸŒฒ",
    "๐ŸŒณ",
    "๐ŸŒด",
    "๐ŸŒฑ",
    "๐ŸŒฟ",
    "โ˜˜๏ธ",
    "๐Ÿ€",
    "๐Ÿ‚",
    "๐Ÿ",
    "๐Ÿ„",
    "๐ŸŒพ",
    "๐Ÿ’",
    "๐ŸŒน",
    "๐ŸŒธ",
    "๐ŸŒ›",
    "๐ŸŒ",
    "โญ๏ธ",
    "๐Ÿ”ฅ",
    "๐ŸŒˆ",
    "๐ŸŒฉ",
    "โ›„๏ธ",
    "๐ŸŽƒ",
    "๐ŸŽ„",
    "๐ŸŽ‰",
    "๐ŸŽ",
    "๐ŸŽ—",
    "๐Ÿ€„๏ธ",
    "๐ŸŽญ",
    "๐ŸŽจ",
    "๐Ÿงต",
    "๐Ÿชก",
    "๐Ÿงถ",
    "๐Ÿฅฝ",
    "๐Ÿฅผ",
    "๐Ÿฆบ",
    "๐Ÿ‘”",
    "๐Ÿ‘•",
    "๐Ÿ‘œ",
    "๐Ÿ‘‘",
]