File size: 42,116 Bytes
f4baae1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
"""
OpenAI API endpoints
"""

import time
import json
import asyncio
from datetime import datetime
from typing import List, Dict, Any
from fastapi import APIRouter, Header, HTTPException
from fastapi.responses import StreamingResponse
import httpx

from app.core.config import settings
from app.models.schemas import OpenAIRequest, Message, ModelsResponse, Model
from app.utils.helpers import debug_log
from app.core.zai_transformer import ZAITransformer, generate_uuid
from app.utils.sse_tool_handler import SSEToolHandler

router = APIRouter()

# 全局转换器实例
transformer = ZAITransformer()


@router.get("/v1/models")
async def list_models():
    """List available models"""
    current_time = int(time.time())
    response = ModelsResponse(
        data=[
            Model(id=settings.PRIMARY_MODEL, created=current_time, owned_by="z.ai"),
            Model(id=settings.THINKING_MODEL, created=current_time, owned_by="z.ai"),
            Model(id=settings.SEARCH_MODEL, created=current_time, owned_by="z.ai"),
            Model(id=settings.AIR_MODEL, created=current_time, owned_by="z.ai"),
            Model(id=settings.GLM_46_MODEL, created=current_time, owned_by="z.ai"),
            Model(id=settings.GLM_46_THINKING_MODEL, created=current_time, owned_by="z.ai"),
        ]
    )
    return response


@router.post("/v1/chat/completions")
async def chat_completions(request: OpenAIRequest, authorization: str = Header(...)):
    """Handle chat completion requests with ZAI transformer"""
    role = request.messages[0].role if request.messages else "unknown"
    debug_log(f"😶‍🌫️ 收到 客户端 请求 - 模型: {request.model}, 流式: {request.stream}, 消息数: {len(request.messages)}, 角色: {role}, 工具数: {len(request.tools) if request.tools else 0}")
    
    try:
        # Validate API key (skip if SKIP_AUTH_TOKEN is enabled)
        if not settings.SKIP_AUTH_TOKEN:
            if not authorization.startswith("Bearer "):
                raise HTTPException(status_code=401, detail="Missing or invalid Authorization header")
            
            api_key = authorization[7:]
            if api_key != settings.AUTH_TOKEN:
                raise HTTPException(status_code=401, detail="Invalid API key")
            
        # 使用新的转换器转换请求
        request_dict = request.model_dump()
        debug_log("🔄 开始转换请求格式: OpenAI -> Z.AI")
        
        transformed = await transformer.transform_request_in(request_dict)

        # 调用上游API
        async def stream_response():
            """流式响应生成器(包含重试机制)"""
            retry_count = 0
            last_error = None
            current_token = transformed.get("token", "")  # 获取当前使用的token

            while retry_count <= settings.MAX_RETRIES:
                try:
                    # 如果是重试,重新获取令牌并更新请求
                    if retry_count > 0:
                        delay = 2.0
                        debug_log(f"重试请求 ({retry_count}/{settings.MAX_RETRIES}) - 等待 {delay:.1f}s")
                        await asyncio.sleep(delay)

                        # 标记前一个token失败
                        if current_token:
                            transformer.mark_token_failure(current_token, Exception(f"Retry {retry_count}: {last_error}"))

                        # 重新获取令牌
                        debug_log("🔑 重新获取令牌用于重试...")
                        new_token = await transformer.get_token()
                        if not new_token:
                            debug_log("❌ 重试时无法获取有效的认证令牌")
                            raise Exception("重试时无法获取有效的认证令牌")
                        transformed["config"]["headers"]["Authorization"] = f"Bearer {new_token}"
                        current_token = new_token

                    async with httpx.AsyncClient(timeout=60.0) as client:
                        # 发送请求到上游
                        # debug_log(f"🎯 发送请求到 Z.AI: {transformed['config']['url']}")
                        async with client.stream(
                            "POST",
                            transformed["config"]["url"],
                            json=transformed["body"],
                            headers=transformed["config"]["headers"],
                        ) as response:
                            # 检查响应状态码
                            if response.status_code == 400:
                                # 400 错误,触发重试
                                error_text = await response.aread()
                                error_msg = error_text.decode('utf-8', errors='ignore')
                                debug_log(f"❌ 上游返回 400 错误 (尝试 {retry_count + 1}/{settings.MAX_RETRIES + 1})")
                                debug_log(f"上游错误响应: {error_msg}")

                                retry_count += 1
                                last_error = f"400 Bad Request: {error_msg}"

                                # 如果还有重试机会,继续循环
                                if retry_count <= settings.MAX_RETRIES:
                                    continue
                                else:
                                    # 达到最大重试次数,抛出错误
                                    debug_log(f"❌ 达到最大重试次数 ({settings.MAX_RETRIES}),请求失败")
                                    error_response = {
                                        "error": {
                                            "message": f"Request failed after {settings.MAX_RETRIES} retries: {last_error}",
                                            "type": "upstream_error",
                                            "code": 400
                                        }
                                    }
                                    yield f"data: {json.dumps(error_response)}\n\n"
                                    yield "data: [DONE]\n\n"
                                    return

                            elif response.status_code == 401:
                                # 认证错误,可能需要重新获取token
                                debug_log(f"❌ 认证失败 (401),标记token失效")
                                if current_token:
                                    transformer.mark_token_failure(current_token, Exception("401 Unauthorized"))
                                
                                retry_count += 1
                                last_error = "401 Unauthorized - Token may be invalid"
                                
                                if retry_count <= settings.MAX_RETRIES:
                                    continue
                                else:
                                    error_response = {
                                        "error": {
                                            "message": "Authentication failed after retries",
                                            "type": "auth_error",
                                            "code": 401
                                        }
                                    }
                                    yield f"data: {json.dumps(error_response)}\n\n"
                                    yield "data: [DONE]\n\n"
                                    return
                            
                            elif response.status_code == 429:
                                # 速率限制,延长等待时间重试
                                debug_log(f"❌ 速率限制 (429),将延长等待时间重试")
                                retry_count += 1
                                last_error = "429 Rate Limited"
                                
                                if retry_count <= settings.MAX_RETRIES:
                                    continue
                                else:
                                    error_response = {
                                        "error": {
                                            "message": "Rate limit exceeded",
                                            "type": "rate_limit_error", 
                                            "code": 429
                                        }
                                    }
                                    yield f"data: {json.dumps(error_response)}\n\n"
                                    yield "data: [DONE]\n\n"
                                    return
                            
                            elif response.status_code != 200:
                                # 其他错误,检查是否需要重试
                                error_text = await response.aread()
                                error_msg = error_text.decode('utf-8', errors='ignore')
                                debug_log(f"❌ 上游返回错误: {response.status_code}, 详情: {error_msg}")
                                
                                # 某些错误可以重试
                                retryable_codes = [502, 503, 504]
                                if response.status_code in retryable_codes and retry_count < settings.MAX_RETRIES:
                                    retry_count += 1
                                    last_error = f"{response.status_code}: {error_msg}"
                                    debug_log(f"⚠️ 服务器错误 {response.status_code},准备重试")
                                    continue
                                
                                # 不可重试的错误或已达到重试上限
                                error_response = {
                                    "error": {
                                        "message": f"Upstream error: {response.status_code}",
                                        "type": "upstream_error",
                                        "code": response.status_code,
                                        "details": error_msg[:500]  # 限制错误详情长度
                                    }
                                }
                                yield f"data: {json.dumps(error_response)}\n\n"
                                yield "data: [DONE]\n\n"
                                return

                            # 200 成功,处理响应
                            debug_log(f"✅ Z.AI 响应成功,开始处理 SSE 流")
                            if retry_count > 0:
                                debug_log(f"✨ 第 {retry_count} 次重试成功")

                            # 标记token使用成功
                            if current_token:
                                transformer.mark_token_success(current_token)

                            # 初始化工具处理器(如果需要)
                            has_tools = transformed["body"].get("tools") is not None
                            has_mcp_servers = bool(transformed["body"].get("mcp_servers"))
                            tool_handler = None

                            # 如果有工具定义或MCP服务器,都需要工具处理器
                            if has_tools or has_mcp_servers:
                                chat_id = transformed["body"]["chat_id"]
                                model = request.model
                                tool_handler = SSEToolHandler(chat_id, model)

                                if has_tools and has_mcp_servers:
                                    debug_log(f"🔧 初始化工具处理器: {len(transformed['body'].get('tools', []))} 个OpenAI工具 + {len(transformed['body'].get('mcp_servers', []))} 个MCP服务器")
                                elif has_tools:
                                    debug_log(f"🔧 初始化工具处理器: {len(transformed['body'].get('tools', []))} 个OpenAI工具")
                                elif has_mcp_servers:
                                    debug_log(f"🔧 初始化工具处理器: {len(transformed['body'].get('mcp_servers', []))} 个MCP服务器")

                            # 处理状态
                            has_thinking = False
                            thinking_signature = None
                            first_thinking_chunk = True

                            # 处理SSE流 - 优化的buffer处理
                            buffer = bytearray()
                            incomplete_line = ""
                            line_count = 0
                            chunk_count = 0
                            last_activity = time.time()
                            debug_log("📡 开始接收 SSE 流数据...")

                            async for chunk in response.aiter_bytes():
                                chunk_count += 1
                                last_activity = time.time()
                                
                                if not chunk:
                                    continue

                                # 将新数据添加到buffer
                                buffer.extend(chunk)
                                
                                # 尝试解码并处理完整的行
                                try:
                                    # 解码为字符串并处理
                                    text_data = buffer.decode('utf-8')
                                    
                                    # 分割为行
                                    lines = text_data.split('\n')
                                    
                                    # 最后一行可能不完整,保存到incomplete_line
                                    if not text_data.endswith('\n'):
                                        incomplete_line = lines[-1]
                                        lines = lines[:-1]
                                    else:
                                        # 如果有未完成的行,将其与第一行合并
                                        if incomplete_line:
                                            lines[0] = incomplete_line + lines[0]
                                            incomplete_line = ""
                                    
                                    # 清空buffer,开始处理新的数据
                                    buffer = bytearray()
                                    if incomplete_line:
                                        buffer.extend(incomplete_line.encode('utf-8'))
                                    
                                    # 处理完整的行
                                    for current_line in lines:
                                        line_count += 1
                                        if not current_line.strip():
                                            continue

                                        if current_line.startswith("data:"):
                                            chunk_str = current_line[5:].strip()
                                            if not chunk_str or chunk_str == "[DONE]":
                                                if chunk_str == "[DONE]":
                                                    debug_log("📡 收到 [DONE] 信号")
                                                    yield "data: [DONE]\n\n"
                                                continue

                                            # debug_log(f"📦 解析数据块: {chunk_str[:200]}..." if len(chunk_str) > 200 else f"📦 解析数据块: {chunk_str}")

                                            try:
                                                chunk = json.loads(chunk_str)

                                                if chunk.get("type") == "chat:completion":
                                                    data = chunk.get("data", {})
                                                    phase = data.get("phase")

                                                    # 记录每个阶段(只在阶段变化时记录)
                                                    if phase and phase != getattr(stream_response, '_last_phase', None):
                                                        debug_log(f"📈 SSE 阶段: {phase}")
                                                        stream_response._last_phase = phase

                                                    # 处理工具调用
                                                    if phase == "tool_call" and tool_handler:
                                                        for output in tool_handler.process_tool_call_phase(data, True):
                                                            yield output

                                                    # 处理其他阶段(工具结束)
                                                    elif phase == "other" and tool_handler:
                                                        for output in tool_handler.process_other_phase(data, True):
                                                            yield output

                                                    # 处理思考内容
                                                    elif phase == "thinking":
                                                        if not has_thinking:
                                                            has_thinking = True
                                                            # 发送初始角色
                                                            role_chunk = {
                                                                "choices": [
                                                                    {
                                                                        "delta": {"role": "assistant"},
                                                                        "finish_reason": None,
                                                                        "index": 0,
                                                                        "logprobs": None,
                                                                    }
                                                                ],
                                                                "created": int(time.time()),
                                                                "id": transformed["body"]["chat_id"],
                                                                "model": request.model,
                                                                "object": "chat.completion.chunk",
                                                                "system_fingerprint": "fp_zai_001",
                                                            }
                                                            yield f"data: {json.dumps(role_chunk)}\n\n"

                                                        delta_content = data.get("delta_content", "")
                                                        if delta_content:
                                                            # 处理思考内容格式
                                                            if delta_content.startswith("<details"):
                                                                content = (
                                                                    delta_content.split("</summary>\n>")[-1].strip()
                                                                    if "</summary>\n>" in delta_content
                                                                    else delta_content
                                                                )
                                                            else:
                                                                content = delta_content
                                                            
                                                            # 第一个思考块添加<think>开始标签,其他块保持纯内容
                                                            if first_thinking_chunk:
                                                                formatted_content = f"<think>{content}"
                                                                first_thinking_chunk = False
                                                            else:
                                                                formatted_content = content

                                                            thinking_chunk = {
                                                                "choices": [
                                                                    {
                                                                        "delta": {
                                                                            "role": "assistant",
                                                                            "content": formatted_content,
                                                                        },
                                                                        "finish_reason": None,
                                                                        "index": 0,
                                                                        "logprobs": None,
                                                                    }
                                                                ],
                                                                "created": int(time.time()),
                                                                "id": transformed["body"]["chat_id"],
                                                                "model": request.model,
                                                                "object": "chat.completion.chunk",
                                                                "system_fingerprint": "fp_zai_001",
                                                            }
                                                            yield f"data: {json.dumps(thinking_chunk)}\n\n"

                                                    # 处理答案内容
                                                    elif phase == "answer":
                                                        edit_content = data.get("edit_content", "")
                                                        delta_content = data.get("delta_content", "")

                                                        # 如果还没有发送角色,先发送角色chunk
                                                        if not has_thinking:
                                                            has_thinking = True  # 设置标志避免重复发送
                                                            role_chunk = {
                                                                "choices": [
                                                                    {
                                                                        "delta": {"role": "assistant"},
                                                                        "finish_reason": None,
                                                                        "index": 0,
                                                                        "logprobs": None,
                                                                    }
                                                                ],
                                                                "created": int(time.time()),
                                                                "id": transformed["body"]["chat_id"],
                                                                "model": request.model,
                                                                "object": "chat.completion.chunk",
                                                                "system_fingerprint": "fp_zai_001",
                                                            }
                                                            debug_log("➡️ 发送初始角色chunk")
                                                            yield f"data: {json.dumps(role_chunk)}\n\n"

                                                        # 处理思考结束和答案开始
                                                        if edit_content and "</details>\n" in edit_content:
                                                            if has_thinking and not first_thinking_chunk:
                                                                # 发送思考结束标记</think>
                                                                thinking_signature = str(int(time.time() * 1000))
                                                                sig_chunk = {
                                                                    "choices": [
                                                                        {
                                                                            "delta": {
                                                                                "role": "assistant",
                                                                                "content": "</think>",
                                                                            },
                                                                            "finish_reason": None,
                                                                            "index": 0,
                                                                            "logprobs": None,
                                                                        }
                                                                    ],
                                                                    "created": int(time.time()),
                                                                    "id": transformed["body"]["chat_id"],
                                                                    "model": request.model,
                                                                    "object": "chat.completion.chunk",
                                                                    "system_fingerprint": "fp_zai_001",
                                                                }
                                                                yield f"data: {json.dumps(sig_chunk)}\n\n"

                                                            # 提取答案内容
                                                            content_after = edit_content.split("</details>\n")[-1]
                                                            if content_after:
                                                                content_chunk = {
                                                                "choices": [
                                                                    {
                                                                        "delta": {
                                                                            "role": "assistant",
                                                                            "content": content_after,
                                                                        },
                                                                        "finish_reason": None,
                                                                        "index": 0,
                                                                        "logprobs": None,
                                                                    }
                                                                ],
                                                                "created": int(time.time()),
                                                                "id": transformed["body"]["chat_id"],
                                                                "model": request.model,
                                                                "object": "chat.completion.chunk",
                                                                "system_fingerprint": "fp_zai_001",
                                                            }
                                                            yield f"data: {json.dumps(content_chunk)}\n\n"

                                                        # 处理增量内容
                                                        elif delta_content:
                                                            # 如果还没有发送角色
                                                            if not has_thinking:
                                                                has_thinking = True  # 避免重复发送
                                                                role_chunk = {
                                                                    "choices": [
                                                                        {
                                                                            "delta": {"role": "assistant"},
                                                                            "finish_reason": None,
                                                                            "index": 0,
                                                                            "logprobs": None,
                                                                        }
                                                                    ],
                                                                    "created": int(time.time()),
                                                                    "id": transformed["body"]["chat_id"],
                                                                    "model": request.model,
                                                                    "object": "chat.completion.chunk",
                                                                    "system_fingerprint": "fp_zai_001",
                                                                }
                                                                debug_log("➡️ 发送初始角色chunk")
                                                                yield f"data: {json.dumps(role_chunk)}\n\n"

                                                            content_chunk = {
                                                                "choices": [
                                                                    {
                                                                        "delta": {
                                                                            "content": delta_content,
                                                                        },
                                                                        "finish_reason": None,
                                                                        "index": 0,
                                                                        "logprobs": None,
                                                                    }
                                                                ],
                                                                "created": int(time.time()),
                                                                "id": transformed["body"]["chat_id"],
                                                                "model": request.model,
                                                                "object": "chat.completion.chunk",
                                                                "system_fingerprint": "fp_zai_001",
                                                            }
                                                            output_data = f"data: {json.dumps(content_chunk)}\n\n"
                                                            # debug_log(f"➡️ 输出内容块到客户端: {delta_content[:50]}...")
                                                            yield output_data

                                                    # 处理完成 - 当收到usage信息时
                                                    if data.get("usage"):
                                                        debug_log(f"📦 完成响应 - 使用统计: {json.dumps(data['usage'])}")

                                                        # 只有在非工具调用模式下才发送普通完成信号
                                                        if not tool_handler or not tool_handler.has_tool_call:
                                                            finish_chunk = {
                                                                "choices": [
                                                                    {
                                                                        "delta": {},  # 空的delta表示结束
                                                                        "finish_reason": "stop",
                                                                        "index": 0,
                                                                        "logprobs": None,
                                                                    }
                                                                ],
                                                                "usage": data["usage"],
                                                                "created": int(time.time()),
                                                                "id": transformed["body"]["chat_id"],
                                                                "model": request.model,
                                                                "object": "chat.completion.chunk",
                                                                "system_fingerprint": "fp_zai_001",
                                                            }
                                                            finish_output = f"data: {json.dumps(finish_chunk)}\n\n"
                                                            debug_log("➡️ 发送完成信号")
                                                            yield finish_output
                                                            debug_log("➡️ 发送 [DONE]")
                                                            yield "data: [DONE]\n\n"

                                            except json.JSONDecodeError as e:
                                                debug_log(f"❌ JSON解析错误: {e}, 内容: {chunk_str[:200]}")
                                            except Exception as e:
                                                debug_log(f"❌ 处理chunk错误: {e}")
                                
                                except UnicodeDecodeError:
                                    # 如果解码失败,可能是数据不完整,继续接收
                                    debug_log(f"⚠️ 数据解码失败,缓冲区大小: {len(buffer)}")
                                    if len(buffer) > 1024 * 1024:  # 1MB限制
                                        debug_log("❌ 缓冲区过大,清空重试")
                                        buffer = bytearray()
                                        incomplete_line = ""
                                except Exception as e:
                                    debug_log(f"❌ Buffer处理异常: {e}")
                                    # 清空buffer继续处理
                                    buffer = bytearray()
                                    incomplete_line = ""
                                
                                # 检查是否长时间没有活动(超时检查)
                                if time.time() - last_activity > 30:  # 30秒超时
                                    debug_log("⚠️ 检测到长时间无活动,可能连接中断")
                                    break

                            # 确保发送结束信号
                            if not tool_handler or not tool_handler.has_tool_call:
                                debug_log("📤 发送最终 [DONE] 信号")
                                yield "data: [DONE]\n\n"

                            debug_log(f"✅ SSE 流处理完成,共处理 {line_count} 行数据,{chunk_count} 个数据块")
                            
                            # 检查处理完整性
                            is_complete = True
                            completion_issues = []
                            
                            if line_count == 0:
                                is_complete = False
                                completion_issues.append("没有处理任何数据行")
                            elif chunk_count == 0:
                                is_complete = False
                                completion_issues.append("没有收到任何数据块")
                            elif chunk_count > 0:
                                debug_log(f"📊 平均每个数据块包含 {line_count/chunk_count:.1f} 行")
                            
                            # 检查工具调用完整性
                            if tool_handler and tool_handler.has_tool_call:
                                if not tool_handler.completed_tools:
                                    completion_issues.append("工具调用未正常完成")
                                else:
                                    debug_log(f"✅ 工具调用完成: {len(tool_handler.completed_tools)} 个工具")
                            
                            # 检查思考内容完整性(只有真正的thinking模式才需要签名)
                            # 注意:普通的answer阶段不需要thinking签名,只有thinking阶段才需要
                            # if has_thinking and not thinking_signature:
                            #     completion_issues.append("思考内容缺少签名")
                            
                            # 报告完整性状态
                            if is_complete and not completion_issues:
                                debug_log("✅ 响应完整性检查通过")
                            else:
                                debug_log(f"⚠️ 响应完整性问题: {', '.join(completion_issues)}")
                                
                                # 如果问题严重且还有重试机会,考虑重试
                                critical_issues = ["没有处理任何数据行", "没有收到任何数据块"]
                                has_critical_issue = any(issue in completion_issues for issue in critical_issues)
                                
                                if has_critical_issue and retry_count < settings.MAX_RETRIES:
                                    debug_log("🔄 检测到严重完整性问题,准备重试")
                                    retry_count += 1
                                    last_error = f"Incomplete response: {', '.join(completion_issues)}"
                                    continue
                            
                            # 成功处理完成,退出重试循环
                            return

                except Exception as e:
                    debug_log(f"❌ 流处理错误: {e}")
                    import traceback
                    debug_log(traceback.format_exc())

                    # 标记token失败
                    if current_token:
                        transformer.mark_token_failure(current_token, e)

                    # 检查是否还可以重试
                    retry_count += 1
                    last_error = str(e)

                    if retry_count > settings.MAX_RETRIES:
                        # 达到最大重试次数,返回错误
                        debug_log(f"❌ 达到最大重试次数 ({settings.MAX_RETRIES}),流处理失败")
                        error_response = {
                            "error": {
                                "message": f"Stream processing failed after {settings.MAX_RETRIES} retries: {last_error}",
                                "type": "stream_error"
                            }
                        }
                        yield f"data: {json.dumps(error_response)}\n\n"
                        yield "data: [DONE]\n\n"
                        return

        # 返回流式响应
        debug_log("🚀 启动 SSE 流式响应")
        
        # 创建一个包装的生成器来追踪数据流
        async def logged_stream():
            chunk_count = 0
            try:
                debug_log("📤 开始向客户端流式传输数据...")
                async for chunk in stream_response():
                    chunk_count += 1
                    # debug_log(f"📤 发送块[{chunk_count}]: {chunk[:200]}..." if len(chunk) > 200 else f"  📤 发送块[{chunk_count}]: {chunk}")
                    yield chunk
                debug_log(f"✅ 流式传输完成,共发送 {chunk_count} 个数据块")
            except Exception as e:
                debug_log(f"❌ 流式传输中断: {e}")
                raise
        
        return StreamingResponse(
            logged_stream(),
            media_type="text/event-stream",
            headers={
                "Cache-Control": "no-cache",
                "Connection": "keep-alive",
            },
        )

    except HTTPException:
        raise
    except Exception as e:
        debug_log(f"❌ 处理请求时发生错误: {str(e)}")
        import traceback

        debug_log(f"❌ 错误堆栈: {traceback.format_exc()}")
        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")