Spaces:
Sleeping
Sleeping
File size: 42,116 Bytes
f4baae1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 |
"""
OpenAI API endpoints
"""
import time
import json
import asyncio
from datetime import datetime
from typing import List, Dict, Any
from fastapi import APIRouter, Header, HTTPException
from fastapi.responses import StreamingResponse
import httpx
from app.core.config import settings
from app.models.schemas import OpenAIRequest, Message, ModelsResponse, Model
from app.utils.helpers import debug_log
from app.core.zai_transformer import ZAITransformer, generate_uuid
from app.utils.sse_tool_handler import SSEToolHandler
router = APIRouter()
# 全局转换器实例
transformer = ZAITransformer()
@router.get("/v1/models")
async def list_models():
"""List available models"""
current_time = int(time.time())
response = ModelsResponse(
data=[
Model(id=settings.PRIMARY_MODEL, created=current_time, owned_by="z.ai"),
Model(id=settings.THINKING_MODEL, created=current_time, owned_by="z.ai"),
Model(id=settings.SEARCH_MODEL, created=current_time, owned_by="z.ai"),
Model(id=settings.AIR_MODEL, created=current_time, owned_by="z.ai"),
Model(id=settings.GLM_46_MODEL, created=current_time, owned_by="z.ai"),
Model(id=settings.GLM_46_THINKING_MODEL, created=current_time, owned_by="z.ai"),
]
)
return response
@router.post("/v1/chat/completions")
async def chat_completions(request: OpenAIRequest, authorization: str = Header(...)):
"""Handle chat completion requests with ZAI transformer"""
role = request.messages[0].role if request.messages else "unknown"
debug_log(f"😶🌫️ 收到 客户端 请求 - 模型: {request.model}, 流式: {request.stream}, 消息数: {len(request.messages)}, 角色: {role}, 工具数: {len(request.tools) if request.tools else 0}")
try:
# Validate API key (skip if SKIP_AUTH_TOKEN is enabled)
if not settings.SKIP_AUTH_TOKEN:
if not authorization.startswith("Bearer "):
raise HTTPException(status_code=401, detail="Missing or invalid Authorization header")
api_key = authorization[7:]
if api_key != settings.AUTH_TOKEN:
raise HTTPException(status_code=401, detail="Invalid API key")
# 使用新的转换器转换请求
request_dict = request.model_dump()
debug_log("🔄 开始转换请求格式: OpenAI -> Z.AI")
transformed = await transformer.transform_request_in(request_dict)
# 调用上游API
async def stream_response():
"""流式响应生成器(包含重试机制)"""
retry_count = 0
last_error = None
current_token = transformed.get("token", "") # 获取当前使用的token
while retry_count <= settings.MAX_RETRIES:
try:
# 如果是重试,重新获取令牌并更新请求
if retry_count > 0:
delay = 2.0
debug_log(f"重试请求 ({retry_count}/{settings.MAX_RETRIES}) - 等待 {delay:.1f}s")
await asyncio.sleep(delay)
# 标记前一个token失败
if current_token:
transformer.mark_token_failure(current_token, Exception(f"Retry {retry_count}: {last_error}"))
# 重新获取令牌
debug_log("🔑 重新获取令牌用于重试...")
new_token = await transformer.get_token()
if not new_token:
debug_log("❌ 重试时无法获取有效的认证令牌")
raise Exception("重试时无法获取有效的认证令牌")
transformed["config"]["headers"]["Authorization"] = f"Bearer {new_token}"
current_token = new_token
async with httpx.AsyncClient(timeout=60.0) as client:
# 发送请求到上游
# debug_log(f"🎯 发送请求到 Z.AI: {transformed['config']['url']}")
async with client.stream(
"POST",
transformed["config"]["url"],
json=transformed["body"],
headers=transformed["config"]["headers"],
) as response:
# 检查响应状态码
if response.status_code == 400:
# 400 错误,触发重试
error_text = await response.aread()
error_msg = error_text.decode('utf-8', errors='ignore')
debug_log(f"❌ 上游返回 400 错误 (尝试 {retry_count + 1}/{settings.MAX_RETRIES + 1})")
debug_log(f"上游错误响应: {error_msg}")
retry_count += 1
last_error = f"400 Bad Request: {error_msg}"
# 如果还有重试机会,继续循环
if retry_count <= settings.MAX_RETRIES:
continue
else:
# 达到最大重试次数,抛出错误
debug_log(f"❌ 达到最大重试次数 ({settings.MAX_RETRIES}),请求失败")
error_response = {
"error": {
"message": f"Request failed after {settings.MAX_RETRIES} retries: {last_error}",
"type": "upstream_error",
"code": 400
}
}
yield f"data: {json.dumps(error_response)}\n\n"
yield "data: [DONE]\n\n"
return
elif response.status_code == 401:
# 认证错误,可能需要重新获取token
debug_log(f"❌ 认证失败 (401),标记token失效")
if current_token:
transformer.mark_token_failure(current_token, Exception("401 Unauthorized"))
retry_count += 1
last_error = "401 Unauthorized - Token may be invalid"
if retry_count <= settings.MAX_RETRIES:
continue
else:
error_response = {
"error": {
"message": "Authentication failed after retries",
"type": "auth_error",
"code": 401
}
}
yield f"data: {json.dumps(error_response)}\n\n"
yield "data: [DONE]\n\n"
return
elif response.status_code == 429:
# 速率限制,延长等待时间重试
debug_log(f"❌ 速率限制 (429),将延长等待时间重试")
retry_count += 1
last_error = "429 Rate Limited"
if retry_count <= settings.MAX_RETRIES:
continue
else:
error_response = {
"error": {
"message": "Rate limit exceeded",
"type": "rate_limit_error",
"code": 429
}
}
yield f"data: {json.dumps(error_response)}\n\n"
yield "data: [DONE]\n\n"
return
elif response.status_code != 200:
# 其他错误,检查是否需要重试
error_text = await response.aread()
error_msg = error_text.decode('utf-8', errors='ignore')
debug_log(f"❌ 上游返回错误: {response.status_code}, 详情: {error_msg}")
# 某些错误可以重试
retryable_codes = [502, 503, 504]
if response.status_code in retryable_codes and retry_count < settings.MAX_RETRIES:
retry_count += 1
last_error = f"{response.status_code}: {error_msg}"
debug_log(f"⚠️ 服务器错误 {response.status_code},准备重试")
continue
# 不可重试的错误或已达到重试上限
error_response = {
"error": {
"message": f"Upstream error: {response.status_code}",
"type": "upstream_error",
"code": response.status_code,
"details": error_msg[:500] # 限制错误详情长度
}
}
yield f"data: {json.dumps(error_response)}\n\n"
yield "data: [DONE]\n\n"
return
# 200 成功,处理响应
debug_log(f"✅ Z.AI 响应成功,开始处理 SSE 流")
if retry_count > 0:
debug_log(f"✨ 第 {retry_count} 次重试成功")
# 标记token使用成功
if current_token:
transformer.mark_token_success(current_token)
# 初始化工具处理器(如果需要)
has_tools = transformed["body"].get("tools") is not None
has_mcp_servers = bool(transformed["body"].get("mcp_servers"))
tool_handler = None
# 如果有工具定义或MCP服务器,都需要工具处理器
if has_tools or has_mcp_servers:
chat_id = transformed["body"]["chat_id"]
model = request.model
tool_handler = SSEToolHandler(chat_id, model)
if has_tools and has_mcp_servers:
debug_log(f"🔧 初始化工具处理器: {len(transformed['body'].get('tools', []))} 个OpenAI工具 + {len(transformed['body'].get('mcp_servers', []))} 个MCP服务器")
elif has_tools:
debug_log(f"🔧 初始化工具处理器: {len(transformed['body'].get('tools', []))} 个OpenAI工具")
elif has_mcp_servers:
debug_log(f"🔧 初始化工具处理器: {len(transformed['body'].get('mcp_servers', []))} 个MCP服务器")
# 处理状态
has_thinking = False
thinking_signature = None
first_thinking_chunk = True
# 处理SSE流 - 优化的buffer处理
buffer = bytearray()
incomplete_line = ""
line_count = 0
chunk_count = 0
last_activity = time.time()
debug_log("📡 开始接收 SSE 流数据...")
async for chunk in response.aiter_bytes():
chunk_count += 1
last_activity = time.time()
if not chunk:
continue
# 将新数据添加到buffer
buffer.extend(chunk)
# 尝试解码并处理完整的行
try:
# 解码为字符串并处理
text_data = buffer.decode('utf-8')
# 分割为行
lines = text_data.split('\n')
# 最后一行可能不完整,保存到incomplete_line
if not text_data.endswith('\n'):
incomplete_line = lines[-1]
lines = lines[:-1]
else:
# 如果有未完成的行,将其与第一行合并
if incomplete_line:
lines[0] = incomplete_line + lines[0]
incomplete_line = ""
# 清空buffer,开始处理新的数据
buffer = bytearray()
if incomplete_line:
buffer.extend(incomplete_line.encode('utf-8'))
# 处理完整的行
for current_line in lines:
line_count += 1
if not current_line.strip():
continue
if current_line.startswith("data:"):
chunk_str = current_line[5:].strip()
if not chunk_str or chunk_str == "[DONE]":
if chunk_str == "[DONE]":
debug_log("📡 收到 [DONE] 信号")
yield "data: [DONE]\n\n"
continue
# debug_log(f"📦 解析数据块: {chunk_str[:200]}..." if len(chunk_str) > 200 else f"📦 解析数据块: {chunk_str}")
try:
chunk = json.loads(chunk_str)
if chunk.get("type") == "chat:completion":
data = chunk.get("data", {})
phase = data.get("phase")
# 记录每个阶段(只在阶段变化时记录)
if phase and phase != getattr(stream_response, '_last_phase', None):
debug_log(f"📈 SSE 阶段: {phase}")
stream_response._last_phase = phase
# 处理工具调用
if phase == "tool_call" and tool_handler:
for output in tool_handler.process_tool_call_phase(data, True):
yield output
# 处理其他阶段(工具结束)
elif phase == "other" and tool_handler:
for output in tool_handler.process_other_phase(data, True):
yield output
# 处理思考内容
elif phase == "thinking":
if not has_thinking:
has_thinking = True
# 发送初始角色
role_chunk = {
"choices": [
{
"delta": {"role": "assistant"},
"finish_reason": None,
"index": 0,
"logprobs": None,
}
],
"created": int(time.time()),
"id": transformed["body"]["chat_id"],
"model": request.model,
"object": "chat.completion.chunk",
"system_fingerprint": "fp_zai_001",
}
yield f"data: {json.dumps(role_chunk)}\n\n"
delta_content = data.get("delta_content", "")
if delta_content:
# 处理思考内容格式
if delta_content.startswith("<details"):
content = (
delta_content.split("</summary>\n>")[-1].strip()
if "</summary>\n>" in delta_content
else delta_content
)
else:
content = delta_content
# 第一个思考块添加<think>开始标签,其他块保持纯内容
if first_thinking_chunk:
formatted_content = f"<think>{content}"
first_thinking_chunk = False
else:
formatted_content = content
thinking_chunk = {
"choices": [
{
"delta": {
"role": "assistant",
"content": formatted_content,
},
"finish_reason": None,
"index": 0,
"logprobs": None,
}
],
"created": int(time.time()),
"id": transformed["body"]["chat_id"],
"model": request.model,
"object": "chat.completion.chunk",
"system_fingerprint": "fp_zai_001",
}
yield f"data: {json.dumps(thinking_chunk)}\n\n"
# 处理答案内容
elif phase == "answer":
edit_content = data.get("edit_content", "")
delta_content = data.get("delta_content", "")
# 如果还没有发送角色,先发送角色chunk
if not has_thinking:
has_thinking = True # 设置标志避免重复发送
role_chunk = {
"choices": [
{
"delta": {"role": "assistant"},
"finish_reason": None,
"index": 0,
"logprobs": None,
}
],
"created": int(time.time()),
"id": transformed["body"]["chat_id"],
"model": request.model,
"object": "chat.completion.chunk",
"system_fingerprint": "fp_zai_001",
}
debug_log("➡️ 发送初始角色chunk")
yield f"data: {json.dumps(role_chunk)}\n\n"
# 处理思考结束和答案开始
if edit_content and "</details>\n" in edit_content:
if has_thinking and not first_thinking_chunk:
# 发送思考结束标记</think>
thinking_signature = str(int(time.time() * 1000))
sig_chunk = {
"choices": [
{
"delta": {
"role": "assistant",
"content": "</think>",
},
"finish_reason": None,
"index": 0,
"logprobs": None,
}
],
"created": int(time.time()),
"id": transformed["body"]["chat_id"],
"model": request.model,
"object": "chat.completion.chunk",
"system_fingerprint": "fp_zai_001",
}
yield f"data: {json.dumps(sig_chunk)}\n\n"
# 提取答案内容
content_after = edit_content.split("</details>\n")[-1]
if content_after:
content_chunk = {
"choices": [
{
"delta": {
"role": "assistant",
"content": content_after,
},
"finish_reason": None,
"index": 0,
"logprobs": None,
}
],
"created": int(time.time()),
"id": transformed["body"]["chat_id"],
"model": request.model,
"object": "chat.completion.chunk",
"system_fingerprint": "fp_zai_001",
}
yield f"data: {json.dumps(content_chunk)}\n\n"
# 处理增量内容
elif delta_content:
# 如果还没有发送角色
if not has_thinking:
has_thinking = True # 避免重复发送
role_chunk = {
"choices": [
{
"delta": {"role": "assistant"},
"finish_reason": None,
"index": 0,
"logprobs": None,
}
],
"created": int(time.time()),
"id": transformed["body"]["chat_id"],
"model": request.model,
"object": "chat.completion.chunk",
"system_fingerprint": "fp_zai_001",
}
debug_log("➡️ 发送初始角色chunk")
yield f"data: {json.dumps(role_chunk)}\n\n"
content_chunk = {
"choices": [
{
"delta": {
"content": delta_content,
},
"finish_reason": None,
"index": 0,
"logprobs": None,
}
],
"created": int(time.time()),
"id": transformed["body"]["chat_id"],
"model": request.model,
"object": "chat.completion.chunk",
"system_fingerprint": "fp_zai_001",
}
output_data = f"data: {json.dumps(content_chunk)}\n\n"
# debug_log(f"➡️ 输出内容块到客户端: {delta_content[:50]}...")
yield output_data
# 处理完成 - 当收到usage信息时
if data.get("usage"):
debug_log(f"📦 完成响应 - 使用统计: {json.dumps(data['usage'])}")
# 只有在非工具调用模式下才发送普通完成信号
if not tool_handler or not tool_handler.has_tool_call:
finish_chunk = {
"choices": [
{
"delta": {}, # 空的delta表示结束
"finish_reason": "stop",
"index": 0,
"logprobs": None,
}
],
"usage": data["usage"],
"created": int(time.time()),
"id": transformed["body"]["chat_id"],
"model": request.model,
"object": "chat.completion.chunk",
"system_fingerprint": "fp_zai_001",
}
finish_output = f"data: {json.dumps(finish_chunk)}\n\n"
debug_log("➡️ 发送完成信号")
yield finish_output
debug_log("➡️ 发送 [DONE]")
yield "data: [DONE]\n\n"
except json.JSONDecodeError as e:
debug_log(f"❌ JSON解析错误: {e}, 内容: {chunk_str[:200]}")
except Exception as e:
debug_log(f"❌ 处理chunk错误: {e}")
except UnicodeDecodeError:
# 如果解码失败,可能是数据不完整,继续接收
debug_log(f"⚠️ 数据解码失败,缓冲区大小: {len(buffer)}")
if len(buffer) > 1024 * 1024: # 1MB限制
debug_log("❌ 缓冲区过大,清空重试")
buffer = bytearray()
incomplete_line = ""
except Exception as e:
debug_log(f"❌ Buffer处理异常: {e}")
# 清空buffer继续处理
buffer = bytearray()
incomplete_line = ""
# 检查是否长时间没有活动(超时检查)
if time.time() - last_activity > 30: # 30秒超时
debug_log("⚠️ 检测到长时间无活动,可能连接中断")
break
# 确保发送结束信号
if not tool_handler or not tool_handler.has_tool_call:
debug_log("📤 发送最终 [DONE] 信号")
yield "data: [DONE]\n\n"
debug_log(f"✅ SSE 流处理完成,共处理 {line_count} 行数据,{chunk_count} 个数据块")
# 检查处理完整性
is_complete = True
completion_issues = []
if line_count == 0:
is_complete = False
completion_issues.append("没有处理任何数据行")
elif chunk_count == 0:
is_complete = False
completion_issues.append("没有收到任何数据块")
elif chunk_count > 0:
debug_log(f"📊 平均每个数据块包含 {line_count/chunk_count:.1f} 行")
# 检查工具调用完整性
if tool_handler and tool_handler.has_tool_call:
if not tool_handler.completed_tools:
completion_issues.append("工具调用未正常完成")
else:
debug_log(f"✅ 工具调用完成: {len(tool_handler.completed_tools)} 个工具")
# 检查思考内容完整性(只有真正的thinking模式才需要签名)
# 注意:普通的answer阶段不需要thinking签名,只有thinking阶段才需要
# if has_thinking and not thinking_signature:
# completion_issues.append("思考内容缺少签名")
# 报告完整性状态
if is_complete and not completion_issues:
debug_log("✅ 响应完整性检查通过")
else:
debug_log(f"⚠️ 响应完整性问题: {', '.join(completion_issues)}")
# 如果问题严重且还有重试机会,考虑重试
critical_issues = ["没有处理任何数据行", "没有收到任何数据块"]
has_critical_issue = any(issue in completion_issues for issue in critical_issues)
if has_critical_issue and retry_count < settings.MAX_RETRIES:
debug_log("🔄 检测到严重完整性问题,准备重试")
retry_count += 1
last_error = f"Incomplete response: {', '.join(completion_issues)}"
continue
# 成功处理完成,退出重试循环
return
except Exception as e:
debug_log(f"❌ 流处理错误: {e}")
import traceback
debug_log(traceback.format_exc())
# 标记token失败
if current_token:
transformer.mark_token_failure(current_token, e)
# 检查是否还可以重试
retry_count += 1
last_error = str(e)
if retry_count > settings.MAX_RETRIES:
# 达到最大重试次数,返回错误
debug_log(f"❌ 达到最大重试次数 ({settings.MAX_RETRIES}),流处理失败")
error_response = {
"error": {
"message": f"Stream processing failed after {settings.MAX_RETRIES} retries: {last_error}",
"type": "stream_error"
}
}
yield f"data: {json.dumps(error_response)}\n\n"
yield "data: [DONE]\n\n"
return
# 返回流式响应
debug_log("🚀 启动 SSE 流式响应")
# 创建一个包装的生成器来追踪数据流
async def logged_stream():
chunk_count = 0
try:
debug_log("📤 开始向客户端流式传输数据...")
async for chunk in stream_response():
chunk_count += 1
# debug_log(f"📤 发送块[{chunk_count}]: {chunk[:200]}..." if len(chunk) > 200 else f" 📤 发送块[{chunk_count}]: {chunk}")
yield chunk
debug_log(f"✅ 流式传输完成,共发送 {chunk_count} 个数据块")
except Exception as e:
debug_log(f"❌ 流式传输中断: {e}")
raise
return StreamingResponse(
logged_stream(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
},
)
except HTTPException:
raise
except Exception as e:
debug_log(f"❌ 处理请求时发生错误: {str(e)}")
import traceback
debug_log(f"❌ 错误堆栈: {traceback.format_exc()}")
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}") |