zarev

Runtime error

App Files Files Community

gaila commited on Feb 17

Commit

51cb8bb

verified ·

1 Parent(s): b093d94

Upload 5 files

Browse files

Files changed (5) hide show

Dockerfile +15 -0
claude_compat.py +250 -0
docker-compose.yml +26 -0
main.py +504 -0
openai.py +1761 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,15 @@

+FROM python:3.11-slim
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1
+WORKDIR /app
+COPY . /app
+RUN pip install --no-cache-dir --upgrade pip \
+    && pip install --no-cache-dir fastapi uvicorn httpx httpcore
+EXPOSE 30016
+CMD ["python", "openai.py"]

claude_compat.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""Anthropic Claude Messages API (/v1/messages) helpers.
+Converts between Anthropic Claude native format and the internal
+OpenAI-like format already used by openai.py.
+"""
+from __future__ import annotations
+import json
+import uuid
+from typing import Any
+# ── Message conversion ───────────────────────────────────────────────
+def claude_messages_to_openai(system: Any, messages: list[dict]) -> list[dict]:
+    """Convert Anthropic Claude messages to OpenAI message format."""
+    out: list[dict] = []
+    # System is a top-level field in Claude format
+    if system:
+        if isinstance(system, str):
+            out.append({"role": "system", "content": system})
+        elif isinstance(system, list):
+            texts = [
+                b.get("text", "")
+                for b in system
+                if isinstance(b, dict) and b.get("type") == "text"
+            ]
+            if texts:
+                out.append({"role": "system", "content": "\n".join(texts)})
+    for msg in messages:
+        role = msg.get("role", "user")
+        content = msg.get("content", "")
+        # Assistant with content blocks (may include tool_use)
+        if role == "assistant" and isinstance(content, list):
+            text_parts: list[str] = []
+            tool_calls: list[dict] = []
+            for block in content:
+                if not isinstance(block, dict):
+                    continue
+                bt = block.get("type")
+                if bt == "text":
+                    text_parts.append(block.get("text", ""))
+                elif bt == "tool_use":
+                    tool_calls.append({
+                        "id": block.get("id", f"call_{uuid.uuid4().hex[:24]}"),
+                        "type": "function",
+                        "function": {
+                            "name": block.get("name", ""),
+                            "arguments": json.dumps(block.get("input", {}), ensure_ascii=False),
+                        },
+                    })
+            omsg: dict = {"role": "assistant", "content": " ".join(text_parts).strip() or None}
+            if tool_calls:
+                omsg["tool_calls"] = tool_calls
+            out.append(omsg)
+            continue
+        # User with tool_result blocks
+        if role == "user" and isinstance(content, list):
+            has_tool_result = any(
+                isinstance(b, dict) and b.get("type") == "tool_result" for b in content
+            )
+            if has_tool_result:
+                for block in content:
+                    if not isinstance(block, dict):
+                        continue
+                    bt = block.get("type")
+                    if bt == "tool_result":
+                        rc = block.get("content", "")
+                        if isinstance(rc, str):
+                            rt = rc
+                        elif isinstance(rc, list):
+                            rt = " ".join(
+                                s.get("text", "") for s in rc
+                                if isinstance(s, dict) and s.get("type") == "text"
+                            )
+                        else:
+                            rt = str(rc)
+                        out.append({"role": "tool", "tool_call_id": block.get("tool_use_id", ""), "content": rt})
+                    elif bt == "text":
+                        out.append({"role": "user", "content": block.get("text", "")})
+                continue
+        # Default: extract text
+        out.append({"role": role, "content": _extract_text(content)})
+    return out
+def claude_tools_to_openai(tools: list[dict] | None) -> list[dict] | None:
+    """Convert Anthropic tool definitions to OpenAI format."""
+    if not tools:
+        return None
+    out = [
+        {
+            "type": "function",
+            "function": {
+                "name": t.get("name", ""),
+                "description": t.get("description", ""),
+                "parameters": t.get("input_schema", {}),
+            },
+        }
+        for t in tools
+        if isinstance(t, dict)
+    ]
+    return out or None
+def claude_tool_choice_prompt(tool_choice: Any) -> str:
+    if not isinstance(tool_choice, dict):
+        return ""
+    tc_type = tool_choice.get("type", "auto")
+    if tc_type == "any":
+        return "\nIMPORTANT: You MUST call at least one tool in your next response."
+    if tc_type == "tool":
+        name = tool_choice.get("name", "")
+        if name:
+            return f"\nIMPORTANT: You MUST call this tool: {name}"
+    return ""
+# ── Response builders ────────────────────────────────────────────────
+def make_claude_id() -> str:
+    return f"msg_{uuid.uuid4().hex[:24]}"
+def build_tool_call_blocks(
+    tool_calls: list[dict],
+) -> list[dict]:
+    """Convert internal tool call dicts to Claude content blocks."""
+    blocks = []
+    for tc in tool_calls:
+        fn = tc.get("function", {}) if isinstance(tc.get("function"), dict) else {}
+        args_str = fn.get("arguments", "{}")
+        try:
+            args_obj = json.loads(args_str) if isinstance(args_str, str) else args_str
+        except Exception:
+            args_obj = {}
+        blocks.append({
+            "type": "tool_use",
+            "id": tc.get("id", f"toolu_{uuid.uuid4().hex[:20]}").replace("call_", "toolu_"),
+            "name": fn.get("name", ""),
+            "input": args_obj,
+        })
+    return blocks
+def build_non_stream_response(
+    msg_id: str,
+    model: str,
+    reasoning_parts: list[str],
+    answer_text: str,
+    tool_calls: list[dict] | None,
+    input_tokens: int,
+    output_tokens: int,
+) -> dict:
+    """Build a complete Anthropic non-streaming response."""
+    content: list[dict] = []
+    if reasoning_parts:
+        content.append({"type": "thinking", "thinking": "".join(reasoning_parts)})
+    if answer_text:
+        content.append({"type": "text", "text": answer_text})
+    elif not tool_calls:
+        content.append({"type": "text", "text": ""})
+    if tool_calls:
+        content.extend(build_tool_call_blocks(tool_calls))
+    return {
+        "id": msg_id,
+        "type": "message",
+        "role": "assistant",
+        "content": content,
+        "model": model,
+        "stop_reason": "tool_use" if tool_calls else "end_turn",
+        "stop_sequence": None,
+        "usage": {"input_tokens": input_tokens, "output_tokens": output_tokens},
+    }
+# ── SSE event helpers ────────────────────────────────────────────────
+def sse(event: str, data: dict) -> str:
+    return f"event: {event}\ndata: {json.dumps(data, ensure_ascii=False)}\n\n"
+def sse_message_start(msg_id: str, model: str, input_tokens: int) -> str:
+    return sse("message_start", {
+        "type": "message_start",
+        "message": {
+            "id": msg_id, "type": "message", "role": "assistant",
+            "content": [], "model": model,
+            "stop_reason": None, "stop_sequence": None,
+            "usage": {"input_tokens": input_tokens, "output_tokens": 0},
+        },
+    })
+def sse_ping() -> str:
+    return sse("ping", {"type": "ping"})
+def sse_content_block_start(index: int, block: dict) -> str:
+    return sse("content_block_start", {"type": "content_block_start", "index": index, "content_block": block})
+def sse_content_block_delta(index: int, delta: dict) -> str:
+    return sse("content_block_delta", {"type": "content_block_delta", "index": index, "delta": delta})
+def sse_content_block_stop(index: int) -> str:
+    return sse("content_block_stop", {"type": "content_block_stop", "index": index})
+def sse_message_delta(stop_reason: str, output_tokens: int) -> str:
+    return sse("message_delta", {
+        "type": "message_delta",
+        "delta": {"stop_reason": stop_reason, "stop_sequence": None},
+        "usage": {"output_tokens": output_tokens},
+    })
+def sse_message_stop() -> str:
+    return sse("message_stop", {"type": "message_stop"})
+def sse_error(error_type: str, message: str) -> str:
+    return sse("error", {"type": "error", "error": {"type": error_type, "message": message}})
+# ── Private ──────────────────────────────────────────────────────────
+def _extract_text(content: object) -> str:
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        return " ".join(
+            str(b.get("text", ""))
+            for b in content
+            if isinstance(b, dict) and b.get("type") == "text"
+        ).strip()
+    return str(content) if content else ""

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,26 @@

+services:
+  zai-openai:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    image: zai-openai:local
+    container_name: zai-openai
+    working_dir: /app
+    restart: unless-stopped
+    environment:
+      - LOG_LEVEL=DEBUG
+      - HTTP_DEBUG=0
+      - ENABLE_THINKING=1
+      - UPSTREAM_FIRST_EVENT_TIMEOUT=60
+      - UPSTREAM_FIRST_EVENT_TIMEOUT_RETRY_MAX=2
+      - TOKEN_MAX_AGE=480
+      - POOL_MIN_SIZE=5
+      - POOL_MAX_SIZE=24
+      - POOL_TARGET_INFLIGHT_PER_ACCOUNT=2
+      - POOL_MAINTAIN_INTERVAL=10
+      - POOL_SCALE_DOWN_IDLE_ROUNDS=3
+    volumes:
+      - ./:/app
+    command: ["python", "openai.py"]
+    ports:
+      - "30016:30016"

main.py ADDED Viewed

	@@ -0,0 +1,504 @@

+"""chat.z.ai reverse-engineered Python client."""
+from __future__ import annotations
+import asyncio
+import base64
+import hashlib
+import hmac
+import json
+import logging
+import os
+import time
+import uuid
+from datetime import datetime, timezone, timedelta
+from urllib.parse import urlencode
+import httpx
+logger = logging.getLogger("zai.client")
+BASE_URL = "https://chat.z.ai"
+HMAC_SECRET = "key-@@@@)))()((9))-xxxx&&&%%%%%"
+FE_VERSION = "prod-fe-1.0.231"
+CLIENT_VERSION = "0.0.1"
+DEFAULT_MODEL = "glm-5"
+ENABLE_THINKING_DEFAULT = os.getenv("ENABLE_THINKING", "1") == "1"
+USER_AGENT = (
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) "
+    "Chrome/144.0.0.0 Safari/537.36"
+)
+class ZaiClient:
+    def __init__(self) -> None:
+        # 分离超时配置：connect快速失败，read支持长时间流式响应
+        timeout_config = httpx.Timeout(
+            connect=5.0,      # 连接超时 5秒
+            read=180.0,       # 读取超时 3分钟（支持长文生成）
+            write=10.0,       # 写入超时 10秒
+            pool=5.0,         # 连接池获取超时 5秒
+        )
+        self.client = httpx.AsyncClient(
+            base_url=BASE_URL,
+            timeout=timeout_config,
+            headers={
+                "User-Agent": USER_AGENT,
+                "Accept-Language": "zh-CN",
+                "Referer": f"{BASE_URL}/",
+                "Origin": BASE_URL,
+            },
+            # 限制连接池大小，避免连接泄漏
+            limits=httpx.Limits(max_keepalive_connections=5, max_connections=10),
+        )
+        self.token: str | None = None
+        self.user_id: str | None = None
+        self.username: str | None = None
+    async def close(self) -> None:
+        await self.client.aclose()
+    # ── auth ────────────────────────────────────────────────────────
+    async def auth_as_guest(self) -> dict:
+        """GET /api/v1/auths/ — creates a guest session and returns user info."""
+        resp = await self.client.get(
+            "/api/v1/auths/",
+            headers={"Content-Type": "application/json"},
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        self.token = data["token"]
+        self.user_id = data["id"]
+        self.username = data.get("name") or data.get("email", "").split("@")[0]
+        return data
+    # ── models ──────────────────────────────────────────────────────
+    async def get_models(self) -> list:
+        """GET /api/models — returns available model list."""
+        resp = await self.client.get(
+            "/api/models",
+            headers={
+                "Content-Type": "application/json",
+                "Accept": "application/json",
+                **({"Authorization": f"Bearer {self.token}"} if self.token else {}),
+            },
+        )
+        resp.raise_for_status()
+        return resp.json()
+    # ── chat CRUD ───────────────────────────────────────────────────
+    async def create_chat(
+        self,
+        user_message: str,
+        model: str = DEFAULT_MODEL,
+        *,
+        enable_thinking: bool | None = None,
+    ) -> dict:
+        """POST /api/v1/chats/new — creates a new chat session.
+        The content placed in history is only for session initialization;
+        actual conversation content is sent via chat_completions later.
+        We truncate long prompts to avoid 400 errors from the upstream API.
+        """
+        # Truncate the user_message for chat creation to avoid 400 errors
+        # when the prompt is too long (e.g. contains tool definitions,
+        # multi-turn history, or large system prompts).
+        MAX_INIT_CONTENT_LEN = 500
+        init_content = user_message
+        if len(init_content) > MAX_INIT_CONTENT_LEN:
+            init_content = init_content[:MAX_INIT_CONTENT_LEN] + "..."
+        msg_id = str(uuid.uuid4())
+        ts = int(time.time())
+        body = {
+            "chat": {
+                "id": "",
+                "title": "新聊天",
+                "models": [model],
+                "params": {},
+                "history": {
+                    "messages": {
+                        msg_id: {
+                            "id": msg_id,
+                            "parentId": None,
+                            "childrenIds": [],
+                            "role": "user",
+                            "content": init_content,
+                            "timestamp": ts,
+                            "models": [model],
+                        }
+                    },
+                    "currentId": msg_id,
+                },
+                "tags": [],
+                "flags": [],
+                "features": [
+                    {
+                        "type": "tool_selector",
+                        "server": "tool_selector_h",
+                        "status": "hidden",
+                    }
+                ],
+                "mcp_servers": [],
+                "enable_thinking": (
+                    ENABLE_THINKING_DEFAULT
+                    if enable_thinking is None
+                    else bool(enable_thinking)
+                ),
+                "auto_web_search": False,
+                "message_version": 1,
+                "extra": {},
+                "timestamp": int(time.time() * 1000),
+            }
+        }
+        resp = await self.client.post(
+            "/api/v1/chats/new",
+            headers={
+                "Content-Type": "application/json",
+                "Accept": "application/json",
+                **({"Authorization": f"Bearer {self.token}"} if self.token else {}),
+            },
+            json=body,
+        )
+        if resp.status_code != 200:
+            error_body = resp.text
+            logger.warning(
+                "create_chat failed: status=%d body=%s (prompt_len=%d, truncated_len=%d)",
+                resp.status_code, error_body[:500], len(user_message), len(init_content),
+            )
+            resp.raise_for_status()
+        return resp.json()
+    # ── chat cleanup ─────────────────────────────────────────────────
+    async def delete_chat(self, chat_id: str) -> bool:
+        """DELETE /api/v1/chats/{chat_id} — deletes a chat session.
+        Returns True if deleted successfully, False otherwise.
+        This should be called after each request to free up concurrency slots.
+        """
+        try:
+            resp = await self.client.delete(
+                f"/api/v1/chats/{chat_id}",
+                headers={
+                    "Content-Type": "application/json",
+                    "Accept": "application/json",
+                    **({
+                        "Authorization": f"Bearer {self.token}"
+                    } if self.token else {}),
+                },
+            )
+            if resp.status_code == 200:
+                return True
+            logger.debug(
+                "delete_chat %s: status=%d body=%s",
+                chat_id, resp.status_code, resp.text[:200],
+            )
+            return False
+        except Exception as e:
+            logger.debug("delete_chat %s failed: %s", chat_id, e)
+            return False
+    async def delete_all_chats(self) -> bool:
+        """DELETE /api/v1/chats/ — deletes all chats for the current user.
+        Useful for cleaning up accumulated chats when hitting concurrency limits.
+        """
+        try:
+            resp = await self.client.delete(
+                "/api/v1/chats/",
+                headers={
+                    "Content-Type": "application/json",
+                    "Accept": "application/json",
+                    **({
+                        "Authorization": f"Bearer {self.token}"
+                    } if self.token else {}),
+                },
+            )
+            if resp.status_code == 200:
+                logger.info("delete_all_chats: success")
+                return True
+            logger.warning(
+                "delete_all_chats: status=%d body=%s",
+                resp.status_code, resp.text[:200],
+            )
+            return False
+        except Exception as e:
+            logger.warning("delete_all_chats failed: %s", e)
+            return False
+    # ── signature ───────────────────────────────────────────────────
+    @staticmethod
+    def _generate_signature(
+        sorted_payload: str, prompt: str, timestamp: str
+    ) -> str:
+        """
+        Two-layer HMAC-SHA256 matching DLHfQWwv.js.
+        1. b64_prompt  = base64(utf8(prompt))
+        2. message     = "{sorted_payload}|{b64_prompt}|{timestamp}"
+        3. time_bucket = floor(int(timestamp) / 300_000)
+        4. derived_key = HMAC-SHA256(HMAC_SECRET, str(time_bucket)) → hex string
+        5. signature   = HMAC-SHA256(derived_key_hex_bytes, message) → hex
+        """
+        b64_prompt = base64.b64encode(prompt.encode("utf-8")).decode("ascii")
+        message = f"{sorted_payload}|{b64_prompt}|{timestamp}"
+        time_bucket = int(timestamp) // (5 * 60 * 1000)
+        derived_key_hex = hmac.new(
+            HMAC_SECRET.encode("utf-8"),
+            str(time_bucket).encode("utf-8"),
+            hashlib.sha256,
+        ).hexdigest()
+        signature = hmac.new(
+            derived_key_hex.encode("utf-8"),
+            message.encode("utf-8"),
+            hashlib.sha256,
+        ).hexdigest()
+        return signature
+    def _build_query_and_signature(
+        self, prompt: str, chat_id: str
+    ) -> tuple[str, str]:
+        """Build the full URL query string and X-Signature header.
+        Returns (full_query_string, signature).
+        """
+        timestamp_ms = str(int(time.time() * 1000))
+        request_id = str(uuid.uuid4())
+        now = datetime.now(timezone.utc)
+        # Core params (used for sortedPayload)
+        core = {
+            "timestamp": timestamp_ms,
+            "requestId": request_id,
+            "user_id": self.user_id,
+        }
+        # sortedPayload: Object.entries(core).sort(by key).join(",")
+        sorted_payload = ",".join(
+            f"{k},{v}" for k, v in sorted(core.items(), key=lambda x: x[0])
+        )
+        # Compute signature over the prompt
+        signature = self._generate_signature(sorted_payload, prompt, timestamp_ms)
+        # Browser/device fingerprint params
+        extra = {
+            "version": CLIENT_VERSION,
+            "platform": "web",
+            "token": self.token or "",
+            "user_agent": USER_AGENT,
+            "language": "zh-CN",
+            "languages": "zh-CN",
+            "timezone": "Asia/Shanghai",
+            "cookie_enabled": "true",
+            "screen_width": "1920",
+            "screen_height": "1080",
+            "screen_resolution": "1920x1080",
+            "viewport_height": "919",
+            "viewport_width": "944",
+            "viewport_size": "944x919",
+            "color_depth": "24",
+            "pixel_ratio": "1.25",
+            "current_url": f"{BASE_URL}/c/{chat_id}",
+            "pathname": f"/c/{chat_id}",
+            "search": "",
+            "hash": "",
+            "host": "chat.z.ai",
+            "hostname": "chat.z.ai",
+            "protocol": "https:",
+            "referrer": "",
+            "title": "Z.ai - Free AI Chatbot & Agent powered by GLM-5 & GLM-4.7",
+            "timezone_offset": "-480",
+            "local_time": now.strftime("%Y-%m-%dT%H:%M:%S.")
+            + f"{now.microsecond // 1000:03d}Z",
+            "utc_time": now.strftime("%a, %d %b %Y %H:%M:%S GMT"),
+            "is_mobile": "false",
+            "is_touch": "false",
+            "max_touch_points": "10",
+            "browser_name": "Chrome",
+            "os_name": "Windows",
+            "signature_timestamp": timestamp_ms,
+        }
+        all_params = {**core, **extra}
+        query_string = urlencode(all_params)
+        return query_string, signature
+    # ── chat completions (SSE) ──────────────────────────────────────
+    async def chat_completions(
+        self,
+        chat_id: str,
+        messages: list[dict],
+        prompt: str,
+        *,
+        model: str = DEFAULT_MODEL,
+        parent_message_id: str | None = None,
+        tools: list[dict] | None = None,
+        enable_thinking: bool | None = None,
+    ):
+        """POST /api/v2/chat/completions — streams SSE response.
+        Yields the full event ``data`` dict for each SSE frame.
+        """
+        query_string, signature = self._build_query_and_signature(prompt, chat_id)
+        msg_id = str(uuid.uuid4())
+        user_msg_id = str(uuid.uuid4())
+        now = datetime.now(timezone(timedelta(hours=8)))
+        variables = {
+            "{{USER_NAME}}": self.username or "Guest",
+            "{{USER_LOCATION}}": "Unknown",
+            "{{CURRENT_DATETIME}}": now.strftime("%Y-%m-%d %H:%M:%S"),
+            "{{CURRENT_DATE}}": now.strftime("%Y-%m-%d"),
+            "{{CURRENT_TIME}}": now.strftime("%H:%M:%S"),
+            "{{CURRENT_WEEKDAY}}": now.strftime("%A"),
+            "{{CURRENT_TIMEZONE}}": "Asia/Shanghai",
+            "{{USER_LANGUAGE}}": "zh-CN",
+        }
+        body = {
+            "stream": True,
+            "model": model,
+            "messages": messages,
+            "signature_prompt": prompt,
+            "params": {},
+            "extra": {},
+            "features": {
+                "image_generation": False,
+                "web_search": False,
+                "auto_web_search": False,
+                "preview_mode": True,
+                "flags": [],
+                "enable_thinking": (
+                    ENABLE_THINKING_DEFAULT
+                    if enable_thinking is None
+                    else bool(enable_thinking)
+                ),
+            },
+            "variables": variables,
+            "chat_id": chat_id,
+            "id": msg_id,
+            "current_user_message_id": user_msg_id,
+            "current_user_message_parent_id": parent_message_id,
+            "background_tasks": {
+                "title_generation": True,
+                "tags_generation": True,
+            },
+        }
+        if tools:
+            body["tools"] = tools
+        headers = {
+            "Content-Type": "application/json",
+            "Accept": "*/*",
+            "Accept-Language": "zh-CN",
+            "X-FE-Version": FE_VERSION,
+            "X-Signature": signature,
+            **({"Authorization": f"Bearer {self.token}"} if self.token else {}),
+        }
+        url = f"{BASE_URL}/api/v2/chat/completions?{query_string}"
+        async with self.client.stream(
+            "POST", url, headers=headers, json=body,
+        ) as resp:
+            if resp.status_code != 200:
+                error_body = (await resp.aread()).decode("utf-8", errors="replace")
+                raise httpx.HTTPStatusError(
+                    f"chat/completions {resp.status_code}: {error_body[:500]}",
+                    request=resp.request,
+                    response=resp,
+                )
+            async for line in resp.aiter_lines():
+                if not line.startswith("data: "):
+                    continue
+                raw = line[6:]
+                if raw.strip() == "[DONE]":
+                    return
+                try:
+                    event = json.loads(raw)
+                except json.JSONDecodeError:
+                    continue
+                data = event.get("data", {})
+                yield data
+                if data.get("done"):
+                    return
+async def main() -> None:
+    client = ZaiClient()
+    try:
+        # 1. Authenticate as guest
+        print("[1] Authenticating as guest...")
+        auth = await client.auth_as_guest()
+        print(f"    user_id : {auth['id']}")
+        print(f"    email   : {auth.get('email', 'N/A')}")
+        print(f"    token   : {auth['token'][:40]}...")
+        # 2. Fetch models
+        print("\n[2] Fetching models...")
+        models_resp = await client.get_models()
+        if isinstance(models_resp, dict) and "data" in models_resp:
+            names = [m.get("id", m.get("name", "?")) for m in models_resp["data"]]
+        elif isinstance(models_resp, list):
+            names = [m.get("id", m.get("name", "?")) for m in models_resp]
+        else:
+            names = [str(models_resp)[:80]]
+        print(f"    models  : {', '.join(names[:10])}")
+        # 3. Create chat
+        user_message = "Hello"
+        print(f"\n[3] Creating chat with first message: {user_message!r}")
+        messages = [{"role": "user", "content": user_message}]
+        chat = await client.create_chat(user_message)
+        chat_id = chat["id"]
+        print(f"    chat_id : {chat_id}")
+        # 4. Stream chat completions
+        print(f"\n[4] Streaming chat completions (model={DEFAULT_MODEL})...\n")
+        messages = [{"role": "user", "content": user_message}]
+        thinking_started = False
+        answer_started = False
+        async for data in client.chat_completions(
+            chat_id=chat_id,
+            messages=messages,
+            prompt=user_message,
+        ):
+            phase = data.get("phase", "")
+            delta = data.get("delta_content", "")
+            if phase == "thinking":
+                if not thinking_started:
+                    print("[thinking] ", end="", flush=True)
+                    thinking_started = True
+                print(delta, end="", flush=True)
+            elif phase == "answer":
+                if not answer_started:
+                    if thinking_started:
+                        print("\n")
+                    print("[answer]   ", end="", flush=True)
+                    answer_started = True
+                print(delta, end="", flush=True)
+            elif phase == "done":
+                break
+        print("\n\n[done]")
+    finally:
+        await client.close()
+if __name__ == "__main__":
+    asyncio.run(main())

openai.py ADDED Viewed

	@@ -0,0 +1,1761 @@

+"""OpenAI-compatible proxy server for chat.z.ai + Toolify-style function calling."""
+from __future__ import annotations
+import asyncio
+import json
+import logging
+import math
+import os
+import re
+import secrets
+import string
+import time
+import uuid
+from contextlib import asynccontextmanager
+from typing import Any
+import httpcore
+import httpx
+import uvicorn
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+from main import ZaiClient
+from claude_compat import (
+    claude_messages_to_openai,
+    claude_tools_to_openai,
+    claude_tool_choice_prompt,
+    make_claude_id,
+    build_tool_call_blocks,
+    build_non_stream_response,
+    sse_message_start,
+    sse_ping,
+    sse_content_block_start,
+    sse_content_block_delta,
+    sse_content_block_stop,
+    sse_message_delta,
+    sse_message_stop,
+    sse_error,
+)
+# ── Logging ──────────────────────────────────────────────────────────
+LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
+HTTP_DEBUG = os.getenv("HTTP_DEBUG", "0") == "1"
+logging.basicConfig(
+    level=getattr(logging, LOG_LEVEL, logging.INFO),
+    format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
+)
+logger = logging.getLogger("zai.openai")
+if not HTTP_DEBUG:
+    logging.getLogger("httpx").setLevel(logging.WARNING)
+    logging.getLogger("httpcore").setLevel(logging.WARNING)
+# ── Multi-Account Pool ───────────────────────────────────────────────
+POOL_SIZE = int(os.getenv("POOL_SIZE", "3"))
+POOL_MIN_SIZE = max(1, int(os.getenv("POOL_MIN_SIZE", str(POOL_SIZE))))
+POOL_MAX_SIZE = max(POOL_MIN_SIZE, int(os.getenv("POOL_MAX_SIZE", str(max(POOL_MIN_SIZE, POOL_MIN_SIZE * 3)))))
+POOL_TARGET_INFLIGHT_PER_ACCOUNT = max(1, int(os.getenv("POOL_TARGET_INFLIGHT_PER_ACCOUNT", "2")))
+POOL_MAINTAIN_INTERVAL = max(5, int(os.getenv("POOL_MAINTAIN_INTERVAL", "10")))
+POOL_SCALE_DOWN_IDLE_ROUNDS = max(1, int(os.getenv("POOL_SCALE_DOWN_IDLE_ROUNDS", "3")))
+TOKEN_MAX_AGE = int(os.getenv("TOKEN_MAX_AGE", "480"))  # seconds
+UPSTREAM_FIRST_EVENT_TIMEOUT = max(1.0, float(os.getenv("UPSTREAM_FIRST_EVENT_TIMEOUT", "60")))
+UPSTREAM_FIRST_EVENT_TIMEOUT_RETRY_MAX = max(0, int(os.getenv("UPSTREAM_FIRST_EVENT_TIMEOUT_RETRY_MAX", "2")))
+def _compute_pool_target_by_load(in_flight: int) -> int:
+    """根据当前并发负载估算池目标大小。"""
+    if in_flight <= 0:
+        return POOL_MIN_SIZE
+    # +1 headroom，避免全部账号都打满时排队。
+    by_load = math.ceil(in_flight / POOL_TARGET_INFLIGHT_PER_ACCOUNT) + 1
+    return min(POOL_MAX_SIZE, max(POOL_MIN_SIZE, by_load))
+class AccountInfo:
+    """A single guest auth session."""
+    __slots__ = ("token", "user_id", "username", "created_at", "active", "valid")
+    def __init__(self, token: str, user_id: str, username: str) -> None:
+        self.token = token
+        self.user_id = user_id
+        self.username = username
+        self.created_at = time.time()
+        self.active = 0  # number of in-flight requests
+        self.valid = True
+    def snapshot(self) -> dict[str, str]:
+        return {"token": self.token, "user_id": self.user_id, "username": self.username}
+    @property
+    def age(self) -> float:
+        return time.time() - self.created_at
+class SessionPool:
+    """Pool of guest accounts for concurrent, seamless use."""
+    def __init__(self) -> None:
+        self._lock = asyncio.Lock()
+        self._accounts: list[AccountInfo] = []
+        self._bg_task: asyncio.Task | None = None
+        self._maintain_event = asyncio.Event()
+        self._target_size = POOL_MIN_SIZE
+        self._idle_rounds = 0
+    # ── internal ─────────────────────────────────────────────────────
+    def _valid_accounts(self, *, include_expired: bool = False) -> list[AccountInfo]:
+        if include_expired:
+            return [a for a in self._accounts if a.valid]
+        return [a for a in self._accounts if a.valid and a.age < TOKEN_MAX_AGE]
+    def _raise_target_size(self, target_size: int) -> None:
+        clamped = min(POOL_MAX_SIZE, max(POOL_MIN_SIZE, target_size))
+        if clamped > self._target_size:
+            self._target_size = clamped
+            self._maintain_event.set()
+    async def _new_account(self) -> AccountInfo:
+        c = ZaiClient()
+        try:
+            d = await c.auth_as_guest()
+            acc = AccountInfo(d["token"], d["id"], d.get("name") or d.get("email", "").split("@")[0])
+            logger.info("Pool: +account uid=%s (total=%d)", acc.user_id, len(self._accounts) + 1)
+            return acc
+        finally:
+            await c.close()
+    async def _del_account(self, acc: AccountInfo) -> None:
+        try:
+            c = ZaiClient()
+            c.token, c.user_id, c.username = acc.token, acc.user_id, acc.username
+            await c.delete_all_chats()
+            await c.close()
+        except Exception:
+            pass
+    async def _maintain(self) -> None:
+        """后台维护：按负载扩缩容 + 清理失效账号。"""
+        while True:
+            try:
+                try:
+                    await asyncio.wait_for(self._maintain_event.wait(), timeout=POOL_MAINTAIN_INTERVAL)
+                except asyncio.TimeoutError:
+                    pass
+                self._maintain_event.clear()
+                to_delete: list[AccountInfo] = []
+                to_add = 0
+                cycle_target = POOL_MIN_SIZE
+                async with self._lock:
+                    dead = [a for a in self._accounts if (not a.valid or a.age > TOKEN_MAX_AGE) and a.active == 0]
+                    for a in dead:
+                        self._accounts.remove(a)
+                        to_delete.append(a)
+                    valid = self._valid_accounts()
+                    valid_count = len(valid)
+                    in_flight = sum(a.active for a in valid)
+                    load_target = _compute_pool_target_by_load(in_flight)
+                    desired = min(POOL_MAX_SIZE, max(POOL_MIN_SIZE, max(load_target, self._target_size)))
+                    # 缩容仅在连续空闲轮次后执行，避免负载抖动。
+                    if in_flight == 0 and valid_count > desired:
+                        self._idle_rounds += 1
+                    else:
+                        self._idle_rounds = 0
+                    if self._idle_rounds >= POOL_SCALE_DOWN_IDLE_ROUNDS and valid_count > desired:
+                        removable = [a for a in valid if a.active == 0]
+                        removable.sort(key=lambda a: a.created_at)
+                        shrink_by = min(valid_count - desired, len(removable))
+                        for a in removable[:shrink_by]:
+                            self._accounts.remove(a)
+                            to_delete.append(a)
+                        valid_count -= shrink_by
+                        if valid_count <= desired:
+                            self._idle_rounds = 0
+                    else:
+                        # 未满足缩容条件时，至少保持当前 valid 数量。
+                        desired = max(desired, valid_count)
+                    cycle_target = desired
+                    # _target_size 仅作为“临时抬升”的请求值，下一轮回到按负载计算。
+                    self._target_size = load_target
+                    to_add = max(0, desired - valid_count)
+                for a in to_delete:
+                    asyncio.create_task(self._del_account(a))
+                for _ in range(to_add):
+                    try:
+                        new_acc = await self._new_account()
+                    except Exception as e:
+                        logger.warning("Pool maintain add failed: %s", e)
+                        break
+                    async with self._lock:
+                        valid_now = len(self._valid_accounts())
+                        if valid_now >= cycle_target:
+                            asyncio.create_task(self._del_account(new_acc))
+                            continue
+                        self._accounts.append(new_acc)
+            except asyncio.CancelledError:
+                return
+            except Exception as e:
+                logger.warning("Pool maintain loop error: %s", e)
+    # ── public API ───────────────────────────────────────────────────
+    async def initialize(self) -> None:
+        self._target_size = POOL_MIN_SIZE
+        async with self._lock:
+            results = await asyncio.gather(
+                *[self._new_account() for _ in range(POOL_MIN_SIZE)],
+                return_exceptions=True,
+            )
+            for r in results:
+                if isinstance(r, AccountInfo):
+                    self._accounts.append(r)
+                else:
+                    logger.warning("Pool init failed: %s", r)
+            if not self._accounts:
+                self._accounts.append(await self._new_account())
+            logger.info("Pool: ready with %d accounts", len(self._accounts))
+        self._bg_task = asyncio.create_task(self._maintain())
+        self._maintain_event.set()
+    async def close(self) -> None:
+        if self._bg_task:
+            self._bg_task.cancel()
+            try:
+                await self._bg_task
+            except asyncio.CancelledError:
+                pass
+        for a in list(self._accounts):
+            await self._del_account(a)
+        self._accounts.clear()
+    async def acquire(self) -> AccountInfo:
+        """Get the least-busy valid account (creates one if needed)."""
+        good = self._valid_accounts()
+        if not good:
+            async with self._lock:
+                good = self._valid_accounts()
+                if not good:
+                    acc = await self._new_account()
+                    self._accounts.append(acc)
+                    good = [acc]
+        acc = min(good, key=lambda a: a.active)
+        acc.active += 1
+        if acc.active >= POOL_TARGET_INFLIGHT_PER_ACCOUNT:
+            self._raise_target_size(len(good) + 1)
+        return acc
+    def release(self, acc: AccountInfo) -> None:
+        acc.active = max(0, acc.active - 1)
+        if acc.active == 0:
+            self._maintain_event.set()
+    async def report_failure(self, acc: AccountInfo) -> None:
+        """Mark account invalid, schedule cleanup, add replacement."""
+        acc.valid = False
+        acc.active = max(0, acc.active - 1)
+        self._raise_target_size(len(self._valid_accounts()) + 1)
+        asyncio.create_task(self._del_account(acc))
+        try:
+            new = await self._new_account()
+            async with self._lock:
+                if len(self._valid_accounts(include_expired=True)) < POOL_MAX_SIZE:
+                    self._accounts.append(new)
+                else:
+                    asyncio.create_task(self._del_account(new))
+        except Exception as e:
+            logger.warning("Pool replace failed: %s", e)
+        self._maintain_event.set()
+    async def get_models(self) -> list | dict:
+        acc = await self.acquire()
+        c = ZaiClient()
+        try:
+            c.token, c.user_id, c.username = acc.token, acc.user_id, acc.username
+            return await c.get_models()
+        finally:
+            self.release(acc)
+            await c.close()
+    # ── compat methods (called by request handlers) ──────────────────
+    async def ensure_auth(self) -> None:
+        """Ensure at least one valid account exists in the pool."""
+        good = self._valid_accounts(include_expired=True)
+        if not good:
+            async with self._lock:
+                good = self._valid_accounts(include_expired=True)
+                if not good:
+                    self._accounts.append(await self._new_account())
+        if len(good) < POOL_MIN_SIZE:
+            self._raise_target_size(POOL_MIN_SIZE)
+    def get_auth_snapshot(self) -> dict[str, str]:
+        """Get auth snapshot from the least-busy valid account."""
+        good = self._valid_accounts()
+        if not good:
+            good = self._valid_accounts(include_expired=True)
+        if not good:
+            raise RuntimeError("No valid accounts in pool")
+        acc = min(good, key=lambda a: a.active)
+        acc.active += 1
+        if acc.active >= POOL_TARGET_INFLIGHT_PER_ACCOUNT:
+            self._raise_target_size(len(good) + 1)
+        return acc.snapshot()
+    def _release_by_user_id(self, user_id: str) -> None:
+        """Release (decrement active) for the account matching user_id."""
+        for a in self._accounts:
+            if a.user_id == user_id:
+                a.active = max(0, a.active - 1)
+                if a.active == 0:
+                    self._maintain_event.set()
+                return
+    async def refresh_auth(self, failed_user_id: str | None = None) -> None:
+        """Invalidate the failed account (if given) and create a fresh one."""
+        if failed_user_id:
+            for a in self._accounts:
+                if a.user_id == failed_user_id:
+                    a.valid = False
+                    a.active = max(0, a.active - 1)
+                    asyncio.create_task(self._del_account(a))
+                    logger.info("SessionPool: invalidated failed account uid=%s", failed_user_id)
+                    break
+        self._raise_target_size(len(self._valid_accounts()) + 1)
+        try:
+            acc = await self._new_account()
+            async with self._lock:
+                if len(self._valid_accounts(include_expired=True)) < POOL_MAX_SIZE:
+                    self._accounts.append(acc)
+                else:
+                    asyncio.create_task(self._del_account(acc))
+            logger.info("SessionPool: auth refreshed, new user_id=%s", acc.user_id)
+        except Exception as e:
+            logger.warning("SessionPool: refresh_auth failed: %s", e)
+        self._maintain_event.set()
+    async def cleanup_chats(self) -> None:
+        """Clean up chats for idle accounts to free concurrency slots."""
+        for a in list(self._accounts):
+            if a.valid and a.active == 0:
+                try:
+                    c = ZaiClient()
+                    c.token, c.user_id, c.username = a.token, a.user_id, a.username
+                    await c.delete_all_chats()
+                    await c.close()
+                except Exception:
+                    pass
+pool = SessionPool()
+@asynccontextmanager
+async def lifespan(_app: FastAPI):
+    await pool.initialize()
+    yield
+    await pool.close()
+app = FastAPI(lifespan=lifespan)
+# ── Toolify-style helpers ─────────���──────────────────────────────────
+def _generate_trigger_signal() -> str:
+    chars = string.ascii_letters + string.digits
+    rand = "".join(secrets.choice(chars) for _ in range(4))
+    return f"<Function_{rand}_Start/>"
+GLOBAL_TRIGGER_SIGNAL = _generate_trigger_signal()
+def _extract_text_from_content(content: object) -> str:
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        parts: list[str] = []
+        for p in content:
+            if isinstance(p, dict) and p.get("type") == "text":
+                parts.append(str(p.get("text", "")))
+        return " ".join(parts).strip()
+    if content is None:
+        return ""
+    try:
+        return json.dumps(content, ensure_ascii=False)
+    except Exception:
+        return str(content)
+def _build_tool_call_index_from_messages(messages: list[dict]) -> dict[str, dict[str, str]]:
+    idx: dict[str, dict[str, str]] = {}
+    for msg in messages:
+        if msg.get("role") != "assistant":
+            continue
+        tcs = msg.get("tool_calls")
+        if not isinstance(tcs, list):
+            continue
+        for tc in tcs:
+            if not isinstance(tc, dict):
+                continue
+            tc_id = tc.get("id")
+            fn = tc.get("function", {}) if isinstance(tc.get("function"), dict) else {}
+            name = str(fn.get("name", ""))
+            args = fn.get("arguments", "{}")
+            if not isinstance(args, str):
+                try:
+                    args = json.dumps(args, ensure_ascii=False)
+                except Exception:
+                    args = "{}"
+            if isinstance(tc_id, str) and name:
+                idx[tc_id] = {"name": name, "arguments": args}
+    return idx
+def _format_tool_result_for_ai(tool_name: str, tool_arguments: str, result_content: str) -> str:
+    return (
+        "<tool_execution_result>\n"
+        f"<tool_name>{tool_name}</tool_name>\n"
+        f"<tool_arguments>{tool_arguments}</tool_arguments>\n"
+        f"<tool_output>{result_content}</tool_output>\n"
+        "</tool_execution_result>"
+    )
+def _format_assistant_tool_calls_for_ai(tool_calls: list[dict], trigger_signal: str) -> str:
+    blocks: list[str] = []
+    for tc in tool_calls:
+        if not isinstance(tc, dict):
+            continue
+        fn = tc.get("function", {}) if isinstance(tc.get("function"), dict) else {}
+        name = str(fn.get("name", "")).strip()
+        if not name:
+            continue
+        args = fn.get("arguments", "{}")
+        if isinstance(args, str):
+            args_text = args
+        else:
+            try:
+                args_text = json.dumps(args, ensure_ascii=False)
+            except Exception:
+                args_text = "{}"
+        blocks.append(
+            "<function_call>\n"
+            f"<name>{name}</name>\n"
+            f"<args_json>{args_text}</args_json>\n"
+            "</function_call>"
+        )
+    if not blocks:
+        return ""
+    return f"{trigger_signal}\n<function_calls>\n" + "\n".join(blocks) + "\n</function_calls>"
+def _preprocess_messages(messages: list[dict]) -> list[dict]:
+    tool_idx = _build_tool_call_index_from_messages(messages)
+    out: list[dict] = []
+    for msg in messages:
+        if not isinstance(msg, dict):
+            continue
+        role = msg.get("role")
+        if role == "tool":
+            tc_id = msg.get("tool_call_id")
+            content = _extract_text_from_content(msg.get("content", ""))
+            info = tool_idx.get(tc_id, {"name": msg.get("name", "unknown_tool"), "arguments": "{}"})
+            out.append(
+                {
+                    "role": "user",
+                    "content": _format_tool_result_for_ai(info["name"], info["arguments"], content),
+                }
+            )
+            continue
+        if role == "assistant" and isinstance(msg.get("tool_calls"), list):
+            xml_calls = _format_assistant_tool_calls_for_ai(msg["tool_calls"], GLOBAL_TRIGGER_SIGNAL)
+            content = (_extract_text_from_content(msg.get("content", "")) + "\n" + xml_calls).strip()
+            out.append({"role": "assistant", "content": content})
+            continue
+        if role == "developer":
+            cloned = dict(msg)
+            cloned["role"] = "system"
+            out.append(cloned)
+            continue
+        out.append(msg)
+    return out
+def _generate_function_prompt(tools: list[dict], trigger_signal: str) -> str:
+    tool_lines: list[str] = []
+    for i, t in enumerate(tools):
+        if not isinstance(t, dict) or t.get("type") != "function":
+            continue
+        fn = t.get("function", {}) if isinstance(t.get("function"), dict) else {}
+        name = str(fn.get("name", "")).strip()
+        if not name:
+            continue
+        desc = str(fn.get("description", "")).strip() or "None"
+        params = fn.get("parameters", {})
+        required = params.get("required", []) if isinstance(params, dict) else []
+        try:
+            params_json = json.dumps(params, ensure_ascii=False)
+        except Exception:
+            params_json = "{}"
+        tool_lines.append(
+            f"{i+1}. <tool name=\"{name}\">\n"
+            f"   Description: {desc}\n"
+            f"   Required: {', '.join(required) if isinstance(required, list) and required else 'None'}\n"
+            f"   Parameters JSON Schema: {params_json}"
+        )
+    tools_block = "\n\n".join(tool_lines) if tool_lines else "(no tools)"
+    return (
+        "You have access to tools.\n\n"
+        "When you need to call tools, you MUST output exactly:\n"
+        f"{trigger_signal}\n"
+        "<function_calls>\n"
+        "  <function_call>\n"
+        "    <name>tool_name</name>\n"
+        "    <args_json>{\"arg\":\"value\"}</args_json>\n"
+        "  </function_call>\n"
+        "</function_calls>\n\n"
+        "Rules:\n"
+        "1) args_json MUST be valid JSON object\n"
+        "2) For multiple calls, output one <function_calls> with multiple <function_call> children\n"
+        "3) If no tool is needed, answer normally\n\n"
+        f"Available tools:\n{tools_block}"
+    )
+def _safe_process_tool_choice(tool_choice: Any, tools: list[dict]) -> str:
+    if tool_choice is None:
+        return ""
+    if isinstance(tool_choice, str):
+        if tool_choice == "required":
+            return "\nIMPORTANT: You MUST call at least one tool in your next response."
+        if tool_choice == "none":
+            return "\nIMPORTANT: Do not call tools. Answer directly."
+        return ""
+    if isinstance(tool_choice, dict):
+        fn = tool_choice.get("function", {}) if isinstance(tool_choice.get("function"), dict) else {}
+        name = fn.get("name")
+        if isinstance(name, str) and name:
+            return f"\nIMPORTANT: You MUST call this tool: {name}"
+    return ""
+def _flatten_messages_for_zai(messages: list[dict]) -> list[dict]:
+    parts: list[str] = []
+    for msg in messages:
+        role = str(msg.get("role", "user")).upper()
+        content = _extract_text_from_content(msg.get("content", ""))
+        parts.append(f"<{role}>{content}</{role}>")
+    return [{"role": "user", "content": "\n".join(parts)}]
+def _remove_think_blocks(text: str) -> str:
+    while "<think>" in text and "</think>" in text:
+        start = text.find("<think>")
+        if start == -1:
+            break
+        pos = start + 7
+        depth = 1
+        while pos < len(text) and depth > 0:
+            if text[pos : pos + 7] == "<think>":
+                depth += 1
+                pos += 7
+            elif text[pos : pos + 8] == "</think>":
+                depth -= 1
+                pos += 8
+            else:
+                pos += 1
+        if depth == 0:
+            text = text[:start] + text[pos:]
+        else:
+            break
+    return text
+def _find_last_trigger_signal_outside_think(text: str, trigger_signal: str) -> int:
+    if not text or not trigger_signal:
+        return -1
+    i = 0
+    depth = 0
+    last = -1
+    while i < len(text):
+        if text.startswith("<think>", i):
+            depth += 1
+            i += 7
+            continue
+        if text.startswith("</think>", i):
+            depth = max(0, depth - 1)
+            i += 8
+            continue
+        if depth == 0 and text.startswith(trigger_signal, i):
+            last = i
+            i += 1
+            continue
+        i += 1
+    return last
+def _drain_safe_answer_delta(
+    answer_text: str,
+    emitted_chars: int,
+    *,
+    has_fc: bool,
+    trigger_signal: str,
+) -> tuple[str, int, bool]:
+    """在流式输出中提取可安全下发的增量文本。
+    - 非 function-calling 场景：可直接全部下发。
+    - function-calling 场景：保留末尾 `len(trigger_signal)-1` 字符，避免触发信号跨 chunk 时泄漏。
+    - 一旦检测到触发信号，仅允许下发触发信号之前的文本。
+    """
+    if emitted_chars >= len(answer_text):
+        return "", emitted_chars, False
+    if not has_fc:
+        return answer_text[emitted_chars:], len(answer_text), False
+    trigger_pos = _find_last_trigger_signal_outside_think(answer_text, trigger_signal)
+    if trigger_pos >= 0:
+        safe_end = trigger_pos
+        has_trigger = True
+    else:
+        holdback = max(0, len(trigger_signal) - 1)
+        safe_end = max(0, len(answer_text) - holdback)
+        has_trigger = False
+    if safe_end <= emitted_chars:
+        return "", emitted_chars, has_trigger
+    return answer_text[emitted_chars:safe_end], safe_end, has_trigger
+def _parse_function_calls_xml(xml_string: str, trigger_signal: str) -> list[dict]:
+    if not xml_string or trigger_signal not in xml_string:
+        return []
+    cleaned = _remove_think_blocks(xml_string)
+    pos = cleaned.rfind(trigger_signal)
+    if pos == -1:
+        return []
+    sub = cleaned[pos:]
+    m = re.search(r"<function_calls>([\s\S]*?)</function_calls>", sub)
+    if not m:
+        return []
+    calls_block = m.group(1)
+    chunks = re.findall(r"<function_call>([\s\S]*?)</function_call>", calls_block)
+    out: list[dict] = []
+    for c in chunks:
+        name_m = re.search(r"<name>([\s\S]*?)</name>", c)
+        args_m = re.search(r"<args_json>([\s\S]*?)</args_json>", c)
+        if not name_m:
+            continue
+        name = name_m.group(1).strip()
+        args_raw = (args_m.group(1).strip() if args_m else "{}")
+        try:
+            parsed = json.loads(args_raw) if args_raw else {}
+            if not isinstance(parsed, dict):
+                parsed = {"value": parsed}
+        except Exception:
+            parsed = {"raw": args_raw}
+        out.append(
+            {
+                "id": f"call_{uuid.uuid4().hex[:24]}",
+                "type": "function",
+                "function": {"name": name, "arguments": json.dumps(parsed, ensure_ascii=False)},
+            }
+        )
+    return out
+# ── OpenAI response helpers ──────────────────────────────────────────
+def _make_id() -> str:
+    return f"chatcmpl-{uuid.uuid4().hex[:29]}"
+def _estimate_tokens(text: str) -> int:
+    if not text:
+        return 0
+    return max(1, math.ceil(len(text) / 2))
+def _to_optional_bool(value: Any) -> bool | None:
+    if value is None:
+        return None
+    if isinstance(value, bool):
+        return value
+    if isinstance(value, (int, float)):
+        return bool(value)
+    if isinstance(value, str):
+        v = value.strip().lower()
+        if v in {"1", "true", "yes", "on"}:
+            return True
+        if v in {"0", "false", "no", "off"}:
+            return False
+    return None
+def _build_usage(prompt_text: str, completion_text: str) -> dict:
+    p = _estimate_tokens(prompt_text)
+    c = _estimate_tokens(completion_text)
+    return {"prompt_tokens": p, "completion_tokens": c, "total_tokens": p + c}
+def _openai_chunk(
+    completion_id: str,
+    model: str,
+    *,
+    content: str | None = None,
+    reasoning_content: str | None = None,
+    finish_reason: str | None = None,
+) -> dict:
+    delta: dict = {}
+    if content is not None:
+        delta["content"] = content
+    if reasoning_content is not None:
+        delta["reasoning_content"] = reasoning_content
+    return {
+        "id": completion_id,
+        "object": "chat.completion.chunk",
+        "created": int(time.time()),
+        "model": model,
+        "choices": [{"index": 0, "delta": delta, "finish_reason": finish_reason}],
+    }
+def _extract_upstream_tool_calls(data: dict) -> list[dict]:
+    # Native Toolify/Z.ai style
+    tcs = data.get("tool_calls")
+    if isinstance(tcs, list):
+        return tcs
+    # OpenAI-like style: choices[0].delta.tool_calls or choices[0].message.tool_calls
+    choices = data.get("choices")
+    if isinstance(choices, list) and choices:
+        c0 = choices[0] if isinstance(choices[0], dict) else {}
+        delta = c0.get("delta") if isinstance(c0.get("delta"), dict) else {}
+        message = c0.get("message") if isinstance(c0.get("message"), dict) else {}
+        for candidate in (delta.get("tool_calls"), message.get("tool_calls")):
+            if isinstance(candidate, list):
+                return candidate
+    return []
+def _extract_upstream_delta(data: dict) -> tuple[str, str]:
+    """Best-effort extract (phase, delta_text) from upstream event payload."""
+    phase = str(data.get("phase", "") or "")
+    # OpenAI-like envelope
+    choices = data.get("choices")
+    if isinstance(choices, list) and choices:
+        c0 = choices[0] if isinstance(choices[0], dict) else {}
+        delta_obj = c0.get("delta") if isinstance(c0.get("delta"), dict) else {}
+        msg_obj = c0.get("message") if isinstance(c0.get("message"), dict) else {}
+        if not phase:
+            phase = str(c0.get("phase", "") or "")
+        for v in (
+            delta_obj.get("reasoning_content"),
+            delta_obj.get("content"),
+            msg_obj.get("reasoning_content"),
+            msg_obj.get("content"),
+        ):
+            if isinstance(v, str) and v:
+                return phase, v
+    candidates = [
+        data.get("delta_content"),
+        data.get("content"),
+        data.get("delta"),
+        (data.get("message") or {}).get("content") if isinstance(data.get("message"), dict) else None,
+    ]
+    for v in candidates:
+        if isinstance(v, str) and v:
+            return phase, v
+    return phase, ""
+async def _iter_upstream_with_first_event_timeout(upstream: Any, timeout_s: float):
+    """Wrap upstream iterator and enforce a timeout for the first event only."""
+    iterator = upstream.__aiter__()
+    try:
+        first = await asyncio.wait_for(iterator.__anext__(), timeout=timeout_s)
+    except StopAsyncIteration:
+        return
+    yield first
+    async for data in iterator:
+        yield data
+# ── Endpoints ──────────��─────────────────────────────────────────────
+@app.get("/v1/models")
+async def list_models():
+    models_resp = await pool.get_models()
+    if isinstance(models_resp, dict) and "data" in models_resp:
+        models_list = models_resp["data"]
+    elif isinstance(models_resp, list):
+        models_list = models_resp
+    else:
+        models_list = []
+    return {
+        "object": "list",
+        "data": [
+            {
+                "id": m.get("id") or m.get("name", "unknown"),
+                "object": "model",
+                "created": 0,
+                "owned_by": "z.ai",
+            }
+            for m in models_list
+        ],
+    }
+@app.post("/v1/chat/completions")
+async def chat_completions(request: Request):
+    body = await request.json()
+    model: str = body.get("model", "glm-5")
+    messages: list[dict] = body.get("messages", [])
+    stream: bool = body.get("stream", False)
+    tools: list[dict] | None = body.get("tools")
+    tool_choice = body.get("tool_choice")
+    enable_thinking = _to_optional_bool(body.get("enable_thinking"))
+    # signature prompt: last user message in original request
+    prompt = ""
+    for msg in reversed(messages):
+        if msg.get("role") == "user":
+            prompt = _extract_text_from_content(msg.get("content", ""))
+            break
+    if not prompt:
+        return JSONResponse(
+            status_code=400,
+            content={"error": {"message": "No user message found in messages", "type": "invalid_request_error"}},
+        )
+    processed_messages = _preprocess_messages(messages)
+    has_fc = bool(tools)
+    if has_fc:
+        fc_prompt = _generate_function_prompt(tools or [], GLOBAL_TRIGGER_SIGNAL)
+        fc_prompt += _safe_process_tool_choice(tool_choice, tools or [])
+        processed_messages.insert(0, {"role": "system", "content": fc_prompt})
+    flat_messages = _flatten_messages_for_zai(processed_messages)
+    usage_prompt_text = "\n".join(_extract_text_from_content(m.get("content", "")) for m in processed_messages)
+    req_id = f"req_{uuid.uuid4().hex[:10]}"
+    logger.info(
+        "[entry][%s] model=%s stream=%s tools=%d input_messages=%d flat_chars=%d est_prompt_tokens=%d first_event_timeout=%.1fs timeout_retry_max=%d",
+        req_id,
+        model,
+        stream,
+        len(tools or []),
+        len(messages),
+        len(flat_messages[0].get("content", "")),
+        _estimate_tokens(usage_prompt_text),
+        UPSTREAM_FIRST_EVENT_TIMEOUT,
+        UPSTREAM_FIRST_EVENT_TIMEOUT_RETRY_MAX,
+    )
+    async def run_once(auth: dict[str, str], enable_thinking_override: bool | None):
+        client = ZaiClient()
+        try:
+            client.token = auth["token"]
+            client.user_id = auth["user_id"]
+            client.username = auth["username"]
+            create_chat_started = time.perf_counter()
+            chat = await client.create_chat(prompt, model, enable_thinking=enable_thinking_override)
+            create_chat_elapsed = time.perf_counter() - create_chat_started
+            chat_id = chat["id"]
+            upstream = client.chat_completions(
+                chat_id=chat_id,
+                messages=flat_messages,
+                prompt=prompt,
+                model=model,
+                tools=None,
+                enable_thinking=enable_thinking_override,
+            )
+            return upstream, client, chat_id, create_chat_elapsed
+        except Exception:
+            await client.close()
+            raise
+    if stream:
+        async def gen_sse():
+            completion_id = _make_id()
+            retried = False
+            first_event_timeout_retries = 0
+            empty_reply_retries = 0
+            current_uid: str | None = None
+            role_emitted = False
+            while True:
+                client: ZaiClient | None = None
+                chat_id: str | None = None
+                try:
+                    phase_started = time.perf_counter()
+                    await pool.ensure_auth()
+                    ensure_auth_elapsed = time.perf_counter() - phase_started
+                    auth = pool.get_auth_snapshot()
+                    current_uid = auth["user_id"]
+                    if not role_emitted:
+                        yield f"data: {json.dumps({'id': completion_id, 'object': 'chat.completion.chunk', 'created': int(time.time()), 'model': model, 'choices': [{'index': 0, 'delta': {'role': 'assistant'}, 'finish_reason': None}]}, ensure_ascii=False)}\n\n"
+                        role_emitted = True
+                    upstream, client, chat_id, create_chat_elapsed = await run_once(auth, enable_thinking)
+                    first_upstream_started = time.perf_counter()
+                    first_event_logged = False
+                    reasoning_parts: list[str] = []
+                    answer_text = ""
+                    emitted_answer_chars = 0
+                    native_tool_calls: list[dict] = []
+                    async for data in _iter_upstream_with_first_event_timeout(upstream, UPSTREAM_FIRST_EVENT_TIMEOUT):
+                        if not first_event_logged:
+                            first_upstream_elapsed = time.perf_counter() - first_upstream_started
+                            logger.info(
+                                "[stream][%s] phase ensure_auth=%.3fs create_chat=%.3fs first_upstream_event=%.3fs",
+                                completion_id,
+                                ensure_auth_elapsed,
+                                create_chat_elapsed,
+                                first_upstream_elapsed,
+                            )
+                            first_event_logged = True
+                        phase, delta = _extract_upstream_delta(data)
+                        upstream_tcs = _extract_upstream_tool_calls(data)
+                        if upstream_tcs:
+                            for tc in upstream_tcs:
+                                native_tool_calls.append(
+                                    {
+                                        "id": tc.get("id", f"call_{uuid.uuid4().hex[:24]}"),
+                                        "type": "function",
+                                        "function": {
+                                            "name": tc.get("function", {}).get("name", ""),
+                                            "arguments": tc.get("function", {}).get("arguments", ""),
+                                        },
+                                    }
+                                )
+                            continue
+                        if phase == "thinking" and delta:
+                            reasoning_parts.append(delta)
+                            chunk = _openai_chunk(completion_id, model, reasoning_content=delta)
+                            yield f"data: {json.dumps(chunk, ensure_ascii=False)}\n\n"
+                        elif delta:
+                            answer_text += delta
+                            safe_delta, emitted_answer_chars, _ = _drain_safe_answer_delta(
+                                answer_text,
+                                emitted_answer_chars,
+                                has_fc=has_fc,
+                                trigger_signal=GLOBAL_TRIGGER_SIGNAL,
+                            )
+                            if safe_delta:
+                                yield f"data: {json.dumps(_openai_chunk(completion_id, model, content=safe_delta), ensure_ascii=False)}\n\n"
+                    if not first_event_logged:
+                        logger.info(
+                            "[stream][%s] phase ensure_auth=%.3fs create_chat=%.3fs first_upstream_event=EOF",
+                            completion_id,
+                            ensure_auth_elapsed,
+                            create_chat_elapsed,
+                        )
+                    if native_tool_calls:
+                        logger.info("[stream][%s] native_tool_calls=%d", completion_id, len(native_tool_calls))
+                        for i, tc in enumerate(native_tool_calls):
+                            tc_chunk = {
+                                "id": completion_id,
+                                "object": "chat.completion.chunk",
+                                "created": int(time.time()),
+                                "model": model,
+                                "choices": [{"index": 0, "delta": {"tool_calls": [{"index": i, **tc}]}, "finish_reason": None}],
+                            }
+                            yield f"data: {json.dumps(tc_chunk, ensure_ascii=False)}\n\n"
+                        finish = _openai_chunk(completion_id, model, finish_reason="tool_calls")
+                        yield f"data: {json.dumps(finish, ensure_ascii=False)}\n\n"
+                        yield "data: [DONE]\n\n"
+                        return
+                    logger.info(
+                        "[stream][%s] collected answer_len=%d reasoning_len=%d",
+                        completion_id,
+                        len(answer_text),
+                        len("".join(reasoning_parts)),
+                    )
+                    if not answer_text and not reasoning_parts:
+                        if empty_reply_retries >= UPSTREAM_FIRST_EVENT_TIMEOUT_RETRY_MAX:
+                            yield f"data: {json.dumps({'error': {'message': 'Upstream returned empty reply after retry', 'type': 'empty_response_error'}}, ensure_ascii=False)}\n\n"
+                            yield "data: [DONE]\n\n"
+                            return
+                        empty_reply_retries += 1
+                        logger.warning(
+                            "[stream][%s] empty upstream reply, retrying... (%d/%d)",
+                            completion_id,
+                            empty_reply_retries,
+                            UPSTREAM_FIRST_EVENT_TIMEOUT_RETRY_MAX,
+                        )
+                        await pool.refresh_auth(current_uid)
+                        current_uid = None
+                        continue
+                    parsed = _parse_function_calls_xml(answer_text, GLOBAL_TRIGGER_SIGNAL) if has_fc else []
+                    if parsed:
+                        logger.info("[stream][%s] parsed_tool_calls=%d", completion_id, len(parsed))
+                        prefix_pos = _find_last_trigger_signal_outside_think(answer_text, GLOBAL_TRIGGER_SIGNAL)
+                        if prefix_pos > emitted_answer_chars:
+                            prefix_delta = answer_text[emitted_answer_chars:prefix_pos]
+                            if prefix_delta:
+                                yield f"data: {json.dumps(_openai_chunk(completion_id, model, content=prefix_delta), ensure_ascii=False)}\n\n"
+                        for i, tc in enumerate(parsed):
+                            tc_chunk = {
+                                "id": completion_id,
+                                "object": "chat.completion.chunk",
+                                "created": int(time.time()),
+                                "model": model,
+                                "choices": [{"index": 0, "delta": {"tool_calls": [{"index": i, **tc}]}, "finish_reason": None}],
+                            }
+                            yield f"data: {json.dumps(tc_chunk, ensure_ascii=False)}\n\n"
+                        finish = _openai_chunk(completion_id, model, finish_reason="tool_calls")
+                        yield f"data: {json.dumps(finish, ensure_ascii=False)}\n\n"
+                        yield "data: [DONE]\n\n"
+                        return
+                    if emitted_answer_chars < len(answer_text):
+                        tail_delta = answer_text[emitted_answer_chars:]
+                        if tail_delta:
+                            yield f"data: {json.dumps(_openai_chunk(completion_id, model, content=tail_delta), ensure_ascii=False)}\n\n"
+                    else:
+                        # Never return an empty stream response body to clients.
+                        if not answer_text:
+                            yield f"data: {json.dumps(_openai_chunk(completion_id, model, content=''), ensure_ascii=False)}\n\n"
+                    finish = _openai_chunk(completion_id, model, finish_reason="stop")
+                    yield f"data: {json.dumps(finish, ensure_ascii=False)}\n\n"
+                    yield "data: [DONE]\n\n"
+                    return
+                except asyncio.TimeoutError:
+                    logger.error(
+                        "[stream][%s] first upstream event timeout: %.1fs",
+                        completion_id,
+                        UPSTREAM_FIRST_EVENT_TIMEOUT,
+                    )
+                    if client is not None:
+                        if chat_id:
+                            await client.delete_chat(chat_id)
+                        await client.close()
+                        client = None
+                    if first_event_timeout_retries >= UPSTREAM_FIRST_EVENT_TIMEOUT_RETRY_MAX:
+                        yield f"data: {json.dumps({'error': {'message': 'Upstream first event timeout after retry', 'type': 'timeout_error'}}, ensure_ascii=False)}\n\n"
+                        yield "data: [DONE]\n\n"
+                        return
+                    first_event_timeout_retries += 1
+                    logger.info(
+                        "[stream][%s] retrying after first-event timeout... (%d/%d)",
+                        completion_id,
+                        first_event_timeout_retries,
+                        UPSTREAM_FIRST_EVENT_TIMEOUT_RETRY_MAX,
+                    )
+                    await pool.refresh_auth(current_uid)
+                    current_uid = None
+                    continue
+                except (httpcore.RemoteProtocolError, httpx.RemoteProtocolError) as e:
+                    logger.error("[stream][%s] server disconnected: %s", completion_id, e)
+                    if client is not None:
+                        if chat_id:
+                            await client.delete_chat(chat_id)
+                        await client.close()
+                        client = None
+                    if retried:
+                        error_msg = "上游服务断开连接，请稍后重试"
+                        yield f"data: {json.dumps(_openai_chunk(completion_id, model, content=f'[{error_msg}]'), ensure_ascii=False)}\n\n"
+                        yield f"data: {json.dumps(_openai_chunk(completion_id, model, finish_reason='error'), ensure_ascii=False)}\n\n"
+                        yield "data: [DONE]\n\n"
+                        return
+                    retried = True
+                    logger.info("[stream][%s] switching account and retrying...", completion_id)
+                    await pool.refresh_auth(current_uid)
+                    current_uid = None
+                    continue
+                except (httpcore.ReadTimeout, httpx.ReadTimeout) as e:
+                    logger.error("[stream][%s] read timeout: %s", completion_id, e)
+                    if client is not None:
+                        if chat_id:
+                            await client.delete_chat(chat_id)
+                        await client.close()
+                        client = None
+                    if retried:
+                        error_msg = "上游服务响应超时，请稍后重试或减少消息长度"
+                        yield f"data: {json.dumps(_openai_chunk(completion_id, model, content=f'[{error_msg}]'), ensure_ascii=False)}\n\n"
+                        yield f"data: {json.dumps(_openai_chunk(completion_id, model, finish_reason='error'), ensure_ascii=False)}\n\n"
+                        yield "data: [DONE]\n\n"
+                        return
+                    retried = True
+                    logger.info("[stream][%s] retrying after timeout...", completion_id)
+                    await pool.refresh_auth(current_uid)
+                    current_uid = None
+                    continue
+                except httpx.HTTPStatusError as e:
+                    # Handle upstream 400 with concurrency limit (code 429)
+                    is_concurrency = False
+                    try:
+                        err_body = e.response.json() if e.response else {}
+                        is_concurrency = err_body.get("code") == 429
+                    except Exception:
+                        pass
+                    logger.error("[stream][%s] HTTP %s (concurrency=%s): %s", completion_id, e.response.status_code if e.response else '?', is_concurrency, e)
+                    if client is not None:
+                        if chat_id:
+                            await client.delete_chat(chat_id)
+                        await client.close()
+                        client = None
+                    if retried:
+                        yield f"data: {json.dumps({'error': {'message': 'Upstream concurrency limit' if is_concurrency else 'Upstream error after retry', 'type': 'server_error'}}, ensure_ascii=False)}\n\n"
+                        yield "data: [DONE]\n\n"
+                        return
+                    retried = True
+                    if is_concurrency:
+                        logger.info("[stream][%s] concurrency limit hit, cleaning up chats...", completion_id)
+                        await pool.cleanup_chats()
+                        await asyncio.sleep(1)
+                    await pool.refresh_auth(current_uid)
+                    current_uid = None
+                    continue
+                except Exception as e:
+                    logger.exception("[stream][%s] exception: %s", completion_id, e)
+                    if client is not None:
+                        if chat_id:
+                            await client.delete_chat(chat_id)
+                        await client.close()
+                        client = None
+                    if retried:
+                        yield f"data: {json.dumps({'error': {'message': 'Upstream Zai error after retry', 'type': 'server_error'}}, ensure_ascii=False)}\n\n"
+                        yield "data: [DONE]\n\n"
+                        return
+                    retried = True
+                    logger.info("[stream][%s] refreshing auth and retrying...", completion_id)
+                    await pool.refresh_auth(current_uid)
+                    current_uid = None
+                    continue
+                finally:
+                    if client is not None:
+                        if chat_id:
+                            await client.delete_chat(chat_id)
+                        await client.close()
+                    if current_uid:
+                        pool._release_by_user_id(current_uid)
+                        current_uid = None
+        return StreamingResponse(
+            gen_sse(),
+            media_type="text/event-stream",
+            headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"},
+        )
+    completion_id = _make_id()
+    client: ZaiClient | None = None
+    chat_id: str | None = None
+    current_uid: str | None = None
+    max_sync_attempts = max(2, UPSTREAM_FIRST_EVENT_TIMEOUT_RETRY_MAX + 1)
+    for attempt in range(max_sync_attempts):
+        try:
+            phase_started = time.perf_counter()
+            await pool.ensure_auth()
+            ensure_auth_elapsed = time.perf_counter() - phase_started
+            auth = pool.get_auth_snapshot()
+            current_uid = auth["user_id"]
+            upstream, client, chat_id, create_chat_elapsed = await run_once(auth, enable_thinking)
+            first_upstream_started = time.perf_counter()
+            first_event_logged = False
+            reasoning_parts: list[str] = []
+            answer_parts: list[str] = []
+            native_tool_calls: list[dict] = []
+            async for data in _iter_upstream_with_first_event_timeout(upstream, UPSTREAM_FIRST_EVENT_TIMEOUT):
+                if not first_event_logged:
+                    first_upstream_elapsed = time.perf_counter() - first_upstream_started
+                    logger.info(
+                        "[sync][%s] phase ensure_auth=%.3fs create_chat=%.3fs first_upstream_event=%.3fs",
+                        completion_id,
+                        ensure_auth_elapsed,
+                        create_chat_elapsed,
+                        first_upstream_elapsed,
+                    )
+                    first_event_logged = True
+                phase, delta = _extract_upstream_delta(data)
+                upstream_tcs = _extract_upstream_tool_calls(data)
+                if upstream_tcs:
+                    for tc in upstream_tcs:
+                        native_tool_calls.append(
+                            {
+                                "id": tc.get("id", f"call_{uuid.uuid4().hex[:24]}"),
+                                "type": "function",
+                                "function": {
+                                    "name": tc.get("function", {}).get("name", ""),
+                                    "arguments": tc.get("function", {}).get("arguments", ""),
+                                },
+                            }
+                        )
+                elif phase == "thinking" and delta:
+                    reasoning_parts.append(delta)
+                elif delta:
+                    answer_parts.append(delta)
+            if not first_event_logged:
+                logger.info(
+                    "[sync][%s] phase ensure_auth=%.3fs create_chat=%.3fs first_upstream_event=EOF",
+                    completion_id,
+                    ensure_auth_elapsed,
+                    create_chat_elapsed,
+                )
+            if native_tool_calls:
+                message: dict = {"role": "assistant", "content": None, "tool_calls": native_tool_calls}
+                if reasoning_parts:
+                    message["reasoning_content"] = "".join(reasoning_parts)
+                usage = _build_usage(usage_prompt_text, "".join(reasoning_parts))
+                return {
+                    "id": completion_id,
+                    "object": "chat.completion",
+                    "created": int(time.time()),
+                    "model": model,
+                    "choices": [{"index": 0, "message": message, "finish_reason": "tool_calls"}],
+                    "usage": usage,
+                }
+            answer_text = "".join(answer_parts)
+            if not answer_text and not reasoning_parts:
+                if attempt < max_sync_attempts - 1:
+                    logger.warning(
+                        "[sync][%s] empty upstream reply, retrying... (%d/%d)",
+                        completion_id,
+                        attempt + 1,
+                        max_sync_attempts - 1,
+                    )
+                    await pool.refresh_auth(current_uid)
+                    current_uid = None
+                    continue
+                return JSONResponse(
+                    status_code=502,
+                    content={"error": {"message": "Upstream returned empty reply after retry", "type": "empty_response_error"}},
+                )
+            parsed = _parse_function_calls_xml(answer_text, GLOBAL_TRIGGER_SIGNAL) if has_fc else []
+            if parsed:
+                prefix_pos = _find_last_trigger_signal_outside_think(answer_text, GLOBAL_TRIGGER_SIGNAL)
+                prefix_text = answer_text[:prefix_pos].rstrip() if prefix_pos > 0 else None
+                message = {"role": "assistant", "content": prefix_text or None, "tool_calls": parsed}
+                if reasoning_parts:
+                    message["reasoning_content"] = "".join(reasoning_parts)
+                usage = _build_usage(usage_prompt_text, (prefix_text or "") + "".join(reasoning_parts))
+                return {
+                    "id": completion_id,
+                    "object": "chat.completion",
+                    "created": int(time.time()),
+                    "model": model,
+                    "choices": [{"index": 0, "message": message, "finish_reason": "tool_calls"}],
+                    "usage": usage,
+                }
+            usage = _build_usage(usage_prompt_text, answer_text + "".join(reasoning_parts))
+            msg: dict = {"role": "assistant", "content": answer_text}
+            if reasoning_parts:
+                msg["reasoning_content"] = "".join(reasoning_parts)
+            return {
+                "id": completion_id,
+                "object": "chat.completion",
+                "created": int(time.time()),
+                "model": model,
+                "choices": [{"index": 0, "message": msg, "finish_reason": "stop"}],
+                "usage": usage,
+            }
+        except asyncio.TimeoutError:
+            logger.error(
+                "[sync][%s] first upstream event timeout: %.1fs",
+                completion_id,
+                UPSTREAM_FIRST_EVENT_TIMEOUT,
+            )
+            if client is not None:
+                if chat_id:
+                    await client.delete_chat(chat_id)
+                await client.close()
+                client = None
+                chat_id = None
+            if attempt < UPSTREAM_FIRST_EVENT_TIMEOUT_RETRY_MAX:
+                await pool.refresh_auth(current_uid)
+                current_uid = None
+                continue
+            return JSONResponse(
+                status_code=504,
+                content={"error": {"message": "Upstream first event timeout after retry", "type": "timeout_error"}},
+            )
+        except httpx.HTTPStatusError as e:
+            is_concurrency = False
+            try:
+                err_body = e.response.json() if e.response else {}
+                is_concurrency = err_body.get("code") == 429
+            except Exception:
+                pass
+            logger.error("[sync][%s] HTTP %s (concurrency=%s): %s", completion_id, e.response.status_code if e.response else '?', is_concurrency, e)
+            if client is not None:
+                if chat_id:
+                    await client.delete_chat(chat_id)
+                await client.close()
+                client = None
+                chat_id = None
+            if attempt == 0:
+                if is_concurrency:
+                    await pool.cleanup_chats()
+                    await asyncio.sleep(1)
+                await pool.refresh_auth(current_uid)
+                current_uid = None
+                continue
+            return JSONResponse(
+                status_code=502,
+                content={"error": {"message": "Upstream concurrency limit" if is_concurrency else "Upstream error after retry", "type": "server_error"}},
+            )
+        except Exception as e:
+            logger.exception("[sync][%s] exception: %s", completion_id, e)
+            if client is not None:
+                if chat_id:
+                    await client.delete_chat(chat_id)
+                await client.close()
+                client = None
+                chat_id = None
+            if attempt == 0:
+                await pool.refresh_auth(current_uid)
+                current_uid = None
+                continue
+            return JSONResponse(
+                status_code=502,
+                content={"error": {"message": "Upstream Zai error after retry", "type": "server_error"}},
+            )
+        finally:
+            if client is not None:
+                if chat_id:
+                    await client.delete_chat(chat_id)
+                await client.close()
+            if current_uid:
+                pool._release_by_user_id(current_uid)
+                current_uid = None
+    return JSONResponse(status_code=502, content={"error": {"message": "Unexpected error", "type": "server_error"}})
+# ── Anthropic Claude Messages Endpoint ───────────────────────────────
+@app.post("/v1/messages")
+async def claude_messages(request: Request):
+    """Anthropic Claude Messages API compatible endpoint for new-api."""
+    body = await request.json()
+    model: str = body.get("model", "glm-5")
+    claude_msgs: list[dict] = body.get("messages", [])
+    system = body.get("system")
+    stream: bool = body.get("stream", False)
+    tools_claude: list[dict] | None = body.get("tools")
+    tool_choice = body.get("tool_choice")
+    enable_thinking = _to_optional_bool(body.get("enable_thinking"))
+    openai_messages = claude_messages_to_openai(system, claude_msgs)
+    openai_tools = claude_tools_to_openai(tools_claude)
+    prompt = ""
+    for msg in reversed(openai_messages):
+        if msg.get("role") == "user":
+            prompt = _extract_text_from_content(msg.get("content", ""))
+            break
+    if not prompt:
+        return JSONResponse(
+            status_code=400,
+            content={"type": "error", "error": {"type": "invalid_request_error", "message": "No user message"}},
+        )
+    processed_messages = _preprocess_messages(openai_messages)
+    has_fc = bool(openai_tools)
+    if has_fc:
+        fc_prompt = _generate_function_prompt(openai_tools, GLOBAL_TRIGGER_SIGNAL)
+        fc_prompt += claude_tool_choice_prompt(tool_choice)
+        processed_messages.insert(0, {"role": "system", "content": fc_prompt})
+    flat_messages = _flatten_messages_for_zai(processed_messages)
+    usage_prompt = "\n".join(_extract_text_from_content(m.get("content", "")) for m in processed_messages)
+    msg_id = make_claude_id()
+    req_id = f"req_{uuid.uuid4().hex[:10]}"
+    logger.info("[claude][%s] model=%s stream=%s tools=%d", req_id, model, stream, len(openai_tools or []))
+    async def _run(auth):
+        c = ZaiClient()
+        try:
+            c.token, c.user_id, c.username = auth["token"], auth["user_id"], auth["username"]
+            chat = await c.create_chat(prompt, model, enable_thinking=enable_thinking)
+            chat_id = chat["id"]
+            up = c.chat_completions(
+                chat_id=chat_id,
+                messages=flat_messages,
+                prompt=prompt,
+                model=model,
+                enable_thinking=enable_thinking,
+            )
+            return up, c, chat_id
+        except Exception:
+            await c.close()
+            raise
+    if stream:
+        return StreamingResponse(
+            _claude_stream(msg_id, model, _run, has_fc, usage_prompt),
+            media_type="text/event-stream",
+            headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"},
+        )
+    return await _claude_sync(msg_id, model, _run, has_fc, usage_prompt)
+async def _claude_stream(msg_id, model, run_once, has_fc, usage_prompt):
+    """Generator for Claude SSE streaming."""
+    retried = False
+    current_uid: str | None = None
+    started = False
+    while True:
+        client = None
+        chat_id = None
+        try:
+            await pool.ensure_auth()
+            auth = pool.get_auth_snapshot()
+            current_uid = auth["user_id"]
+            input_tk = _estimate_tokens(usage_prompt)
+            if not started:
+                yield sse_message_start(msg_id, model, input_tk)
+                yield sse_ping()
+                started = True
+            upstream, client, chat_id = await run_once(auth)
+            r_parts: list[str] = []
+            answer_text = ""
+            emitted_answer_chars = 0
+            bidx = 0
+            thinking_on = False
+            text_on = False
+            native_tcs: list[dict] = []
+            async for data in upstream:
+                phase, delta = _extract_upstream_delta(data)
+                up_tcs = _extract_upstream_tool_calls(data)
+                if up_tcs:
+                    native_tcs.extend(up_tcs)
+                    continue
+                if phase == "thinking" and delta:
+                    if not thinking_on and not text_on:
+                        yield sse_content_block_start(bidx, {"type": "thinking", "thinking": ""})
+                        thinking_on = True
+                    r_parts.append(delta)
+                    if thinking_on:
+                        yield sse_content_block_delta(bidx, {"type": "thinking_delta", "thinking": delta})
+                elif delta:
+                    answer_text += delta
+                    safe_delta, emitted_answer_chars, _ = _drain_safe_answer_delta(
+                        answer_text,
+                        emitted_answer_chars,
+                        has_fc=has_fc,
+                        trigger_signal=GLOBAL_TRIGGER_SIGNAL,
+                    )
+                    if safe_delta:
+                        if thinking_on:
+                            yield sse_content_block_stop(bidx)
+                            bidx += 1
+                            thinking_on = False
+                        if not text_on:
+                            yield sse_content_block_start(bidx, {"type": "text", "text": ""})
+                            text_on = True
+                        yield sse_content_block_delta(bidx, {"type": "text_delta", "text": safe_delta})
+            # close thinking block
+            if thinking_on:
+                yield sse_content_block_stop(bidx)
+                bidx += 1
+            all_tcs = native_tcs
+            parsed_tcs: list[dict] = []
+            if not all_tcs and has_fc:
+                parsed_tcs = _parse_function_calls_xml(answer_text, GLOBAL_TRIGGER_SIGNAL)
+                all_tcs = parsed_tcs
+            if all_tcs:
+                answer_visible = answer_text
+                if parsed_tcs:
+                    prefix_pos = _find_last_trigger_signal_outside_think(answer_text, GLOBAL_TRIGGER_SIGNAL)
+                    if prefix_pos < 0:
+                        prefix_pos = 0
+                    if prefix_pos > emitted_answer_chars:
+                        prefix_delta = answer_text[emitted_answer_chars:prefix_pos]
+                        if prefix_delta:
+                            if not text_on:
+                                yield sse_content_block_start(bidx, {"type": "text", "text": ""})
+                                text_on = True
+                            yield sse_content_block_delta(bidx, {"type": "text_delta", "text": prefix_delta})
+                    answer_visible = answer_text[:prefix_pos]
+                elif emitted_answer_chars < len(answer_text):
+                    tail_delta = answer_text[emitted_answer_chars:]
+                    if tail_delta:
+                        if not text_on:
+                            yield sse_content_block_start(bidx, {"type": "text", "text": ""})
+                            text_on = True
+                        yield sse_content_block_delta(bidx, {"type": "text_delta", "text": tail_delta})
+                if text_on:
+                    yield sse_content_block_stop(bidx)
+                    bidx += 1
+                    text_on = False
+                for tc in all_tcs:
+                    fn = tc.get("function", {}) if isinstance(tc.get("function"), dict) else tc
+                    nm = fn.get("name", tc.get("name", ""))
+                    args_s = fn.get("arguments", "{}")
+                    tid = tc.get("id", f"toolu_{uuid.uuid4().hex[:20]}").replace("call_", "toolu_")
+                    yield sse_content_block_start(bidx, {"type": "tool_use", "id": tid, "name": nm, "input": {}})
+                    yield sse_content_block_delta(bidx, {"type": "input_json_delta", "partial_json": args_s})
+                    yield sse_content_block_stop(bidx)
+                    bidx += 1
+                out_tk = _estimate_tokens("".join(r_parts) + answer_visible)
+                yield sse_message_delta("tool_use", out_tk)
+                yield sse_message_stop()
+                return
+            if emitted_answer_chars < len(answer_text):
+                tail_delta = answer_text[emitted_answer_chars:]
+                if tail_delta:
+                    if not text_on:
+                        yield sse_content_block_start(bidx, {"type": "text", "text": ""})
+                        text_on = True
+                    yield sse_content_block_delta(bidx, {"type": "text_delta", "text": tail_delta})
+            if not text_on:
+                yield sse_content_block_start(bidx, {"type": "text", "text": ""})
+            yield sse_content_block_stop(bidx)
+            out_tk = _estimate_tokens("".join(r_parts) + answer_text)
+            yield sse_message_delta("end_turn", out_tk)
+            yield sse_message_stop()
+            return
+        except (httpcore.ReadTimeout, httpx.ReadTimeout) as e:
+            logger.error("[claude-stream][%s] timeout: %s", msg_id, e)
+            if client:
+                if chat_id:
+                    await client.delete_chat(chat_id)
+                await client.close()
+                client = None
+            if retried:
+                yield sse_error("overloaded_error", "Upstream timeout")
+                return
+            retried = True
+            await pool.refresh_auth(current_uid)
+            current_uid = None
+            continue
+        except (httpcore.RemoteProtocolError, httpx.RemoteProtocolError) as e:
+            logger.error("[claude-stream][%s] server disconnected: %s", msg_id, e)
+            if client:
+                if chat_id:
+                    await client.delete_chat(chat_id)
+                await client.close()
+                client = None
+            if retried:
+                yield sse_error("api_error", "Server disconnected, please retry")
+                return
+            retried = True
+            await pool.refresh_auth(current_uid)
+            current_uid = None
+            continue
+        except httpx.HTTPStatusError as e:
+            is_concurrency = False
+            try:
+                err_body = e.response.json() if e.response else {}
+                is_concurrency = err_body.get("code") == 429
+            except Exception:
+                pass
+            logger.error("[claude-stream][%s] HTTP %s (concurrency=%s): %s", msg_id, e.response.status_code if e.response else '?', is_concurrency, e)
+            if client:
+                if chat_id:
+                    await client.delete_chat(chat_id)
+                await client.close()
+                client = None
+            if retried:
+                yield sse_error("overloaded_error" if is_concurrency else "api_error", "Upstream concurrency limit" if is_concurrency else "Upstream error after retry")
+                return
+            retried = True
+            if is_concurrency:
+                logger.info("[claude-stream][%s] concurrency limit hit, cleaning up chats...", msg_id)
+                await pool.cleanup_chats()
+                await asyncio.sleep(1)
+            await pool.refresh_auth(current_uid)
+            current_uid = None
+            continue
+        except Exception as e:
+            logger.exception("[claude-stream][%s] error: %s", msg_id, e)
+            if client:
+                if chat_id:
+                    await client.delete_chat(chat_id)
+                await client.close()
+                client = None
+            if retried:
+                yield sse_error("api_error", "Upstream error after retry")
+                return
+            retried = True
+            await pool.refresh_auth(current_uid)
+            current_uid = None
+            continue
+        finally:
+            if client:
+                if chat_id:
+                    await client.delete_chat(chat_id)
+                await client.close()
+            if current_uid:
+                pool._release_by_user_id(current_uid)
+                current_uid = None
+async def _claude_sync(msg_id, model, run_once, has_fc, usage_prompt):
+    """Non-streaming Claude response."""
+    client = None
+    chat_id = None
+    current_uid: str | None = None
+    for attempt in range(2):
+        try:
+            await pool.ensure_auth()
+            auth = pool.get_auth_snapshot()
+            current_uid = auth["user_id"]
+            upstream, client, chat_id = await run_once(auth)
+            r_parts, a_parts = [], []
+            native_tcs: list[dict] = []
+            async for data in upstream:
+                phase, delta = _extract_upstream_delta(data)
+                up_tcs = _extract_upstream_tool_calls(data)
+                if up_tcs:
+                    native_tcs.extend(up_tcs)
+                elif phase == "thinking" and delta:
+                    r_parts.append(delta)
+                elif delta:
+                    a_parts.append(delta)
+            answer = "".join(a_parts)
+            all_tcs = native_tcs
+            if not all_tcs and has_fc:
+                all_tcs = _parse_function_calls_xml(answer, GLOBAL_TRIGGER_SIGNAL)
+                if all_tcs:
+                    pp = _find_last_trigger_signal_outside_think(answer, GLOBAL_TRIGGER_SIGNAL)
+                    answer = answer[:pp].rstrip() if pp > 0 else ""
+            in_tk = _estimate_tokens(usage_prompt)
+            out_tk = _estimate_tokens("".join(r_parts) + "".join(a_parts))
+            return build_non_stream_response(msg_id, model, r_parts, answer, all_tcs or None, in_tk, out_tk)
+        except httpx.HTTPStatusError as e:
+            is_concurrency = False
+            try:
+                err_body = e.response.json() if e.response else {}
+                is_concurrency = err_body.get("code") == 429
+            except Exception:
+                pass
+            logger.error("[claude-sync][%s] HTTP %s (concurrency=%s): %s", msg_id, e.response.status_code if e.response else '?', is_concurrency, e)
+            if client:
+                if chat_id:
+                    await client.delete_chat(chat_id)
+                await client.close()
+                client = None
+                chat_id = None
+            if attempt == 0:
+                if is_concurrency:
+                    await pool.cleanup_chats()
+                    await asyncio.sleep(1)
+                await pool.refresh_auth(current_uid)
+                current_uid = None
+                continue
+            return JSONResponse(
+                status_code=500,
+                content={"type": "error", "error": {"type": "overloaded_error" if is_concurrency else "api_error", "message": "Upstream concurrency limit" if is_concurrency else "Upstream error"}},
+            )
+        except Exception as e:
+            logger.exception("[claude-sync][%s] error: %s", msg_id, e)
+            if client:
+                if chat_id:
+                    await client.delete_chat(chat_id)
+                await client.close()
+                client = None
+                chat_id = None
+            if attempt == 0:
+                await pool.refresh_auth(current_uid)
+                current_uid = None
+                continue
+            return JSONResponse(
+                status_code=500,
+                content={"type": "error", "error": {"type": "api_error", "message": "Upstream error"}},
+            )
+        finally:
+            if client:
+                if chat_id:
+                    await client.delete_chat(chat_id)
+                await client.close()
+            if current_uid:
+                pool._release_by_user_id(current_uid)
+                current_uid = None
+    return JSONResponse(status_code=500, content={"type": "error", "error": {"type": "api_error", "message": "Unexpected"}})
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=30016)