Spaces:

overwrite69
/

haiku-api

Sleeping

App Files Files Community

overwrite69 commited on 8 days ago

Commit

3d49d68

verified ·

1 Parent(s): 7cd613b

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

README.md +47 -1
app.py +501 -106

README.md CHANGED Viewed

@@ -11,6 +11,24 @@ app_port: 7860
 OpenAI-compatible API proxy for Claude Haiku 4.5 via chatgpt.org.
 ## Usage
 ### Chat Completions (non-streaming)
@@ -36,6 +54,32 @@ curl https://YOUR_SPACE.hf.space/v1/chat/completions \
   }'
 ```
 ### With OpenAI Python SDK
 ```python
@@ -64,7 +108,9 @@ curl https://YOUR_SPACE.hf.space/v1/models
 | Endpoint | Description |
 |---|---|
-| `POST /v1/chat/completions` | OpenAI-compatible chat completions |
 | `GET /v1/models` | List available models |
 | `GET /health` | Health check |
 | `GET /debug/session` | Session debug info |

 OpenAI-compatible API proxy for Claude Haiku 4.5 via chatgpt.org.
+Supports **tool/function calling**, auto-continue for the 1K token limit, rotating proxy, and SSE keep-alive.
+## Features
+- **Tool/Function Calling**: Full OpenAI-compatible tool calling support. Converts `tools` definitions to system prompts, parses Claude's output for `<tool_call_>` blocks, and returns properly formatted `tool_calls` responses.
+- **Auto-Continue**: When the upstream 1K token limit is hit, automatically continues the response with "Continue" messages.
+- **SSE Keep-Alive**: Sends keep-alive comments during continuation gaps to prevent socket timeouts.
+- **Rotating Proxy**: Supports unstable rotating proxies with automatic retries on connection failures.
+- **Message Normalization**: Handles Orchids.app's content array format and converts it to plain text.
+## Environment Variables
+| Variable | Description | Default |
+|---|---|---|
+| `PROXY_URL` | Rotating proxy URL (e.g. `http://user:pass@proxy.op.wtf:32424`) | `""` (direct) |
+Set these in HF Spaces > Settings > Variables and Secrets.
 ## Usage
 ### Chat Completions (non-streaming)
   }'
 ```
+### With Tool Calling
+```bash
+curl https://YOUR_SPACE.hf.space/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "anthropic/claude-haiku-4-5",
+    "messages": [{"role": "user", "content": "Create a file called hello.txt with hello world"}],
+    "tools": [{
+      "type": "function",
+      "function": {
+        "name": "Write",
+        "description": "Write content to a file",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "file_path": {"type": "string", "description": "Path to the file"},
+            "content": {"type": "string", "description": "Content to write"}
+          },
+          "required": ["file_path", "content"]
+        }
+      }
+    }]
+  }'
+```
 ### With OpenAI Python SDK
 ```python
 | Endpoint | Description |
 |---|---|
+| `POST /v1/chat/completions` | OpenAI-compatible chat completions (with tool calling) |
+| `POST /chat/completions` | Same, without /v1 prefix |
 | `GET /v1/models` | List available models |
 | `GET /health` | Health check |
 | `GET /debug/session` | Session debug info |
+| `GET /debug/refresh` | Force session refresh |

app.py CHANGED Viewed

@@ -2,9 +2,12 @@
 Haiku API - OpenAI-compatible proxy for chatgpt.org/claude/chat
 Deploy to Hugging Face Spaces (Docker SDK)
-Auto-continues when upstream hits the ~1K token output limit.
-Uses rotating proxy with aggressive retries for unstable IPs.
-Sends SSE keep-alive comments during continuation gaps.
 """
 import asyncio
@@ -21,7 +24,7 @@ from fastapi import FastAPI, HTTPException, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse, JSONResponse
-app = FastAPI(title="Haiku API", version="4.0.0")
 # ── CORS ─────────────────────────────────────────────────────────
 app.add_middleware(
@@ -70,10 +73,8 @@ class SessionState:
             if self.cookies and (now - self.last_refresh) < self.refresh_interval:
                 return
-            # Try multiple times with proxy rotation (new IP each request)
             for attempt in range(PROXY_MAX_RETRIES):
                 try:
-                    # Create fresh client for each attempt (gets new proxy IP)
                     if PROXY_URL and attempt > 0:
                         try:
                             await client.aclose()
@@ -119,7 +120,7 @@ class SessionState:
                     self.csrf_token = csrf
                     self.last_refresh = now
                     print(f"[Session] OK — CSRF:{bool(csrf)} XSRF:{bool(xsrf)} Cookies:{list(new_cookies.keys())} (attempt {attempt+1})")
-                    return  # Success!
                 except (httpx.ConnectError, httpx.ProxyError, httpx.TimeoutException) as e:
                     print(f"[Session] Proxy error attempt #{attempt+1}: {type(e).__name__}: {e}")
@@ -150,37 +151,263 @@ async def shutdown():
         await http_client.aclose()
 # ── Message normalization ────────────────────────────────────────
-def normalize_messages(messages: list[dict]) -> list[dict]:
-    """Normalize messages: content arrays → plain text, strip extra fields."""
     result = []
-    for msg in messages:
-        role = msg.get("role", "user")
-        content = msg.get("content", "")
-        if isinstance(content, list):
-            text_parts = []
-            for part in content:
-                if isinstance(part, str):
-                    text_parts.append(part)
-                elif isinstance(part, dict):
-                    if part.get("type") == "text":
-                        text_parts.append(part.get("text", ""))
-            content = "\n".join(text_parts)
-        if content is None:
-            content = ""
-        content = str(content)
-        if role == "system" and not content.strip():
             continue
-        result.append({"role": role, "content": content})
     return result
 def _headers() -> dict:
     h = {
         "Accept": "*/*",
@@ -197,6 +424,7 @@ def _headers() -> dict:
 # ── Proxy-aware request with retry ──────────────────────────────
 async def _proxy_post(url: str, **kwargs) -> httpx.Response:
     """POST with proxy retry logic. Creates new client on each retry to get fresh IP."""
     global http_client
@@ -204,54 +432,26 @@ async def _proxy_post(url: str, **kwargs) -> httpx.Response:
     for attempt in range(PROXY_MAX_RETRIES):
         try:
             resp = await http_client.post(url, **kwargs)
-            # Proxy returned a non-connection error — return it
             return resp
         except (httpx.ConnectError, httpx.ProxyError, httpx.TimeoutException) as e:
             print(f"[Proxy] Connection error #{attempt+1}: {type(e).__name__}")
-            # Recreate client with new proxy IP
             if PROXY_URL:
                 try:
                     await http_client.aclose()
                 except:
                     pass
                 http_client = _make_client()
-                # Re-apply session cookies
                 await asyncio.sleep(PROXY_RETRY_DELAY)
             else:
                 await asyncio.sleep(2)
             continue
-    # All retries exhausted — return last attempt anyway
     return await http_client.post(url, **kwargs)
-async def _proxy_get(url: str, **kwargs) -> httpx.Response:
-    """GET with proxy retry logic."""
-    global http_client
-    for attempt in range(PROXY_MAX_RETRIES):
-        try:
-            resp = await http_client.get(url, **kwargs)
-            return resp
-        except (httpx.ConnectError, httpx.ProxyError, httpx.TimeoutException) as e:
-            print(f"[Proxy] GET error #{attempt+1}: {type(e).__name__}")
-            if PROXY_URL:
-                try:
-                    await http_client.aclose()
-                except:
-                    pass
-                http_client = _make_client()
-                await asyncio.sleep(PROXY_RETRY_DELAY)
-            else:
-                await asyncio.sleep(2)
-            continue
-    return await http_client.get(url, **kwargs)
 # ── Raw call with retries ───────────────────────────────────────
 async def _raw_call(messages: list[dict], model: str) -> httpx.Response:
     """Make a single POST to chatgpt.org/api/chat with full retry logic."""
     await session.refresh(http_client)
@@ -268,7 +468,7 @@ async def _raw_call(messages: list[dict], model: str) -> httpx.Response:
             )
             if resp.status_code == 419 and attempt == 0:
-                print("[Chat] 419 → refreshing session...")
                 session.last_refresh = 0
                 await session.refresh(http_client)
                 break
@@ -293,7 +493,8 @@ async def _raw_call(messages: list[dict], model: str) -> httpx.Response:
 async def _stream_one_response(resp):
-    """Stream a single upstream SSE response in real-time."""
     finish_reason = None
     async for raw_line in resp.aiter_lines():
@@ -331,14 +532,15 @@ async def _stream_one_response(resp):
 # ── Streaming with auto-continue ────────────────────────────────
 MAX_CONTINUATIONS = 20
 async def _raw_call_streaming(messages: list[dict], model: str):
-    """Like _raw_call but yields SSE keep-alive comments during retries."""
     await session.refresh(http_client)
     payload = {"model": model, "messages": messages}
     for attempt in range(2):  # CSRF retry
         for rate_attempt in range(3):  # 429 retry
-            # Keep-alive before request
             yield ": thinking...\n\n"
             resp = await _proxy_post(
@@ -349,7 +551,7 @@ async def _raw_call_streaming(messages: list[dict], model: str):
             )
             if resp.status_code == 419 and attempt == 0:
-                print("[Chat] 419 → refreshing session...")
                 session.last_refresh = 0
                 await session.refresh(http_client)
                 break
@@ -376,8 +578,13 @@ async def _raw_call_streaming(messages: list[dict], model: str):
     raise HTTPException(500, "Failed after retry")
-async def _stream_with_auto_continue(messages: list[dict], model: str):
-    """Stream with real-time output, auto-continue, and keep-alive pings."""
     chunk_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
     created = int(time.time())
     conversation = list(messages)
@@ -407,6 +614,132 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
             if text:
                 chunk_content += text
                 total_content += text
                 sse_data = json.dumps({
                     "id": chunk_id,
                     "object": "chat.completion.chunk",
@@ -414,38 +747,46 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
                     "model": model,
                     "choices": [{
                         "index": 0,
-                        "delta": {"content": text},
-                        "finish_reason": None,
                     }],
                 })
                 yield f"data: {sse_data}\n\n"
-        print(f"[Chat] Chunk #{cont_num+1}: {len(chunk_content)} chars, finish={finish_reason}")
-        if finish_reason == "stop":
-            sse_data = json.dumps({
-                "id": chunk_id,
-                "object": "chat.completion.chunk",
-                "created": created,
-                "model": model,
-                "choices": [{
-                    "index": 0,
-                    "delta": {},
-                    "finish_reason": "stop",
-                }],
-            })
-            yield f"data: {sse_data}\n\n"
-            yield "data: [DONE]\n\n"
-            return
         yield ": continuing...\n\n"
-        conversation.append({"role": "assistant", "content": chunk_content})
-        conversation.append({"role": "user", "content": "Continue exactly from where you left off. Do not repeat any text you already wrote."})
         print(f"[Chat] Auto-continue #{cont_num+1}, total so far: {len(total_content)} chars")
-    # Safety
     sse_data = json.dumps({
         "id": chunk_id,
         "object": "chat.completion.chunk",
@@ -462,8 +803,10 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
 # ── Non-streaming with auto-continue ────────────────────────────
-async def _collect_with_auto_continue(messages: list[dict], model: str) -> str:
-    """Collect the full response, auto-continuing if cut off."""
     conversation = list(messages)
     full_content = ""
@@ -482,16 +825,40 @@ async def _collect_with_auto_continue(messages: list[dict], model: str) -> str:
         full_content += content
         print(f"[Chat] Collect #{cont_num+1}: {len(content)} chars, finish={finish_reason}")
         if finish_reason == "stop":
-            return full_content
-        conversation.append({"role": "assistant", "content": content})
-        conversation.append({"role": "user", "content": "Continue exactly from where you left off. Do not repeat any text you already wrote."})
-    return full_content
 # ── OpenAI-compatible endpoint ──────────────────────────────────
 @app.post("/v1/chat/completions")
 @app.post("/chat/completions")
 async def chat_completions(request: Request):
@@ -506,18 +873,22 @@ async def chat_completions(request: Request):
     model = body.get("model", "anthropic/claude-haiku-4-5")
     messages_raw = body.get("messages", [])
     stream = body.get("stream", False)
     if not messages_raw or not isinstance(messages_raw, list):
         raise HTTPException(400, "messages must be a non-empty array")
-    messages = normalize_messages(messages_raw)
     if not messages:
         raise HTTPException(400, "No valid messages after normalization")
     if stream:
         return StreamingResponse(
-            _stream_with_auto_continue(messages, model),
             media_type="text/event-stream",
             headers={
                 "Cache-Control": "no-cache",
@@ -526,22 +897,45 @@ async def chat_completions(request: Request):
             },
         )
     else:
-        full_text = await _collect_with_auto_continue(messages, model)
-        return JSONResponse({
-            "id": f"chatcmpl-{int(time.time())}",
-            "object": "chat.completion",
-            "created": int(time.time()),
-            "model": model,
-            "choices": [{
-                "index": 0,
-                "message": {"role": "assistant", "content": full_text},
-                "finish_reason": "stop",
-            }],
-            "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
-        })
 # ── Models / Health ─────────────────────────────────────────────
 @app.get("/v1/models")
 @app.get("/models")
 async def list_models():
@@ -557,8 +951,9 @@ async def list_models():
 async def root():
     return {
         "status": "ok",
-        "version": "4.0.0",
         "proxy": bool(PROXY_URL),
         "endpoints": ["/v1/chat/completions", "/v1/models"],
     }

 Haiku API - OpenAI-compatible proxy for chatgpt.org/claude/chat
 Deploy to Hugging Face Spaces (Docker SDK)
+Features:
+- Tool/function calling support (converts OpenAI tools → system prompt, parses output)
+- Auto-continues when upstream hits the ~1K token output limit
+- Rotating proxy with aggressive retries for unstable IPs
+- SSE keep-alive comments during continuation gaps
+- Message normalization for Orchids.app compatibility
 """
 import asyncio
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse, JSONResponse
+app = FastAPI(title="Haiku API", version="5.0.0")
 # ── CORS ─────────────────────────────────────────────────────────
 app.add_middleware(
             if self.cookies and (now - self.last_refresh) < self.refresh_interval:
                 return
             for attempt in range(PROXY_MAX_RETRIES):
                 try:
                     if PROXY_URL and attempt > 0:
                         try:
                             await client.aclose()
                     self.csrf_token = csrf
                     self.last_refresh = now
                     print(f"[Session] OK — CSRF:{bool(csrf)} XSRF:{bool(xsrf)} Cookies:{list(new_cookies.keys())} (attempt {attempt+1})")
+                    return
                 except (httpx.ConnectError, httpx.ProxyError, httpx.TimeoutException) as e:
                     print(f"[Session] Proxy error attempt #{attempt+1}: {type(e).__name__}: {e}")
         await http_client.aclose()
+# ── Tool Calling Support ─────────────────────────────────────────
+def _build_tool_system_prompt(tools: list[dict], tool_choice=None) -> str:
+    """Convert OpenAI tools format to a system prompt that instructs Claude
+    to output tool calls in a parseable format."""
+    tools_desc = []
+    for tool in tools:
+        func = tool.get("function", {})
+        name = func.get("name", "unknown")
+        desc = func.get("description", "No description")
+        params = func.get("parameters", {})
+        # Format parameters nicely
+        props = params.get("properties", {})
+        required = params.get("required", [])
+        param_lines = []
+        for pname, pdef in props.items():
+            ptype = pdef.get("type", "any")
+            pdesc = pdef.get("description", "")
+            req_flag = " (required)" if pname in required else " (optional)"
+            param_lines.append(f"  - {pname}: {ptype}{req_flag} — {pdesc}")
+        params_text = "\n".join(param_lines) if param_lines else "  (no parameters)"
+        tools_desc.append(f"### {name}\n{desc}\nParameters:\n{params_text}")
+    tools_text = "\n\n".join(tools_desc)
+    # Handle tool_choice
+    choice_instruction = ""
+    if tool_choice == "required":
+        choice_instruction = "\nIMPORTANT: You MUST call at least one tool. Do not respond with just text."
+    elif tool_choice == "none":
+        # Shouldn't reach here since we skip tool injection for "none"
+        choice_instruction = "\nDo NOT call any tools. Respond with text only."
+    elif isinstance(tool_choice, dict) and tool_choice.get("type") == "function":
+        fname = tool_choice.get("function", {}).get("name", "")
+        choice_instruction = f"\nIMPORTANT: You MUST call the {fname} function."
+    return f"""# Available Tools
+You have access to the following tools that you can call:
+{tools_text}
+## Tool Call Format
+When you want to call a tool, you MUST use EXACTLY this XML format — one block per tool call:
+<tool_call name="FUNCTION_NAME">
+{"{"}"param1": "value1", "param2": "value2"{"}"}
+</tool_call_>
+Example — calling the Write tool:
+<tool_call name="Write">
+{"{"}"file_path": "hello.txt", "content": "hello world"{"}"}
+</tool_call_>
+## Rules
+- You may call multiple tools by using multiple <tool_call_> blocks in sequence
+- The arguments inside the block MUST be valid JSON matching the tool's parameter schema
+- If you need to call a tool, output ONLY <tool_call_> blocks — no explanatory text before or after
+- If you don't need to call any tools, just respond normally with text (no <tool_call_> blocks)
+- Do NOT wrap <tool_call_> blocks in markdown code blocks or any other formatting
+{choice_instruction}"""
+# Regex to parse <tool_call name="...">...</tool_call_> blocks
+_TOOL_CALL_RE = re.compile(
+    r'<tool_call\s+name="([^"]+)">\s*(.*?)\s*</tool_call_>',
+    re.DOTALL
+)
+# Also try matching incomplete tool calls (for auto-continue detection)
+_INCOMPLETE_TOOL_CALL_RE = re.compile(
+    r'<tool_call\s+name="([^"]+)">\s*(.*?)$',
+    re.DOTALL
+)
+def _parse_tool_calls(text: str) -> tuple[list[dict], str]:
+    """Parse tool calls from model text output.
+    Returns (tool_calls, remaining_text) where tool_calls is in OpenAI format.
+    If no tool calls found, returns ([], original_text).
+    """
+    matches = list(_TOOL_CALL_RE.finditer(text))
+    if not matches:
+        return [], text
+    tool_calls = []
+    # Collect text outside of tool call blocks
+    remaining_parts = []
+    last_end = 0
+    for match in matches:
+        # Text before this tool call
+        if match.start() > last_end:
+            before = text[last_end:match.start()].strip()
+            if before:
+                remaining_parts.append(before)
+        last_end = match.end()
+        func_name = match.group(1)
+        args_str = match.group(2).strip()
+        # Try to parse arguments as JSON
+        try:
+            args_json = json.loads(args_str)
+            args_final = json.dumps(args_json)
+        except json.JSONDecodeError:
+            # Try to fix common issues
+            # Sometimes Claude wraps args in markdown code block
+            args_cleaned = args_str.strip('`').strip()
+            if args_cleaned.startswith('json'):
+                args_cleaned = args_cleaned[4:].strip()
+            try:
+                args_json = json.loads(args_cleaned)
+                args_final = json.dumps(args_json)
+            except json.JSONDecodeError:
+                # Last resort: wrap the raw text as an argument
+                args_final = json.dumps({"raw_input": args_str})
+        tool_calls.append({
+            "id": f"call_{uuid.uuid4().hex[:24]}",
+            "type": "function",
+            "function": {
+                "name": func_name,
+                "arguments": args_final,
+            }
+        })
+    # Text after the last tool call
+    if last_end < len(text):
+        after = text[last_end:].strip()
+        if after:
+            remaining_parts.append(after)
+    remaining_text = "\n".join(remaining_parts)
+    return tool_calls, remaining_text
+def _has_incomplete_tool_call(text: str) -> bool:
+    """Check if text has an opening <tool_call_>> tag without a matching close."""
+    opens = len(re.findall(r'<tool_call\s+name="[^"]+">', text))
+    closes = len(re.findall(r'</tool_call_>', text))
+    return opens > closes
 # ── Message normalization ────────────────────────────────────────
+def normalize_messages(messages: list[dict], tools: list[dict] = None, tool_choice=None) -> list[dict]:
+    """Normalize messages: handle content arrays, tool roles, tool_calls,
+    and inject tool definitions into system prompt if tools are provided."""
     result = []
+    # If tools provided and tool_choice != "none", inject tool system prompt
+    inject_tools = tools and tool_choice != "none"
+    if inject_tools:
+        tool_system = _build_tool_system_prompt(tools, tool_choice)
+    else:
+        tool_system = None
+    system_injected = False
+    for msg in messages:
+        role = msg.get("role", "user")
+        # Inject tool system prompt before or as the first system message
+        if role == "system" and not system_injected and tool_system:
+            content = msg.get("content", "")
+            if isinstance(content, list):
+                content = _flatten_content_array(content)
+            content = str(content) if content else ""
+            combined = content + "\n\n" + tool_system if content.strip() else tool_system
+            result.append({"role": "system", "content": combined})
+            system_injected = True
             continue
+        result.append(_normalize_one_message(msg))
+    # If no system message existed, add tool system prompt as first message
+    if tool_system and not system_injected:
+        result.insert(0, {"role": "system", "content": tool_system})
+    # Filter out empty system messages
+    result = [m for m in result if not (m.get("role") == "system" and not m.get("content", "").strip())]
     return result
+def _flatten_content_array(content: list) -> str:
+    """Convert a content array to plain text."""
+    text_parts = []
+    for part in content:
+        if isinstance(part, str):
+            text_parts.append(part)
+        elif isinstance(part, dict):
+            if part.get("type") == "text":
+                text_parts.append(part.get("text", ""))
+    return "\n".join(text_parts)
+def _normalize_one_message(msg: dict) -> dict:
+    """Normalize a single message for chatgpt.org API."""
+    role = msg.get("role", "user")
+    content = msg.get("content", "")
+    # Handle content arrays → plain text
+    if isinstance(content, list):
+        content = _flatten_content_array(content)
+    if content is None:
+        content = ""
+    content = str(content)
+    # Handle tool role messages → convert to user message with tool result
+    if role == "tool":
+        tool_name = msg.get("name", "unknown_tool")
+        tool_call_id = msg.get("tool_call_id", "")
+        return {
+            "role": "user",
+            "content": f"[Tool Result for {tool_name} (id: {tool_call_id})]:\n{content}"
+        }
+    # Handle assistant messages with tool_calls → text with <tool_call_> blocks
+    if role == "assistant" and msg.get("tool_calls"):
+        parts = []
+        regular_content = content if content and content.strip() else ""
+        if regular_content:
+            parts.append(regular_content)
+        for tc in msg["tool_calls"]:
+            func = tc.get("function", {})
+            name = func.get("name", "unknown")
+            args = func.get("arguments", "{}")
+            # Validate args is valid JSON
+            try:
+                json.loads(args)
+            except (json.JSONDecodeError, TypeError):
+                args = "{}"
+            parts.append(f'<tool_call name="{name}">\n{args}\n</tool_call_>')
+        return {"role": "assistant", "content": "\n\n".join(parts)}
+    # System messages with empty content get filtered out later
+    if role == "system" and not content.strip():
+        return {"role": "system", "content": ""}
+    return {"role": role, "content": content}
+# ── Headers ──────────────────────────────────────────────────────
 def _headers() -> dict:
     h = {
         "Accept": "*/*",
 # ── Proxy-aware request with retry ──────────────────────────────
 async def _proxy_post(url: str, **kwargs) -> httpx.Response:
     """POST with proxy retry logic. Creates new client on each retry to get fresh IP."""
     global http_client
     for attempt in range(PROXY_MAX_RETRIES):
         try:
             resp = await http_client.post(url, **kwargs)
             return resp
         except (httpx.ConnectError, httpx.ProxyError, httpx.TimeoutException) as e:
             print(f"[Proxy] Connection error #{attempt+1}: {type(e).__name__}")
             if PROXY_URL:
                 try:
                     await http_client.aclose()
                 except:
                     pass
                 http_client = _make_client()
                 await asyncio.sleep(PROXY_RETRY_DELAY)
             else:
                 await asyncio.sleep(2)
             continue
     return await http_client.post(url, **kwargs)
 # ── Raw call with retries ───────────────────────────────────────
 async def _raw_call(messages: list[dict], model: str) -> httpx.Response:
     """Make a single POST to chatgpt.org/api/chat with full retry logic."""
     await session.refresh(http_client)
             )
             if resp.status_code == 419 and attempt == 0:
+                print("[Chat] 419 -> refreshing session...")
                 session.last_refresh = 0
                 await session.refresh(http_client)
                 break
 async def _stream_one_response(resp):
+    """Stream a single upstream SSE response in real-time.
+    Yields (text, finish_reason) tuples. finish_reason is None for text chunks."""
     finish_reason = None
     async for raw_line in resp.aiter_lines():
 # ── Streaming with auto-continue ────────────────────────────────
 MAX_CONTINUATIONS = 20
 async def _raw_call_streaming(messages: list[dict], model: str):
+    """Like _raw_call but yields SSE keep-alive comments during retries,
+    then yields the httpx.Response object."""
     await session.refresh(http_client)
     payload = {"model": model, "messages": messages}
     for attempt in range(2):  # CSRF retry
         for rate_attempt in range(3):  # 429 retry
             yield ": thinking...\n\n"
             resp = await _proxy_post(
             )
             if resp.status_code == 419 and attempt == 0:
+                print("[Chat] 419 -> refreshing session...")
                 session.last_refresh = 0
                 await session.refresh(http_client)
                 break
     raise HTTPException(500, "Failed after retry")
+async def _stream_with_auto_continue(messages: list[dict], model: str, has_tools: bool = False):
+    """Stream with real-time output, auto-continue, and keep-alive pings.
+    When has_tools is True, we buffer the full response to properly detect
+    and format tool calls, sending keep-alive pings while buffering.
+    When has_tools is False, we stream text in real-time.
+    """
     chunk_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
     created = int(time.time())
     conversation = list(messages)
             if text:
                 chunk_content += text
                 total_content += text
+                # If no tools, stream text in real-time
+                if not has_tools:
+                    sse_data = json.dumps({
+                        "id": chunk_id,
+                        "object": "chat.completion.chunk",
+                        "created": created,
+                        "model": model,
+                        "choices": [{
+                            "index": 0,
+                            "delta": {"content": text},
+                            "finish_reason": None,
+                        }],
+                    })
+                    yield f"data: {sse_data}\n\n"
+        print(f"[Chat] Chunk #{cont_num+1}: {len(chunk_content)} chars, finish={finish_reason}")
+        # Check for tool calls
+        if has_tools:
+            tool_calls, remaining_text = _parse_tool_calls(total_content)
+            if tool_calls:
+                # Emit tool calls as OpenAI streaming chunks
+                for i, tc in enumerate(tool_calls):
+                    # First chunk: role + tool_call with id, name, and start of arguments
+                    sse_start = json.dumps({
+                        "id": chunk_id,
+                        "object": "chat.completion.chunk",
+                        "created": created,
+                        "model": model,
+                        "choices": [{
+                            "index": 0,
+                            "delta": {
+                                "role": "assistant",
+                                "tool_calls": [{
+                                    "index": i,
+                                    "id": tc["id"],
+                                    "type": "function",
+                                    "function": {
+                                        "name": tc["function"]["name"],
+                                        "arguments": "",
+                                    }
+                                }]
+                            },
+                            "finish_reason": None,
+                        }],
+                    })
+                    yield f"data: {sse_start}\n\n"
+                    # Argument chunks — split into small pieces for streaming feel
+                    args = tc["function"]["arguments"]
+                    chunk_size = max(1, len(args) // 3)
+                    for offset in range(0, len(args), chunk_size):
+                        arg_piece = args[offset:offset + chunk_size]
+                        sse_arg = json.dumps({
+                            "id": chunk_id,
+                            "object": "chat.completion.chunk",
+                            "created": created,
+                            "model": model,
+                            "choices": [{
+                                "index": 0,
+                                "delta": {
+                                    "tool_calls": [{
+                                        "index": i,
+                                        "function": {
+                                            "arguments": arg_piece,
+                                        }
+                                    }]
+                                },
+                                "finish_reason": None,
+                            }],
+                        })
+                        yield f"data: {sse_arg}\n\n"
+                # If there's remaining text alongside tool calls, emit it too
+                if remaining_text.strip():
+                    sse_text = json.dumps({
+                        "id": chunk_id,
+                        "object": "chat.completion.chunk",
+                        "created": created,
+                        "model": model,
+                        "choices": [{
+                            "index": 0,
+                            "delta": {"content": remaining_text},
+                            "finish_reason": None,
+                        }],
+                    })
+                    yield f"data: {sse_text}\n\n"
+                # Final chunk with finish_reason
+                sse_done = json.dumps({
+                    "id": chunk_id,
+                    "object": "chat.completion.chunk",
+                    "created": created,
+                    "model": model,
+                    "choices": [{
+                        "index": 0,
+                        "delta": {},
+                        "finish_reason": "tool_calls",
+                    }],
+                })
+                yield f"data: {sse_done}\n\n"
+                yield "data: [DONE]\n\n"
+                return
+            # No tool calls found — if text is complete, stream it as content
+            if finish_reason == "stop":
+                # Stream the buffered text content as chunks
+                text_to_stream = total_content
+                chunk_sz = 50  # characters per streaming chunk
+                for offset in range(0, len(text_to_stream), chunk_sz):
+                    piece = text_to_stream[offset:offset + chunk_sz]
+                    sse_data = json.dumps({
+                        "id": chunk_id,
+                        "object": "chat.completion.chunk",
+                        "created": created,
+                        "model": model,
+                        "choices": [{
+                            "index": 0,
+                            "delta": {"content": piece},
+                            "finish_reason": None,
+                        }],
+                    })
+                    yield f"data: {sse_data}\n\n"
                 sse_data = json.dumps({
                     "id": chunk_id,
                     "object": "chat.completion.chunk",
                     "model": model,
                     "choices": [{
                         "index": 0,
+                        "delta": {},
+                        "finish_reason": "stop",
                     }],
                 })
                 yield f"data: {sse_data}\n\n"
+                yield "data: [DONE]\n\n"
+                return
+        else:
+            # No tools — original behavior
+            if finish_reason == "stop":
+                sse_data = json.dumps({
+                    "id": chunk_id,
+                    "object": "chat.completion.chunk",
+                    "created": created,
+                    "model": model,
+                    "choices": [{
+                        "index": 0,
+                        "delta": {},
+                        "finish_reason": "stop",
+                    }],
+                })
+                yield f"data: {sse_data}\n\n"
+                yield "data: [DONE]\n\n"
+                return
+        # Auto-continue for length-limited responses
         yield ": continuing...\n\n"
+        # Check if we're in the middle of a tool call
+        if _has_incomplete_tool_call(chunk_content):
+            conversation.append({"role": "assistant", "content": chunk_content})
+            conversation.append({"role": "user", "content": "Continue the tool call exactly from where you left off. Do not repeat the opening tag or any arguments you already wrote."})
+        else:
+            conversation.append({"role": "assistant", "content": chunk_content})
+            conversation.append({"role": "user", "content": "Continue exactly from where you left off. Do not repeat any text you already wrote."})
         print(f"[Chat] Auto-continue #{cont_num+1}, total so far: {len(total_content)} chars")
+    # Safety: max continuations reached
     sse_data = json.dumps({
         "id": chunk_id,
         "object": "chat.completion.chunk",
 # ── Non-streaming with auto-continue ────────────────────────────
+async def _collect_with_auto_continue(messages: list[dict], model: str, has_tools: bool = False) -> dict:
+    """Collect the full response, auto-continuing if cut off.
+    Returns a dict with either 'content' or 'tool_calls' key."""
     conversation = list(messages)
     full_content = ""
         full_content += content
         print(f"[Chat] Collect #{cont_num+1}: {len(content)} chars, finish={finish_reason}")
+        # Check for tool calls if tools were provided
+        if has_tools:
+            tool_calls, remaining_text = _parse_tool_calls(full_content)
+            if tool_calls:
+                result = {
+                    "tool_calls": tool_calls,
+                    "content": remaining_text if remaining_text.strip() else None,
+                }
+                # If there are incomplete tool calls, continue
+                if _has_incomplete_tool_call(full_content) and finish_reason == "length":
+                    pass  # fall through to auto-continue
+                else:
+                    return result
         if finish_reason == "stop":
+            if has_tools:
+                # No tool calls found, return as text
+                return {"content": full_content, "tool_calls": None}
+            return {"content": full_content, "tool_calls": None}
+        # Auto-continue
+        if _has_incomplete_tool_call(content):
+            conversation.append({"role": "assistant", "content": content})
+            conversation.append({"role": "user", "content": "Continue the tool call exactly from where you left off. Do not repeat the opening tag or any arguments you already wrote."})
+        else:
+            conversation.append({"role": "assistant", "content": content})
+            conversation.append({"role": "user", "content": "Continue exactly from where you left off. Do not repeat any text you already wrote."})
+    return {"content": full_content, "tool_calls": None}
 # ── OpenAI-compatible endpoint ──────────────────────────────────
 @app.post("/v1/chat/completions")
 @app.post("/chat/completions")
 async def chat_completions(request: Request):
     model = body.get("model", "anthropic/claude-haiku-4-5")
     messages_raw = body.get("messages", [])
     stream = body.get("stream", False)
+    tools = body.get("tools") or None
+    tool_choice = body.get("tool_choice", "auto")
     if not messages_raw or not isinstance(messages_raw, list):
         raise HTTPException(400, "messages must be a non-empty array")
+    has_tools = bool(tools) and tool_choice != "none"
+    messages = normalize_messages(messages_raw, tools=tools, tool_choice=tool_choice)
     if not messages:
         raise HTTPException(400, "No valid messages after normalization")
     if stream:
         return StreamingResponse(
+            _stream_with_auto_continue(messages, model, has_tools=has_tools),
             media_type="text/event-stream",
             headers={
                 "Cache-Control": "no-cache",
             },
         )
     else:
+        result = await _collect_with_auto_continue(messages, model, has_tools=has_tools)
+        tool_calls = result.get("tool_calls")
+        content = result.get("content")
+        if tool_calls:
+            return JSONResponse({
+                "id": f"chatcmpl-{int(time.time())}",
+                "object": "chat.completion",
+                "created": int(time.time()),
+                "model": model,
+                "choices": [{
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": content,
+                        "tool_calls": tool_calls,
+                    },
+                    "finish_reason": "tool_calls",
+                }],
+                "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
+            })
+        else:
+            return JSONResponse({
+                "id": f"chatcmpl-{int(time.time())}",
+                "object": "chat.completion",
+                "created": int(time.time()),
+                "model": model,
+                "choices": [{
+                    "index": 0,
+                    "message": {"role": "assistant", "content": content or ""},
+                    "finish_reason": "stop",
+                }],
+                "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
+            })
 # ── Models / Health ─────────────────────────────────────────────
 @app.get("/v1/models")
 @app.get("/models")
 async def list_models():
 async def root():
     return {
         "status": "ok",
+        "version": "5.0.0",
         "proxy": bool(PROXY_URL),
+        "tool_calling": True,
         "endpoints": ["/v1/chat/completions", "/v1/models"],
     }