Spaces:

overwrite69
/

haiku-api

Sleeping

App Files Files Community

overwrite69 commited on 8 days ago

Commit

f5092e2

verified ·

1 Parent(s): 3d49d68

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +205 -325

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ Haiku API - OpenAI-compatible proxy for chatgpt.org/claude/chat
 Deploy to Hugging Face Spaces (Docker SDK)
 Features:
-- Tool/function calling support (converts OpenAI tools → system prompt, parses output)
 - Auto-continues when upstream hits the ~1K token output limit
 - Rotating proxy with aggressive retries for unstable IPs
 - SSE keep-alive comments during continuation gaps
@@ -24,7 +24,7 @@ from fastapi import FastAPI, HTTPException, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse, JSONResponse
-app = FastAPI(title="Haiku API", version="5.0.0")
 # ── CORS ─────────────────────────────────────────────────────────
 app.add_middleware(
@@ -152,81 +152,16 @@ async def shutdown():
 # ── Tool Calling Support ─────────────────────────────────────────
-def _build_tool_system_prompt(tools: list[dict], tool_choice=None) -> str:
-    """Convert OpenAI tools format to a system prompt that instructs Claude
-    to output tool calls in a parseable format."""
-    tools_desc = []
-    for tool in tools:
-        func = tool.get("function", {})
-        name = func.get("name", "unknown")
-        desc = func.get("description", "No description")
-        params = func.get("parameters", {})
-        # Format parameters nicely
-        props = params.get("properties", {})
-        required = params.get("required", [])
-        param_lines = []
-        for pname, pdef in props.items():
-            ptype = pdef.get("type", "any")
-            pdesc = pdef.get("description", "")
-            req_flag = " (required)" if pname in required else " (optional)"
-            param_lines.append(f"  - {pname}: {ptype}{req_flag} — {pdesc}")
-        params_text = "\n".join(param_lines) if param_lines else "  (no parameters)"
-        tools_desc.append(f"### {name}\n{desc}\nParameters:\n{params_text}")
-    tools_text = "\n\n".join(tools_desc)
-    # Handle tool_choice
-    choice_instruction = ""
-    if tool_choice == "required":
-        choice_instruction = "\nIMPORTANT: You MUST call at least one tool. Do not respond with just text."
-    elif tool_choice == "none":
-        # Shouldn't reach here since we skip tool injection for "none"
-        choice_instruction = "\nDo NOT call any tools. Respond with text only."
-    elif isinstance(tool_choice, dict) and tool_choice.get("type") == "function":
-        fname = tool_choice.get("function", {}).get("name", "")
-        choice_instruction = f"\nIMPORTANT: You MUST call the {fname} function."
-    return f"""# Available Tools
-You have access to the following tools that you can call:
-{tools_text}
-## Tool Call Format
-When you want to call a tool, you MUST use EXACTLY this XML format — one block per tool call:
-<tool_call name="FUNCTION_NAME">
-{"{"}"param1": "value1", "param2": "value2"{"}"}
-</tool_call_>
-Example — calling the Write tool:
-<tool_call name="Write">
-{"{"}"file_path": "hello.txt", "content": "hello world"{"}"}
-</tool_call_>
-## Rules
-- You may call multiple tools by using multiple <tool_call_> blocks in sequence
-- The arguments inside the block MUST be valid JSON matching the tool's parameter schema
-- If you need to call a tool, output ONLY <tool_call_> blocks — no explanatory text before or after
-- If you don't need to call any tools, just respond normally with text (no <tool_call_> blocks)
-- Do NOT wrap <tool_call_> blocks in markdown code blocks or any other formatting
-{choice_instruction}"""
 # Regex to parse <tool_call name="...">...</tool_call_> blocks
 _TOOL_CALL_RE = re.compile(
     r'<tool_call\s+name="([^"]+)">\s*(.*?)\s*</tool_call_>',
     re.DOTALL
 )
-# Also try matching incomplete tool calls (for auto-continue detection)
 _INCOMPLETE_TOOL_CALL_RE = re.compile(
-    r'<tool_call\s+name="([^"]+)">\s*(.*?)$',
     re.DOTALL
 )
@@ -243,7 +178,6 @@ def _parse_tool_calls(text: str) -> tuple[list[dict], str]:
         return [], text
     tool_calls = []
-    # Collect text outside of tool call blocks
     remaining_parts = []
     last_end = 0
@@ -264,7 +198,6 @@ def _parse_tool_calls(text: str) -> tuple[list[dict], str]:
             args_final = json.dumps(args_json)
         except json.JSONDecodeError:
             # Try to fix common issues
-            # Sometimes Claude wraps args in markdown code block
             args_cleaned = args_str.strip('`').strip()
             if args_cleaned.startswith('json'):
                 args_cleaned = args_cleaned[4:].strip()
@@ -295,54 +228,18 @@ def _parse_tool_calls(text: str) -> tuple[list[dict], str]:
 def _has_incomplete_tool_call(text: str) -> bool:
-    """Check if text has an opening <tool_call_>> tag without a matching close."""
     opens = len(re.findall(r'<tool_call\s+name="[^"]+">', text))
     closes = len(re.findall(r'</tool_call_>', text))
     return opens > closes
-# ── Message normalization ────────────────────────────────────────
-def normalize_messages(messages: list[dict], tools: list[dict] = None, tool_choice=None) -> list[dict]:
-    """Normalize messages: handle content arrays, tool roles, tool_calls,
-    and inject tool definitions into system prompt if tools are provided."""
-    result = []
-    # If tools provided and tool_choice != "none", inject tool system prompt
-    inject_tools = tools and tool_choice != "none"
-    if inject_tools:
-        tool_system = _build_tool_system_prompt(tools, tool_choice)
-    else:
-        tool_system = None
-    system_injected = False
-    for msg in messages:
-        role = msg.get("role", "user")
-        # Inject tool system prompt before or as the first system message
-        if role == "system" and not system_injected and tool_system:
-            content = msg.get("content", "")
-            if isinstance(content, list):
-                content = _flatten_content_array(content)
-            content = str(content) if content else ""
-            combined = content + "\n\n" + tool_system if content.strip() else tool_system
-            result.append({"role": "system", "content": combined})
-            system_injected = True
-            continue
-        result.append(_normalize_one_message(msg))
-    # If no system message existed, add tool system prompt as first message
-    if tool_system and not system_injected:
-        result.insert(0, {"role": "system", "content": tool_system})
-    # Filter out empty system messages
-    result = [m for m in result if not (m.get("role") == "system" and not m.get("content", "").strip())]
-    return result
 def _flatten_content_array(content: list) -> str:
     """Convert a content array to plain text."""
@@ -356,54 +253,60 @@ def _flatten_content_array(content: list) -> str:
     return "\n".join(text_parts)
-def _normalize_one_message(msg: dict) -> dict:
-    """Normalize a single message for chatgpt.org API."""
-    role = msg.get("role", "user")
-    content = msg.get("content", "")
-    # Handle content arrays → plain text
-    if isinstance(content, list):
-        content = _flatten_content_array(content)
-    if content is None:
-        content = ""
-    content = str(content)
-    # Handle tool role messages → convert to user message with tool result
-    if role == "tool":
-        tool_name = msg.get("name", "unknown_tool")
-        tool_call_id = msg.get("tool_call_id", "")
-        return {
-            "role": "user",
-            "content": f"[Tool Result for {tool_name} (id: {tool_call_id})]:\n{content}"
-        }
-    # Handle assistant messages with tool_calls → text with <tool_call_> blocks
-    if role == "assistant" and msg.get("tool_calls"):
-        parts = []
-        regular_content = content if content and content.strip() else ""
-        if regular_content:
-            parts.append(regular_content)
-        for tc in msg["tool_calls"]:
-            func = tc.get("function", {})
-            name = func.get("name", "unknown")
-            args = func.get("arguments", "{}")
-            # Validate args is valid JSON
-            try:
-                json.loads(args)
-            except (json.JSONDecodeError, TypeError):
-                args = "{}"
-            parts.append(f'<tool_call name="{name}">\n{args}\n</tool_call_>')
-        return {"role": "assistant", "content": "\n\n".join(parts)}
-    # System messages with empty content get filtered out later
-    if role == "system" and not content.strip():
-        return {"role": "system", "content": ""}
-    return {"role": role, "content": content}
 # ── Headers ──────────────────────────────────────────────────────
@@ -578,12 +481,100 @@ async def _raw_call_streaming(messages: list[dict], model: str):
     raise HTTPException(500, "Failed after retry")
-async def _stream_with_auto_continue(messages: list[dict], model: str, has_tools: bool = False):
     """Stream with real-time output, auto-continue, and keep-alive pings.
-    When has_tools is True, we buffer the full response to properly detect
-    and format tool calls, sending keep-alive pings while buffering.
-    When has_tools is False, we stream text in real-time.
     """
     chunk_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
     created = int(time.time())
@@ -591,6 +582,7 @@ async def _stream_with_auto_continue(messages: list[dict], model: str, has_tools
     total_content = ""
     for cont_num in range(MAX_CONTINUATIONS):
         yield ": thinking...\n\n"
         resp = None
@@ -606,6 +598,7 @@ async def _stream_with_auto_continue(messages: list[dict], model: str, has_tools
         finish_reason = "stop"
         chunk_content = ""
         async for text, fr in _stream_one_response(resp):
             if fr is not None:
                 finish_reason = fr
@@ -615,131 +608,27 @@ async def _stream_with_auto_continue(messages: list[dict], model: str, has_tools
                 chunk_content += text
                 total_content += text
-                # If no tools, stream text in real-time
-                if not has_tools:
-                    sse_data = json.dumps({
-                        "id": chunk_id,
-                        "object": "chat.completion.chunk",
-                        "created": created,
-                        "model": model,
-                        "choices": [{
-                            "index": 0,
-                            "delta": {"content": text},
-                            "finish_reason": None,
-                        }],
-                    })
-                    yield f"data: {sse_data}\n\n"
         print(f"[Chat] Chunk #{cont_num+1}: {len(chunk_content)} chars, finish={finish_reason}")
-        # Check for tool calls
-        if has_tools:
-            tool_calls, remaining_text = _parse_tool_calls(total_content)
-            if tool_calls:
-                # Emit tool calls as OpenAI streaming chunks
-                for i, tc in enumerate(tool_calls):
-                    # First chunk: role + tool_call with id, name, and start of arguments
-                    sse_start = json.dumps({
-                        "id": chunk_id,
-                        "object": "chat.completion.chunk",
-                        "created": created,
-                        "model": model,
-                        "choices": [{
-                            "index": 0,
-                            "delta": {
-                                "role": "assistant",
-                                "tool_calls": [{
-                                    "index": i,
-                                    "id": tc["id"],
-                                    "type": "function",
-                                    "function": {
-                                        "name": tc["function"]["name"],
-                                        "arguments": "",
-                                    }
-                                }]
-                            },
-                            "finish_reason": None,
-                        }],
-                    })
-                    yield f"data: {sse_start}\n\n"
-                    # Argument chunks — split into small pieces for streaming feel
-                    args = tc["function"]["arguments"]
-                    chunk_size = max(1, len(args) // 3)
-                    for offset in range(0, len(args), chunk_size):
-                        arg_piece = args[offset:offset + chunk_size]
-                        sse_arg = json.dumps({
-                            "id": chunk_id,
-                            "object": "chat.completion.chunk",
-                            "created": created,
-                            "model": model,
-                            "choices": [{
-                                "index": 0,
-                                "delta": {
-                                    "tool_calls": [{
-                                        "index": i,
-                                        "function": {
-                                            "arguments": arg_piece,
-                                        }
-                                    }]
-                                },
-                                "finish_reason": None,
-                            }],
-                        })
-                        yield f"data: {sse_arg}\n\n"
-                # If there's remaining text alongside tool calls, emit it too
-                if remaining_text.strip():
-                    sse_text = json.dumps({
-                        "id": chunk_id,
-                        "object": "chat.completion.chunk",
-                        "created": created,
-                        "model": model,
-                        "choices": [{
-                            "index": 0,
-                            "delta": {"content": remaining_text},
-                            "finish_reason": None,
-                        }],
-                    })
-                    yield f"data: {sse_text}\n\n"
-                # Final chunk with finish_reason
-                sse_done = json.dumps({
-                    "id": chunk_id,
-                    "object": "chat.completion.chunk",
-                    "created": created,
-                    "model": model,
-                    "choices": [{
-                        "index": 0,
-                        "delta": {},
-                        "finish_reason": "tool_calls",
-                    }],
-                })
-                yield f"data: {sse_done}\n\n"
-                yield "data: [DONE]\n\n"
-                return
-            # No tool calls found — if text is complete, stream it as content
-            if finish_reason == "stop":
-                # Stream the buffered text content as chunks
-                text_to_stream = total_content
-                chunk_sz = 50  # characters per streaming chunk
-                for offset in range(0, len(text_to_stream), chunk_sz):
-                    piece = text_to_stream[offset:offset + chunk_sz]
-                    sse_data = json.dumps({
-                        "id": chunk_id,
-                        "object": "chat.completion.chunk",
-                        "created": created,
-                        "model": model,
-                        "choices": [{
-                            "index": 0,
-                            "delta": {"content": piece},
-                            "finish_reason": None,
-                        }],
-                    })
-                    yield f"data: {sse_data}\n\n"
                 sse_data = json.dumps({
                     "id": chunk_id,
                     "object": "chat.completion.chunk",
@@ -747,36 +636,31 @@ async def _stream_with_auto_continue(messages: list[dict], model: str, has_tools
                     "model": model,
                     "choices": [{
                         "index": 0,
-                        "delta": {},
-                        "finish_reason": "stop",
                     }],
                 })
                 yield f"data: {sse_data}\n\n"
-                yield "data: [DONE]\n\n"
-                return
-        else:
-            # No tools — original behavior
-            if finish_reason == "stop":
-                sse_data = json.dumps({
-                    "id": chunk_id,
-                    "object": "chat.completion.chunk",
-                    "created": created,
-                    "model": model,
-                    "choices": [{
-                        "index": 0,
-                        "delta": {},
-                        "finish_reason": "stop",
-                    }],
-                })
-                yield f"data: {sse_data}\n\n"
-                yield "data: [DONE]\n\n"
-                return
         # Auto-continue for length-limited responses
         yield ": continuing...\n\n"
-        # Check if we're in the middle of a tool call
         if _has_incomplete_tool_call(chunk_content):
             conversation.append({"role": "assistant", "content": chunk_content})
             conversation.append({"role": "user", "content": "Continue the tool call exactly from where you left off. Do not repeat the opening tag or any arguments you already wrote."})
@@ -804,9 +688,9 @@ async def _stream_with_auto_continue(messages: list[dict], model: str, has_tools
 # ── Non-streaming with auto-continue ────────────────────────────
-async def _collect_with_auto_continue(messages: list[dict], model: str, has_tools: bool = False) -> dict:
     """Collect the full response, auto-continuing if cut off.
-    Returns a dict with either 'content' or 'tool_calls' key."""
     conversation = list(messages)
     full_content = ""
@@ -825,25 +709,20 @@ async def _collect_with_auto_continue(messages: list[dict], model: str, has_tool
         full_content += content
         print(f"[Chat] Collect #{cont_num+1}: {len(content)} chars, finish={finish_reason}")
-        # Check for tool calls if tools were provided
-        if has_tools:
-            tool_calls, remaining_text = _parse_tool_calls(full_content)
-            if tool_calls:
-                result = {
                     "tool_calls": tool_calls,
                     "content": remaining_text if remaining_text.strip() else None,
                 }
-                # If there are incomplete tool calls, continue
-                if _has_incomplete_tool_call(full_content) and finish_reason == "length":
-                    pass  # fall through to auto-continue
-                else:
-                    return result
         if finish_reason == "stop":
-            if has_tools:
-                # No tool calls found, return as text
-                return {"content": full_content, "tool_calls": None}
             return {"content": full_content, "tool_calls": None}
         # Auto-continue
@@ -873,22 +752,23 @@ async def chat_completions(request: Request):
     model = body.get("model", "anthropic/claude-haiku-4-5")
     messages_raw = body.get("messages", [])
     stream = body.get("stream", False)
-    tools = body.get("tools") or None
-    tool_choice = body.get("tool_choice", "auto")
     if not messages_raw or not isinstance(messages_raw, list):
         raise HTTPException(400, "messages must be a non-empty array")
-    has_tools = bool(tools) and tool_choice != "none"
-    messages = normalize_messages(messages_raw, tools=tools, tool_choice=tool_choice)
     if not messages:
         raise HTTPException(400, "No valid messages after normalization")
     if stream:
         return StreamingResponse(
-            _stream_with_auto_continue(messages, model, has_tools=has_tools),
             media_type="text/event-stream",
             headers={
                 "Cache-Control": "no-cache",
@@ -897,7 +777,7 @@ async def chat_completions(request: Request):
             },
         )
     else:
-        result = await _collect_with_auto_continue(messages, model, has_tools=has_tools)
         tool_calls = result.get("tool_calls")
         content = result.get("content")
@@ -951,7 +831,7 @@ async def list_models():
 async def root():
     return {
         "status": "ok",
-        "version": "5.0.0",
         "proxy": bool(PROXY_URL),
         "tool_calling": True,
         "endpoints": ["/v1/chat/completions", "/v1/models"],

 Deploy to Hugging Face Spaces (Docker SDK)
 Features:
+- Tool/function calling support (always detects <tool_call_> tags in output)
 - Auto-continues when upstream hits the ~1K token output limit
 - Rotating proxy with aggressive retries for unstable IPs
 - SSE keep-alive comments during continuation gaps
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse, JSONResponse
+app = FastAPI(title="Haiku API", version="6.0.0")
 # ── CORS ─────────────────────────────────────────────────────────
 app.add_middleware(
 # ── Tool Calling Support ─────────────────────────────────────────
 # Regex to parse <tool_call name="...">...</tool_call_> blocks
+# Supports: <tool_call name="X">JSON</tool_call_> and variations
 _TOOL_CALL_RE = re.compile(
     r'<tool_call\s+name="([^"]+)">\s*(.*?)\s*</tool_call_>',
     re.DOTALL
 )
+# Also match incomplete tool calls (for auto-continue detection)
 _INCOMPLETE_TOOL_CALL_RE = re.compile(
+    r'<tool_call\s+name="[^"]+">\s*(.*?)$',
     re.DOTALL
 )
         return [], text
     tool_calls = []
     remaining_parts = []
     last_end = 0
             args_final = json.dumps(args_json)
         except json.JSONDecodeError:
             # Try to fix common issues
             args_cleaned = args_str.strip('`').strip()
             if args_cleaned.startswith('json'):
                 args_cleaned = args_cleaned[4:].strip()
 def _has_incomplete_tool_call(text: str) -> bool:
+    """Check if text has an opening <tool_call_> tag without a matching close."""
     opens = len(re.findall(r'<tool_call\s+name="[^"]+">', text))
     closes = len(re.findall(r'</tool_call_>', text))
     return opens > closes
+def _detect_tool_calls_in_text(text: str) -> bool:
+    """Quick check if text likely contains tool call patterns."""
+    return bool(_TOOL_CALL_RE.search(text))
+# ── Message normalization ────────────────────────────────────────
 def _flatten_content_array(content: list) -> str:
     """Convert a content array to plain text."""
     return "\n".join(text_parts)
+def normalize_messages(messages: list[dict]) -> list[dict]:
+    """Normalize messages: handle content arrays, tool roles, tool_calls."""
+    result = []
+    for msg in messages:
+        role = msg.get("role", "user")
+        content = msg.get("content", "")
+        # Handle content arrays → plain text
+        if isinstance(content, list):
+            content = _flatten_content_array(content)
+        if content is None:
+            content = ""
+        content = str(content)
+        # Handle tool role messages → convert to user message with tool result
+        if role == "tool":
+            tool_name = msg.get("name", "unknown_tool")
+            tool_call_id = msg.get("tool_call_id", "")
+            result.append({
+                "role": "user",
+                "content": f"[Tool Result for {tool_name} (id: {tool_call_id})]:\n{content}"
+            })
+            continue
+        # Handle assistant messages with tool_calls → text with <tool_call_> blocks
+        if role == "assistant" and msg.get("tool_calls"):
+            parts = []
+            regular_content = content if content and content.strip() else ""
+            if regular_content:
+                parts.append(regular_content)
+            for tc in msg["tool_calls"]:
+                func = tc.get("function", {})
+                name = func.get("name", "unknown")
+                args = func.get("arguments", "{}")
+                try:
+                    json.loads(args)
+                except (json.JSONDecodeError, TypeError):
+                    args = "{}"
+                parts.append(f'<tool_call name="{name}">\n{args}\n</tool_call_>')
+            result.append({"role": "assistant", "content": "\n\n".join(parts)})
+            continue
+        # System messages with empty content get filtered out
+        if role == "system" and not content.strip():
+            continue
+        result.append({"role": role, "content": content})
+    return result
 # ── Headers ──────────────────────────────────────────────────────
     raise HTTPException(500, "Failed after retry")
+def _emit_tool_call_chunks(chunk_id: str, created: int, model: str, tool_calls: list[dict], remaining_text: str):
+    """Generate OpenAI streaming chunks for tool calls. Returns list of SSE strings."""
+    chunks = []
+    for i, tc in enumerate(tool_calls):
+        # First chunk: role + tool_call with id, name, and start of arguments
+        sse_start = json.dumps({
+            "id": chunk_id,
+            "object": "chat.completion.chunk",
+            "created": created,
+            "model": model,
+            "choices": [{
+                "index": 0,
+                "delta": {
+                    "role": "assistant",
+                    "tool_calls": [{
+                        "index": i,
+                        "id": tc["id"],
+                        "type": "function",
+                        "function": {
+                            "name": tc["function"]["name"],
+                            "arguments": "",
+                        }
+                    }]
+                },
+                "finish_reason": None,
+            }],
+        })
+        chunks.append(f"data: {sse_start}\n\n")
+        # Argument chunks — split into small pieces for streaming feel
+        args = tc["function"]["arguments"]
+        chunk_size = max(1, len(args) // 3)
+        for offset in range(0, len(args), chunk_size):
+            arg_piece = args[offset:offset + chunk_size]
+            sse_arg = json.dumps({
+                "id": chunk_id,
+                "object": "chat.completion.chunk",
+                "created": created,
+                "model": model,
+                "choices": [{
+                    "index": 0,
+                    "delta": {
+                        "tool_calls": [{
+                            "index": i,
+                            "function": {
+                                "arguments": arg_piece,
+                            }
+                        }]
+                    },
+                    "finish_reason": None,
+                }],
+            })
+            chunks.append(f"data: {sse_arg}\n\n")
+    # If there's remaining text alongside tool calls, emit it too
+    if remaining_text.strip():
+        sse_text = json.dumps({
+            "id": chunk_id,
+            "object": "chat.completion.chunk",
+            "created": created,
+            "model": model,
+            "choices": [{
+                "index": 0,
+                "delta": {"content": remaining_text},
+                "finish_reason": None,
+            }],
+        })
+        chunks.append(f"data: {sse_text}\n\n")
+    # Final chunk with finish_reason
+    sse_done = json.dumps({
+        "id": chunk_id,
+        "object": "chat.completion.chunk",
+        "created": created,
+        "model": model,
+        "choices": [{
+            "index": 0,
+            "delta": {},
+            "finish_reason": "tool_calls",
+        }],
+    })
+    chunks.append(f"data: {sse_done}\n\n")
+    chunks.append("data: [DONE]\n\n")
+    return chunks
+async def _stream_with_auto_continue(messages: list[dict], model: str):
     """Stream with real-time output, auto-continue, and keep-alive pings.
+    ALWAYS buffers the full response to detect <tool_call_> tags.
+    If tool calls are found, emits them as proper OpenAI tool_calls chunks.
+    If no tool calls, emits the text as regular content chunks.
     """
     chunk_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
     created = int(time.time())
     total_content = ""
     for cont_num in range(MAX_CONTINUATIONS):
+        # Send keep-alive while we buffer
         yield ": thinking...\n\n"
         resp = None
         finish_reason = "stop"
         chunk_content = ""
+        # Buffer the full response (don't stream in real-time so we can detect tool calls)
         async for text, fr in _stream_one_response(resp):
             if fr is not None:
                 finish_reason = fr
                 chunk_content += text
                 total_content += text
+                # Send keep-alive pings while buffering
+                yield ": streaming...\n\n"
         print(f"[Chat] Chunk #{cont_num+1}: {len(chunk_content)} chars, finish={finish_reason}")
+        # ALWAYS check for tool calls in the accumulated text
+        tool_calls, remaining_text = _parse_tool_calls(total_content)
+        if tool_calls:
+            print(f"[Chat] Detected {len(tool_calls)} tool call(s)")
+            # Emit tool calls as proper OpenAI streaming chunks
+            for sse_chunk in _emit_tool_call_chunks(chunk_id, created, model, tool_calls, remaining_text):
+                yield sse_chunk
+            return
+        # No tool calls found
+        if finish_reason == "stop":
+            # Stream the buffered text content as regular content chunks
+            chunk_sz = 50
+            for offset in range(0, len(total_content), chunk_sz):
+                piece = total_content[offset:offset + chunk_sz]
                 sse_data = json.dumps({
                     "id": chunk_id,
                     "object": "chat.completion.chunk",
                     "model": model,
                     "choices": [{
                         "index": 0,
+                        "delta": {"content": piece},
+                        "finish_reason": None,
                     }],
                 })
                 yield f"data: {sse_data}\n\n"
+            # Final stop chunk
+            sse_data = json.dumps({
+                "id": chunk_id,
+                "object": "chat.completion.chunk",
+                "created": created,
+                "model": model,
+                "choices": [{
+                    "index": 0,
+                    "delta": {},
+                    "finish_reason": "stop",
+                }],
+            })
+            yield f"data: {sse_data}\n\n"
+            yield "data: [DONE]\n\n"
+            return
         # Auto-continue for length-limited responses
         yield ": continuing...\n\n"
         if _has_incomplete_tool_call(chunk_content):
             conversation.append({"role": "assistant", "content": chunk_content})
             conversation.append({"role": "user", "content": "Continue the tool call exactly from where you left off. Do not repeat the opening tag or any arguments you already wrote."})
 # ── Non-streaming with auto-continue ────────────────────────────
+async def _collect_with_auto_continue(messages: list[dict], model: str) -> dict:
     """Collect the full response, auto-continuing if cut off.
+    Always checks for tool calls. Returns dict with 'content' and/or 'tool_calls'."""
     conversation = list(messages)
     full_content = ""
         full_content += content
         print(f"[Chat] Collect #{cont_num+1}: {len(content)} chars, finish={finish_reason}")
+        # Always check for tool calls
+        tool_calls, remaining_text = _parse_tool_calls(full_content)
+        if tool_calls:
+            # If there are incomplete tool calls and we got cut off, continue
+            if _has_incomplete_tool_call(full_content) and finish_reason == "length":
+                pass  # fall through to auto-continue
+            else:
+                return {
                     "tool_calls": tool_calls,
                     "content": remaining_text if remaining_text.strip() else None,
                 }
         if finish_reason == "stop":
             return {"content": full_content, "tool_calls": None}
         # Auto-continue
     model = body.get("model", "anthropic/claude-haiku-4-5")
     messages_raw = body.get("messages", [])
     stream = body.get("stream", False)
+    # Log request for debugging
+    tools_present = "tools" in body
+    functions_present = "functions" in body
+    print(f"[Request] model={model} stream={stream} tools={tools_present} functions={functions_present} msgs={len(messages_raw)}")
     if not messages_raw or not isinstance(messages_raw, list):
         raise HTTPException(400, "messages must be a non-empty array")
+    messages = normalize_messages(messages_raw)
     if not messages:
         raise HTTPException(400, "No valid messages after normalization")
     if stream:
         return StreamingResponse(
+            _stream_with_auto_continue(messages, model),
             media_type="text/event-stream",
             headers={
                 "Cache-Control": "no-cache",
             },
         )
     else:
+        result = await _collect_with_auto_continue(messages, model)
         tool_calls = result.get("tool_calls")
         content = result.get("content")
 async def root():
     return {
         "status": "ok",
+        "version": "6.0.0",
         "proxy": bool(PROXY_URL),
         "tool_calling": True,
         "endpoints": ["/v1/chat/completions", "/v1/models"],