Spaces:

overwrite69
/

haiku-api

Sleeping

App Files Files Community

overwrite69 commited on 5 days ago

Commit

aa3153c

verified ·

1 Parent(s): 33bf669

v8.1.0: fix tool call cutoff - never emit incomplete tool calls, auto-continue properly

Browse files

Files changed (1) hide show

app.py +100 -33

app.py CHANGED Viewed

@@ -26,7 +26,7 @@ from fastapi import FastAPI, HTTPException, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse, JSONResponse
-app = FastAPI(title="Haiku API", version="8.0.0")
 # ── CORS ─────────────────────────────────────────────────────────
 app.add_middleware(
@@ -318,6 +318,29 @@ def _has_incomplete_tool_call(text: str) -> bool:
     return False
 # ── Tool System Prompt Builder ──────────────────────────────────
 def _build_tool_system_prompt(tools: list[dict], tool_choice=None) -> str:
@@ -759,7 +782,8 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
     """Stream with real-time output, auto-continue, and keep-alive pings.
     ALWAYS buffers the full response to detect tool call tags.
-    If tool calls are found, emits them as proper OpenAI tool_calls chunks.
     If no tool calls, emits the text as regular content chunks.
     """
     chunk_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
@@ -778,7 +802,6 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
                 else:
                     resp = result
         except HTTPException as e:
-            # Send error as SSE then stop
             error_data = json.dumps({
                 "id": chunk_id,
                 "object": "chat.completion.chunk",
@@ -817,17 +840,40 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
         print(f"[Chat] Chunk #{cont_num+1}: {len(chunk_content)} chars, finish={finish_reason}")
-        # ALWAYS check for tool calls
         tool_calls, remaining_text = _parse_tool_calls(total_content)
-        if tool_calls:
-            print(f"[Chat] Detected {len(tool_calls)} tool call(s)")
             for sse_chunk in _emit_tool_call_chunks(chunk_id, created, model, tool_calls, remaining_text):
                 yield sse_chunk
             return
-        # No tool calls
         if finish_reason == "stop":
             chunk_sz = 50
             for offset in range(0, len(total_content), chunk_sz):
                 piece = total_content[offset:offset + chunk_sz]
@@ -859,32 +905,53 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
             yield "data: [DONE]\n\n"
             return
-        # Auto-continue
         yield ": continuing...\n\n"
-        if _has_incomplete_tool_call(chunk_content):
-            conversation.append({"role": "assistant", "content": chunk_content})
-            conversation.append({"role": "user", "content": "Continue the tool call exactly from where you left off. Do not repeat the opening tag or any arguments you already wrote."})
-        else:
-            conversation.append({"role": "assistant", "content": chunk_content})
-            conversation.append({"role": "user", "content": "Continue exactly from where you left off. Do not repeat any text you already wrote."})
-        print(f"[Chat] Auto-continue #{cont_num+1}, total so far: {len(total_content)} chars")
-    # Safety: max continuations reached
-    sse_data = json.dumps({
-        "id": chunk_id,
-        "object": "chat.completion.chunk",
-        "created": created,
-        "model": model,
-        "choices": [{
-            "index": 0,
-            "delta": {},
-            "finish_reason": "stop",
-        }],
-    })
-    yield f"data: {sse_data}\n\n"
-    yield "data: [DONE]\n\n"
 # ── Non-streaming with auto-continue ────────────────────────────
@@ -1042,7 +1109,7 @@ async def list_models():
 async def root():
     return {
         "status": "ok",
-        "version": "8.0.0",
         "proxy": bool(PROXY_URL),
         "tool_calling": True,
         "endpoints": ["/v1/chat/completions", "/v1/models"],

 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse, JSONResponse
+app = FastAPI(title="Haiku API", version="8.1.0")
 # ── CORS ─────────────────────────────────────────────────────────
 app.add_middleware(
     return False
+def _strip_incomplete_tool_tags(text: str) -> str:
+    """Remove incomplete tool call XML tags from text.
+    This prevents raw XML tags from leaking into delta.content
+    when auto-continue fails to complete a tool call."""
+    # Remove incomplete Anthropic XML blocks
+    # e.g. "<function_calls>\n<invoke name="Write">\n<parameter name="content">some unfinished..."
+    text = re.sub(
+        r'<function_calls>\s*<invoke[^>]*>.*',
+        '', text, flags=re.DOTALL
+    )
+    # Remove incomplete inline JSON tool calls
+    text = re.sub(
+        r'<(?:function_call|tool_call)\s+name="[^"]+">.*',
+        '', text, flags=re.DOTALL
+    )
+    # Remove any stray opening/closing tags
+    text = re.sub(r'</?function_calls>\s*', '', text)
+    text = re.sub(r'</?invoke[^>]*>\s*', '', text)
+    text = re.sub(r'</?parameter[^>]*>\s*', '', text)
+    text = re.sub(r'</?(?:function_call|tool_call)_?>\s*', '', text)
+    return text.strip()
 # ── Tool System Prompt Builder ──────────────────────────────────
 def _build_tool_system_prompt(tools: list[dict], tool_choice=None) -> str:
     """Stream with real-time output, auto-continue, and keep-alive pings.
     ALWAYS buffers the full response to detect tool call tags.
+    If tool calls are found AND complete, emits them as proper OpenAI tool_calls chunks.
+    If tool calls are incomplete, auto-continues to collect the rest.
     If no tool calls, emits the text as regular content chunks.
     """
     chunk_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
                 else:
                     resp = result
         except HTTPException as e:
             error_data = json.dumps({
                 "id": chunk_id,
                 "object": "chat.completion.chunk",
         print(f"[Chat] Chunk #{cont_num+1}: {len(chunk_content)} chars, finish={finish_reason}")
+        # Check for tool calls in the accumulated text
         tool_calls, remaining_text = _parse_tool_calls(total_content)
+        has_incomplete = _has_incomplete_tool_call(total_content)
+        print(f"[Chat] tool_calls={len(tool_calls)} incomplete={has_incomplete} finish={finish_reason}")
+        # ── Decision tree ──────────────────────────────────────────
+        #
+        # 1. If we have COMPLETE tool calls AND no incomplete tags → emit & done
+        # 2. If we have incomplete tool calls (regardless of complete ones) → auto-continue
+        # 3. If no tool calls and finish_reason == "stop" and no incomplete tags → emit text & done
+        # 4. If no tool calls and finish_reason == "stop" but HAS incomplete tags → auto-continue
+        #    (the upstream might report "stop" even when cut off mid-tag)
+        # 5. If finish_reason == "length" → auto-continue
+        if tool_calls and not has_incomplete:
+            # All tool calls are complete — emit them
+            print(f"[Chat] Emitting {len(tool_calls)} complete tool call(s)")
             for sse_chunk in _emit_tool_call_chunks(chunk_id, created, model, tool_calls, remaining_text):
                 yield sse_chunk
             return
+        if has_incomplete:
+            # Incomplete tool calls detected — must auto-continue
+            print(f"[Chat] Incomplete tool call detected, auto-continuing...")
+            yield ": continuing...\n\n"
+            conversation.append({"role": "assistant", "content": chunk_content})
+            conversation.append({"role": "user", "content": "Continue the tool call exactly from where you left off. Do not repeat the opening tag or any arguments you already wrote. Just continue outputting the parameter values from where you stopped."})
+            print(f"[Chat] Auto-continue (incomplete) #{cont_num+1}, total so far: {len(total_content)} chars")
+            continue
+        # No tool calls and no incomplete tags
         if finish_reason == "stop":
+            # Regular text response — emit as content
             chunk_sz = 50
             for offset in range(0, len(total_content), chunk_sz):
                 piece = total_content[offset:offset + chunk_sz]
             yield "data: [DONE]\n\n"
             return
+        # finish_reason == "length" — auto-continue for regular text
         yield ": continuing...\n\n"
+        conversation.append({"role": "assistant", "content": chunk_content})
+        conversation.append({"role": "user", "content": "Continue exactly from where you left off. Do not repeat any text you already wrote."})
+        print(f"[Chat] Auto-continue (length) #{cont_num+1}, total so far: {len(total_content)} chars")
+    # Safety: max continuations reached — try to emit whatever we have
+    tool_calls, remaining_text = _parse_tool_calls(total_content)
+    if tool_calls:
+        # Best-effort: emit whatever tool calls we managed to parse
+        print(f"[Chat] Max continuations reached, emitting {len(tool_calls)} partial tool call(s)")
+        for sse_chunk in _emit_tool_call_chunks(chunk_id, created, model, tool_calls, remaining_text):
+            yield sse_chunk
+    else:
+        # Emit whatever text we have
+        # Strip any incomplete tool call XML from the output to avoid raw tags in content
+        clean_content = _strip_incomplete_tool_tags(total_content)
+        if clean_content.strip():
+            chunk_sz = 50
+            for offset in range(0, len(clean_content), chunk_sz):
+                piece = clean_content[offset:offset + chunk_sz]
+                sse_data = json.dumps({
+                    "id": chunk_id,
+                    "object": "chat.completion.chunk",
+                    "created": created,
+                    "model": model,
+                    "choices": [{
+                        "index": 0,
+                        "delta": {"content": piece},
+                        "finish_reason": None,
+                    }],
+                })
+                yield f"data: {sse_data}\n\n"
+        sse_data = json.dumps({
+            "id": chunk_id,
+            "object": "chat.completion.chunk",
+            "created": created,
+            "model": model,
+            "choices": [{
+                "index": 0,
+                "delta": {},
+                "finish_reason": "stop",
+            }],
+        })
+        yield f"data: {sse_data}\n\n"
+        yield "data: [DONE]\n\n"
 # ── Non-streaming with auto-continue ────────────────────────────
 async def root():
     return {
         "status": "ok",
+        "version": "8.1.0",
         "proxy": bool(PROXY_URL),
         "tool_calling": True,
         "endpoints": ["/v1/chat/completions", "/v1/models"],