Spaces:
Sleeping
Sleeping
v8.1.0: fix tool call cutoff - never emit incomplete tool calls, auto-continue properly
Browse files
app.py
CHANGED
|
@@ -26,7 +26,7 @@ from fastapi import FastAPI, HTTPException, Request
|
|
| 26 |
from fastapi.middleware.cors import CORSMiddleware
|
| 27 |
from fastapi.responses import StreamingResponse, JSONResponse
|
| 28 |
|
| 29 |
-
app = FastAPI(title="Haiku API", version="8.
|
| 30 |
|
| 31 |
# ββ CORS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
app.add_middleware(
|
|
@@ -318,6 +318,29 @@ def _has_incomplete_tool_call(text: str) -> bool:
|
|
| 318 |
return False
|
| 319 |
|
| 320 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
# ββ Tool System Prompt Builder ββββββββββββββββββββββββββββββββββ
|
| 322 |
|
| 323 |
def _build_tool_system_prompt(tools: list[dict], tool_choice=None) -> str:
|
|
@@ -759,7 +782,8 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
|
|
| 759 |
"""Stream with real-time output, auto-continue, and keep-alive pings.
|
| 760 |
|
| 761 |
ALWAYS buffers the full response to detect tool call tags.
|
| 762 |
-
If tool calls are found, emits them as proper OpenAI tool_calls chunks.
|
|
|
|
| 763 |
If no tool calls, emits the text as regular content chunks.
|
| 764 |
"""
|
| 765 |
chunk_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
|
|
@@ -778,7 +802,6 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
|
|
| 778 |
else:
|
| 779 |
resp = result
|
| 780 |
except HTTPException as e:
|
| 781 |
-
# Send error as SSE then stop
|
| 782 |
error_data = json.dumps({
|
| 783 |
"id": chunk_id,
|
| 784 |
"object": "chat.completion.chunk",
|
|
@@ -817,17 +840,40 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
|
|
| 817 |
|
| 818 |
print(f"[Chat] Chunk #{cont_num+1}: {len(chunk_content)} chars, finish={finish_reason}")
|
| 819 |
|
| 820 |
-
#
|
| 821 |
tool_calls, remaining_text = _parse_tool_calls(total_content)
|
| 822 |
-
|
| 823 |
-
|
| 824 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 825 |
for sse_chunk in _emit_tool_call_chunks(chunk_id, created, model, tool_calls, remaining_text):
|
| 826 |
yield sse_chunk
|
| 827 |
return
|
| 828 |
|
| 829 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 830 |
if finish_reason == "stop":
|
|
|
|
| 831 |
chunk_sz = 50
|
| 832 |
for offset in range(0, len(total_content), chunk_sz):
|
| 833 |
piece = total_content[offset:offset + chunk_sz]
|
|
@@ -859,32 +905,53 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
|
|
| 859 |
yield "data: [DONE]\n\n"
|
| 860 |
return
|
| 861 |
|
| 862 |
-
#
|
| 863 |
yield ": continuing...\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 864 |
|
| 865 |
-
|
| 866 |
-
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
"
|
| 877 |
-
"
|
| 878 |
-
"created": created,
|
| 879 |
-
"model": model,
|
| 880 |
-
"choices": [{
|
| 881 |
-
"index": 0,
|
| 882 |
-
"delta": {},
|
| 883 |
-
"finish_reason": "stop",
|
| 884 |
-
}],
|
| 885 |
-
})
|
| 886 |
-
yield f"data: {sse_data}\n\n"
|
| 887 |
-
yield "data: [DONE]\n\n"
|
| 888 |
|
| 889 |
|
| 890 |
# ββ Non-streaming with auto-continue ββββββββββββββββββββββββββββ
|
|
@@ -1042,7 +1109,7 @@ async def list_models():
|
|
| 1042 |
async def root():
|
| 1043 |
return {
|
| 1044 |
"status": "ok",
|
| 1045 |
-
"version": "8.
|
| 1046 |
"proxy": bool(PROXY_URL),
|
| 1047 |
"tool_calling": True,
|
| 1048 |
"endpoints": ["/v1/chat/completions", "/v1/models"],
|
|
|
|
| 26 |
from fastapi.middleware.cors import CORSMiddleware
|
| 27 |
from fastapi.responses import StreamingResponse, JSONResponse
|
| 28 |
|
| 29 |
+
app = FastAPI(title="Haiku API", version="8.1.0")
|
| 30 |
|
| 31 |
# ββ CORS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
app.add_middleware(
|
|
|
|
| 318 |
return False
|
| 319 |
|
| 320 |
|
| 321 |
+
def _strip_incomplete_tool_tags(text: str) -> str:
|
| 322 |
+
"""Remove incomplete tool call XML tags from text.
|
| 323 |
+
This prevents raw XML tags from leaking into delta.content
|
| 324 |
+
when auto-continue fails to complete a tool call."""
|
| 325 |
+
# Remove incomplete Anthropic XML blocks
|
| 326 |
+
# e.g. "<function_calls>\n<invoke name="Write">\n<parameter name="content">some unfinished..."
|
| 327 |
+
text = re.sub(
|
| 328 |
+
r'<function_calls>\s*<invoke[^>]*>.*',
|
| 329 |
+
'', text, flags=re.DOTALL
|
| 330 |
+
)
|
| 331 |
+
# Remove incomplete inline JSON tool calls
|
| 332 |
+
text = re.sub(
|
| 333 |
+
r'<(?:function_call|tool_call)\s+name="[^"]+">.*',
|
| 334 |
+
'', text, flags=re.DOTALL
|
| 335 |
+
)
|
| 336 |
+
# Remove any stray opening/closing tags
|
| 337 |
+
text = re.sub(r'</?function_calls>\s*', '', text)
|
| 338 |
+
text = re.sub(r'</?invoke[^>]*>\s*', '', text)
|
| 339 |
+
text = re.sub(r'</?parameter[^>]*>\s*', '', text)
|
| 340 |
+
text = re.sub(r'</?(?:function_call|tool_call)_?>\s*', '', text)
|
| 341 |
+
return text.strip()
|
| 342 |
+
|
| 343 |
+
|
| 344 |
# ββ Tool System Prompt Builder ββββββββββββββββββββββββββββββββββ
|
| 345 |
|
| 346 |
def _build_tool_system_prompt(tools: list[dict], tool_choice=None) -> str:
|
|
|
|
| 782 |
"""Stream with real-time output, auto-continue, and keep-alive pings.
|
| 783 |
|
| 784 |
ALWAYS buffers the full response to detect tool call tags.
|
| 785 |
+
If tool calls are found AND complete, emits them as proper OpenAI tool_calls chunks.
|
| 786 |
+
If tool calls are incomplete, auto-continues to collect the rest.
|
| 787 |
If no tool calls, emits the text as regular content chunks.
|
| 788 |
"""
|
| 789 |
chunk_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
|
|
|
|
| 802 |
else:
|
| 803 |
resp = result
|
| 804 |
except HTTPException as e:
|
|
|
|
| 805 |
error_data = json.dumps({
|
| 806 |
"id": chunk_id,
|
| 807 |
"object": "chat.completion.chunk",
|
|
|
|
| 840 |
|
| 841 |
print(f"[Chat] Chunk #{cont_num+1}: {len(chunk_content)} chars, finish={finish_reason}")
|
| 842 |
|
| 843 |
+
# Check for tool calls in the accumulated text
|
| 844 |
tool_calls, remaining_text = _parse_tool_calls(total_content)
|
| 845 |
+
has_incomplete = _has_incomplete_tool_call(total_content)
|
| 846 |
+
|
| 847 |
+
print(f"[Chat] tool_calls={len(tool_calls)} incomplete={has_incomplete} finish={finish_reason}")
|
| 848 |
+
|
| 849 |
+
# ββ Decision tree ββββββββββββββββββββββββββββββββββββββββββ
|
| 850 |
+
#
|
| 851 |
+
# 1. If we have COMPLETE tool calls AND no incomplete tags β emit & done
|
| 852 |
+
# 2. If we have incomplete tool calls (regardless of complete ones) β auto-continue
|
| 853 |
+
# 3. If no tool calls and finish_reason == "stop" and no incomplete tags β emit text & done
|
| 854 |
+
# 4. If no tool calls and finish_reason == "stop" but HAS incomplete tags β auto-continue
|
| 855 |
+
# (the upstream might report "stop" even when cut off mid-tag)
|
| 856 |
+
# 5. If finish_reason == "length" β auto-continue
|
| 857 |
+
|
| 858 |
+
if tool_calls and not has_incomplete:
|
| 859 |
+
# All tool calls are complete β emit them
|
| 860 |
+
print(f"[Chat] Emitting {len(tool_calls)} complete tool call(s)")
|
| 861 |
for sse_chunk in _emit_tool_call_chunks(chunk_id, created, model, tool_calls, remaining_text):
|
| 862 |
yield sse_chunk
|
| 863 |
return
|
| 864 |
|
| 865 |
+
if has_incomplete:
|
| 866 |
+
# Incomplete tool calls detected β must auto-continue
|
| 867 |
+
print(f"[Chat] Incomplete tool call detected, auto-continuing...")
|
| 868 |
+
yield ": continuing...\n\n"
|
| 869 |
+
conversation.append({"role": "assistant", "content": chunk_content})
|
| 870 |
+
conversation.append({"role": "user", "content": "Continue the tool call exactly from where you left off. Do not repeat the opening tag or any arguments you already wrote. Just continue outputting the parameter values from where you stopped."})
|
| 871 |
+
print(f"[Chat] Auto-continue (incomplete) #{cont_num+1}, total so far: {len(total_content)} chars")
|
| 872 |
+
continue
|
| 873 |
+
|
| 874 |
+
# No tool calls and no incomplete tags
|
| 875 |
if finish_reason == "stop":
|
| 876 |
+
# Regular text response β emit as content
|
| 877 |
chunk_sz = 50
|
| 878 |
for offset in range(0, len(total_content), chunk_sz):
|
| 879 |
piece = total_content[offset:offset + chunk_sz]
|
|
|
|
| 905 |
yield "data: [DONE]\n\n"
|
| 906 |
return
|
| 907 |
|
| 908 |
+
# finish_reason == "length" β auto-continue for regular text
|
| 909 |
yield ": continuing...\n\n"
|
| 910 |
+
conversation.append({"role": "assistant", "content": chunk_content})
|
| 911 |
+
conversation.append({"role": "user", "content": "Continue exactly from where you left off. Do not repeat any text you already wrote."})
|
| 912 |
+
print(f"[Chat] Auto-continue (length) #{cont_num+1}, total so far: {len(total_content)} chars")
|
| 913 |
+
|
| 914 |
+
# Safety: max continuations reached β try to emit whatever we have
|
| 915 |
+
tool_calls, remaining_text = _parse_tool_calls(total_content)
|
| 916 |
+
if tool_calls:
|
| 917 |
+
# Best-effort: emit whatever tool calls we managed to parse
|
| 918 |
+
print(f"[Chat] Max continuations reached, emitting {len(tool_calls)} partial tool call(s)")
|
| 919 |
+
for sse_chunk in _emit_tool_call_chunks(chunk_id, created, model, tool_calls, remaining_text):
|
| 920 |
+
yield sse_chunk
|
| 921 |
+
else:
|
| 922 |
+
# Emit whatever text we have
|
| 923 |
+
# Strip any incomplete tool call XML from the output to avoid raw tags in content
|
| 924 |
+
clean_content = _strip_incomplete_tool_tags(total_content)
|
| 925 |
+
if clean_content.strip():
|
| 926 |
+
chunk_sz = 50
|
| 927 |
+
for offset in range(0, len(clean_content), chunk_sz):
|
| 928 |
+
piece = clean_content[offset:offset + chunk_sz]
|
| 929 |
+
sse_data = json.dumps({
|
| 930 |
+
"id": chunk_id,
|
| 931 |
+
"object": "chat.completion.chunk",
|
| 932 |
+
"created": created,
|
| 933 |
+
"model": model,
|
| 934 |
+
"choices": [{
|
| 935 |
+
"index": 0,
|
| 936 |
+
"delta": {"content": piece},
|
| 937 |
+
"finish_reason": None,
|
| 938 |
+
}],
|
| 939 |
+
})
|
| 940 |
+
yield f"data: {sse_data}\n\n"
|
| 941 |
|
| 942 |
+
sse_data = json.dumps({
|
| 943 |
+
"id": chunk_id,
|
| 944 |
+
"object": "chat.completion.chunk",
|
| 945 |
+
"created": created,
|
| 946 |
+
"model": model,
|
| 947 |
+
"choices": [{
|
| 948 |
+
"index": 0,
|
| 949 |
+
"delta": {},
|
| 950 |
+
"finish_reason": "stop",
|
| 951 |
+
}],
|
| 952 |
+
})
|
| 953 |
+
yield f"data: {sse_data}\n\n"
|
| 954 |
+
yield "data: [DONE]\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 955 |
|
| 956 |
|
| 957 |
# ββ Non-streaming with auto-continue ββββββββββββββββββββββββββββ
|
|
|
|
| 1109 |
async def root():
|
| 1110 |
return {
|
| 1111 |
"status": "ok",
|
| 1112 |
+
"version": "8.1.0",
|
| 1113 |
"proxy": bool(PROXY_URL),
|
| 1114 |
"tool_calling": True,
|
| 1115 |
"endpoints": ["/v1/chat/completions", "/v1/models"],
|