overwrite69 commited on
Commit
aa3153c
Β·
verified Β·
1 Parent(s): 33bf669

v8.1.0: fix tool call cutoff - never emit incomplete tool calls, auto-continue properly

Browse files
Files changed (1) hide show
  1. app.py +100 -33
app.py CHANGED
@@ -26,7 +26,7 @@ from fastapi import FastAPI, HTTPException, Request
26
  from fastapi.middleware.cors import CORSMiddleware
27
  from fastapi.responses import StreamingResponse, JSONResponse
28
 
29
- app = FastAPI(title="Haiku API", version="8.0.0")
30
 
31
  # ── CORS ─────────────────────────────────────────────────────────
32
  app.add_middleware(
@@ -318,6 +318,29 @@ def _has_incomplete_tool_call(text: str) -> bool:
318
  return False
319
 
320
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  # ── Tool System Prompt Builder ──────────────────────────────────
322
 
323
  def _build_tool_system_prompt(tools: list[dict], tool_choice=None) -> str:
@@ -759,7 +782,8 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
759
  """Stream with real-time output, auto-continue, and keep-alive pings.
760
 
761
  ALWAYS buffers the full response to detect tool call tags.
762
- If tool calls are found, emits them as proper OpenAI tool_calls chunks.
 
763
  If no tool calls, emits the text as regular content chunks.
764
  """
765
  chunk_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
@@ -778,7 +802,6 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
778
  else:
779
  resp = result
780
  except HTTPException as e:
781
- # Send error as SSE then stop
782
  error_data = json.dumps({
783
  "id": chunk_id,
784
  "object": "chat.completion.chunk",
@@ -817,17 +840,40 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
817
 
818
  print(f"[Chat] Chunk #{cont_num+1}: {len(chunk_content)} chars, finish={finish_reason}")
819
 
820
- # ALWAYS check for tool calls
821
  tool_calls, remaining_text = _parse_tool_calls(total_content)
822
-
823
- if tool_calls:
824
- print(f"[Chat] Detected {len(tool_calls)} tool call(s)")
 
 
 
 
 
 
 
 
 
 
 
 
 
825
  for sse_chunk in _emit_tool_call_chunks(chunk_id, created, model, tool_calls, remaining_text):
826
  yield sse_chunk
827
  return
828
 
829
- # No tool calls
 
 
 
 
 
 
 
 
 
830
  if finish_reason == "stop":
 
831
  chunk_sz = 50
832
  for offset in range(0, len(total_content), chunk_sz):
833
  piece = total_content[offset:offset + chunk_sz]
@@ -859,32 +905,53 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
859
  yield "data: [DONE]\n\n"
860
  return
861
 
862
- # Auto-continue
863
  yield ": continuing...\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
864
 
865
- if _has_incomplete_tool_call(chunk_content):
866
- conversation.append({"role": "assistant", "content": chunk_content})
867
- conversation.append({"role": "user", "content": "Continue the tool call exactly from where you left off. Do not repeat the opening tag or any arguments you already wrote."})
868
- else:
869
- conversation.append({"role": "assistant", "content": chunk_content})
870
- conversation.append({"role": "user", "content": "Continue exactly from where you left off. Do not repeat any text you already wrote."})
871
-
872
- print(f"[Chat] Auto-continue #{cont_num+1}, total so far: {len(total_content)} chars")
873
-
874
- # Safety: max continuations reached
875
- sse_data = json.dumps({
876
- "id": chunk_id,
877
- "object": "chat.completion.chunk",
878
- "created": created,
879
- "model": model,
880
- "choices": [{
881
- "index": 0,
882
- "delta": {},
883
- "finish_reason": "stop",
884
- }],
885
- })
886
- yield f"data: {sse_data}\n\n"
887
- yield "data: [DONE]\n\n"
888
 
889
 
890
  # ── Non-streaming with auto-continue ────────────────────────────
@@ -1042,7 +1109,7 @@ async def list_models():
1042
  async def root():
1043
  return {
1044
  "status": "ok",
1045
- "version": "8.0.0",
1046
  "proxy": bool(PROXY_URL),
1047
  "tool_calling": True,
1048
  "endpoints": ["/v1/chat/completions", "/v1/models"],
 
26
  from fastapi.middleware.cors import CORSMiddleware
27
  from fastapi.responses import StreamingResponse, JSONResponse
28
 
29
+ app = FastAPI(title="Haiku API", version="8.1.0")
30
 
31
  # ── CORS ─────────────────────────────────────────────────────────
32
  app.add_middleware(
 
318
  return False
319
 
320
 
321
+ def _strip_incomplete_tool_tags(text: str) -> str:
322
+ """Remove incomplete tool call XML tags from text.
323
+ This prevents raw XML tags from leaking into delta.content
324
+ when auto-continue fails to complete a tool call."""
325
+ # Remove incomplete Anthropic XML blocks
326
+ # e.g. "<function_calls>\n<invoke name="Write">\n<parameter name="content">some unfinished..."
327
+ text = re.sub(
328
+ r'<function_calls>\s*<invoke[^>]*>.*',
329
+ '', text, flags=re.DOTALL
330
+ )
331
+ # Remove incomplete inline JSON tool calls
332
+ text = re.sub(
333
+ r'<(?:function_call|tool_call)\s+name="[^"]+">.*',
334
+ '', text, flags=re.DOTALL
335
+ )
336
+ # Remove any stray opening/closing tags
337
+ text = re.sub(r'</?function_calls>\s*', '', text)
338
+ text = re.sub(r'</?invoke[^>]*>\s*', '', text)
339
+ text = re.sub(r'</?parameter[^>]*>\s*', '', text)
340
+ text = re.sub(r'</?(?:function_call|tool_call)_?>\s*', '', text)
341
+ return text.strip()
342
+
343
+
344
  # ── Tool System Prompt Builder ──────────────────────────────────
345
 
346
  def _build_tool_system_prompt(tools: list[dict], tool_choice=None) -> str:
 
782
  """Stream with real-time output, auto-continue, and keep-alive pings.
783
 
784
  ALWAYS buffers the full response to detect tool call tags.
785
+ If tool calls are found AND complete, emits them as proper OpenAI tool_calls chunks.
786
+ If tool calls are incomplete, auto-continues to collect the rest.
787
  If no tool calls, emits the text as regular content chunks.
788
  """
789
  chunk_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
 
802
  else:
803
  resp = result
804
  except HTTPException as e:
 
805
  error_data = json.dumps({
806
  "id": chunk_id,
807
  "object": "chat.completion.chunk",
 
840
 
841
  print(f"[Chat] Chunk #{cont_num+1}: {len(chunk_content)} chars, finish={finish_reason}")
842
 
843
+ # Check for tool calls in the accumulated text
844
  tool_calls, remaining_text = _parse_tool_calls(total_content)
845
+ has_incomplete = _has_incomplete_tool_call(total_content)
846
+
847
+ print(f"[Chat] tool_calls={len(tool_calls)} incomplete={has_incomplete} finish={finish_reason}")
848
+
849
+ # ── Decision tree ──────────────────────────────────────────
850
+ #
851
+ # 1. If we have COMPLETE tool calls AND no incomplete tags β†’ emit & done
852
+ # 2. If we have incomplete tool calls (regardless of complete ones) β†’ auto-continue
853
+ # 3. If no tool calls and finish_reason == "stop" and no incomplete tags β†’ emit text & done
854
+ # 4. If no tool calls and finish_reason == "stop" but HAS incomplete tags β†’ auto-continue
855
+ # (the upstream might report "stop" even when cut off mid-tag)
856
+ # 5. If finish_reason == "length" β†’ auto-continue
857
+
858
+ if tool_calls and not has_incomplete:
859
+ # All tool calls are complete β€” emit them
860
+ print(f"[Chat] Emitting {len(tool_calls)} complete tool call(s)")
861
  for sse_chunk in _emit_tool_call_chunks(chunk_id, created, model, tool_calls, remaining_text):
862
  yield sse_chunk
863
  return
864
 
865
+ if has_incomplete:
866
+ # Incomplete tool calls detected β€” must auto-continue
867
+ print(f"[Chat] Incomplete tool call detected, auto-continuing...")
868
+ yield ": continuing...\n\n"
869
+ conversation.append({"role": "assistant", "content": chunk_content})
870
+ conversation.append({"role": "user", "content": "Continue the tool call exactly from where you left off. Do not repeat the opening tag or any arguments you already wrote. Just continue outputting the parameter values from where you stopped."})
871
+ print(f"[Chat] Auto-continue (incomplete) #{cont_num+1}, total so far: {len(total_content)} chars")
872
+ continue
873
+
874
+ # No tool calls and no incomplete tags
875
  if finish_reason == "stop":
876
+ # Regular text response β€” emit as content
877
  chunk_sz = 50
878
  for offset in range(0, len(total_content), chunk_sz):
879
  piece = total_content[offset:offset + chunk_sz]
 
905
  yield "data: [DONE]\n\n"
906
  return
907
 
908
+ # finish_reason == "length" β€” auto-continue for regular text
909
  yield ": continuing...\n\n"
910
+ conversation.append({"role": "assistant", "content": chunk_content})
911
+ conversation.append({"role": "user", "content": "Continue exactly from where you left off. Do not repeat any text you already wrote."})
912
+ print(f"[Chat] Auto-continue (length) #{cont_num+1}, total so far: {len(total_content)} chars")
913
+
914
+ # Safety: max continuations reached β€” try to emit whatever we have
915
+ tool_calls, remaining_text = _parse_tool_calls(total_content)
916
+ if tool_calls:
917
+ # Best-effort: emit whatever tool calls we managed to parse
918
+ print(f"[Chat] Max continuations reached, emitting {len(tool_calls)} partial tool call(s)")
919
+ for sse_chunk in _emit_tool_call_chunks(chunk_id, created, model, tool_calls, remaining_text):
920
+ yield sse_chunk
921
+ else:
922
+ # Emit whatever text we have
923
+ # Strip any incomplete tool call XML from the output to avoid raw tags in content
924
+ clean_content = _strip_incomplete_tool_tags(total_content)
925
+ if clean_content.strip():
926
+ chunk_sz = 50
927
+ for offset in range(0, len(clean_content), chunk_sz):
928
+ piece = clean_content[offset:offset + chunk_sz]
929
+ sse_data = json.dumps({
930
+ "id": chunk_id,
931
+ "object": "chat.completion.chunk",
932
+ "created": created,
933
+ "model": model,
934
+ "choices": [{
935
+ "index": 0,
936
+ "delta": {"content": piece},
937
+ "finish_reason": None,
938
+ }],
939
+ })
940
+ yield f"data: {sse_data}\n\n"
941
 
942
+ sse_data = json.dumps({
943
+ "id": chunk_id,
944
+ "object": "chat.completion.chunk",
945
+ "created": created,
946
+ "model": model,
947
+ "choices": [{
948
+ "index": 0,
949
+ "delta": {},
950
+ "finish_reason": "stop",
951
+ }],
952
+ })
953
+ yield f"data: {sse_data}\n\n"
954
+ yield "data: [DONE]\n\n"
 
 
 
 
 
 
 
 
 
 
955
 
956
 
957
  # ── Non-streaming with auto-continue ────────────────────────────
 
1109
  async def root():
1110
  return {
1111
  "status": "ok",
1112
+ "version": "8.1.0",
1113
  "proxy": bool(PROXY_URL),
1114
  "tool_calling": True,
1115
  "endpoints": ["/v1/chat/completions", "/v1/models"],