Spaces:
Sleeping
Sleeping
Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
|
@@ -3,7 +3,7 @@ Haiku API - OpenAI-compatible proxy for chatgpt.org/claude/chat
|
|
| 3 |
Deploy to Hugging Face Spaces (Docker SDK)
|
| 4 |
|
| 5 |
Features:
|
| 6 |
-
- Tool/function calling support (
|
| 7 |
- Auto-continues when upstream hits the ~1K token output limit
|
| 8 |
- Rotating proxy with aggressive retries for unstable IPs
|
| 9 |
- SSE keep-alive comments during continuation gaps
|
|
@@ -24,7 +24,7 @@ from fastapi import FastAPI, HTTPException, Request
|
|
| 24 |
from fastapi.middleware.cors import CORSMiddleware
|
| 25 |
from fastapi.responses import StreamingResponse, JSONResponse
|
| 26 |
|
| 27 |
-
app = FastAPI(title="Haiku API", version="
|
| 28 |
|
| 29 |
# ββ CORS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
app.add_middleware(
|
|
@@ -152,81 +152,16 @@ async def shutdown():
|
|
| 152 |
|
| 153 |
|
| 154 |
# ββ Tool Calling Support βββββββββββββββββββββββββββββββββββββββββ
|
| 155 |
-
|
| 156 |
-
def _build_tool_system_prompt(tools: list[dict], tool_choice=None) -> str:
|
| 157 |
-
"""Convert OpenAI tools format to a system prompt that instructs Claude
|
| 158 |
-
to output tool calls in a parseable format."""
|
| 159 |
-
|
| 160 |
-
tools_desc = []
|
| 161 |
-
for tool in tools:
|
| 162 |
-
func = tool.get("function", {})
|
| 163 |
-
name = func.get("name", "unknown")
|
| 164 |
-
desc = func.get("description", "No description")
|
| 165 |
-
params = func.get("parameters", {})
|
| 166 |
-
|
| 167 |
-
# Format parameters nicely
|
| 168 |
-
props = params.get("properties", {})
|
| 169 |
-
required = params.get("required", [])
|
| 170 |
-
param_lines = []
|
| 171 |
-
for pname, pdef in props.items():
|
| 172 |
-
ptype = pdef.get("type", "any")
|
| 173 |
-
pdesc = pdef.get("description", "")
|
| 174 |
-
req_flag = " (required)" if pname in required else " (optional)"
|
| 175 |
-
param_lines.append(f" - {pname}: {ptype}{req_flag} β {pdesc}")
|
| 176 |
-
|
| 177 |
-
params_text = "\n".join(param_lines) if param_lines else " (no parameters)"
|
| 178 |
-
tools_desc.append(f"### {name}\n{desc}\nParameters:\n{params_text}")
|
| 179 |
-
|
| 180 |
-
tools_text = "\n\n".join(tools_desc)
|
| 181 |
-
|
| 182 |
-
# Handle tool_choice
|
| 183 |
-
choice_instruction = ""
|
| 184 |
-
if tool_choice == "required":
|
| 185 |
-
choice_instruction = "\nIMPORTANT: You MUST call at least one tool. Do not respond with just text."
|
| 186 |
-
elif tool_choice == "none":
|
| 187 |
-
# Shouldn't reach here since we skip tool injection for "none"
|
| 188 |
-
choice_instruction = "\nDo NOT call any tools. Respond with text only."
|
| 189 |
-
elif isinstance(tool_choice, dict) and tool_choice.get("type") == "function":
|
| 190 |
-
fname = tool_choice.get("function", {}).get("name", "")
|
| 191 |
-
choice_instruction = f"\nIMPORTANT: You MUST call the {fname} function."
|
| 192 |
-
|
| 193 |
-
return f"""# Available Tools
|
| 194 |
-
|
| 195 |
-
You have access to the following tools that you can call:
|
| 196 |
-
|
| 197 |
-
{tools_text}
|
| 198 |
-
|
| 199 |
-
## Tool Call Format
|
| 200 |
-
|
| 201 |
-
When you want to call a tool, you MUST use EXACTLY this XML format β one block per tool call:
|
| 202 |
-
|
| 203 |
-
<tool_call name="FUNCTION_NAME">
|
| 204 |
-
{"{"}"param1": "value1", "param2": "value2"{"}"}
|
| 205 |
-
</tool_call_>
|
| 206 |
-
|
| 207 |
-
Example β calling the Write tool:
|
| 208 |
-
<tool_call name="Write">
|
| 209 |
-
{"{"}"file_path": "hello.txt", "content": "hello world"{"}"}
|
| 210 |
-
</tool_call_>
|
| 211 |
-
|
| 212 |
-
## Rules
|
| 213 |
-
- You may call multiple tools by using multiple <tool_call_> blocks in sequence
|
| 214 |
-
- The arguments inside the block MUST be valid JSON matching the tool's parameter schema
|
| 215 |
-
- If you need to call a tool, output ONLY <tool_call_> blocks β no explanatory text before or after
|
| 216 |
-
- If you don't need to call any tools, just respond normally with text (no <tool_call_> blocks)
|
| 217 |
-
- Do NOT wrap <tool_call_> blocks in markdown code blocks or any other formatting
|
| 218 |
-
{choice_instruction}"""
|
| 219 |
-
|
| 220 |
-
|
| 221 |
# Regex to parse <tool_call name="...">...</tool_call_> blocks
|
|
|
|
| 222 |
_TOOL_CALL_RE = re.compile(
|
| 223 |
r'<tool_call\s+name="([^"]+)">\s*(.*?)\s*</tool_call_>',
|
| 224 |
re.DOTALL
|
| 225 |
)
|
| 226 |
|
| 227 |
-
# Also
|
| 228 |
_INCOMPLETE_TOOL_CALL_RE = re.compile(
|
| 229 |
-
r'<tool_call\s+name="
|
| 230 |
re.DOTALL
|
| 231 |
)
|
| 232 |
|
|
@@ -243,7 +178,6 @@ def _parse_tool_calls(text: str) -> tuple[list[dict], str]:
|
|
| 243 |
return [], text
|
| 244 |
|
| 245 |
tool_calls = []
|
| 246 |
-
# Collect text outside of tool call blocks
|
| 247 |
remaining_parts = []
|
| 248 |
|
| 249 |
last_end = 0
|
|
@@ -264,7 +198,6 @@ def _parse_tool_calls(text: str) -> tuple[list[dict], str]:
|
|
| 264 |
args_final = json.dumps(args_json)
|
| 265 |
except json.JSONDecodeError:
|
| 266 |
# Try to fix common issues
|
| 267 |
-
# Sometimes Claude wraps args in markdown code block
|
| 268 |
args_cleaned = args_str.strip('`').strip()
|
| 269 |
if args_cleaned.startswith('json'):
|
| 270 |
args_cleaned = args_cleaned[4:].strip()
|
|
@@ -295,54 +228,18 @@ def _parse_tool_calls(text: str) -> tuple[list[dict], str]:
|
|
| 295 |
|
| 296 |
|
| 297 |
def _has_incomplete_tool_call(text: str) -> bool:
|
| 298 |
-
"""Check if text has an opening <tool_call_>
|
| 299 |
opens = len(re.findall(r'<tool_call\s+name="[^"]+">', text))
|
| 300 |
closes = len(re.findall(r'</tool_call_>', text))
|
| 301 |
return opens > closes
|
| 302 |
|
| 303 |
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
"""Normalize messages: handle content arrays, tool roles, tool_calls,
|
| 308 |
-
and inject tool definitions into system prompt if tools are provided."""
|
| 309 |
-
result = []
|
| 310 |
-
|
| 311 |
-
# If tools provided and tool_choice != "none", inject tool system prompt
|
| 312 |
-
inject_tools = tools and tool_choice != "none"
|
| 313 |
-
|
| 314 |
-
if inject_tools:
|
| 315 |
-
tool_system = _build_tool_system_prompt(tools, tool_choice)
|
| 316 |
-
else:
|
| 317 |
-
tool_system = None
|
| 318 |
-
|
| 319 |
-
system_injected = False
|
| 320 |
-
|
| 321 |
-
for msg in messages:
|
| 322 |
-
role = msg.get("role", "user")
|
| 323 |
-
|
| 324 |
-
# Inject tool system prompt before or as the first system message
|
| 325 |
-
if role == "system" and not system_injected and tool_system:
|
| 326 |
-
content = msg.get("content", "")
|
| 327 |
-
if isinstance(content, list):
|
| 328 |
-
content = _flatten_content_array(content)
|
| 329 |
-
content = str(content) if content else ""
|
| 330 |
-
combined = content + "\n\n" + tool_system if content.strip() else tool_system
|
| 331 |
-
result.append({"role": "system", "content": combined})
|
| 332 |
-
system_injected = True
|
| 333 |
-
continue
|
| 334 |
-
|
| 335 |
-
result.append(_normalize_one_message(msg))
|
| 336 |
-
|
| 337 |
-
# If no system message existed, add tool system prompt as first message
|
| 338 |
-
if tool_system and not system_injected:
|
| 339 |
-
result.insert(0, {"role": "system", "content": tool_system})
|
| 340 |
|
| 341 |
-
# Filter out empty system messages
|
| 342 |
-
result = [m for m in result if not (m.get("role") == "system" and not m.get("content", "").strip())]
|
| 343 |
-
|
| 344 |
-
return result
|
| 345 |
|
|
|
|
| 346 |
|
| 347 |
def _flatten_content_array(content: list) -> str:
|
| 348 |
"""Convert a content array to plain text."""
|
|
@@ -356,54 +253,60 @@ def _flatten_content_array(content: list) -> str:
|
|
| 356 |
return "\n".join(text_parts)
|
| 357 |
|
| 358 |
|
| 359 |
-
def
|
| 360 |
-
"""Normalize
|
| 361 |
-
|
| 362 |
-
content = msg.get("content", "")
|
| 363 |
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
content =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
# Handle tool role messages β convert to user message with tool result
|
| 373 |
-
if role == "tool":
|
| 374 |
-
tool_name = msg.get("name", "unknown_tool")
|
| 375 |
-
tool_call_id = msg.get("tool_call_id", "")
|
| 376 |
-
return {
|
| 377 |
-
"role": "user",
|
| 378 |
-
"content": f"[Tool Result for {tool_name} (id: {tool_call_id})]:\n{content}"
|
| 379 |
-
}
|
| 380 |
-
|
| 381 |
-
# Handle assistant messages with tool_calls β text with <tool_call_> blocks
|
| 382 |
-
if role == "assistant" and msg.get("tool_calls"):
|
| 383 |
-
parts = []
|
| 384 |
-
regular_content = content if content and content.strip() else ""
|
| 385 |
-
|
| 386 |
-
if regular_content:
|
| 387 |
-
parts.append(regular_content)
|
| 388 |
-
|
| 389 |
-
for tc in msg["tool_calls"]:
|
| 390 |
-
func = tc.get("function", {})
|
| 391 |
-
name = func.get("name", "unknown")
|
| 392 |
-
args = func.get("arguments", "{}")
|
| 393 |
-
# Validate args is valid JSON
|
| 394 |
-
try:
|
| 395 |
-
json.loads(args)
|
| 396 |
-
except (json.JSONDecodeError, TypeError):
|
| 397 |
-
args = "{}"
|
| 398 |
-
parts.append(f'<tool_call name="{name}">\n{args}\n</tool_call_>')
|
| 399 |
|
| 400 |
-
|
|
|
|
| 401 |
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
|
| 406 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
|
| 408 |
|
| 409 |
# ββ Headers ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -578,12 +481,100 @@ async def _raw_call_streaming(messages: list[dict], model: str):
|
|
| 578 |
raise HTTPException(500, "Failed after retry")
|
| 579 |
|
| 580 |
|
| 581 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 582 |
"""Stream with real-time output, auto-continue, and keep-alive pings.
|
| 583 |
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
"""
|
| 588 |
chunk_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
|
| 589 |
created = int(time.time())
|
|
@@ -591,6 +582,7 @@ async def _stream_with_auto_continue(messages: list[dict], model: str, has_tools
|
|
| 591 |
total_content = ""
|
| 592 |
|
| 593 |
for cont_num in range(MAX_CONTINUATIONS):
|
|
|
|
| 594 |
yield ": thinking...\n\n"
|
| 595 |
|
| 596 |
resp = None
|
|
@@ -606,6 +598,7 @@ async def _stream_with_auto_continue(messages: list[dict], model: str, has_tools
|
|
| 606 |
finish_reason = "stop"
|
| 607 |
chunk_content = ""
|
| 608 |
|
|
|
|
| 609 |
async for text, fr in _stream_one_response(resp):
|
| 610 |
if fr is not None:
|
| 611 |
finish_reason = fr
|
|
@@ -615,131 +608,27 @@ async def _stream_with_auto_continue(messages: list[dict], model: str, has_tools
|
|
| 615 |
chunk_content += text
|
| 616 |
total_content += text
|
| 617 |
|
| 618 |
-
#
|
| 619 |
-
|
| 620 |
-
sse_data = json.dumps({
|
| 621 |
-
"id": chunk_id,
|
| 622 |
-
"object": "chat.completion.chunk",
|
| 623 |
-
"created": created,
|
| 624 |
-
"model": model,
|
| 625 |
-
"choices": [{
|
| 626 |
-
"index": 0,
|
| 627 |
-
"delta": {"content": text},
|
| 628 |
-
"finish_reason": None,
|
| 629 |
-
}],
|
| 630 |
-
})
|
| 631 |
-
yield f"data: {sse_data}\n\n"
|
| 632 |
|
| 633 |
print(f"[Chat] Chunk #{cont_num+1}: {len(chunk_content)} chars, finish={finish_reason}")
|
| 634 |
|
| 635 |
-
#
|
| 636 |
-
|
| 637 |
-
tool_calls, remaining_text = _parse_tool_calls(total_content)
|
| 638 |
-
|
| 639 |
-
if tool_calls:
|
| 640 |
-
# Emit tool calls as OpenAI streaming chunks
|
| 641 |
-
for i, tc in enumerate(tool_calls):
|
| 642 |
-
# First chunk: role + tool_call with id, name, and start of arguments
|
| 643 |
-
sse_start = json.dumps({
|
| 644 |
-
"id": chunk_id,
|
| 645 |
-
"object": "chat.completion.chunk",
|
| 646 |
-
"created": created,
|
| 647 |
-
"model": model,
|
| 648 |
-
"choices": [{
|
| 649 |
-
"index": 0,
|
| 650 |
-
"delta": {
|
| 651 |
-
"role": "assistant",
|
| 652 |
-
"tool_calls": [{
|
| 653 |
-
"index": i,
|
| 654 |
-
"id": tc["id"],
|
| 655 |
-
"type": "function",
|
| 656 |
-
"function": {
|
| 657 |
-
"name": tc["function"]["name"],
|
| 658 |
-
"arguments": "",
|
| 659 |
-
}
|
| 660 |
-
}]
|
| 661 |
-
},
|
| 662 |
-
"finish_reason": None,
|
| 663 |
-
}],
|
| 664 |
-
})
|
| 665 |
-
yield f"data: {sse_start}\n\n"
|
| 666 |
-
|
| 667 |
-
# Argument chunks β split into small pieces for streaming feel
|
| 668 |
-
args = tc["function"]["arguments"]
|
| 669 |
-
chunk_size = max(1, len(args) // 3)
|
| 670 |
-
for offset in range(0, len(args), chunk_size):
|
| 671 |
-
arg_piece = args[offset:offset + chunk_size]
|
| 672 |
-
sse_arg = json.dumps({
|
| 673 |
-
"id": chunk_id,
|
| 674 |
-
"object": "chat.completion.chunk",
|
| 675 |
-
"created": created,
|
| 676 |
-
"model": model,
|
| 677 |
-
"choices": [{
|
| 678 |
-
"index": 0,
|
| 679 |
-
"delta": {
|
| 680 |
-
"tool_calls": [{
|
| 681 |
-
"index": i,
|
| 682 |
-
"function": {
|
| 683 |
-
"arguments": arg_piece,
|
| 684 |
-
}
|
| 685 |
-
}]
|
| 686 |
-
},
|
| 687 |
-
"finish_reason": None,
|
| 688 |
-
}],
|
| 689 |
-
})
|
| 690 |
-
yield f"data: {sse_arg}\n\n"
|
| 691 |
-
|
| 692 |
-
# If there's remaining text alongside tool calls, emit it too
|
| 693 |
-
if remaining_text.strip():
|
| 694 |
-
sse_text = json.dumps({
|
| 695 |
-
"id": chunk_id,
|
| 696 |
-
"object": "chat.completion.chunk",
|
| 697 |
-
"created": created,
|
| 698 |
-
"model": model,
|
| 699 |
-
"choices": [{
|
| 700 |
-
"index": 0,
|
| 701 |
-
"delta": {"content": remaining_text},
|
| 702 |
-
"finish_reason": None,
|
| 703 |
-
}],
|
| 704 |
-
})
|
| 705 |
-
yield f"data: {sse_text}\n\n"
|
| 706 |
-
|
| 707 |
-
# Final chunk with finish_reason
|
| 708 |
-
sse_done = json.dumps({
|
| 709 |
-
"id": chunk_id,
|
| 710 |
-
"object": "chat.completion.chunk",
|
| 711 |
-
"created": created,
|
| 712 |
-
"model": model,
|
| 713 |
-
"choices": [{
|
| 714 |
-
"index": 0,
|
| 715 |
-
"delta": {},
|
| 716 |
-
"finish_reason": "tool_calls",
|
| 717 |
-
}],
|
| 718 |
-
})
|
| 719 |
-
yield f"data: {sse_done}\n\n"
|
| 720 |
-
yield "data: [DONE]\n\n"
|
| 721 |
-
return
|
| 722 |
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
piece = text_to_stream[offset:offset + chunk_sz]
|
| 730 |
-
sse_data = json.dumps({
|
| 731 |
-
"id": chunk_id,
|
| 732 |
-
"object": "chat.completion.chunk",
|
| 733 |
-
"created": created,
|
| 734 |
-
"model": model,
|
| 735 |
-
"choices": [{
|
| 736 |
-
"index": 0,
|
| 737 |
-
"delta": {"content": piece},
|
| 738 |
-
"finish_reason": None,
|
| 739 |
-
}],
|
| 740 |
-
})
|
| 741 |
-
yield f"data: {sse_data}\n\n"
|
| 742 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 743 |
sse_data = json.dumps({
|
| 744 |
"id": chunk_id,
|
| 745 |
"object": "chat.completion.chunk",
|
|
@@ -747,36 +636,31 @@ async def _stream_with_auto_continue(messages: list[dict], model: str, has_tools
|
|
| 747 |
"model": model,
|
| 748 |
"choices": [{
|
| 749 |
"index": 0,
|
| 750 |
-
"delta": {},
|
| 751 |
-
"finish_reason":
|
| 752 |
}],
|
| 753 |
})
|
| 754 |
yield f"data: {sse_data}\n\n"
|
| 755 |
-
yield "data: [DONE]\n\n"
|
| 756 |
-
return
|
| 757 |
|
| 758 |
-
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
"
|
| 766 |
-
"
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
yield "data: [DONE]\n\n"
|
| 774 |
-
return
|
| 775 |
|
| 776 |
# Auto-continue for length-limited responses
|
| 777 |
yield ": continuing...\n\n"
|
| 778 |
|
| 779 |
-
# Check if we're in the middle of a tool call
|
| 780 |
if _has_incomplete_tool_call(chunk_content):
|
| 781 |
conversation.append({"role": "assistant", "content": chunk_content})
|
| 782 |
conversation.append({"role": "user", "content": "Continue the tool call exactly from where you left off. Do not repeat the opening tag or any arguments you already wrote."})
|
|
@@ -804,9 +688,9 @@ async def _stream_with_auto_continue(messages: list[dict], model: str, has_tools
|
|
| 804 |
|
| 805 |
# ββ Non-streaming with auto-continue ββββββββββββββββββββββββββββ
|
| 806 |
|
| 807 |
-
async def _collect_with_auto_continue(messages: list[dict], model: str
|
| 808 |
"""Collect the full response, auto-continuing if cut off.
|
| 809 |
-
|
| 810 |
conversation = list(messages)
|
| 811 |
full_content = ""
|
| 812 |
|
|
@@ -825,25 +709,20 @@ async def _collect_with_auto_continue(messages: list[dict], model: str, has_tool
|
|
| 825 |
full_content += content
|
| 826 |
print(f"[Chat] Collect #{cont_num+1}: {len(content)} chars, finish={finish_reason}")
|
| 827 |
|
| 828 |
-
#
|
| 829 |
-
|
| 830 |
-
tool_calls, remaining_text = _parse_tool_calls(full_content)
|
| 831 |
|
| 832 |
-
|
| 833 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 834 |
"tool_calls": tool_calls,
|
| 835 |
"content": remaining_text if remaining_text.strip() else None,
|
| 836 |
}
|
| 837 |
-
# If there are incomplete tool calls, continue
|
| 838 |
-
if _has_incomplete_tool_call(full_content) and finish_reason == "length":
|
| 839 |
-
pass # fall through to auto-continue
|
| 840 |
-
else:
|
| 841 |
-
return result
|
| 842 |
|
| 843 |
if finish_reason == "stop":
|
| 844 |
-
if has_tools:
|
| 845 |
-
# No tool calls found, return as text
|
| 846 |
-
return {"content": full_content, "tool_calls": None}
|
| 847 |
return {"content": full_content, "tool_calls": None}
|
| 848 |
|
| 849 |
# Auto-continue
|
|
@@ -873,22 +752,23 @@ async def chat_completions(request: Request):
|
|
| 873 |
model = body.get("model", "anthropic/claude-haiku-4-5")
|
| 874 |
messages_raw = body.get("messages", [])
|
| 875 |
stream = body.get("stream", False)
|
| 876 |
-
|
| 877 |
-
|
|
|
|
|
|
|
|
|
|
| 878 |
|
| 879 |
if not messages_raw or not isinstance(messages_raw, list):
|
| 880 |
raise HTTPException(400, "messages must be a non-empty array")
|
| 881 |
|
| 882 |
-
|
| 883 |
-
|
| 884 |
-
messages = normalize_messages(messages_raw, tools=tools, tool_choice=tool_choice)
|
| 885 |
|
| 886 |
if not messages:
|
| 887 |
raise HTTPException(400, "No valid messages after normalization")
|
| 888 |
|
| 889 |
if stream:
|
| 890 |
return StreamingResponse(
|
| 891 |
-
_stream_with_auto_continue(messages, model
|
| 892 |
media_type="text/event-stream",
|
| 893 |
headers={
|
| 894 |
"Cache-Control": "no-cache",
|
|
@@ -897,7 +777,7 @@ async def chat_completions(request: Request):
|
|
| 897 |
},
|
| 898 |
)
|
| 899 |
else:
|
| 900 |
-
result = await _collect_with_auto_continue(messages, model
|
| 901 |
|
| 902 |
tool_calls = result.get("tool_calls")
|
| 903 |
content = result.get("content")
|
|
@@ -951,7 +831,7 @@ async def list_models():
|
|
| 951 |
async def root():
|
| 952 |
return {
|
| 953 |
"status": "ok",
|
| 954 |
-
"version": "
|
| 955 |
"proxy": bool(PROXY_URL),
|
| 956 |
"tool_calling": True,
|
| 957 |
"endpoints": ["/v1/chat/completions", "/v1/models"],
|
|
|
|
| 3 |
Deploy to Hugging Face Spaces (Docker SDK)
|
| 4 |
|
| 5 |
Features:
|
| 6 |
+
- Tool/function calling support (always detects <tool_call_> tags in output)
|
| 7 |
- Auto-continues when upstream hits the ~1K token output limit
|
| 8 |
- Rotating proxy with aggressive retries for unstable IPs
|
| 9 |
- SSE keep-alive comments during continuation gaps
|
|
|
|
| 24 |
from fastapi.middleware.cors import CORSMiddleware
|
| 25 |
from fastapi.responses import StreamingResponse, JSONResponse
|
| 26 |
|
| 27 |
+
app = FastAPI(title="Haiku API", version="6.0.0")
|
| 28 |
|
| 29 |
# ββ CORS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
app.add_middleware(
|
|
|
|
| 152 |
|
| 153 |
|
| 154 |
# ββ Tool Calling Support βββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
# Regex to parse <tool_call name="...">...</tool_call_> blocks
|
| 156 |
+
# Supports: <tool_call name="X">JSON</tool_call_> and variations
|
| 157 |
_TOOL_CALL_RE = re.compile(
|
| 158 |
r'<tool_call\s+name="([^"]+)">\s*(.*?)\s*</tool_call_>',
|
| 159 |
re.DOTALL
|
| 160 |
)
|
| 161 |
|
| 162 |
+
# Also match incomplete tool calls (for auto-continue detection)
|
| 163 |
_INCOMPLETE_TOOL_CALL_RE = re.compile(
|
| 164 |
+
r'<tool_call\s+name="[^"]+">\s*(.*?)$',
|
| 165 |
re.DOTALL
|
| 166 |
)
|
| 167 |
|
|
|
|
| 178 |
return [], text
|
| 179 |
|
| 180 |
tool_calls = []
|
|
|
|
| 181 |
remaining_parts = []
|
| 182 |
|
| 183 |
last_end = 0
|
|
|
|
| 198 |
args_final = json.dumps(args_json)
|
| 199 |
except json.JSONDecodeError:
|
| 200 |
# Try to fix common issues
|
|
|
|
| 201 |
args_cleaned = args_str.strip('`').strip()
|
| 202 |
if args_cleaned.startswith('json'):
|
| 203 |
args_cleaned = args_cleaned[4:].strip()
|
|
|
|
| 228 |
|
| 229 |
|
| 230 |
def _has_incomplete_tool_call(text: str) -> bool:
|
| 231 |
+
"""Check if text has an opening <tool_call_> tag without a matching close."""
|
| 232 |
opens = len(re.findall(r'<tool_call\s+name="[^"]+">', text))
|
| 233 |
closes = len(re.findall(r'</tool_call_>', text))
|
| 234 |
return opens > closes
|
| 235 |
|
| 236 |
|
| 237 |
+
def _detect_tool_calls_in_text(text: str) -> bool:
|
| 238 |
+
"""Quick check if text likely contains tool call patterns."""
|
| 239 |
+
return bool(_TOOL_CALL_RE.search(text))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
|
| 242 |
+
# ββ Message normalization ββββββββββββββββββββββββββββββββββββββββ
|
| 243 |
|
| 244 |
def _flatten_content_array(content: list) -> str:
|
| 245 |
"""Convert a content array to plain text."""
|
|
|
|
| 253 |
return "\n".join(text_parts)
|
| 254 |
|
| 255 |
|
| 256 |
+
def normalize_messages(messages: list[dict]) -> list[dict]:
|
| 257 |
+
"""Normalize messages: handle content arrays, tool roles, tool_calls."""
|
| 258 |
+
result = []
|
|
|
|
| 259 |
|
| 260 |
+
for msg in messages:
|
| 261 |
+
role = msg.get("role", "user")
|
| 262 |
+
content = msg.get("content", "")
|
| 263 |
+
|
| 264 |
+
# Handle content arrays β plain text
|
| 265 |
+
if isinstance(content, list):
|
| 266 |
+
content = _flatten_content_array(content)
|
| 267 |
+
|
| 268 |
+
if content is None:
|
| 269 |
+
content = ""
|
| 270 |
+
content = str(content)
|
| 271 |
+
|
| 272 |
+
# Handle tool role messages β convert to user message with tool result
|
| 273 |
+
if role == "tool":
|
| 274 |
+
tool_name = msg.get("name", "unknown_tool")
|
| 275 |
+
tool_call_id = msg.get("tool_call_id", "")
|
| 276 |
+
result.append({
|
| 277 |
+
"role": "user",
|
| 278 |
+
"content": f"[Tool Result for {tool_name} (id: {tool_call_id})]:\n{content}"
|
| 279 |
+
})
|
| 280 |
+
continue
|
| 281 |
|
| 282 |
+
# Handle assistant messages with tool_calls β text with <tool_call_> blocks
|
| 283 |
+
if role == "assistant" and msg.get("tool_calls"):
|
| 284 |
+
parts = []
|
| 285 |
+
regular_content = content if content and content.strip() else ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
|
| 287 |
+
if regular_content:
|
| 288 |
+
parts.append(regular_content)
|
| 289 |
|
| 290 |
+
for tc in msg["tool_calls"]:
|
| 291 |
+
func = tc.get("function", {})
|
| 292 |
+
name = func.get("name", "unknown")
|
| 293 |
+
args = func.get("arguments", "{}")
|
| 294 |
+
try:
|
| 295 |
+
json.loads(args)
|
| 296 |
+
except (json.JSONDecodeError, TypeError):
|
| 297 |
+
args = "{}"
|
| 298 |
+
parts.append(f'<tool_call name="{name}">\n{args}\n</tool_call_>')
|
| 299 |
|
| 300 |
+
result.append({"role": "assistant", "content": "\n\n".join(parts)})
|
| 301 |
+
continue
|
| 302 |
+
|
| 303 |
+
# System messages with empty content get filtered out
|
| 304 |
+
if role == "system" and not content.strip():
|
| 305 |
+
continue
|
| 306 |
+
|
| 307 |
+
result.append({"role": role, "content": content})
|
| 308 |
+
|
| 309 |
+
return result
|
| 310 |
|
| 311 |
|
| 312 |
# ββ Headers ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 481 |
raise HTTPException(500, "Failed after retry")
|
| 482 |
|
| 483 |
|
| 484 |
+
def _emit_tool_call_chunks(chunk_id: str, created: int, model: str, tool_calls: list[dict], remaining_text: str):
|
| 485 |
+
"""Generate OpenAI streaming chunks for tool calls. Returns list of SSE strings."""
|
| 486 |
+
chunks = []
|
| 487 |
+
|
| 488 |
+
for i, tc in enumerate(tool_calls):
|
| 489 |
+
# First chunk: role + tool_call with id, name, and start of arguments
|
| 490 |
+
sse_start = json.dumps({
|
| 491 |
+
"id": chunk_id,
|
| 492 |
+
"object": "chat.completion.chunk",
|
| 493 |
+
"created": created,
|
| 494 |
+
"model": model,
|
| 495 |
+
"choices": [{
|
| 496 |
+
"index": 0,
|
| 497 |
+
"delta": {
|
| 498 |
+
"role": "assistant",
|
| 499 |
+
"tool_calls": [{
|
| 500 |
+
"index": i,
|
| 501 |
+
"id": tc["id"],
|
| 502 |
+
"type": "function",
|
| 503 |
+
"function": {
|
| 504 |
+
"name": tc["function"]["name"],
|
| 505 |
+
"arguments": "",
|
| 506 |
+
}
|
| 507 |
+
}]
|
| 508 |
+
},
|
| 509 |
+
"finish_reason": None,
|
| 510 |
+
}],
|
| 511 |
+
})
|
| 512 |
+
chunks.append(f"data: {sse_start}\n\n")
|
| 513 |
+
|
| 514 |
+
# Argument chunks β split into small pieces for streaming feel
|
| 515 |
+
args = tc["function"]["arguments"]
|
| 516 |
+
chunk_size = max(1, len(args) // 3)
|
| 517 |
+
for offset in range(0, len(args), chunk_size):
|
| 518 |
+
arg_piece = args[offset:offset + chunk_size]
|
| 519 |
+
sse_arg = json.dumps({
|
| 520 |
+
"id": chunk_id,
|
| 521 |
+
"object": "chat.completion.chunk",
|
| 522 |
+
"created": created,
|
| 523 |
+
"model": model,
|
| 524 |
+
"choices": [{
|
| 525 |
+
"index": 0,
|
| 526 |
+
"delta": {
|
| 527 |
+
"tool_calls": [{
|
| 528 |
+
"index": i,
|
| 529 |
+
"function": {
|
| 530 |
+
"arguments": arg_piece,
|
| 531 |
+
}
|
| 532 |
+
}]
|
| 533 |
+
},
|
| 534 |
+
"finish_reason": None,
|
| 535 |
+
}],
|
| 536 |
+
})
|
| 537 |
+
chunks.append(f"data: {sse_arg}\n\n")
|
| 538 |
+
|
| 539 |
+
# If there's remaining text alongside tool calls, emit it too
|
| 540 |
+
if remaining_text.strip():
|
| 541 |
+
sse_text = json.dumps({
|
| 542 |
+
"id": chunk_id,
|
| 543 |
+
"object": "chat.completion.chunk",
|
| 544 |
+
"created": created,
|
| 545 |
+
"model": model,
|
| 546 |
+
"choices": [{
|
| 547 |
+
"index": 0,
|
| 548 |
+
"delta": {"content": remaining_text},
|
| 549 |
+
"finish_reason": None,
|
| 550 |
+
}],
|
| 551 |
+
})
|
| 552 |
+
chunks.append(f"data: {sse_text}\n\n")
|
| 553 |
+
|
| 554 |
+
# Final chunk with finish_reason
|
| 555 |
+
sse_done = json.dumps({
|
| 556 |
+
"id": chunk_id,
|
| 557 |
+
"object": "chat.completion.chunk",
|
| 558 |
+
"created": created,
|
| 559 |
+
"model": model,
|
| 560 |
+
"choices": [{
|
| 561 |
+
"index": 0,
|
| 562 |
+
"delta": {},
|
| 563 |
+
"finish_reason": "tool_calls",
|
| 564 |
+
}],
|
| 565 |
+
})
|
| 566 |
+
chunks.append(f"data: {sse_done}\n\n")
|
| 567 |
+
chunks.append("data: [DONE]\n\n")
|
| 568 |
+
|
| 569 |
+
return chunks
|
| 570 |
+
|
| 571 |
+
|
| 572 |
+
async def _stream_with_auto_continue(messages: list[dict], model: str):
|
| 573 |
"""Stream with real-time output, auto-continue, and keep-alive pings.
|
| 574 |
|
| 575 |
+
ALWAYS buffers the full response to detect <tool_call_> tags.
|
| 576 |
+
If tool calls are found, emits them as proper OpenAI tool_calls chunks.
|
| 577 |
+
If no tool calls, emits the text as regular content chunks.
|
| 578 |
"""
|
| 579 |
chunk_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
|
| 580 |
created = int(time.time())
|
|
|
|
| 582 |
total_content = ""
|
| 583 |
|
| 584 |
for cont_num in range(MAX_CONTINUATIONS):
|
| 585 |
+
# Send keep-alive while we buffer
|
| 586 |
yield ": thinking...\n\n"
|
| 587 |
|
| 588 |
resp = None
|
|
|
|
| 598 |
finish_reason = "stop"
|
| 599 |
chunk_content = ""
|
| 600 |
|
| 601 |
+
# Buffer the full response (don't stream in real-time so we can detect tool calls)
|
| 602 |
async for text, fr in _stream_one_response(resp):
|
| 603 |
if fr is not None:
|
| 604 |
finish_reason = fr
|
|
|
|
| 608 |
chunk_content += text
|
| 609 |
total_content += text
|
| 610 |
|
| 611 |
+
# Send keep-alive pings while buffering
|
| 612 |
+
yield ": streaming...\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 613 |
|
| 614 |
print(f"[Chat] Chunk #{cont_num+1}: {len(chunk_content)} chars, finish={finish_reason}")
|
| 615 |
|
| 616 |
+
# ALWAYS check for tool calls in the accumulated text
|
| 617 |
+
tool_calls, remaining_text = _parse_tool_calls(total_content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
|
| 619 |
+
if tool_calls:
|
| 620 |
+
print(f"[Chat] Detected {len(tool_calls)} tool call(s)")
|
| 621 |
+
# Emit tool calls as proper OpenAI streaming chunks
|
| 622 |
+
for sse_chunk in _emit_tool_call_chunks(chunk_id, created, model, tool_calls, remaining_text):
|
| 623 |
+
yield sse_chunk
|
| 624 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 625 |
|
| 626 |
+
# No tool calls found
|
| 627 |
+
if finish_reason == "stop":
|
| 628 |
+
# Stream the buffered text content as regular content chunks
|
| 629 |
+
chunk_sz = 50
|
| 630 |
+
for offset in range(0, len(total_content), chunk_sz):
|
| 631 |
+
piece = total_content[offset:offset + chunk_sz]
|
| 632 |
sse_data = json.dumps({
|
| 633 |
"id": chunk_id,
|
| 634 |
"object": "chat.completion.chunk",
|
|
|
|
| 636 |
"model": model,
|
| 637 |
"choices": [{
|
| 638 |
"index": 0,
|
| 639 |
+
"delta": {"content": piece},
|
| 640 |
+
"finish_reason": None,
|
| 641 |
}],
|
| 642 |
})
|
| 643 |
yield f"data: {sse_data}\n\n"
|
|
|
|
|
|
|
| 644 |
|
| 645 |
+
# Final stop chunk
|
| 646 |
+
sse_data = json.dumps({
|
| 647 |
+
"id": chunk_id,
|
| 648 |
+
"object": "chat.completion.chunk",
|
| 649 |
+
"created": created,
|
| 650 |
+
"model": model,
|
| 651 |
+
"choices": [{
|
| 652 |
+
"index": 0,
|
| 653 |
+
"delta": {},
|
| 654 |
+
"finish_reason": "stop",
|
| 655 |
+
}],
|
| 656 |
+
})
|
| 657 |
+
yield f"data: {sse_data}\n\n"
|
| 658 |
+
yield "data: [DONE]\n\n"
|
| 659 |
+
return
|
|
|
|
|
|
|
| 660 |
|
| 661 |
# Auto-continue for length-limited responses
|
| 662 |
yield ": continuing...\n\n"
|
| 663 |
|
|
|
|
| 664 |
if _has_incomplete_tool_call(chunk_content):
|
| 665 |
conversation.append({"role": "assistant", "content": chunk_content})
|
| 666 |
conversation.append({"role": "user", "content": "Continue the tool call exactly from where you left off. Do not repeat the opening tag or any arguments you already wrote."})
|
|
|
|
| 688 |
|
| 689 |
# ββ Non-streaming with auto-continue ββββββββββββββββββββββββββββ
|
| 690 |
|
| 691 |
+
async def _collect_with_auto_continue(messages: list[dict], model: str) -> dict:
|
| 692 |
"""Collect the full response, auto-continuing if cut off.
|
| 693 |
+
Always checks for tool calls. Returns dict with 'content' and/or 'tool_calls'."""
|
| 694 |
conversation = list(messages)
|
| 695 |
full_content = ""
|
| 696 |
|
|
|
|
| 709 |
full_content += content
|
| 710 |
print(f"[Chat] Collect #{cont_num+1}: {len(content)} chars, finish={finish_reason}")
|
| 711 |
|
| 712 |
+
# Always check for tool calls
|
| 713 |
+
tool_calls, remaining_text = _parse_tool_calls(full_content)
|
|
|
|
| 714 |
|
| 715 |
+
if tool_calls:
|
| 716 |
+
# If there are incomplete tool calls and we got cut off, continue
|
| 717 |
+
if _has_incomplete_tool_call(full_content) and finish_reason == "length":
|
| 718 |
+
pass # fall through to auto-continue
|
| 719 |
+
else:
|
| 720 |
+
return {
|
| 721 |
"tool_calls": tool_calls,
|
| 722 |
"content": remaining_text if remaining_text.strip() else None,
|
| 723 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 724 |
|
| 725 |
if finish_reason == "stop":
|
|
|
|
|
|
|
|
|
|
| 726 |
return {"content": full_content, "tool_calls": None}
|
| 727 |
|
| 728 |
# Auto-continue
|
|
|
|
| 752 |
model = body.get("model", "anthropic/claude-haiku-4-5")
|
| 753 |
messages_raw = body.get("messages", [])
|
| 754 |
stream = body.get("stream", False)
|
| 755 |
+
|
| 756 |
+
# Log request for debugging
|
| 757 |
+
tools_present = "tools" in body
|
| 758 |
+
functions_present = "functions" in body
|
| 759 |
+
print(f"[Request] model={model} stream={stream} tools={tools_present} functions={functions_present} msgs={len(messages_raw)}")
|
| 760 |
|
| 761 |
if not messages_raw or not isinstance(messages_raw, list):
|
| 762 |
raise HTTPException(400, "messages must be a non-empty array")
|
| 763 |
|
| 764 |
+
messages = normalize_messages(messages_raw)
|
|
|
|
|
|
|
| 765 |
|
| 766 |
if not messages:
|
| 767 |
raise HTTPException(400, "No valid messages after normalization")
|
| 768 |
|
| 769 |
if stream:
|
| 770 |
return StreamingResponse(
|
| 771 |
+
_stream_with_auto_continue(messages, model),
|
| 772 |
media_type="text/event-stream",
|
| 773 |
headers={
|
| 774 |
"Cache-Control": "no-cache",
|
|
|
|
| 777 |
},
|
| 778 |
)
|
| 779 |
else:
|
| 780 |
+
result = await _collect_with_auto_continue(messages, model)
|
| 781 |
|
| 782 |
tool_calls = result.get("tool_calls")
|
| 783 |
content = result.get("content")
|
|
|
|
| 831 |
async def root():
|
| 832 |
return {
|
| 833 |
"status": "ok",
|
| 834 |
+
"version": "6.0.0",
|
| 835 |
"proxy": bool(PROXY_URL),
|
| 836 |
"tool_calling": True,
|
| 837 |
"endpoints": ["/v1/chat/completions", "/v1/models"],
|