overwrite69 commited on
Commit
f5092e2
Β·
verified Β·
1 Parent(s): 3d49d68

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +205 -325
app.py CHANGED
@@ -3,7 +3,7 @@ Haiku API - OpenAI-compatible proxy for chatgpt.org/claude/chat
3
  Deploy to Hugging Face Spaces (Docker SDK)
4
 
5
  Features:
6
- - Tool/function calling support (converts OpenAI tools β†’ system prompt, parses output)
7
  - Auto-continues when upstream hits the ~1K token output limit
8
  - Rotating proxy with aggressive retries for unstable IPs
9
  - SSE keep-alive comments during continuation gaps
@@ -24,7 +24,7 @@ from fastapi import FastAPI, HTTPException, Request
24
  from fastapi.middleware.cors import CORSMiddleware
25
  from fastapi.responses import StreamingResponse, JSONResponse
26
 
27
- app = FastAPI(title="Haiku API", version="5.0.0")
28
 
29
  # ── CORS ─────────────────────────────────────────────────────────
30
  app.add_middleware(
@@ -152,81 +152,16 @@ async def shutdown():
152
 
153
 
154
  # ── Tool Calling Support ─────────────────────────────────────────
155
-
156
- def _build_tool_system_prompt(tools: list[dict], tool_choice=None) -> str:
157
- """Convert OpenAI tools format to a system prompt that instructs Claude
158
- to output tool calls in a parseable format."""
159
-
160
- tools_desc = []
161
- for tool in tools:
162
- func = tool.get("function", {})
163
- name = func.get("name", "unknown")
164
- desc = func.get("description", "No description")
165
- params = func.get("parameters", {})
166
-
167
- # Format parameters nicely
168
- props = params.get("properties", {})
169
- required = params.get("required", [])
170
- param_lines = []
171
- for pname, pdef in props.items():
172
- ptype = pdef.get("type", "any")
173
- pdesc = pdef.get("description", "")
174
- req_flag = " (required)" if pname in required else " (optional)"
175
- param_lines.append(f" - {pname}: {ptype}{req_flag} β€” {pdesc}")
176
-
177
- params_text = "\n".join(param_lines) if param_lines else " (no parameters)"
178
- tools_desc.append(f"### {name}\n{desc}\nParameters:\n{params_text}")
179
-
180
- tools_text = "\n\n".join(tools_desc)
181
-
182
- # Handle tool_choice
183
- choice_instruction = ""
184
- if tool_choice == "required":
185
- choice_instruction = "\nIMPORTANT: You MUST call at least one tool. Do not respond with just text."
186
- elif tool_choice == "none":
187
- # Shouldn't reach here since we skip tool injection for "none"
188
- choice_instruction = "\nDo NOT call any tools. Respond with text only."
189
- elif isinstance(tool_choice, dict) and tool_choice.get("type") == "function":
190
- fname = tool_choice.get("function", {}).get("name", "")
191
- choice_instruction = f"\nIMPORTANT: You MUST call the {fname} function."
192
-
193
- return f"""# Available Tools
194
-
195
- You have access to the following tools that you can call:
196
-
197
- {tools_text}
198
-
199
- ## Tool Call Format
200
-
201
- When you want to call a tool, you MUST use EXACTLY this XML format β€” one block per tool call:
202
-
203
- <tool_call name="FUNCTION_NAME">
204
- {"{"}"param1": "value1", "param2": "value2"{"}"}
205
- </tool_call_>
206
-
207
- Example β€” calling the Write tool:
208
- <tool_call name="Write">
209
- {"{"}"file_path": "hello.txt", "content": "hello world"{"}"}
210
- </tool_call_>
211
-
212
- ## Rules
213
- - You may call multiple tools by using multiple <tool_call_> blocks in sequence
214
- - The arguments inside the block MUST be valid JSON matching the tool's parameter schema
215
- - If you need to call a tool, output ONLY <tool_call_> blocks β€” no explanatory text before or after
216
- - If you don't need to call any tools, just respond normally with text (no <tool_call_> blocks)
217
- - Do NOT wrap <tool_call_> blocks in markdown code blocks or any other formatting
218
- {choice_instruction}"""
219
-
220
-
221
  # Regex to parse <tool_call name="...">...</tool_call_> blocks
 
222
  _TOOL_CALL_RE = re.compile(
223
  r'<tool_call\s+name="([^"]+)">\s*(.*?)\s*</tool_call_>',
224
  re.DOTALL
225
  )
226
 
227
- # Also try matching incomplete tool calls (for auto-continue detection)
228
  _INCOMPLETE_TOOL_CALL_RE = re.compile(
229
- r'<tool_call\s+name="([^"]+)">\s*(.*?)$',
230
  re.DOTALL
231
  )
232
 
@@ -243,7 +178,6 @@ def _parse_tool_calls(text: str) -> tuple[list[dict], str]:
243
  return [], text
244
 
245
  tool_calls = []
246
- # Collect text outside of tool call blocks
247
  remaining_parts = []
248
 
249
  last_end = 0
@@ -264,7 +198,6 @@ def _parse_tool_calls(text: str) -> tuple[list[dict], str]:
264
  args_final = json.dumps(args_json)
265
  except json.JSONDecodeError:
266
  # Try to fix common issues
267
- # Sometimes Claude wraps args in markdown code block
268
  args_cleaned = args_str.strip('`').strip()
269
  if args_cleaned.startswith('json'):
270
  args_cleaned = args_cleaned[4:].strip()
@@ -295,54 +228,18 @@ def _parse_tool_calls(text: str) -> tuple[list[dict], str]:
295
 
296
 
297
  def _has_incomplete_tool_call(text: str) -> bool:
298
- """Check if text has an opening <tool_call_>> tag without a matching close."""
299
  opens = len(re.findall(r'<tool_call\s+name="[^"]+">', text))
300
  closes = len(re.findall(r'</tool_call_>', text))
301
  return opens > closes
302
 
303
 
304
- # ── Message normalization ────────────────────────────────────────
305
-
306
- def normalize_messages(messages: list[dict], tools: list[dict] = None, tool_choice=None) -> list[dict]:
307
- """Normalize messages: handle content arrays, tool roles, tool_calls,
308
- and inject tool definitions into system prompt if tools are provided."""
309
- result = []
310
-
311
- # If tools provided and tool_choice != "none", inject tool system prompt
312
- inject_tools = tools and tool_choice != "none"
313
-
314
- if inject_tools:
315
- tool_system = _build_tool_system_prompt(tools, tool_choice)
316
- else:
317
- tool_system = None
318
-
319
- system_injected = False
320
-
321
- for msg in messages:
322
- role = msg.get("role", "user")
323
-
324
- # Inject tool system prompt before or as the first system message
325
- if role == "system" and not system_injected and tool_system:
326
- content = msg.get("content", "")
327
- if isinstance(content, list):
328
- content = _flatten_content_array(content)
329
- content = str(content) if content else ""
330
- combined = content + "\n\n" + tool_system if content.strip() else tool_system
331
- result.append({"role": "system", "content": combined})
332
- system_injected = True
333
- continue
334
-
335
- result.append(_normalize_one_message(msg))
336
-
337
- # If no system message existed, add tool system prompt as first message
338
- if tool_system and not system_injected:
339
- result.insert(0, {"role": "system", "content": tool_system})
340
 
341
- # Filter out empty system messages
342
- result = [m for m in result if not (m.get("role") == "system" and not m.get("content", "").strip())]
343
-
344
- return result
345
 
 
346
 
347
  def _flatten_content_array(content: list) -> str:
348
  """Convert a content array to plain text."""
@@ -356,54 +253,60 @@ def _flatten_content_array(content: list) -> str:
356
  return "\n".join(text_parts)
357
 
358
 
359
- def _normalize_one_message(msg: dict) -> dict:
360
- """Normalize a single message for chatgpt.org API."""
361
- role = msg.get("role", "user")
362
- content = msg.get("content", "")
363
 
364
- # Handle content arrays β†’ plain text
365
- if isinstance(content, list):
366
- content = _flatten_content_array(content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
 
368
- if content is None:
369
- content = ""
370
- content = str(content)
371
-
372
- # Handle tool role messages β†’ convert to user message with tool result
373
- if role == "tool":
374
- tool_name = msg.get("name", "unknown_tool")
375
- tool_call_id = msg.get("tool_call_id", "")
376
- return {
377
- "role": "user",
378
- "content": f"[Tool Result for {tool_name} (id: {tool_call_id})]:\n{content}"
379
- }
380
-
381
- # Handle assistant messages with tool_calls β†’ text with <tool_call_> blocks
382
- if role == "assistant" and msg.get("tool_calls"):
383
- parts = []
384
- regular_content = content if content and content.strip() else ""
385
-
386
- if regular_content:
387
- parts.append(regular_content)
388
-
389
- for tc in msg["tool_calls"]:
390
- func = tc.get("function", {})
391
- name = func.get("name", "unknown")
392
- args = func.get("arguments", "{}")
393
- # Validate args is valid JSON
394
- try:
395
- json.loads(args)
396
- except (json.JSONDecodeError, TypeError):
397
- args = "{}"
398
- parts.append(f'<tool_call name="{name}">\n{args}\n</tool_call_>')
399
 
400
- return {"role": "assistant", "content": "\n\n".join(parts)}
 
401
 
402
- # System messages with empty content get filtered out later
403
- if role == "system" and not content.strip():
404
- return {"role": "system", "content": ""}
 
 
 
 
 
 
405
 
406
- return {"role": role, "content": content}
 
 
 
 
 
 
 
 
 
407
 
408
 
409
  # ── Headers ──────────────────────────────────────────────────────
@@ -578,12 +481,100 @@ async def _raw_call_streaming(messages: list[dict], model: str):
578
  raise HTTPException(500, "Failed after retry")
579
 
580
 
581
- async def _stream_with_auto_continue(messages: list[dict], model: str, has_tools: bool = False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
582
  """Stream with real-time output, auto-continue, and keep-alive pings.
583
 
584
- When has_tools is True, we buffer the full response to properly detect
585
- and format tool calls, sending keep-alive pings while buffering.
586
- When has_tools is False, we stream text in real-time.
587
  """
588
  chunk_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
589
  created = int(time.time())
@@ -591,6 +582,7 @@ async def _stream_with_auto_continue(messages: list[dict], model: str, has_tools
591
  total_content = ""
592
 
593
  for cont_num in range(MAX_CONTINUATIONS):
 
594
  yield ": thinking...\n\n"
595
 
596
  resp = None
@@ -606,6 +598,7 @@ async def _stream_with_auto_continue(messages: list[dict], model: str, has_tools
606
  finish_reason = "stop"
607
  chunk_content = ""
608
 
 
609
  async for text, fr in _stream_one_response(resp):
610
  if fr is not None:
611
  finish_reason = fr
@@ -615,131 +608,27 @@ async def _stream_with_auto_continue(messages: list[dict], model: str, has_tools
615
  chunk_content += text
616
  total_content += text
617
 
618
- # If no tools, stream text in real-time
619
- if not has_tools:
620
- sse_data = json.dumps({
621
- "id": chunk_id,
622
- "object": "chat.completion.chunk",
623
- "created": created,
624
- "model": model,
625
- "choices": [{
626
- "index": 0,
627
- "delta": {"content": text},
628
- "finish_reason": None,
629
- }],
630
- })
631
- yield f"data: {sse_data}\n\n"
632
 
633
  print(f"[Chat] Chunk #{cont_num+1}: {len(chunk_content)} chars, finish={finish_reason}")
634
 
635
- # Check for tool calls
636
- if has_tools:
637
- tool_calls, remaining_text = _parse_tool_calls(total_content)
638
-
639
- if tool_calls:
640
- # Emit tool calls as OpenAI streaming chunks
641
- for i, tc in enumerate(tool_calls):
642
- # First chunk: role + tool_call with id, name, and start of arguments
643
- sse_start = json.dumps({
644
- "id": chunk_id,
645
- "object": "chat.completion.chunk",
646
- "created": created,
647
- "model": model,
648
- "choices": [{
649
- "index": 0,
650
- "delta": {
651
- "role": "assistant",
652
- "tool_calls": [{
653
- "index": i,
654
- "id": tc["id"],
655
- "type": "function",
656
- "function": {
657
- "name": tc["function"]["name"],
658
- "arguments": "",
659
- }
660
- }]
661
- },
662
- "finish_reason": None,
663
- }],
664
- })
665
- yield f"data: {sse_start}\n\n"
666
-
667
- # Argument chunks β€” split into small pieces for streaming feel
668
- args = tc["function"]["arguments"]
669
- chunk_size = max(1, len(args) // 3)
670
- for offset in range(0, len(args), chunk_size):
671
- arg_piece = args[offset:offset + chunk_size]
672
- sse_arg = json.dumps({
673
- "id": chunk_id,
674
- "object": "chat.completion.chunk",
675
- "created": created,
676
- "model": model,
677
- "choices": [{
678
- "index": 0,
679
- "delta": {
680
- "tool_calls": [{
681
- "index": i,
682
- "function": {
683
- "arguments": arg_piece,
684
- }
685
- }]
686
- },
687
- "finish_reason": None,
688
- }],
689
- })
690
- yield f"data: {sse_arg}\n\n"
691
-
692
- # If there's remaining text alongside tool calls, emit it too
693
- if remaining_text.strip():
694
- sse_text = json.dumps({
695
- "id": chunk_id,
696
- "object": "chat.completion.chunk",
697
- "created": created,
698
- "model": model,
699
- "choices": [{
700
- "index": 0,
701
- "delta": {"content": remaining_text},
702
- "finish_reason": None,
703
- }],
704
- })
705
- yield f"data: {sse_text}\n\n"
706
-
707
- # Final chunk with finish_reason
708
- sse_done = json.dumps({
709
- "id": chunk_id,
710
- "object": "chat.completion.chunk",
711
- "created": created,
712
- "model": model,
713
- "choices": [{
714
- "index": 0,
715
- "delta": {},
716
- "finish_reason": "tool_calls",
717
- }],
718
- })
719
- yield f"data: {sse_done}\n\n"
720
- yield "data: [DONE]\n\n"
721
- return
722
 
723
- # No tool calls found β€” if text is complete, stream it as content
724
- if finish_reason == "stop":
725
- # Stream the buffered text content as chunks
726
- text_to_stream = total_content
727
- chunk_sz = 50 # characters per streaming chunk
728
- for offset in range(0, len(text_to_stream), chunk_sz):
729
- piece = text_to_stream[offset:offset + chunk_sz]
730
- sse_data = json.dumps({
731
- "id": chunk_id,
732
- "object": "chat.completion.chunk",
733
- "created": created,
734
- "model": model,
735
- "choices": [{
736
- "index": 0,
737
- "delta": {"content": piece},
738
- "finish_reason": None,
739
- }],
740
- })
741
- yield f"data: {sse_data}\n\n"
742
 
 
 
 
 
 
 
743
  sse_data = json.dumps({
744
  "id": chunk_id,
745
  "object": "chat.completion.chunk",
@@ -747,36 +636,31 @@ async def _stream_with_auto_continue(messages: list[dict], model: str, has_tools
747
  "model": model,
748
  "choices": [{
749
  "index": 0,
750
- "delta": {},
751
- "finish_reason": "stop",
752
  }],
753
  })
754
  yield f"data: {sse_data}\n\n"
755
- yield "data: [DONE]\n\n"
756
- return
757
 
758
- else:
759
- # No tools β€” original behavior
760
- if finish_reason == "stop":
761
- sse_data = json.dumps({
762
- "id": chunk_id,
763
- "object": "chat.completion.chunk",
764
- "created": created,
765
- "model": model,
766
- "choices": [{
767
- "index": 0,
768
- "delta": {},
769
- "finish_reason": "stop",
770
- }],
771
- })
772
- yield f"data: {sse_data}\n\n"
773
- yield "data: [DONE]\n\n"
774
- return
775
 
776
  # Auto-continue for length-limited responses
777
  yield ": continuing...\n\n"
778
 
779
- # Check if we're in the middle of a tool call
780
  if _has_incomplete_tool_call(chunk_content):
781
  conversation.append({"role": "assistant", "content": chunk_content})
782
  conversation.append({"role": "user", "content": "Continue the tool call exactly from where you left off. Do not repeat the opening tag or any arguments you already wrote."})
@@ -804,9 +688,9 @@ async def _stream_with_auto_continue(messages: list[dict], model: str, has_tools
804
 
805
  # ── Non-streaming with auto-continue ────────────────────────────
806
 
807
- async def _collect_with_auto_continue(messages: list[dict], model: str, has_tools: bool = False) -> dict:
808
  """Collect the full response, auto-continuing if cut off.
809
- Returns a dict with either 'content' or 'tool_calls' key."""
810
  conversation = list(messages)
811
  full_content = ""
812
 
@@ -825,25 +709,20 @@ async def _collect_with_auto_continue(messages: list[dict], model: str, has_tool
825
  full_content += content
826
  print(f"[Chat] Collect #{cont_num+1}: {len(content)} chars, finish={finish_reason}")
827
 
828
- # Check for tool calls if tools were provided
829
- if has_tools:
830
- tool_calls, remaining_text = _parse_tool_calls(full_content)
831
 
832
- if tool_calls:
833
- result = {
 
 
 
 
834
  "tool_calls": tool_calls,
835
  "content": remaining_text if remaining_text.strip() else None,
836
  }
837
- # If there are incomplete tool calls, continue
838
- if _has_incomplete_tool_call(full_content) and finish_reason == "length":
839
- pass # fall through to auto-continue
840
- else:
841
- return result
842
 
843
  if finish_reason == "stop":
844
- if has_tools:
845
- # No tool calls found, return as text
846
- return {"content": full_content, "tool_calls": None}
847
  return {"content": full_content, "tool_calls": None}
848
 
849
  # Auto-continue
@@ -873,22 +752,23 @@ async def chat_completions(request: Request):
873
  model = body.get("model", "anthropic/claude-haiku-4-5")
874
  messages_raw = body.get("messages", [])
875
  stream = body.get("stream", False)
876
- tools = body.get("tools") or None
877
- tool_choice = body.get("tool_choice", "auto")
 
 
 
878
 
879
  if not messages_raw or not isinstance(messages_raw, list):
880
  raise HTTPException(400, "messages must be a non-empty array")
881
 
882
- has_tools = bool(tools) and tool_choice != "none"
883
-
884
- messages = normalize_messages(messages_raw, tools=tools, tool_choice=tool_choice)
885
 
886
  if not messages:
887
  raise HTTPException(400, "No valid messages after normalization")
888
 
889
  if stream:
890
  return StreamingResponse(
891
- _stream_with_auto_continue(messages, model, has_tools=has_tools),
892
  media_type="text/event-stream",
893
  headers={
894
  "Cache-Control": "no-cache",
@@ -897,7 +777,7 @@ async def chat_completions(request: Request):
897
  },
898
  )
899
  else:
900
- result = await _collect_with_auto_continue(messages, model, has_tools=has_tools)
901
 
902
  tool_calls = result.get("tool_calls")
903
  content = result.get("content")
@@ -951,7 +831,7 @@ async def list_models():
951
  async def root():
952
  return {
953
  "status": "ok",
954
- "version": "5.0.0",
955
  "proxy": bool(PROXY_URL),
956
  "tool_calling": True,
957
  "endpoints": ["/v1/chat/completions", "/v1/models"],
 
3
  Deploy to Hugging Face Spaces (Docker SDK)
4
 
5
  Features:
6
+ - Tool/function calling support (always detects <tool_call_> tags in output)
7
  - Auto-continues when upstream hits the ~1K token output limit
8
  - Rotating proxy with aggressive retries for unstable IPs
9
  - SSE keep-alive comments during continuation gaps
 
24
  from fastapi.middleware.cors import CORSMiddleware
25
  from fastapi.responses import StreamingResponse, JSONResponse
26
 
27
+ app = FastAPI(title="Haiku API", version="6.0.0")
28
 
29
  # ── CORS ─────────────────────────────────────────────────────────
30
  app.add_middleware(
 
152
 
153
 
154
  # ── Tool Calling Support ─────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  # Regex to parse <tool_call name="...">...</tool_call_> blocks
156
+ # Supports: <tool_call name="X">JSON</tool_call_> and variations
157
  _TOOL_CALL_RE = re.compile(
158
  r'<tool_call\s+name="([^"]+)">\s*(.*?)\s*</tool_call_>',
159
  re.DOTALL
160
  )
161
 
162
+ # Also match incomplete tool calls (for auto-continue detection)
163
  _INCOMPLETE_TOOL_CALL_RE = re.compile(
164
+ r'<tool_call\s+name="[^"]+">\s*(.*?)$',
165
  re.DOTALL
166
  )
167
 
 
178
  return [], text
179
 
180
  tool_calls = []
 
181
  remaining_parts = []
182
 
183
  last_end = 0
 
198
  args_final = json.dumps(args_json)
199
  except json.JSONDecodeError:
200
  # Try to fix common issues
 
201
  args_cleaned = args_str.strip('`').strip()
202
  if args_cleaned.startswith('json'):
203
  args_cleaned = args_cleaned[4:].strip()
 
228
 
229
 
230
  def _has_incomplete_tool_call(text: str) -> bool:
231
+ """Check if text has an opening <tool_call_> tag without a matching close."""
232
  opens = len(re.findall(r'<tool_call\s+name="[^"]+">', text))
233
  closes = len(re.findall(r'</tool_call_>', text))
234
  return opens > closes
235
 
236
 
237
+ def _detect_tool_calls_in_text(text: str) -> bool:
238
+ """Quick check if text likely contains tool call patterns."""
239
+ return bool(_TOOL_CALL_RE.search(text))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
 
 
 
 
241
 
242
+ # ── Message normalization ────────────────────────────────────────
243
 
244
  def _flatten_content_array(content: list) -> str:
245
  """Convert a content array to plain text."""
 
253
  return "\n".join(text_parts)
254
 
255
 
256
+ def normalize_messages(messages: list[dict]) -> list[dict]:
257
+ """Normalize messages: handle content arrays, tool roles, tool_calls."""
258
+ result = []
 
259
 
260
+ for msg in messages:
261
+ role = msg.get("role", "user")
262
+ content = msg.get("content", "")
263
+
264
+ # Handle content arrays β†’ plain text
265
+ if isinstance(content, list):
266
+ content = _flatten_content_array(content)
267
+
268
+ if content is None:
269
+ content = ""
270
+ content = str(content)
271
+
272
+ # Handle tool role messages β†’ convert to user message with tool result
273
+ if role == "tool":
274
+ tool_name = msg.get("name", "unknown_tool")
275
+ tool_call_id = msg.get("tool_call_id", "")
276
+ result.append({
277
+ "role": "user",
278
+ "content": f"[Tool Result for {tool_name} (id: {tool_call_id})]:\n{content}"
279
+ })
280
+ continue
281
 
282
+ # Handle assistant messages with tool_calls β†’ text with <tool_call_> blocks
283
+ if role == "assistant" and msg.get("tool_calls"):
284
+ parts = []
285
+ regular_content = content if content and content.strip() else ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
 
287
+ if regular_content:
288
+ parts.append(regular_content)
289
 
290
+ for tc in msg["tool_calls"]:
291
+ func = tc.get("function", {})
292
+ name = func.get("name", "unknown")
293
+ args = func.get("arguments", "{}")
294
+ try:
295
+ json.loads(args)
296
+ except (json.JSONDecodeError, TypeError):
297
+ args = "{}"
298
+ parts.append(f'<tool_call name="{name}">\n{args}\n</tool_call_>')
299
 
300
+ result.append({"role": "assistant", "content": "\n\n".join(parts)})
301
+ continue
302
+
303
+ # System messages with empty content get filtered out
304
+ if role == "system" and not content.strip():
305
+ continue
306
+
307
+ result.append({"role": role, "content": content})
308
+
309
+ return result
310
 
311
 
312
  # ── Headers ──────────────────────────────────────────────────────
 
481
  raise HTTPException(500, "Failed after retry")
482
 
483
 
484
+ def _emit_tool_call_chunks(chunk_id: str, created: int, model: str, tool_calls: list[dict], remaining_text: str):
485
+ """Generate OpenAI streaming chunks for tool calls. Returns list of SSE strings."""
486
+ chunks = []
487
+
488
+ for i, tc in enumerate(tool_calls):
489
+ # First chunk: role + tool_call with id, name, and start of arguments
490
+ sse_start = json.dumps({
491
+ "id": chunk_id,
492
+ "object": "chat.completion.chunk",
493
+ "created": created,
494
+ "model": model,
495
+ "choices": [{
496
+ "index": 0,
497
+ "delta": {
498
+ "role": "assistant",
499
+ "tool_calls": [{
500
+ "index": i,
501
+ "id": tc["id"],
502
+ "type": "function",
503
+ "function": {
504
+ "name": tc["function"]["name"],
505
+ "arguments": "",
506
+ }
507
+ }]
508
+ },
509
+ "finish_reason": None,
510
+ }],
511
+ })
512
+ chunks.append(f"data: {sse_start}\n\n")
513
+
514
+ # Argument chunks β€” split into small pieces for streaming feel
515
+ args = tc["function"]["arguments"]
516
+ chunk_size = max(1, len(args) // 3)
517
+ for offset in range(0, len(args), chunk_size):
518
+ arg_piece = args[offset:offset + chunk_size]
519
+ sse_arg = json.dumps({
520
+ "id": chunk_id,
521
+ "object": "chat.completion.chunk",
522
+ "created": created,
523
+ "model": model,
524
+ "choices": [{
525
+ "index": 0,
526
+ "delta": {
527
+ "tool_calls": [{
528
+ "index": i,
529
+ "function": {
530
+ "arguments": arg_piece,
531
+ }
532
+ }]
533
+ },
534
+ "finish_reason": None,
535
+ }],
536
+ })
537
+ chunks.append(f"data: {sse_arg}\n\n")
538
+
539
+ # If there's remaining text alongside tool calls, emit it too
540
+ if remaining_text.strip():
541
+ sse_text = json.dumps({
542
+ "id": chunk_id,
543
+ "object": "chat.completion.chunk",
544
+ "created": created,
545
+ "model": model,
546
+ "choices": [{
547
+ "index": 0,
548
+ "delta": {"content": remaining_text},
549
+ "finish_reason": None,
550
+ }],
551
+ })
552
+ chunks.append(f"data: {sse_text}\n\n")
553
+
554
+ # Final chunk with finish_reason
555
+ sse_done = json.dumps({
556
+ "id": chunk_id,
557
+ "object": "chat.completion.chunk",
558
+ "created": created,
559
+ "model": model,
560
+ "choices": [{
561
+ "index": 0,
562
+ "delta": {},
563
+ "finish_reason": "tool_calls",
564
+ }],
565
+ })
566
+ chunks.append(f"data: {sse_done}\n\n")
567
+ chunks.append("data: [DONE]\n\n")
568
+
569
+ return chunks
570
+
571
+
572
+ async def _stream_with_auto_continue(messages: list[dict], model: str):
573
  """Stream with real-time output, auto-continue, and keep-alive pings.
574
 
575
+ ALWAYS buffers the full response to detect <tool_call_> tags.
576
+ If tool calls are found, emits them as proper OpenAI tool_calls chunks.
577
+ If no tool calls, emits the text as regular content chunks.
578
  """
579
  chunk_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
580
  created = int(time.time())
 
582
  total_content = ""
583
 
584
  for cont_num in range(MAX_CONTINUATIONS):
585
+ # Send keep-alive while we buffer
586
  yield ": thinking...\n\n"
587
 
588
  resp = None
 
598
  finish_reason = "stop"
599
  chunk_content = ""
600
 
601
+ # Buffer the full response (don't stream in real-time so we can detect tool calls)
602
  async for text, fr in _stream_one_response(resp):
603
  if fr is not None:
604
  finish_reason = fr
 
608
  chunk_content += text
609
  total_content += text
610
 
611
+ # Send keep-alive pings while buffering
612
+ yield ": streaming...\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
613
 
614
  print(f"[Chat] Chunk #{cont_num+1}: {len(chunk_content)} chars, finish={finish_reason}")
615
 
616
+ # ALWAYS check for tool calls in the accumulated text
617
+ tool_calls, remaining_text = _parse_tool_calls(total_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
618
 
619
+ if tool_calls:
620
+ print(f"[Chat] Detected {len(tool_calls)} tool call(s)")
621
+ # Emit tool calls as proper OpenAI streaming chunks
622
+ for sse_chunk in _emit_tool_call_chunks(chunk_id, created, model, tool_calls, remaining_text):
623
+ yield sse_chunk
624
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
625
 
626
+ # No tool calls found
627
+ if finish_reason == "stop":
628
+ # Stream the buffered text content as regular content chunks
629
+ chunk_sz = 50
630
+ for offset in range(0, len(total_content), chunk_sz):
631
+ piece = total_content[offset:offset + chunk_sz]
632
  sse_data = json.dumps({
633
  "id": chunk_id,
634
  "object": "chat.completion.chunk",
 
636
  "model": model,
637
  "choices": [{
638
  "index": 0,
639
+ "delta": {"content": piece},
640
+ "finish_reason": None,
641
  }],
642
  })
643
  yield f"data: {sse_data}\n\n"
 
 
644
 
645
+ # Final stop chunk
646
+ sse_data = json.dumps({
647
+ "id": chunk_id,
648
+ "object": "chat.completion.chunk",
649
+ "created": created,
650
+ "model": model,
651
+ "choices": [{
652
+ "index": 0,
653
+ "delta": {},
654
+ "finish_reason": "stop",
655
+ }],
656
+ })
657
+ yield f"data: {sse_data}\n\n"
658
+ yield "data: [DONE]\n\n"
659
+ return
 
 
660
 
661
  # Auto-continue for length-limited responses
662
  yield ": continuing...\n\n"
663
 
 
664
  if _has_incomplete_tool_call(chunk_content):
665
  conversation.append({"role": "assistant", "content": chunk_content})
666
  conversation.append({"role": "user", "content": "Continue the tool call exactly from where you left off. Do not repeat the opening tag or any arguments you already wrote."})
 
688
 
689
  # ── Non-streaming with auto-continue ────────────────────────────
690
 
691
+ async def _collect_with_auto_continue(messages: list[dict], model: str) -> dict:
692
  """Collect the full response, auto-continuing if cut off.
693
+ Always checks for tool calls. Returns dict with 'content' and/or 'tool_calls'."""
694
  conversation = list(messages)
695
  full_content = ""
696
 
 
709
  full_content += content
710
  print(f"[Chat] Collect #{cont_num+1}: {len(content)} chars, finish={finish_reason}")
711
 
712
+ # Always check for tool calls
713
+ tool_calls, remaining_text = _parse_tool_calls(full_content)
 
714
 
715
+ if tool_calls:
716
+ # If there are incomplete tool calls and we got cut off, continue
717
+ if _has_incomplete_tool_call(full_content) and finish_reason == "length":
718
+ pass # fall through to auto-continue
719
+ else:
720
+ return {
721
  "tool_calls": tool_calls,
722
  "content": remaining_text if remaining_text.strip() else None,
723
  }
 
 
 
 
 
724
 
725
  if finish_reason == "stop":
 
 
 
726
  return {"content": full_content, "tool_calls": None}
727
 
728
  # Auto-continue
 
752
  model = body.get("model", "anthropic/claude-haiku-4-5")
753
  messages_raw = body.get("messages", [])
754
  stream = body.get("stream", False)
755
+
756
+ # Log request for debugging
757
+ tools_present = "tools" in body
758
+ functions_present = "functions" in body
759
+ print(f"[Request] model={model} stream={stream} tools={tools_present} functions={functions_present} msgs={len(messages_raw)}")
760
 
761
  if not messages_raw or not isinstance(messages_raw, list):
762
  raise HTTPException(400, "messages must be a non-empty array")
763
 
764
+ messages = normalize_messages(messages_raw)
 
 
765
 
766
  if not messages:
767
  raise HTTPException(400, "No valid messages after normalization")
768
 
769
  if stream:
770
  return StreamingResponse(
771
+ _stream_with_auto_continue(messages, model),
772
  media_type="text/event-stream",
773
  headers={
774
  "Cache-Control": "no-cache",
 
777
  },
778
  )
779
  else:
780
+ result = await _collect_with_auto_continue(messages, model)
781
 
782
  tool_calls = result.get("tool_calls")
783
  content = result.get("content")
 
831
  async def root():
832
  return {
833
  "status": "ok",
834
+ "version": "6.0.0",
835
  "proxy": bool(PROXY_URL),
836
  "tool_calling": True,
837
  "endpoints": ["/v1/chat/completions", "/v1/models"],