Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
README.md
CHANGED
|
@@ -11,6 +11,24 @@ app_port: 7860
|
|
| 11 |
|
| 12 |
OpenAI-compatible API proxy for Claude Haiku 4.5 via chatgpt.org.
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
## Usage
|
| 15 |
|
| 16 |
### Chat Completions (non-streaming)
|
|
@@ -36,6 +54,32 @@ curl https://YOUR_SPACE.hf.space/v1/chat/completions \
|
|
| 36 |
}'
|
| 37 |
```
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
### With OpenAI Python SDK
|
| 40 |
|
| 41 |
```python
|
|
@@ -64,7 +108,9 @@ curl https://YOUR_SPACE.hf.space/v1/models
|
|
| 64 |
|
| 65 |
| Endpoint | Description |
|
| 66 |
|---|---|
|
| 67 |
-
| `POST /v1/chat/completions` | OpenAI-compatible chat completions |
|
|
|
|
| 68 |
| `GET /v1/models` | List available models |
|
| 69 |
| `GET /health` | Health check |
|
| 70 |
| `GET /debug/session` | Session debug info |
|
|
|
|
|
|
| 11 |
|
| 12 |
OpenAI-compatible API proxy for Claude Haiku 4.5 via chatgpt.org.
|
| 13 |
|
| 14 |
+
Supports **tool/function calling**, auto-continue for the 1K token limit, rotating proxy, and SSE keep-alive.
|
| 15 |
+
|
| 16 |
+
## Features
|
| 17 |
+
|
| 18 |
+
- **Tool/Function Calling**: Full OpenAI-compatible tool calling support. Converts `tools` definitions to system prompts, parses Claude's output for `<tool_call_>` blocks, and returns properly formatted `tool_calls` responses.
|
| 19 |
+
- **Auto-Continue**: When the upstream 1K token limit is hit, automatically continues the response with "Continue" messages.
|
| 20 |
+
- **SSE Keep-Alive**: Sends keep-alive comments during continuation gaps to prevent socket timeouts.
|
| 21 |
+
- **Rotating Proxy**: Supports unstable rotating proxies with automatic retries on connection failures.
|
| 22 |
+
- **Message Normalization**: Handles Orchids.app's content array format and converts it to plain text.
|
| 23 |
+
|
| 24 |
+
## Environment Variables
|
| 25 |
+
|
| 26 |
+
| Variable | Description | Default |
|
| 27 |
+
|---|---|---|
|
| 28 |
+
| `PROXY_URL` | Rotating proxy URL (e.g. `http://user:pass@proxy.op.wtf:32424`) | `""` (direct) |
|
| 29 |
+
|
| 30 |
+
Set these in HF Spaces > Settings > Variables and Secrets.
|
| 31 |
+
|
| 32 |
## Usage
|
| 33 |
|
| 34 |
### Chat Completions (non-streaming)
|
|
|
|
| 54 |
}'
|
| 55 |
```
|
| 56 |
|
| 57 |
+
### With Tool Calling
|
| 58 |
+
|
| 59 |
+
```bash
|
| 60 |
+
curl https://YOUR_SPACE.hf.space/v1/chat/completions \
|
| 61 |
+
-H "Content-Type: application/json" \
|
| 62 |
+
-d '{
|
| 63 |
+
"model": "anthropic/claude-haiku-4-5",
|
| 64 |
+
"messages": [{"role": "user", "content": "Create a file called hello.txt with hello world"}],
|
| 65 |
+
"tools": [{
|
| 66 |
+
"type": "function",
|
| 67 |
+
"function": {
|
| 68 |
+
"name": "Write",
|
| 69 |
+
"description": "Write content to a file",
|
| 70 |
+
"parameters": {
|
| 71 |
+
"type": "object",
|
| 72 |
+
"properties": {
|
| 73 |
+
"file_path": {"type": "string", "description": "Path to the file"},
|
| 74 |
+
"content": {"type": "string", "description": "Content to write"}
|
| 75 |
+
},
|
| 76 |
+
"required": ["file_path", "content"]
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
}]
|
| 80 |
+
}'
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
### With OpenAI Python SDK
|
| 84 |
|
| 85 |
```python
|
|
|
|
| 108 |
|
| 109 |
| Endpoint | Description |
|
| 110 |
|---|---|
|
| 111 |
+
| `POST /v1/chat/completions` | OpenAI-compatible chat completions (with tool calling) |
|
| 112 |
+
| `POST /chat/completions` | Same, without /v1 prefix |
|
| 113 |
| `GET /v1/models` | List available models |
|
| 114 |
| `GET /health` | Health check |
|
| 115 |
| `GET /debug/session` | Session debug info |
|
| 116 |
+
| `GET /debug/refresh` | Force session refresh |
|
app.py
CHANGED
|
@@ -2,9 +2,12 @@
|
|
| 2 |
Haiku API - OpenAI-compatible proxy for chatgpt.org/claude/chat
|
| 3 |
Deploy to Hugging Face Spaces (Docker SDK)
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
| 8 |
"""
|
| 9 |
|
| 10 |
import asyncio
|
|
@@ -21,7 +24,7 @@ from fastapi import FastAPI, HTTPException, Request
|
|
| 21 |
from fastapi.middleware.cors import CORSMiddleware
|
| 22 |
from fastapi.responses import StreamingResponse, JSONResponse
|
| 23 |
|
| 24 |
-
app = FastAPI(title="Haiku API", version="
|
| 25 |
|
| 26 |
# ββ CORS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 27 |
app.add_middleware(
|
|
@@ -70,10 +73,8 @@ class SessionState:
|
|
| 70 |
if self.cookies and (now - self.last_refresh) < self.refresh_interval:
|
| 71 |
return
|
| 72 |
|
| 73 |
-
# Try multiple times with proxy rotation (new IP each request)
|
| 74 |
for attempt in range(PROXY_MAX_RETRIES):
|
| 75 |
try:
|
| 76 |
-
# Create fresh client for each attempt (gets new proxy IP)
|
| 77 |
if PROXY_URL and attempt > 0:
|
| 78 |
try:
|
| 79 |
await client.aclose()
|
|
@@ -119,7 +120,7 @@ class SessionState:
|
|
| 119 |
self.csrf_token = csrf
|
| 120 |
self.last_refresh = now
|
| 121 |
print(f"[Session] OK β CSRF:{bool(csrf)} XSRF:{bool(xsrf)} Cookies:{list(new_cookies.keys())} (attempt {attempt+1})")
|
| 122 |
-
return
|
| 123 |
|
| 124 |
except (httpx.ConnectError, httpx.ProxyError, httpx.TimeoutException) as e:
|
| 125 |
print(f"[Session] Proxy error attempt #{attempt+1}: {type(e).__name__}: {e}")
|
|
@@ -150,37 +151,263 @@ async def shutdown():
|
|
| 150 |
await http_client.aclose()
|
| 151 |
|
| 152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
# ββ Message normalization ββββββββββββββββββββββββββββββββββββββββ
|
| 154 |
-
|
| 155 |
-
|
|
|
|
|
|
|
| 156 |
result = []
|
| 157 |
-
for msg in messages:
|
| 158 |
-
role = msg.get("role", "user")
|
| 159 |
-
content = msg.get("content", "")
|
| 160 |
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
for part in content:
|
| 164 |
-
if isinstance(part, str):
|
| 165 |
-
text_parts.append(part)
|
| 166 |
-
elif isinstance(part, dict):
|
| 167 |
-
if part.get("type") == "text":
|
| 168 |
-
text_parts.append(part.get("text", ""))
|
| 169 |
-
content = "\n".join(text_parts)
|
| 170 |
|
| 171 |
-
|
| 172 |
-
|
|
|
|
|
|
|
| 173 |
|
| 174 |
-
|
| 175 |
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
continue
|
| 178 |
|
| 179 |
-
result.append(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
return result
|
| 182 |
|
| 183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
def _headers() -> dict:
|
| 185 |
h = {
|
| 186 |
"Accept": "*/*",
|
|
@@ -197,6 +424,7 @@ def _headers() -> dict:
|
|
| 197 |
|
| 198 |
|
| 199 |
# ββ Proxy-aware request with retry ββββββββββββββββββββββββββββββ
|
|
|
|
| 200 |
async def _proxy_post(url: str, **kwargs) -> httpx.Response:
|
| 201 |
"""POST with proxy retry logic. Creates new client on each retry to get fresh IP."""
|
| 202 |
global http_client
|
|
@@ -204,54 +432,26 @@ async def _proxy_post(url: str, **kwargs) -> httpx.Response:
|
|
| 204 |
for attempt in range(PROXY_MAX_RETRIES):
|
| 205 |
try:
|
| 206 |
resp = await http_client.post(url, **kwargs)
|
| 207 |
-
|
| 208 |
-
# Proxy returned a non-connection error β return it
|
| 209 |
return resp
|
| 210 |
|
| 211 |
except (httpx.ConnectError, httpx.ProxyError, httpx.TimeoutException) as e:
|
| 212 |
print(f"[Proxy] Connection error #{attempt+1}: {type(e).__name__}")
|
| 213 |
-
# Recreate client with new proxy IP
|
| 214 |
if PROXY_URL:
|
| 215 |
try:
|
| 216 |
await http_client.aclose()
|
| 217 |
except:
|
| 218 |
pass
|
| 219 |
http_client = _make_client()
|
| 220 |
-
# Re-apply session cookies
|
| 221 |
await asyncio.sleep(PROXY_RETRY_DELAY)
|
| 222 |
else:
|
| 223 |
await asyncio.sleep(2)
|
| 224 |
continue
|
| 225 |
|
| 226 |
-
# All retries exhausted β return last attempt anyway
|
| 227 |
return await http_client.post(url, **kwargs)
|
| 228 |
|
| 229 |
|
| 230 |
-
async def _proxy_get(url: str, **kwargs) -> httpx.Response:
|
| 231 |
-
"""GET with proxy retry logic."""
|
| 232 |
-
global http_client
|
| 233 |
-
|
| 234 |
-
for attempt in range(PROXY_MAX_RETRIES):
|
| 235 |
-
try:
|
| 236 |
-
resp = await http_client.get(url, **kwargs)
|
| 237 |
-
return resp
|
| 238 |
-
except (httpx.ConnectError, httpx.ProxyError, httpx.TimeoutException) as e:
|
| 239 |
-
print(f"[Proxy] GET error #{attempt+1}: {type(e).__name__}")
|
| 240 |
-
if PROXY_URL:
|
| 241 |
-
try:
|
| 242 |
-
await http_client.aclose()
|
| 243 |
-
except:
|
| 244 |
-
pass
|
| 245 |
-
http_client = _make_client()
|
| 246 |
-
await asyncio.sleep(PROXY_RETRY_DELAY)
|
| 247 |
-
else:
|
| 248 |
-
await asyncio.sleep(2)
|
| 249 |
-
continue
|
| 250 |
-
|
| 251 |
-
return await http_client.get(url, **kwargs)
|
| 252 |
-
|
| 253 |
-
|
| 254 |
# ββ Raw call with retries βββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 255 |
async def _raw_call(messages: list[dict], model: str) -> httpx.Response:
|
| 256 |
"""Make a single POST to chatgpt.org/api/chat with full retry logic."""
|
| 257 |
await session.refresh(http_client)
|
|
@@ -268,7 +468,7 @@ async def _raw_call(messages: list[dict], model: str) -> httpx.Response:
|
|
| 268 |
)
|
| 269 |
|
| 270 |
if resp.status_code == 419 and attempt == 0:
|
| 271 |
-
print("[Chat] 419
|
| 272 |
session.last_refresh = 0
|
| 273 |
await session.refresh(http_client)
|
| 274 |
break
|
|
@@ -293,7 +493,8 @@ async def _raw_call(messages: list[dict], model: str) -> httpx.Response:
|
|
| 293 |
|
| 294 |
|
| 295 |
async def _stream_one_response(resp):
|
| 296 |
-
"""Stream a single upstream SSE response in real-time.
|
|
|
|
| 297 |
finish_reason = None
|
| 298 |
|
| 299 |
async for raw_line in resp.aiter_lines():
|
|
@@ -331,14 +532,15 @@ async def _stream_one_response(resp):
|
|
| 331 |
# ββ Streaming with auto-continue ββββββββββββββββββββββββββββββββ
|
| 332 |
MAX_CONTINUATIONS = 20
|
| 333 |
|
|
|
|
| 334 |
async def _raw_call_streaming(messages: list[dict], model: str):
|
| 335 |
-
"""Like _raw_call but yields SSE keep-alive comments during retries
|
|
|
|
| 336 |
await session.refresh(http_client)
|
| 337 |
payload = {"model": model, "messages": messages}
|
| 338 |
|
| 339 |
for attempt in range(2): # CSRF retry
|
| 340 |
for rate_attempt in range(3): # 429 retry
|
| 341 |
-
# Keep-alive before request
|
| 342 |
yield ": thinking...\n\n"
|
| 343 |
|
| 344 |
resp = await _proxy_post(
|
|
@@ -349,7 +551,7 @@ async def _raw_call_streaming(messages: list[dict], model: str):
|
|
| 349 |
)
|
| 350 |
|
| 351 |
if resp.status_code == 419 and attempt == 0:
|
| 352 |
-
print("[Chat] 419
|
| 353 |
session.last_refresh = 0
|
| 354 |
await session.refresh(http_client)
|
| 355 |
break
|
|
@@ -376,8 +578,13 @@ async def _raw_call_streaming(messages: list[dict], model: str):
|
|
| 376 |
raise HTTPException(500, "Failed after retry")
|
| 377 |
|
| 378 |
|
| 379 |
-
async def _stream_with_auto_continue(messages: list[dict], model: str):
|
| 380 |
-
"""Stream with real-time output, auto-continue, and keep-alive pings.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
chunk_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
|
| 382 |
created = int(time.time())
|
| 383 |
conversation = list(messages)
|
|
@@ -407,6 +614,132 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
|
|
| 407 |
if text:
|
| 408 |
chunk_content += text
|
| 409 |
total_content += text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
sse_data = json.dumps({
|
| 411 |
"id": chunk_id,
|
| 412 |
"object": "chat.completion.chunk",
|
|
@@ -414,38 +747,46 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
|
|
| 414 |
"model": model,
|
| 415 |
"choices": [{
|
| 416 |
"index": 0,
|
| 417 |
-
"delta": {
|
| 418 |
-
"finish_reason":
|
| 419 |
}],
|
| 420 |
})
|
| 421 |
yield f"data: {sse_data}\n\n"
|
|
|
|
|
|
|
| 422 |
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
|
|
|
|
| 441 |
yield ": continuing...\n\n"
|
| 442 |
|
| 443 |
-
|
| 444 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 445 |
|
| 446 |
print(f"[Chat] Auto-continue #{cont_num+1}, total so far: {len(total_content)} chars")
|
| 447 |
|
| 448 |
-
# Safety
|
| 449 |
sse_data = json.dumps({
|
| 450 |
"id": chunk_id,
|
| 451 |
"object": "chat.completion.chunk",
|
|
@@ -462,8 +803,10 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
|
|
| 462 |
|
| 463 |
|
| 464 |
# ββ Non-streaming with auto-continue ββββββββββββββββββββββββββββ
|
| 465 |
-
|
| 466 |
-
|
|
|
|
|
|
|
| 467 |
conversation = list(messages)
|
| 468 |
full_content = ""
|
| 469 |
|
|
@@ -482,16 +825,40 @@ async def _collect_with_auto_continue(messages: list[dict], model: str) -> str:
|
|
| 482 |
full_content += content
|
| 483 |
print(f"[Chat] Collect #{cont_num+1}: {len(content)} chars, finish={finish_reason}")
|
| 484 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 485 |
if finish_reason == "stop":
|
| 486 |
-
|
|
|
|
|
|
|
|
|
|
| 487 |
|
| 488 |
-
|
| 489 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
|
| 491 |
-
return full_content
|
| 492 |
|
| 493 |
|
| 494 |
# ββ OpenAI-compatible endpoint ββββββββββββββββββββββββββββββββββ
|
|
|
|
| 495 |
@app.post("/v1/chat/completions")
|
| 496 |
@app.post("/chat/completions")
|
| 497 |
async def chat_completions(request: Request):
|
|
@@ -506,18 +873,22 @@ async def chat_completions(request: Request):
|
|
| 506 |
model = body.get("model", "anthropic/claude-haiku-4-5")
|
| 507 |
messages_raw = body.get("messages", [])
|
| 508 |
stream = body.get("stream", False)
|
|
|
|
|
|
|
| 509 |
|
| 510 |
if not messages_raw or not isinstance(messages_raw, list):
|
| 511 |
raise HTTPException(400, "messages must be a non-empty array")
|
| 512 |
|
| 513 |
-
|
|
|
|
|
|
|
| 514 |
|
| 515 |
if not messages:
|
| 516 |
raise HTTPException(400, "No valid messages after normalization")
|
| 517 |
|
| 518 |
if stream:
|
| 519 |
return StreamingResponse(
|
| 520 |
-
_stream_with_auto_continue(messages, model),
|
| 521 |
media_type="text/event-stream",
|
| 522 |
headers={
|
| 523 |
"Cache-Control": "no-cache",
|
|
@@ -526,22 +897,45 @@ async def chat_completions(request: Request):
|
|
| 526 |
},
|
| 527 |
)
|
| 528 |
else:
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
"
|
| 537 |
-
"
|
| 538 |
-
"
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
|
| 543 |
|
| 544 |
# ββ Models / Health βββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 545 |
@app.get("/v1/models")
|
| 546 |
@app.get("/models")
|
| 547 |
async def list_models():
|
|
@@ -557,8 +951,9 @@ async def list_models():
|
|
| 557 |
async def root():
|
| 558 |
return {
|
| 559 |
"status": "ok",
|
| 560 |
-
"version": "
|
| 561 |
"proxy": bool(PROXY_URL),
|
|
|
|
| 562 |
"endpoints": ["/v1/chat/completions", "/v1/models"],
|
| 563 |
}
|
| 564 |
|
|
|
|
| 2 |
Haiku API - OpenAI-compatible proxy for chatgpt.org/claude/chat
|
| 3 |
Deploy to Hugging Face Spaces (Docker SDK)
|
| 4 |
|
| 5 |
+
Features:
|
| 6 |
+
- Tool/function calling support (converts OpenAI tools β system prompt, parses output)
|
| 7 |
+
- Auto-continues when upstream hits the ~1K token output limit
|
| 8 |
+
- Rotating proxy with aggressive retries for unstable IPs
|
| 9 |
+
- SSE keep-alive comments during continuation gaps
|
| 10 |
+
- Message normalization for Orchids.app compatibility
|
| 11 |
"""
|
| 12 |
|
| 13 |
import asyncio
|
|
|
|
| 24 |
from fastapi.middleware.cors import CORSMiddleware
|
| 25 |
from fastapi.responses import StreamingResponse, JSONResponse
|
| 26 |
|
| 27 |
+
app = FastAPI(title="Haiku API", version="5.0.0")
|
| 28 |
|
| 29 |
# ββ CORS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
app.add_middleware(
|
|
|
|
| 73 |
if self.cookies and (now - self.last_refresh) < self.refresh_interval:
|
| 74 |
return
|
| 75 |
|
|
|
|
| 76 |
for attempt in range(PROXY_MAX_RETRIES):
|
| 77 |
try:
|
|
|
|
| 78 |
if PROXY_URL and attempt > 0:
|
| 79 |
try:
|
| 80 |
await client.aclose()
|
|
|
|
| 120 |
self.csrf_token = csrf
|
| 121 |
self.last_refresh = now
|
| 122 |
print(f"[Session] OK β CSRF:{bool(csrf)} XSRF:{bool(xsrf)} Cookies:{list(new_cookies.keys())} (attempt {attempt+1})")
|
| 123 |
+
return
|
| 124 |
|
| 125 |
except (httpx.ConnectError, httpx.ProxyError, httpx.TimeoutException) as e:
|
| 126 |
print(f"[Session] Proxy error attempt #{attempt+1}: {type(e).__name__}: {e}")
|
|
|
|
| 151 |
await http_client.aclose()
|
| 152 |
|
| 153 |
|
| 154 |
+
# ββ Tool Calling Support βββββββββββββββββββββββββββββββββββββββββ
|
| 155 |
+
|
| 156 |
+
def _build_tool_system_prompt(tools: list[dict], tool_choice=None) -> str:
|
| 157 |
+
"""Convert OpenAI tools format to a system prompt that instructs Claude
|
| 158 |
+
to output tool calls in a parseable format."""
|
| 159 |
+
|
| 160 |
+
tools_desc = []
|
| 161 |
+
for tool in tools:
|
| 162 |
+
func = tool.get("function", {})
|
| 163 |
+
name = func.get("name", "unknown")
|
| 164 |
+
desc = func.get("description", "No description")
|
| 165 |
+
params = func.get("parameters", {})
|
| 166 |
+
|
| 167 |
+
# Format parameters nicely
|
| 168 |
+
props = params.get("properties", {})
|
| 169 |
+
required = params.get("required", [])
|
| 170 |
+
param_lines = []
|
| 171 |
+
for pname, pdef in props.items():
|
| 172 |
+
ptype = pdef.get("type", "any")
|
| 173 |
+
pdesc = pdef.get("description", "")
|
| 174 |
+
req_flag = " (required)" if pname in required else " (optional)"
|
| 175 |
+
param_lines.append(f" - {pname}: {ptype}{req_flag} β {pdesc}")
|
| 176 |
+
|
| 177 |
+
params_text = "\n".join(param_lines) if param_lines else " (no parameters)"
|
| 178 |
+
tools_desc.append(f"### {name}\n{desc}\nParameters:\n{params_text}")
|
| 179 |
+
|
| 180 |
+
tools_text = "\n\n".join(tools_desc)
|
| 181 |
+
|
| 182 |
+
# Handle tool_choice
|
| 183 |
+
choice_instruction = ""
|
| 184 |
+
if tool_choice == "required":
|
| 185 |
+
choice_instruction = "\nIMPORTANT: You MUST call at least one tool. Do not respond with just text."
|
| 186 |
+
elif tool_choice == "none":
|
| 187 |
+
# Shouldn't reach here since we skip tool injection for "none"
|
| 188 |
+
choice_instruction = "\nDo NOT call any tools. Respond with text only."
|
| 189 |
+
elif isinstance(tool_choice, dict) and tool_choice.get("type") == "function":
|
| 190 |
+
fname = tool_choice.get("function", {}).get("name", "")
|
| 191 |
+
choice_instruction = f"\nIMPORTANT: You MUST call the {fname} function."
|
| 192 |
+
|
| 193 |
+
return f"""# Available Tools
|
| 194 |
+
|
| 195 |
+
You have access to the following tools that you can call:
|
| 196 |
+
|
| 197 |
+
{tools_text}
|
| 198 |
+
|
| 199 |
+
## Tool Call Format
|
| 200 |
+
|
| 201 |
+
When you want to call a tool, you MUST use EXACTLY this XML format β one block per tool call:
|
| 202 |
+
|
| 203 |
+
<tool_call name="FUNCTION_NAME">
|
| 204 |
+
{"{"}"param1": "value1", "param2": "value2"{"}"}
|
| 205 |
+
</tool_call_>
|
| 206 |
+
|
| 207 |
+
Example β calling the Write tool:
|
| 208 |
+
<tool_call name="Write">
|
| 209 |
+
{"{"}"file_path": "hello.txt", "content": "hello world"{"}"}
|
| 210 |
+
</tool_call_>
|
| 211 |
+
|
| 212 |
+
## Rules
|
| 213 |
+
- You may call multiple tools by using multiple <tool_call_> blocks in sequence
|
| 214 |
+
- The arguments inside the block MUST be valid JSON matching the tool's parameter schema
|
| 215 |
+
- If you need to call a tool, output ONLY <tool_call_> blocks β no explanatory text before or after
|
| 216 |
+
- If you don't need to call any tools, just respond normally with text (no <tool_call_> blocks)
|
| 217 |
+
- Do NOT wrap <tool_call_> blocks in markdown code blocks or any other formatting
|
| 218 |
+
{choice_instruction}"""
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
# Regex to parse <tool_call name="...">...</tool_call_> blocks
|
| 222 |
+
_TOOL_CALL_RE = re.compile(
|
| 223 |
+
r'<tool_call\s+name="([^"]+)">\s*(.*?)\s*</tool_call_>',
|
| 224 |
+
re.DOTALL
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
# Also try matching incomplete tool calls (for auto-continue detection)
|
| 228 |
+
_INCOMPLETE_TOOL_CALL_RE = re.compile(
|
| 229 |
+
r'<tool_call\s+name="([^"]+)">\s*(.*?)$',
|
| 230 |
+
re.DOTALL
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
def _parse_tool_calls(text: str) -> tuple[list[dict], str]:
|
| 235 |
+
"""Parse tool calls from model text output.
|
| 236 |
+
|
| 237 |
+
Returns (tool_calls, remaining_text) where tool_calls is in OpenAI format.
|
| 238 |
+
If no tool calls found, returns ([], original_text).
|
| 239 |
+
"""
|
| 240 |
+
matches = list(_TOOL_CALL_RE.finditer(text))
|
| 241 |
+
|
| 242 |
+
if not matches:
|
| 243 |
+
return [], text
|
| 244 |
+
|
| 245 |
+
tool_calls = []
|
| 246 |
+
# Collect text outside of tool call blocks
|
| 247 |
+
remaining_parts = []
|
| 248 |
+
|
| 249 |
+
last_end = 0
|
| 250 |
+
for match in matches:
|
| 251 |
+
# Text before this tool call
|
| 252 |
+
if match.start() > last_end:
|
| 253 |
+
before = text[last_end:match.start()].strip()
|
| 254 |
+
if before:
|
| 255 |
+
remaining_parts.append(before)
|
| 256 |
+
last_end = match.end()
|
| 257 |
+
|
| 258 |
+
func_name = match.group(1)
|
| 259 |
+
args_str = match.group(2).strip()
|
| 260 |
+
|
| 261 |
+
# Try to parse arguments as JSON
|
| 262 |
+
try:
|
| 263 |
+
args_json = json.loads(args_str)
|
| 264 |
+
args_final = json.dumps(args_json)
|
| 265 |
+
except json.JSONDecodeError:
|
| 266 |
+
# Try to fix common issues
|
| 267 |
+
# Sometimes Claude wraps args in markdown code block
|
| 268 |
+
args_cleaned = args_str.strip('`').strip()
|
| 269 |
+
if args_cleaned.startswith('json'):
|
| 270 |
+
args_cleaned = args_cleaned[4:].strip()
|
| 271 |
+
try:
|
| 272 |
+
args_json = json.loads(args_cleaned)
|
| 273 |
+
args_final = json.dumps(args_json)
|
| 274 |
+
except json.JSONDecodeError:
|
| 275 |
+
# Last resort: wrap the raw text as an argument
|
| 276 |
+
args_final = json.dumps({"raw_input": args_str})
|
| 277 |
+
|
| 278 |
+
tool_calls.append({
|
| 279 |
+
"id": f"call_{uuid.uuid4().hex[:24]}",
|
| 280 |
+
"type": "function",
|
| 281 |
+
"function": {
|
| 282 |
+
"name": func_name,
|
| 283 |
+
"arguments": args_final,
|
| 284 |
+
}
|
| 285 |
+
})
|
| 286 |
+
|
| 287 |
+
# Text after the last tool call
|
| 288 |
+
if last_end < len(text):
|
| 289 |
+
after = text[last_end:].strip()
|
| 290 |
+
if after:
|
| 291 |
+
remaining_parts.append(after)
|
| 292 |
+
|
| 293 |
+
remaining_text = "\n".join(remaining_parts)
|
| 294 |
+
return tool_calls, remaining_text
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
def _has_incomplete_tool_call(text: str) -> bool:
|
| 298 |
+
"""Check if text has an opening <tool_call_>> tag without a matching close."""
|
| 299 |
+
opens = len(re.findall(r'<tool_call\s+name="[^"]+">', text))
|
| 300 |
+
closes = len(re.findall(r'</tool_call_>', text))
|
| 301 |
+
return opens > closes
|
| 302 |
+
|
| 303 |
+
|
| 304 |
# ββ Message normalization ββββββββββββββββββββββββββββββββββββββββ
|
| 305 |
+
|
| 306 |
+
def normalize_messages(messages: list[dict], tools: list[dict] = None, tool_choice=None) -> list[dict]:
|
| 307 |
+
"""Normalize messages: handle content arrays, tool roles, tool_calls,
|
| 308 |
+
and inject tool definitions into system prompt if tools are provided."""
|
| 309 |
result = []
|
|
|
|
|
|
|
|
|
|
| 310 |
|
| 311 |
+
# If tools provided and tool_choice != "none", inject tool system prompt
|
| 312 |
+
inject_tools = tools and tool_choice != "none"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
|
| 314 |
+
if inject_tools:
|
| 315 |
+
tool_system = _build_tool_system_prompt(tools, tool_choice)
|
| 316 |
+
else:
|
| 317 |
+
tool_system = None
|
| 318 |
|
| 319 |
+
system_injected = False
|
| 320 |
|
| 321 |
+
for msg in messages:
|
| 322 |
+
role = msg.get("role", "user")
|
| 323 |
+
|
| 324 |
+
# Inject tool system prompt before or as the first system message
|
| 325 |
+
if role == "system" and not system_injected and tool_system:
|
| 326 |
+
content = msg.get("content", "")
|
| 327 |
+
if isinstance(content, list):
|
| 328 |
+
content = _flatten_content_array(content)
|
| 329 |
+
content = str(content) if content else ""
|
| 330 |
+
combined = content + "\n\n" + tool_system if content.strip() else tool_system
|
| 331 |
+
result.append({"role": "system", "content": combined})
|
| 332 |
+
system_injected = True
|
| 333 |
continue
|
| 334 |
|
| 335 |
+
result.append(_normalize_one_message(msg))
|
| 336 |
+
|
| 337 |
+
# If no system message existed, add tool system prompt as first message
|
| 338 |
+
if tool_system and not system_injected:
|
| 339 |
+
result.insert(0, {"role": "system", "content": tool_system})
|
| 340 |
+
|
| 341 |
+
# Filter out empty system messages
|
| 342 |
+
result = [m for m in result if not (m.get("role") == "system" and not m.get("content", "").strip())]
|
| 343 |
|
| 344 |
return result
|
| 345 |
|
| 346 |
|
| 347 |
+
def _flatten_content_array(content: list) -> str:
|
| 348 |
+
"""Convert a content array to plain text."""
|
| 349 |
+
text_parts = []
|
| 350 |
+
for part in content:
|
| 351 |
+
if isinstance(part, str):
|
| 352 |
+
text_parts.append(part)
|
| 353 |
+
elif isinstance(part, dict):
|
| 354 |
+
if part.get("type") == "text":
|
| 355 |
+
text_parts.append(part.get("text", ""))
|
| 356 |
+
return "\n".join(text_parts)
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
def _normalize_one_message(msg: dict) -> dict:
|
| 360 |
+
"""Normalize a single message for chatgpt.org API."""
|
| 361 |
+
role = msg.get("role", "user")
|
| 362 |
+
content = msg.get("content", "")
|
| 363 |
+
|
| 364 |
+
# Handle content arrays β plain text
|
| 365 |
+
if isinstance(content, list):
|
| 366 |
+
content = _flatten_content_array(content)
|
| 367 |
+
|
| 368 |
+
if content is None:
|
| 369 |
+
content = ""
|
| 370 |
+
content = str(content)
|
| 371 |
+
|
| 372 |
+
# Handle tool role messages β convert to user message with tool result
|
| 373 |
+
if role == "tool":
|
| 374 |
+
tool_name = msg.get("name", "unknown_tool")
|
| 375 |
+
tool_call_id = msg.get("tool_call_id", "")
|
| 376 |
+
return {
|
| 377 |
+
"role": "user",
|
| 378 |
+
"content": f"[Tool Result for {tool_name} (id: {tool_call_id})]:\n{content}"
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
# Handle assistant messages with tool_calls β text with <tool_call_> blocks
|
| 382 |
+
if role == "assistant" and msg.get("tool_calls"):
|
| 383 |
+
parts = []
|
| 384 |
+
regular_content = content if content and content.strip() else ""
|
| 385 |
+
|
| 386 |
+
if regular_content:
|
| 387 |
+
parts.append(regular_content)
|
| 388 |
+
|
| 389 |
+
for tc in msg["tool_calls"]:
|
| 390 |
+
func = tc.get("function", {})
|
| 391 |
+
name = func.get("name", "unknown")
|
| 392 |
+
args = func.get("arguments", "{}")
|
| 393 |
+
# Validate args is valid JSON
|
| 394 |
+
try:
|
| 395 |
+
json.loads(args)
|
| 396 |
+
except (json.JSONDecodeError, TypeError):
|
| 397 |
+
args = "{}"
|
| 398 |
+
parts.append(f'<tool_call name="{name}">\n{args}\n</tool_call_>')
|
| 399 |
+
|
| 400 |
+
return {"role": "assistant", "content": "\n\n".join(parts)}
|
| 401 |
+
|
| 402 |
+
# System messages with empty content get filtered out later
|
| 403 |
+
if role == "system" and not content.strip():
|
| 404 |
+
return {"role": "system", "content": ""}
|
| 405 |
+
|
| 406 |
+
return {"role": role, "content": content}
|
| 407 |
+
|
| 408 |
+
|
| 409 |
+
# ββ Headers ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 410 |
+
|
| 411 |
def _headers() -> dict:
|
| 412 |
h = {
|
| 413 |
"Accept": "*/*",
|
|
|
|
| 424 |
|
| 425 |
|
| 426 |
# ββ Proxy-aware request with retry ββββββββββββββββββββββββββββββ
|
| 427 |
+
|
| 428 |
async def _proxy_post(url: str, **kwargs) -> httpx.Response:
|
| 429 |
"""POST with proxy retry logic. Creates new client on each retry to get fresh IP."""
|
| 430 |
global http_client
|
|
|
|
| 432 |
for attempt in range(PROXY_MAX_RETRIES):
|
| 433 |
try:
|
| 434 |
resp = await http_client.post(url, **kwargs)
|
|
|
|
|
|
|
| 435 |
return resp
|
| 436 |
|
| 437 |
except (httpx.ConnectError, httpx.ProxyError, httpx.TimeoutException) as e:
|
| 438 |
print(f"[Proxy] Connection error #{attempt+1}: {type(e).__name__}")
|
|
|
|
| 439 |
if PROXY_URL:
|
| 440 |
try:
|
| 441 |
await http_client.aclose()
|
| 442 |
except:
|
| 443 |
pass
|
| 444 |
http_client = _make_client()
|
|
|
|
| 445 |
await asyncio.sleep(PROXY_RETRY_DELAY)
|
| 446 |
else:
|
| 447 |
await asyncio.sleep(2)
|
| 448 |
continue
|
| 449 |
|
|
|
|
| 450 |
return await http_client.post(url, **kwargs)
|
| 451 |
|
| 452 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 453 |
# ββ Raw call with retries βββββββββββββββββββββββββββββββββββββββ
|
| 454 |
+
|
| 455 |
async def _raw_call(messages: list[dict], model: str) -> httpx.Response:
|
| 456 |
"""Make a single POST to chatgpt.org/api/chat with full retry logic."""
|
| 457 |
await session.refresh(http_client)
|
|
|
|
| 468 |
)
|
| 469 |
|
| 470 |
if resp.status_code == 419 and attempt == 0:
|
| 471 |
+
print("[Chat] 419 -> refreshing session...")
|
| 472 |
session.last_refresh = 0
|
| 473 |
await session.refresh(http_client)
|
| 474 |
break
|
|
|
|
| 493 |
|
| 494 |
|
| 495 |
async def _stream_one_response(resp):
|
| 496 |
+
"""Stream a single upstream SSE response in real-time.
|
| 497 |
+
Yields (text, finish_reason) tuples. finish_reason is None for text chunks."""
|
| 498 |
finish_reason = None
|
| 499 |
|
| 500 |
async for raw_line in resp.aiter_lines():
|
|
|
|
| 532 |
# ββ Streaming with auto-continue ββββββββββββββββββββββββββββββββ
|
| 533 |
MAX_CONTINUATIONS = 20
|
| 534 |
|
| 535 |
+
|
| 536 |
async def _raw_call_streaming(messages: list[dict], model: str):
|
| 537 |
+
"""Like _raw_call but yields SSE keep-alive comments during retries,
|
| 538 |
+
then yields the httpx.Response object."""
|
| 539 |
await session.refresh(http_client)
|
| 540 |
payload = {"model": model, "messages": messages}
|
| 541 |
|
| 542 |
for attempt in range(2): # CSRF retry
|
| 543 |
for rate_attempt in range(3): # 429 retry
|
|
|
|
| 544 |
yield ": thinking...\n\n"
|
| 545 |
|
| 546 |
resp = await _proxy_post(
|
|
|
|
| 551 |
)
|
| 552 |
|
| 553 |
if resp.status_code == 419 and attempt == 0:
|
| 554 |
+
print("[Chat] 419 -> refreshing session...")
|
| 555 |
session.last_refresh = 0
|
| 556 |
await session.refresh(http_client)
|
| 557 |
break
|
|
|
|
| 578 |
raise HTTPException(500, "Failed after retry")
|
| 579 |
|
| 580 |
|
| 581 |
+
async def _stream_with_auto_continue(messages: list[dict], model: str, has_tools: bool = False):
|
| 582 |
+
"""Stream with real-time output, auto-continue, and keep-alive pings.
|
| 583 |
+
|
| 584 |
+
When has_tools is True, we buffer the full response to properly detect
|
| 585 |
+
and format tool calls, sending keep-alive pings while buffering.
|
| 586 |
+
When has_tools is False, we stream text in real-time.
|
| 587 |
+
"""
|
| 588 |
chunk_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
|
| 589 |
created = int(time.time())
|
| 590 |
conversation = list(messages)
|
|
|
|
| 614 |
if text:
|
| 615 |
chunk_content += text
|
| 616 |
total_content += text
|
| 617 |
+
|
| 618 |
+
# If no tools, stream text in real-time
|
| 619 |
+
if not has_tools:
|
| 620 |
+
sse_data = json.dumps({
|
| 621 |
+
"id": chunk_id,
|
| 622 |
+
"object": "chat.completion.chunk",
|
| 623 |
+
"created": created,
|
| 624 |
+
"model": model,
|
| 625 |
+
"choices": [{
|
| 626 |
+
"index": 0,
|
| 627 |
+
"delta": {"content": text},
|
| 628 |
+
"finish_reason": None,
|
| 629 |
+
}],
|
| 630 |
+
})
|
| 631 |
+
yield f"data: {sse_data}\n\n"
|
| 632 |
+
|
| 633 |
+
print(f"[Chat] Chunk #{cont_num+1}: {len(chunk_content)} chars, finish={finish_reason}")
|
| 634 |
+
|
| 635 |
+
# Check for tool calls
|
| 636 |
+
if has_tools:
|
| 637 |
+
tool_calls, remaining_text = _parse_tool_calls(total_content)
|
| 638 |
+
|
| 639 |
+
if tool_calls:
|
| 640 |
+
# Emit tool calls as OpenAI streaming chunks
|
| 641 |
+
for i, tc in enumerate(tool_calls):
|
| 642 |
+
# First chunk: role + tool_call with id, name, and start of arguments
|
| 643 |
+
sse_start = json.dumps({
|
| 644 |
+
"id": chunk_id,
|
| 645 |
+
"object": "chat.completion.chunk",
|
| 646 |
+
"created": created,
|
| 647 |
+
"model": model,
|
| 648 |
+
"choices": [{
|
| 649 |
+
"index": 0,
|
| 650 |
+
"delta": {
|
| 651 |
+
"role": "assistant",
|
| 652 |
+
"tool_calls": [{
|
| 653 |
+
"index": i,
|
| 654 |
+
"id": tc["id"],
|
| 655 |
+
"type": "function",
|
| 656 |
+
"function": {
|
| 657 |
+
"name": tc["function"]["name"],
|
| 658 |
+
"arguments": "",
|
| 659 |
+
}
|
| 660 |
+
}]
|
| 661 |
+
},
|
| 662 |
+
"finish_reason": None,
|
| 663 |
+
}],
|
| 664 |
+
})
|
| 665 |
+
yield f"data: {sse_start}\n\n"
|
| 666 |
+
|
| 667 |
+
# Argument chunks β split into small pieces for streaming feel
|
| 668 |
+
args = tc["function"]["arguments"]
|
| 669 |
+
chunk_size = max(1, len(args) // 3)
|
| 670 |
+
for offset in range(0, len(args), chunk_size):
|
| 671 |
+
arg_piece = args[offset:offset + chunk_size]
|
| 672 |
+
sse_arg = json.dumps({
|
| 673 |
+
"id": chunk_id,
|
| 674 |
+
"object": "chat.completion.chunk",
|
| 675 |
+
"created": created,
|
| 676 |
+
"model": model,
|
| 677 |
+
"choices": [{
|
| 678 |
+
"index": 0,
|
| 679 |
+
"delta": {
|
| 680 |
+
"tool_calls": [{
|
| 681 |
+
"index": i,
|
| 682 |
+
"function": {
|
| 683 |
+
"arguments": arg_piece,
|
| 684 |
+
}
|
| 685 |
+
}]
|
| 686 |
+
},
|
| 687 |
+
"finish_reason": None,
|
| 688 |
+
}],
|
| 689 |
+
})
|
| 690 |
+
yield f"data: {sse_arg}\n\n"
|
| 691 |
+
|
| 692 |
+
# If there's remaining text alongside tool calls, emit it too
|
| 693 |
+
if remaining_text.strip():
|
| 694 |
+
sse_text = json.dumps({
|
| 695 |
+
"id": chunk_id,
|
| 696 |
+
"object": "chat.completion.chunk",
|
| 697 |
+
"created": created,
|
| 698 |
+
"model": model,
|
| 699 |
+
"choices": [{
|
| 700 |
+
"index": 0,
|
| 701 |
+
"delta": {"content": remaining_text},
|
| 702 |
+
"finish_reason": None,
|
| 703 |
+
}],
|
| 704 |
+
})
|
| 705 |
+
yield f"data: {sse_text}\n\n"
|
| 706 |
+
|
| 707 |
+
# Final chunk with finish_reason
|
| 708 |
+
sse_done = json.dumps({
|
| 709 |
+
"id": chunk_id,
|
| 710 |
+
"object": "chat.completion.chunk",
|
| 711 |
+
"created": created,
|
| 712 |
+
"model": model,
|
| 713 |
+
"choices": [{
|
| 714 |
+
"index": 0,
|
| 715 |
+
"delta": {},
|
| 716 |
+
"finish_reason": "tool_calls",
|
| 717 |
+
}],
|
| 718 |
+
})
|
| 719 |
+
yield f"data: {sse_done}\n\n"
|
| 720 |
+
yield "data: [DONE]\n\n"
|
| 721 |
+
return
|
| 722 |
+
|
| 723 |
+
# No tool calls found β if text is complete, stream it as content
|
| 724 |
+
if finish_reason == "stop":
|
| 725 |
+
# Stream the buffered text content as chunks
|
| 726 |
+
text_to_stream = total_content
|
| 727 |
+
chunk_sz = 50 # characters per streaming chunk
|
| 728 |
+
for offset in range(0, len(text_to_stream), chunk_sz):
|
| 729 |
+
piece = text_to_stream[offset:offset + chunk_sz]
|
| 730 |
+
sse_data = json.dumps({
|
| 731 |
+
"id": chunk_id,
|
| 732 |
+
"object": "chat.completion.chunk",
|
| 733 |
+
"created": created,
|
| 734 |
+
"model": model,
|
| 735 |
+
"choices": [{
|
| 736 |
+
"index": 0,
|
| 737 |
+
"delta": {"content": piece},
|
| 738 |
+
"finish_reason": None,
|
| 739 |
+
}],
|
| 740 |
+
})
|
| 741 |
+
yield f"data: {sse_data}\n\n"
|
| 742 |
+
|
| 743 |
sse_data = json.dumps({
|
| 744 |
"id": chunk_id,
|
| 745 |
"object": "chat.completion.chunk",
|
|
|
|
| 747 |
"model": model,
|
| 748 |
"choices": [{
|
| 749 |
"index": 0,
|
| 750 |
+
"delta": {},
|
| 751 |
+
"finish_reason": "stop",
|
| 752 |
}],
|
| 753 |
})
|
| 754 |
yield f"data: {sse_data}\n\n"
|
| 755 |
+
yield "data: [DONE]\n\n"
|
| 756 |
+
return
|
| 757 |
|
| 758 |
+
else:
|
| 759 |
+
# No tools β original behavior
|
| 760 |
+
if finish_reason == "stop":
|
| 761 |
+
sse_data = json.dumps({
|
| 762 |
+
"id": chunk_id,
|
| 763 |
+
"object": "chat.completion.chunk",
|
| 764 |
+
"created": created,
|
| 765 |
+
"model": model,
|
| 766 |
+
"choices": [{
|
| 767 |
+
"index": 0,
|
| 768 |
+
"delta": {},
|
| 769 |
+
"finish_reason": "stop",
|
| 770 |
+
}],
|
| 771 |
+
})
|
| 772 |
+
yield f"data: {sse_data}\n\n"
|
| 773 |
+
yield "data: [DONE]\n\n"
|
| 774 |
+
return
|
| 775 |
|
| 776 |
+
# Auto-continue for length-limited responses
|
| 777 |
yield ": continuing...\n\n"
|
| 778 |
|
| 779 |
+
# Check if we're in the middle of a tool call
|
| 780 |
+
if _has_incomplete_tool_call(chunk_content):
|
| 781 |
+
conversation.append({"role": "assistant", "content": chunk_content})
|
| 782 |
+
conversation.append({"role": "user", "content": "Continue the tool call exactly from where you left off. Do not repeat the opening tag or any arguments you already wrote."})
|
| 783 |
+
else:
|
| 784 |
+
conversation.append({"role": "assistant", "content": chunk_content})
|
| 785 |
+
conversation.append({"role": "user", "content": "Continue exactly from where you left off. Do not repeat any text you already wrote."})
|
| 786 |
|
| 787 |
print(f"[Chat] Auto-continue #{cont_num+1}, total so far: {len(total_content)} chars")
|
| 788 |
|
| 789 |
+
# Safety: max continuations reached
|
| 790 |
sse_data = json.dumps({
|
| 791 |
"id": chunk_id,
|
| 792 |
"object": "chat.completion.chunk",
|
|
|
|
| 803 |
|
| 804 |
|
| 805 |
# ββ Non-streaming with auto-continue ββββββββββββββββββββββββββββ
|
| 806 |
+
|
| 807 |
+
async def _collect_with_auto_continue(messages: list[dict], model: str, has_tools: bool = False) -> dict:
|
| 808 |
+
"""Collect the full response, auto-continuing if cut off.
|
| 809 |
+
Returns a dict with either 'content' or 'tool_calls' key."""
|
| 810 |
conversation = list(messages)
|
| 811 |
full_content = ""
|
| 812 |
|
|
|
|
| 825 |
full_content += content
|
| 826 |
print(f"[Chat] Collect #{cont_num+1}: {len(content)} chars, finish={finish_reason}")
|
| 827 |
|
| 828 |
+
# Check for tool calls if tools were provided
|
| 829 |
+
if has_tools:
|
| 830 |
+
tool_calls, remaining_text = _parse_tool_calls(full_content)
|
| 831 |
+
|
| 832 |
+
if tool_calls:
|
| 833 |
+
result = {
|
| 834 |
+
"tool_calls": tool_calls,
|
| 835 |
+
"content": remaining_text if remaining_text.strip() else None,
|
| 836 |
+
}
|
| 837 |
+
# If there are incomplete tool calls, continue
|
| 838 |
+
if _has_incomplete_tool_call(full_content) and finish_reason == "length":
|
| 839 |
+
pass # fall through to auto-continue
|
| 840 |
+
else:
|
| 841 |
+
return result
|
| 842 |
+
|
| 843 |
if finish_reason == "stop":
|
| 844 |
+
if has_tools:
|
| 845 |
+
# No tool calls found, return as text
|
| 846 |
+
return {"content": full_content, "tool_calls": None}
|
| 847 |
+
return {"content": full_content, "tool_calls": None}
|
| 848 |
|
| 849 |
+
# Auto-continue
|
| 850 |
+
if _has_incomplete_tool_call(content):
|
| 851 |
+
conversation.append({"role": "assistant", "content": content})
|
| 852 |
+
conversation.append({"role": "user", "content": "Continue the tool call exactly from where you left off. Do not repeat the opening tag or any arguments you already wrote."})
|
| 853 |
+
else:
|
| 854 |
+
conversation.append({"role": "assistant", "content": content})
|
| 855 |
+
conversation.append({"role": "user", "content": "Continue exactly from where you left off. Do not repeat any text you already wrote."})
|
| 856 |
|
| 857 |
+
return {"content": full_content, "tool_calls": None}
|
| 858 |
|
| 859 |
|
| 860 |
# ββ OpenAI-compatible endpoint ββββββββββββββββββββββββββββββββββ
|
| 861 |
+
|
| 862 |
@app.post("/v1/chat/completions")
|
| 863 |
@app.post("/chat/completions")
|
| 864 |
async def chat_completions(request: Request):
|
|
|
|
| 873 |
model = body.get("model", "anthropic/claude-haiku-4-5")
|
| 874 |
messages_raw = body.get("messages", [])
|
| 875 |
stream = body.get("stream", False)
|
| 876 |
+
tools = body.get("tools") or None
|
| 877 |
+
tool_choice = body.get("tool_choice", "auto")
|
| 878 |
|
| 879 |
if not messages_raw or not isinstance(messages_raw, list):
|
| 880 |
raise HTTPException(400, "messages must be a non-empty array")
|
| 881 |
|
| 882 |
+
has_tools = bool(tools) and tool_choice != "none"
|
| 883 |
+
|
| 884 |
+
messages = normalize_messages(messages_raw, tools=tools, tool_choice=tool_choice)
|
| 885 |
|
| 886 |
if not messages:
|
| 887 |
raise HTTPException(400, "No valid messages after normalization")
|
| 888 |
|
| 889 |
if stream:
|
| 890 |
return StreamingResponse(
|
| 891 |
+
_stream_with_auto_continue(messages, model, has_tools=has_tools),
|
| 892 |
media_type="text/event-stream",
|
| 893 |
headers={
|
| 894 |
"Cache-Control": "no-cache",
|
|
|
|
| 897 |
},
|
| 898 |
)
|
| 899 |
else:
|
| 900 |
+
result = await _collect_with_auto_continue(messages, model, has_tools=has_tools)
|
| 901 |
+
|
| 902 |
+
tool_calls = result.get("tool_calls")
|
| 903 |
+
content = result.get("content")
|
| 904 |
+
|
| 905 |
+
if tool_calls:
|
| 906 |
+
return JSONResponse({
|
| 907 |
+
"id": f"chatcmpl-{int(time.time())}",
|
| 908 |
+
"object": "chat.completion",
|
| 909 |
+
"created": int(time.time()),
|
| 910 |
+
"model": model,
|
| 911 |
+
"choices": [{
|
| 912 |
+
"index": 0,
|
| 913 |
+
"message": {
|
| 914 |
+
"role": "assistant",
|
| 915 |
+
"content": content,
|
| 916 |
+
"tool_calls": tool_calls,
|
| 917 |
+
},
|
| 918 |
+
"finish_reason": "tool_calls",
|
| 919 |
+
}],
|
| 920 |
+
"usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
|
| 921 |
+
})
|
| 922 |
+
else:
|
| 923 |
+
return JSONResponse({
|
| 924 |
+
"id": f"chatcmpl-{int(time.time())}",
|
| 925 |
+
"object": "chat.completion",
|
| 926 |
+
"created": int(time.time()),
|
| 927 |
+
"model": model,
|
| 928 |
+
"choices": [{
|
| 929 |
+
"index": 0,
|
| 930 |
+
"message": {"role": "assistant", "content": content or ""},
|
| 931 |
+
"finish_reason": "stop",
|
| 932 |
+
}],
|
| 933 |
+
"usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
|
| 934 |
+
})
|
| 935 |
|
| 936 |
|
| 937 |
# ββ Models / Health βββββββββββββββββββββββββββββββββββββββββββββ
|
| 938 |
+
|
| 939 |
@app.get("/v1/models")
|
| 940 |
@app.get("/models")
|
| 941 |
async def list_models():
|
|
|
|
| 951 |
async def root():
|
| 952 |
return {
|
| 953 |
"status": "ok",
|
| 954 |
+
"version": "5.0.0",
|
| 955 |
"proxy": bool(PROXY_URL),
|
| 956 |
+
"tool_calling": True,
|
| 957 |
"endpoints": ["/v1/chat/completions", "/v1/models"],
|
| 958 |
}
|
| 959 |
|