""" Haiku API - OpenAI-compatible proxy for chatgpt.org/claude/chat Deploy to Hugging Face Spaces (Docker SDK) Features: - Tool/function calling support (always detects tool call tags in output) - Auto-continues when upstream hits the ~1K token output limit - Rotating proxy with direct-connection fallback - SSE keep-alive comments during continuation gaps - Message normalization for Orchids.app compatibility - Robust error handling with proper JSON error responses """ import asyncio import json import os import re import time import uuid import traceback from typing import Optional from urllib.parse import unquote import httpx from fastapi import FastAPI, HTTPException, Request from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import StreamingResponse, JSONResponse app = FastAPI(title="Haiku API", version="8.1.0") # ── CORS ───────────────────────────────────────────────────────── app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # ── Proxy Config ───────────────────────────────────────────────── PROXY_URL = os.environ.get("PROXY_URL", "") PROXY_MAX_RETRIES = 4 # rotating proxy: try a few IPs PROXY_RETRY_DELAY = 1 # seconds between proxy retries CONNECT_TIMEOUT = 10.0 # short connect timeout READ_TIMEOUT = 120.0 # long read timeout (for streaming responses) def _make_client(use_proxy: bool = True) -> httpx.AsyncClient: """Create an httpx client, with or without proxy.""" kwargs = dict( verify=False, timeout=httpx.Timeout(READ_TIMEOUT, connect=CONNECT_TIMEOUT), ) if use_proxy and PROXY_URL: kwargs["proxy"] = PROXY_URL return httpx.AsyncClient(**kwargs) # ── Session State ──────────────────────────────────────────────── class SessionState: def __init__(self): self.xsrf_token: Optional[str] = None self.csrf_token: Optional[str] = None self.cookies: Optional[httpx.Cookies] = None self.last_refresh: float = 0 self.refresh_interval: float = 600 self._lock = asyncio.Lock() async def refresh(self, client: httpx.AsyncClient): async with self._lock: now = time.time() if self.cookies and (now - self.last_refresh) < self.refresh_interval: return # Try with proxy first, then fallback to direct for use_proxy in [True, False]: if use_proxy and not PROXY_URL: continue working_client = client for attempt in range(PROXY_MAX_RETRIES if use_proxy else 2): try: if attempt > 0: try: await working_client.aclose() except: pass working_client = _make_client(use_proxy=use_proxy) resp = await working_client.get( "https://chatgpt.org/claude/chat", follow_redirects=True, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", }, timeout=20.0, ) if resp.status_code != 200: print(f"[Session] GET returned {resp.status_code} (proxy={use_proxy}, attempt {attempt+1})") await asyncio.sleep(PROXY_RETRY_DELAY) continue new_cookies = httpx.Cookies() for name, value in resp.cookies.items(): new_cookies.set(name, value, domain="chatgpt.org") for header in resp.headers.get_list("set-cookie"): parts = header.split(";")[0] if "=" in parts: k, v = parts.split("=", 1) new_cookies.set(k.strip(), v.strip(), domain="chatgpt.org") xsrf = new_cookies.get("XSRF-TOKEN", domain="chatgpt.org") if xsrf: xsrf = unquote(xsrf) csrf = None m = re.search(r'{"file_path": "hello.js", "content": "hi"} # {"file_path": "hello.js"} # # Format 2 — Anthropic XML (Claude's native format): # # # hello.js # console.log("hi") # # # Regex for Format 1: inline JSON tool calls _TOOL_CALL_INLINE_RE = re.compile( r'<(?:function_call|tool_call)\s+name="([^"]+)">\s*(.*?)\s*', re.DOTALL ) # Regex for Format 2: Anthropic XML function_calls blocks _ANTHROPIC_FC_BLOCK_RE = re.compile( r'\s*(.*?)\s*', re.DOTALL ) # Within a block, match each ... _ANTHROPIC_INVOKE_RE = re.compile( r'\s*(.*?)\s*', re.DOTALL ) # Within an invoke, match each value _ANTHROPIC_PARAM_RE = re.compile( r'(.*?)', re.DOTALL ) def _parse_json_args(args_str: str) -> str: """Try to parse arguments as JSON, with fallbacks. Returns JSON string.""" try: args_json = json.loads(args_str) return json.dumps(args_json) except json.JSONDecodeError: pass # Try to fix common issues args_cleaned = args_str.strip('`').strip() if args_cleaned.startswith('json'): args_cleaned = args_cleaned[4:].strip() try: args_json = json.loads(args_cleaned) return json.dumps(args_json) except json.JSONDecodeError: pass # Last resort: wrap the raw text as an argument return json.dumps({"raw_input": args_str}) def _parse_tool_calls(text: str) -> tuple[list[dict], str]: """Parse tool calls from model text output. Supports two formats: 1. Inline JSON: JSON 2. Anthropic XML: v Returns (tool_calls, remaining_text) where tool_calls is in OpenAI format. If no tool calls found, returns ([], original_text). """ tool_calls = [] consumed_spans = [] # --- Format 1: Inline JSON tool calls --- for match in _TOOL_CALL_INLINE_RE.finditer(text): func_name = match.group(1) args_str = match.group(2).strip() args_final = _parse_json_args(args_str) tool_calls.append({ "id": f"call_{uuid.uuid4().hex[:24]}", "type": "function", "function": { "name": func_name, "arguments": args_final, } }) consumed_spans.append((match.start(), match.end())) # --- Format 2: Anthropic XML function_calls --- for block_match in _ANTHROPIC_FC_BLOCK_RE.finditer(text): block_text = block_match.group(1) consumed_spans.append((block_match.start(), block_match.end())) for invoke_match in _ANTHROPIC_INVOKE_RE.finditer(block_text): func_name = invoke_match.group(1) invoke_body = invoke_match.group(2) params = {} for param_match in _ANTHROPIC_PARAM_RE.finditer(invoke_body): param_name = param_match.group(1) param_value = param_match.group(2) try: params[param_name] = json.loads(param_value) except (json.JSONDecodeError, ValueError): params[param_name] = param_value tool_calls.append({ "id": f"call_{uuid.uuid4().hex[:24]}", "type": "function", "function": { "name": func_name, "arguments": json.dumps(params), } }) if not tool_calls: return [], text # Extract remaining text (not part of any tool call) remaining_parts = [] prev_end = 0 for start, end in sorted(consumed_spans): if start > prev_end: chunk = text[prev_end:start].strip() if chunk: remaining_parts.append(chunk) prev_end = max(prev_end, end) if prev_end < len(text): chunk = text[prev_end:].strip() if chunk: remaining_parts.append(chunk) remaining_text = "\n".join(remaining_parts) return tool_calls, remaining_text def _has_incomplete_tool_call(text: str) -> bool: """Check if text has an opening tool call tag without a matching close.""" # Inline format inline_opens = len(re.findall(r'<(?:function_call|tool_call)\s+name="[^"]+">', text)) inline_closes = len(re.findall(r'', text)) if inline_opens > inline_closes: return True # Anthropic XML format if text.count('') > text.count(''): return True invoke_opens = len(re.findall(r'', text)) if invoke_opens > text.count(''): return True return False def _strip_incomplete_tool_tags(text: str) -> str: """Remove incomplete tool call XML tags from text. This prevents raw XML tags from leaking into delta.content when auto-continue fails to complete a tool call.""" # Remove incomplete Anthropic XML blocks # e.g. "\n\nsome unfinished..." text = re.sub( r'\s*]*>.*', '', text, flags=re.DOTALL ) # Remove incomplete inline JSON tool calls text = re.sub( r'<(?:function_call|tool_call)\s+name="[^"]+">.*', '', text, flags=re.DOTALL ) # Remove any stray opening/closing tags text = re.sub(r'\s*', '', text) text = re.sub(r']*>\s*', '', text) text = re.sub(r']*>\s*', '', text) text = re.sub(r'\s*', '', text) return text.strip() # ── Tool System Prompt Builder ────────────────────────────────── def _build_tool_system_prompt(tools: list[dict], tool_choice=None) -> str: """Convert OpenAI tools/functions format to a system prompt using Anthropic XML format — the format Claude natively understands.""" invoke_blocks = [] tool_names = [] for tool in tools: if "function" in tool: func = tool["function"] else: func = tool name = func.get("name", "unknown") desc = func.get("description", "No description") params = func.get("parameters", {}) tool_names.append(name) props = params.get("properties", {}) required = params.get("required", []) param_lines = [] for pname, pdef in props.items(): ptype = pdef.get("type", "any") pdesc = pdef.get("description", "") req = " (required)" if pname in required else "" param_lines.append(f'{ptype}{req} — {pdesc}') params_xml = '\n'.join(param_lines) if param_lines else '' invoke_blocks.append(f""" {desc} Parameters: {params_xml} """) tools_xml = '\n\n'.join(invoke_blocks) choice_instruction = "" if tool_choice == "required": choice_instruction = "\nIMPORTANT: You MUST call at least one tool." elif tool_choice == "none": return "" elif isinstance(tool_choice, dict) and tool_choice.get("type") == "function": fname = tool_choice.get("function", {}).get("name", "") choice_instruction = f"\nIMPORTANT: You MUST call the {fname} function." return f"""In this environment you have access to a set of tools you can use to answer the user's question. {tools_xml} ## Tool Call Format When you need to call a tool, use this EXACT XML format: value You may call multiple tools by using multiple blocks inside a single block, or by using multiple blocks. - The parameter values should be the actual values, NOT JSON-encoded strings - Do NOT wrap tool calls in markdown code blocks - If you don't need to call any tools, just respond normally with text{choice_instruction}""" # ── Message normalization ──────────────────────────────────────── def _flatten_content_array(content: list) -> str: """Convert a content array to plain text.""" text_parts = [] for part in content: if isinstance(part, str): text_parts.append(part) elif isinstance(part, dict): if part.get("type") == "text": text_parts.append(part.get("text", "")) return "\n".join(text_parts) def normalize_messages(messages: list[dict], tools: list[dict] = None, tool_choice=None) -> list[dict]: """Normalize messages: handle content arrays, tool roles, tool_calls, and inject tool definitions into system prompt if tools are provided.""" result = [] tool_system = None if tools and tool_choice != "none": tool_system = _build_tool_system_prompt(tools, tool_choice) system_injected = False for msg in messages: role = msg.get("role", "user") content = msg.get("content", "") if isinstance(content, list): content = _flatten_content_array(content) if content is None: content = "" content = str(content) # Handle tool role messages if role == "tool": tool_name = msg.get("name", "unknown_tool") tool_call_id = msg.get("tool_call_id", "") result.append({ "role": "user", "content": f"[Tool Result for {tool_name} (id: {tool_call_id})]:\n{content}" }) continue # Handle assistant messages with tool_calls if role == "assistant" and msg.get("tool_calls"): parts = [] regular_content = content if content and content.strip() else "" if regular_content: parts.append(regular_content) invoke_parts = [] for tc in msg["tool_calls"]: func = tc.get("function", {}) name = func.get("name", "unknown") args = func.get("arguments", "{}") try: args_json = json.loads(args) except (json.JSONDecodeError, TypeError): args_json = {} invoke_lines = [f''] for k, v in args_json.items(): invoke_lines.append(f'{v}') invoke_lines.append('') invoke_parts.append('\n'.join(invoke_lines)) fc_content = '\n' + '\n'.join(invoke_parts) + '\n' combined = regular_content + '\n\n' + fc_content if regular_content else fc_content result.append({"role": "assistant", "content": combined}) continue # Inject tool system prompt into the first system message if role == "system" and not system_injected and tool_system: combined = content + '\n\n' + tool_system if content.strip() else tool_system result.append({"role": "system", "content": combined}) system_injected = True continue if role == "system" and not content.strip(): continue result.append({"role": role, "content": content}) if tool_system and not system_injected: result.insert(0, {"role": "system", "content": tool_system}) return result # ── Headers ────────────────────────────────────────────────────── def _headers() -> dict: h = { "Accept": "*/*", "Content-Type": "application/json", "Origin": "https://chatgpt.org", "Referer": "https://chatgpt.org/claude/chat", "X-Requested-With": "XMLHttpRequest", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36", } csrf = session.csrf_token or session.xsrf_token if csrf: h["X-CSRF-TOKEN"] = csrf return h # ── Proxy-aware request with retry + direct fallback ────────────── async def _proxy_post(url: str, **kwargs) -> httpx.Response: """POST with proxy retry logic, falling back to direct connection.""" global http_client # Try with proxy first if PROXY_URL: for attempt in range(PROXY_MAX_RETRIES): try: resp = await http_client.post(url, **kwargs) return resp except (httpx.ConnectError, httpx.ProxyError, httpx.TimeoutException) as e: print(f"[Proxy] Connection error #{attempt+1}: {type(e).__name__}") try: await http_client.aclose() except: pass http_client = _make_client(use_proxy=True) await asyncio.sleep(PROXY_RETRY_DELAY) continue # Fallback: try direct connection print("[Proxy] Falling back to direct connection") direct_client = _make_client(use_proxy=False) try: resp = await direct_client.post(url, **kwargs) return resp finally: await direct_client.aclose() # ── Raw call with retries ─────────────────────────────────────── async def _raw_call(messages: list[dict], model: str) -> httpx.Response: """Make a single POST to chatgpt.org/api/chat with full retry logic.""" await session.refresh(http_client) payload = {"model": model, "messages": messages} for attempt in range(2): # CSRF retry for rate_attempt in range(3): # 429 retry try: resp = await _proxy_post( "https://chatgpt.org/api/chat", json=payload, headers=_headers(), cookies=session.cookies, ) except (httpx.ConnectError, httpx.ProxyError, httpx.TimeoutException) as e: print(f"[Chat] Connection failed: {type(e).__name__}") session.last_refresh = 0 raise HTTPException(502, f"Cannot reach upstream: {type(e).__name__}") if resp.status_code == 419 and attempt == 0: print("[Chat] 419 -> refreshing session...") session.last_refresh = 0 await session.refresh(http_client) break if resp.status_code == 429: wait_time = (rate_attempt + 1) * 10 print(f"[Chat] 429 rate limited, waiting {wait_time}s (attempt {rate_attempt+1}/3)...") session.last_refresh = 0 await session.refresh(http_client) if rate_attempt < 2: await asyncio.sleep(wait_time) continue raise HTTPException(429, f"Rate limited by upstream after {rate_attempt+1} retries") if resp.status_code != 200: session.last_refresh = 0 raise HTTPException(resp.status_code, f"Upstream {resp.status_code}: {resp.text[:300]}") return resp raise HTTPException(500, "Failed after retry") async def _stream_one_response(resp): """Stream a single upstream SSE response in real-time. Yields (text, finish_reason) tuples. finish_reason is None for text chunks.""" finish_reason = None try: async for raw_line in resp.aiter_lines(): line = raw_line.strip() if not line or line.startswith(":"): continue if not line.startswith("data: "): continue payload_str = line[6:] if payload_str.strip() == "[DONE]": break try: chunk = json.loads(payload_str) except json.JSONDecodeError: continue for choice in chunk.get("choices", []): delta = choice.get("delta", {}) c = delta.get("content", "") if c: yield c, None fr = choice.get("finish_reason") if fr: if fr in ("stop", "end_turn"): finish_reason = "stop" elif fr in ("length", "max_tokens"): finish_reason = "length" except (httpx.ReadError, httpx.RemoteProtocolError) as e: print(f"[Stream] Connection lost during streaming: {type(e).__name__}") except Exception as e: print(f"[Stream] Error during streaming: {type(e).__name__}: {e}") yield "", finish_reason # ── Streaming with auto-continue ──────────────────────────────── MAX_CONTINUATIONS = 20 async def _raw_call_streaming(messages: list[dict], model: str): """Like _raw_call but yields SSE keep-alive comments during retries, then yields the httpx.Response object.""" await session.refresh(http_client) payload = {"model": model, "messages": messages} for attempt in range(2): # CSRF retry for rate_attempt in range(3): # 429 retry yield ": thinking...\n\n" try: resp = await _proxy_post( "https://chatgpt.org/api/chat", json=payload, headers=_headers(), cookies=session.cookies, ) except (httpx.ConnectError, httpx.ProxyError, httpx.TimeoutException) as e: print(f"[Chat] Connection failed: {type(e).__name__}") session.last_refresh = 0 raise HTTPException(502, f"Cannot reach upstream: {type(e).__name__}") if resp.status_code == 419 and attempt == 0: print("[Chat] 419 -> refreshing session...") session.last_refresh = 0 await session.refresh(http_client) break if resp.status_code == 429: wait_time = (rate_attempt + 1) * 10 print(f"[Chat] 429 rate limited, waiting {wait_time}s (attempt {rate_attempt+1}/3)...") session.last_refresh = 0 await session.refresh(http_client) if rate_attempt < 2: for _ in range(wait_time): yield ": retrying...\n\n" await asyncio.sleep(1) continue raise HTTPException(429, f"Rate limited after {rate_attempt+1} retries") if resp.status_code != 200: session.last_refresh = 0 raise HTTPException(resp.status_code, f"Upstream {resp.status_code}: {resp.text[:300]}") yield resp return raise HTTPException(500, "Failed after retry") def _emit_tool_call_chunks(chunk_id: str, created: int, model: str, tool_calls: list[dict], remaining_text: str): """Generate OpenAI streaming chunks for tool calls. Returns list of SSE strings.""" chunks = [] for i, tc in enumerate(tool_calls): # First chunk: role + tool_call with id, name, and start of arguments sse_start = json.dumps({ "id": chunk_id, "object": "chat.completion.chunk", "created": created, "model": model, "choices": [{ "index": 0, "delta": { "role": "assistant", "tool_calls": [{ "index": i, "id": tc["id"], "type": "function", "function": { "name": tc["function"]["name"], "arguments": "", } }] }, "finish_reason": None, }], }) chunks.append(f"data: {sse_start}\n\n") # Argument chunks args = tc["function"]["arguments"] chunk_size = max(1, len(args) // 3) for offset in range(0, len(args), chunk_size): arg_piece = args[offset:offset + chunk_size] sse_arg = json.dumps({ "id": chunk_id, "object": "chat.completion.chunk", "created": created, "model": model, "choices": [{ "index": 0, "delta": { "tool_calls": [{ "index": i, "function": { "arguments": arg_piece, } }] }, "finish_reason": None, }], }) chunks.append(f"data: {sse_arg}\n\n") # Remaining text alongside tool calls if remaining_text.strip(): sse_text = json.dumps({ "id": chunk_id, "object": "chat.completion.chunk", "created": created, "model": model, "choices": [{ "index": 0, "delta": {"content": remaining_text}, "finish_reason": None, }], }) chunks.append(f"data: {sse_text}\n\n") # Final chunk with finish_reason sse_done = json.dumps({ "id": chunk_id, "object": "chat.completion.chunk", "created": created, "model": model, "choices": [{ "index": 0, "delta": {}, "finish_reason": "tool_calls", }], }) chunks.append(f"data: {sse_done}\n\n") chunks.append("data: [DONE]\n\n") return chunks async def _stream_with_auto_continue(messages: list[dict], model: str): """Stream with real-time output, auto-continue, and keep-alive pings. ALWAYS buffers the full response to detect tool call tags. If tool calls are found AND complete, emits them as proper OpenAI tool_calls chunks. If tool calls are incomplete, auto-continues to collect the rest. If no tool calls, emits the text as regular content chunks. """ chunk_id = f"chatcmpl-{uuid.uuid4().hex[:12]}" created = int(time.time()) conversation = list(messages) total_content = "" for cont_num in range(MAX_CONTINUATIONS): yield ": thinking...\n\n" resp = None try: async for result in _raw_call_streaming(conversation, model): if isinstance(result, str): yield result else: resp = result except HTTPException as e: error_data = json.dumps({ "id": chunk_id, "object": "chat.completion.chunk", "created": created, "model": model, "choices": [{ "index": 0, "delta": {"content": f"\n\n[Error: {e.detail}]"}, "finish_reason": None, }], }) yield f"data: {error_data}\n\n" yield f"data: {json.dumps({'id': chunk_id, 'object': 'chat.completion.chunk', 'created': created, 'model': model, 'choices': [{'index': 0, 'delta': {}, 'finish_reason': 'stop'}]})}\n\n" yield "data: [DONE]\n\n" return if resp is None: yield f"data: {json.dumps({'id': chunk_id, 'object': 'chat.completion.chunk', 'created': created, 'model': model, 'choices': [{'index': 0, 'delta': {'content': '[Error: No response from upstream]'}, 'finish_reason': None}]})}\n\n" yield f"data: {json.dumps({'id': chunk_id, 'object': 'chat.completion.chunk', 'created': created, 'model': model, 'choices': [{'index': 0, 'delta': {}, 'finish_reason': 'stop'}]})}\n\n" yield "data: [DONE]\n\n" return finish_reason = "stop" chunk_content = "" # Buffer the full response async for text, fr in _stream_one_response(resp): if fr is not None: finish_reason = fr continue if text: chunk_content += text total_content += text yield ": streaming...\n\n" print(f"[Chat] Chunk #{cont_num+1}: {len(chunk_content)} chars, finish={finish_reason}") # Check for tool calls in the accumulated text tool_calls, remaining_text = _parse_tool_calls(total_content) has_incomplete = _has_incomplete_tool_call(total_content) print(f"[Chat] tool_calls={len(tool_calls)} incomplete={has_incomplete} finish={finish_reason}") # ── Decision tree ────────────────────────────────────────── # # 1. If we have COMPLETE tool calls AND no incomplete tags → emit & done # 2. If we have incomplete tool calls (regardless of complete ones) → auto-continue # 3. If no tool calls and finish_reason == "stop" and no incomplete tags → emit text & done # 4. If no tool calls and finish_reason == "stop" but HAS incomplete tags → auto-continue # (the upstream might report "stop" even when cut off mid-tag) # 5. If finish_reason == "length" → auto-continue if tool_calls and not has_incomplete: # All tool calls are complete — emit them print(f"[Chat] Emitting {len(tool_calls)} complete tool call(s)") for sse_chunk in _emit_tool_call_chunks(chunk_id, created, model, tool_calls, remaining_text): yield sse_chunk return if has_incomplete: # Incomplete tool calls detected — must auto-continue print(f"[Chat] Incomplete tool call detected, auto-continuing...") yield ": continuing...\n\n" conversation.append({"role": "assistant", "content": chunk_content}) conversation.append({"role": "user", "content": "Continue the tool call exactly from where you left off. Do not repeat the opening tag or any arguments you already wrote. Just continue outputting the parameter values from where you stopped."}) print(f"[Chat] Auto-continue (incomplete) #{cont_num+1}, total so far: {len(total_content)} chars") continue # No tool calls and no incomplete tags if finish_reason == "stop": # Regular text response — emit as content chunk_sz = 50 for offset in range(0, len(total_content), chunk_sz): piece = total_content[offset:offset + chunk_sz] sse_data = json.dumps({ "id": chunk_id, "object": "chat.completion.chunk", "created": created, "model": model, "choices": [{ "index": 0, "delta": {"content": piece}, "finish_reason": None, }], }) yield f"data: {sse_data}\n\n" sse_data = json.dumps({ "id": chunk_id, "object": "chat.completion.chunk", "created": created, "model": model, "choices": [{ "index": 0, "delta": {}, "finish_reason": "stop", }], }) yield f"data: {sse_data}\n\n" yield "data: [DONE]\n\n" return # finish_reason == "length" — auto-continue for regular text yield ": continuing...\n\n" conversation.append({"role": "assistant", "content": chunk_content}) conversation.append({"role": "user", "content": "Continue exactly from where you left off. Do not repeat any text you already wrote."}) print(f"[Chat] Auto-continue (length) #{cont_num+1}, total so far: {len(total_content)} chars") # Safety: max continuations reached — try to emit whatever we have tool_calls, remaining_text = _parse_tool_calls(total_content) if tool_calls: # Best-effort: emit whatever tool calls we managed to parse print(f"[Chat] Max continuations reached, emitting {len(tool_calls)} partial tool call(s)") for sse_chunk in _emit_tool_call_chunks(chunk_id, created, model, tool_calls, remaining_text): yield sse_chunk else: # Emit whatever text we have # Strip any incomplete tool call XML from the output to avoid raw tags in content clean_content = _strip_incomplete_tool_tags(total_content) if clean_content.strip(): chunk_sz = 50 for offset in range(0, len(clean_content), chunk_sz): piece = clean_content[offset:offset + chunk_sz] sse_data = json.dumps({ "id": chunk_id, "object": "chat.completion.chunk", "created": created, "model": model, "choices": [{ "index": 0, "delta": {"content": piece}, "finish_reason": None, }], }) yield f"data: {sse_data}\n\n" sse_data = json.dumps({ "id": chunk_id, "object": "chat.completion.chunk", "created": created, "model": model, "choices": [{ "index": 0, "delta": {}, "finish_reason": "stop", }], }) yield f"data: {sse_data}\n\n" yield "data: [DONE]\n\n" # ── Non-streaming with auto-continue ──────────────────────────── async def _collect_with_auto_continue(messages: list[dict], model: str) -> dict: """Collect the full response, auto-continuing if cut off.""" conversation = list(messages) full_content = "" for cont_num in range(MAX_CONTINUATIONS): resp = await _raw_call(conversation, model) content = "" finish_reason = "stop" async for text, fr in _stream_one_response(resp): if fr is not None: finish_reason = fr continue if text: content += text full_content += content print(f"[Chat] Collect #{cont_num+1}: {len(content)} chars, finish={finish_reason}") # Always check for tool calls tool_calls, remaining_text = _parse_tool_calls(full_content) if tool_calls: if _has_incomplete_tool_call(full_content) and finish_reason == "length": pass else: return { "tool_calls": tool_calls, "content": remaining_text if remaining_text.strip() else None, } if finish_reason == "stop": return {"content": full_content, "tool_calls": None} # Auto-continue if _has_incomplete_tool_call(content): conversation.append({"role": "assistant", "content": content}) conversation.append({"role": "user", "content": "Continue the tool call exactly from where you left off. Do not repeat the opening tag or any arguments you already wrote."}) else: conversation.append({"role": "assistant", "content": content}) conversation.append({"role": "user", "content": "Continue exactly from where you left off. Do not repeat any text you already wrote."}) return {"content": full_content, "tool_calls": None} # ── OpenAI-compatible endpoint ────────────────────────────────── @app.post("/v1/chat/completions") @app.post("/chat/completions") async def chat_completions(request: Request): try: body = await request.json() except Exception: raise HTTPException(400, "Invalid JSON") if not isinstance(body, dict): raise HTTPException(400, "Body must be a JSON object") model = body.get("model", "anthropic/claude-haiku-4-5") messages_raw = body.get("messages", []) stream = body.get("stream", False) # Extract tools tools = body.get("tools") or body.get("functions") or None tool_choice = body.get("tool_choice", "auto") # Convert old 'functions' format if tools and "function" not in tools[0] and "name" in tools[0]: tools = [{"type": "function", "function": f} for f in tools] print(f"[Request] model={model} stream={stream} tools={bool(tools)} tool_choice={tool_choice} msgs={len(messages_raw)}") if not messages_raw or not isinstance(messages_raw, list): raise HTTPException(400, "messages must be a non-empty array") messages = normalize_messages(messages_raw, tools=tools, tool_choice=tool_choice) if not messages: raise HTTPException(400, "No valid messages after normalization") try: if stream: return StreamingResponse( _stream_with_auto_continue(messages, model), media_type="text/event-stream", headers={ "Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no", }, ) else: result = await _collect_with_auto_continue(messages, model) tool_calls = result.get("tool_calls") content = result.get("content") if tool_calls: return JSONResponse({ "id": f"chatcmpl-{int(time.time())}", "object": "chat.completion", "created": int(time.time()), "model": model, "choices": [{ "index": 0, "message": { "role": "assistant", "content": content, "tool_calls": tool_calls, }, "finish_reason": "tool_calls", }], "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}, }) else: return JSONResponse({ "id": f"chatcmpl-{int(time.time())}", "object": "chat.completion", "created": int(time.time()), "model": model, "choices": [{ "index": 0, "message": {"role": "assistant", "content": content or ""}, "finish_reason": "stop", }], "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}, }) except HTTPException: raise except Exception as e: print(f"[Request] Unhandled error: {type(e).__name__}: {e}") print(traceback.format_exc()) raise HTTPException(500, f"Internal error: {type(e).__name__}") # ── Models / Health ───────────────────────────────────────────── @app.get("/v1/models") @app.get("/models") async def list_models(): return JSONResponse({ "object": "list", "data": [ {"id": "anthropic/claude-haiku-4-5", "object": "model", "owned_by": "anthropic"}, ], }) @app.get("/") async def root(): return { "status": "ok", "version": "8.1.0", "proxy": bool(PROXY_URL), "tool_calling": True, "endpoints": ["/v1/chat/completions", "/v1/models"], } @app.get("/health") async def health(): return { "status": "ok", "session_active": bool(session.cookies), "proxy": bool(PROXY_URL), } @app.get("/debug/refresh") async def force_refresh(): global http_client session.last_refresh = 0 result = await session.refresh(http_client) if result is not None: http_client = result return { "refreshed": True, "has_cookies": bool(session.cookies), "has_csrf": bool(session.csrf_token), "proxy": bool(PROXY_URL), } @app.get("/debug/session") async def debug_session(): return { "has_cookies": bool(session.cookies), "cookie_names": list(session.cookies.keys()) if session.cookies else [], "has_csrf": bool(session.csrf_token), "has_xsrf": bool(session.xsrf_token), "last_refresh_ago": int(time.time() - session.last_refresh) if session.last_refresh else None, "proxy": bool(PROXY_URL), } if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)