Spaces:

overwrite69
/

haiku-api

Sleeping

App Files Files Community

overwrite69 commited on 14 days ago

Commit

d1a0754

verified ·

1 Parent(s): 1c15fbd

v8.0.0: proxy fallback, better timeouts, error handling

Browse files

Files changed (1) hide show

app.py +250 -216

app.py CHANGED Viewed

@@ -5,9 +5,10 @@ Deploy to Hugging Face Spaces (Docker SDK)
 Features:
 - Tool/function calling support (always detects tool call tags in output)
 - Auto-continues when upstream hits the ~1K token output limit
-- Rotating proxy with aggressive retries for unstable IPs
 - SSE keep-alive comments during continuation gaps
 - Message normalization for Orchids.app compatibility
 """
 import asyncio
@@ -16,6 +17,7 @@ import os
 import re
 import time
 import uuid
 from typing import Optional
 from urllib.parse import unquote
@@ -24,7 +26,7 @@ from fastapi import FastAPI, HTTPException, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse, JSONResponse
-app = FastAPI(title="Haiku API", version="7.0.0")
 # ── CORS ─────────────────────────────────────────────────────────
 app.add_middleware(
@@ -38,21 +40,20 @@ app.add_middleware(
 # ── Proxy Config ─────────────────────────────────────────────────
 PROXY_URL = os.environ.get("PROXY_URL", "")
-PROXY_MAX_RETRIES = 6  # rotating proxy: try many IPs since ~half are dead
 PROXY_RETRY_DELAY = 1  # seconds between proxy retries
-def _make_client() -> httpx.AsyncClient:
     """Create an httpx client, with or without proxy."""
     kwargs = dict(
         verify=False,
-        timeout=httpx.Timeout(120.0, connect=15.0),
     )
-    if PROXY_URL:
         kwargs["proxy"] = PROXY_URL
-        print(f"[Proxy] Using rotating proxy: {PROXY_URL.split('@')[-1]}")
-    else:
-        print("[Proxy] No proxy configured, direct connection")
     return httpx.AsyncClient(**kwargs)
@@ -72,65 +73,72 @@ class SessionState:
             if self.cookies and (now - self.last_refresh) < self.refresh_interval:
                 return
-            for attempt in range(PROXY_MAX_RETRIES):
-                try:
-                    if PROXY_URL and attempt > 0:
-                        try:
-                            await client.aclose()
-                        except:
-                            pass
-                        client = _make_client()
-                    resp = await client.get(
-                        "https://chatgpt.org/claude/chat",
-                        follow_redirects=True,
-                        headers={
-                            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36",
-                            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-                        },
-                        timeout=30.0,
-                    )
-                    if resp.status_code != 200:
-                        print(f"[Session] GET returned {resp.status_code}, retry #{attempt+1}")
                         await asyncio.sleep(PROXY_RETRY_DELAY)
                         continue
-                    new_cookies = httpx.Cookies()
-                    for name, value in resp.cookies.items():
-                        new_cookies.set(name, value, domain="chatgpt.org")
-                    for header in resp.headers.get_list("set-cookie"):
-                        parts = header.split(";")[0]
-                        if "=" in parts:
-                            k, v = parts.split("=", 1)
-                            new_cookies.set(k.strip(), v.strip(), domain="chatgpt.org")
-                    xsrf = new_cookies.get("XSRF-TOKEN", domain="chatgpt.org")
-                    if xsrf:
-                        xsrf = unquote(xsrf)
-                    csrf = None
-                    m = re.search(r'<meta\s+name="csrf-token"\s+content="([^"]+)"', resp.text)
-                    if m:
-                        csrf = m.group(1)
-                    self.cookies = new_cookies
-                    self.xsrf_token = xsrf
-                    self.csrf_token = csrf
-                    self.last_refresh = now
-                    print(f"[Session] OK — CSRF:{bool(csrf)} XSRF:{bool(xsrf)} Cookies:{list(new_cookies.keys())} (attempt {attempt+1})")
-                    return
-                except (httpx.ConnectError, httpx.ProxyError, httpx.TimeoutException) as e:
-                    print(f"[Session] Proxy error attempt #{attempt+1}: {type(e).__name__}: {e}")
-                    await asyncio.sleep(PROXY_RETRY_DELAY)
-                    continue
-                except Exception as e:
-                    print(f"[Session] Error attempt #{attempt+1}: {e}")
-                    await asyncio.sleep(PROXY_RETRY_DELAY)
-                    continue
-            print("[Session] WARNING: All refresh attempts failed")
 session = SessionState()
@@ -141,8 +149,10 @@ http_client: Optional[httpx.AsyncClient] = None
 @app.on_event("startup")
 async def startup():
     global http_client
-    http_client = _make_client()
-    await session.refresh(http_client)
 @app.on_event("shutdown")
 async def shutdown():
@@ -172,7 +182,6 @@ _TOOL_CALL_INLINE_RE = re.compile(
 )
 # Regex for Format 2: Anthropic XML function_calls blocks
-# Matches: <function_calls>...</function_calls>  (the whole block)
 _ANTHROPIC_FC_BLOCK_RE = re.compile(
     r'<function_calls>\s*(.*?)\s*</function_calls>',
     re.DOTALL
@@ -224,7 +233,7 @@ def _parse_tool_calls(text: str) -> tuple[list[dict], str]:
     If no tool calls found, returns ([], original_text).
     """
     tool_calls = []
-    consumed_spans = []  # (start, end) of text that was part of tool calls
     # --- Format 1: Inline JSON tool calls ---
     for match in _TOOL_CALL_INLINE_RE.finditer(text):
@@ -251,12 +260,10 @@ def _parse_tool_calls(text: str) -> tuple[list[dict], str]:
             func_name = invoke_match.group(1)
             invoke_body = invoke_match.group(2)
-            # Parse parameters into a dict
             params = {}
             for param_match in _ANTHROPIC_PARAM_RE.finditer(invoke_body):
                 param_name = param_match.group(1)
                 param_value = param_match.group(2)
-                # Try to parse as JSON value (for numbers, bools, etc.)
                 try:
                     params[param_name] = json.loads(param_value)
                 except (json.JSONDecodeError, ValueError):
@@ -294,8 +301,7 @@ def _parse_tool_calls(text: str) -> tuple[list[dict], str]:
 def _has_incomplete_tool_call(text: str) -> bool:
-    """Check if text has an opening tool call tag without a matching close.
-    Checks both inline and Anthropic XML formats."""
     # Inline format
     inline_opens = len(re.findall(r'<(?:function_call|tool_call)\s+name="[^"]+">', text))
     inline_closes = len(re.findall(r'</(?:function_call|tool_call)_?>', text))
@@ -303,15 +309,10 @@ def _has_incomplete_tool_call(text: str) -> bool:
         return True
     # Anthropic XML format
-    fc_opens = text.count('<function_calls>')
-    fc_closes = text.count('</function_calls>')
-    if fc_opens > fc_closes:
         return True
-    # Also check for incomplete <invoke> tags
     invoke_opens = len(re.findall(r'<invoke\s+name="[^"]+">', text))
-    invoke_closes = text.count('</invoke>')
-    if invoke_opens > invoke_closes:
         return True
     return False
@@ -327,11 +328,10 @@ def _build_tool_system_prompt(tools: list[dict], tool_choice=None) -> str:
     tool_names = []
     for tool in tools:
-        # Support both 'tools' and 'functions' (old OpenAI) formats
         if "function" in tool:
             func = tool["function"]
         else:
-            func = tool  # old format: tool IS the function definition
         name = func.get("name", "unknown")
         desc = func.get("description", "No description")
@@ -356,12 +356,11 @@ Parameters:
     tools_xml = '\n\n'.join(invoke_blocks)
-    # Handle tool_choice
     choice_instruction = ""
     if tool_choice == "required":
         choice_instruction = "\nIMPORTANT: You MUST call at least one tool."
     elif tool_choice == "none":
-        return ""  # No tools injected
     elif isinstance(tool_choice, dict) and tool_choice.get("type") == "function":
         fname = tool_choice.get("function", {}).get("name", "")
         choice_instruction = f"\nIMPORTANT: You MUST call the {fname} function."
@@ -403,7 +402,6 @@ def normalize_messages(messages: list[dict], tools: list[dict] = None, tool_choi
     and inject tool definitions into system prompt if tools are provided."""
     result = []
-    # Build tool system prompt if tools are provided
     tool_system = None
     if tools and tool_choice != "none":
         tool_system = _build_tool_system_prompt(tools, tool_choice)
@@ -414,7 +412,6 @@ def normalize_messages(messages: list[dict], tools: list[dict] = None, tool_choi
         role = msg.get("role", "user")
         content = msg.get("content", "")
-        # Handle content arrays → plain text
         if isinstance(content, list):
             content = _flatten_content_array(content)
@@ -422,7 +419,7 @@ def normalize_messages(messages: list[dict], tools: list[dict] = None, tool_choi
             content = ""
         content = str(content)
-        # Handle tool role messages → convert to user message with tool result
         if role == "tool":
             tool_name = msg.get("name", "unknown_tool")
             tool_call_id = msg.get("tool_call_id", "")
@@ -432,7 +429,7 @@ def normalize_messages(messages: list[dict], tools: list[dict] = None, tool_choi
             })
             continue
-        # Handle assistant messages with tool_calls → text with function_call blocks
         if role == "assistant" and msg.get("tool_calls"):
             parts = []
             regular_content = content if content and content.strip() else ""
@@ -449,14 +446,12 @@ def normalize_messages(messages: list[dict], tools: list[dict] = None, tool_choi
                     args_json = json.loads(args)
                 except (json.JSONDecodeError, TypeError):
                     args_json = {}
-                # Convert to Anthropic XML format (matches Claude's native output)
                 invoke_lines = [f'<invoke name="{name}">']
                 for k, v in args_json.items():
                     invoke_lines.append(f'<parameter name="{k}">{v}</parameter>')
                 invoke_lines.append('</invoke>')
                 invoke_parts.append('\n'.join(invoke_lines))
-            # Wrap in <function_calls> block
             fc_content = '<function_calls>\n' + '\n'.join(invoke_parts) + '\n</function_calls>'
             combined = regular_content + '\n\n' + fc_content if regular_content else fc_content
             result.append({"role": "assistant", "content": combined})
@@ -469,13 +464,11 @@ def normalize_messages(messages: list[dict], tools: list[dict] = None, tool_choi
             system_injected = True
             continue
-        # System messages with empty content get filtered out
         if role == "system" and not content.strip():
             continue
         result.append({"role": role, "content": content})
-    # If no system message existed but tools need to be injected
     if tool_system and not system_injected:
         result.insert(0, {"role": "system", "content": tool_system})
@@ -499,31 +492,36 @@ def _headers() -> dict:
     return h
-# ── Proxy-aware request with retry ──────────────────────────────
 async def _proxy_post(url: str, **kwargs) -> httpx.Response:
-    """POST with proxy retry logic. Creates new client on each retry to get fresh IP."""
     global http_client
-    for attempt in range(PROXY_MAX_RETRIES):
-        try:
-            resp = await http_client.post(url, **kwargs)
-            return resp
-        except (httpx.ConnectError, httpx.ProxyError, httpx.TimeoutException) as e:
-            print(f"[Proxy] Connection error #{attempt+1}: {type(e).__name__}")
-            if PROXY_URL:
                 try:
                     await http_client.aclose()
                 except:
                     pass
-                http_client = _make_client()
                 await asyncio.sleep(PROXY_RETRY_DELAY)
-            else:
-                await asyncio.sleep(2)
-            continue
-    return await http_client.post(url, **kwargs)
 # ── Raw call with retries ───────────────────────────────────────
@@ -536,12 +534,17 @@ async def _raw_call(messages: list[dict], model: str) -> httpx.Response:
     for attempt in range(2):  # CSRF retry
         for rate_attempt in range(3):  # 429 retry
-            resp = await _proxy_post(
-                "https://chatgpt.org/api/chat",
-                json=payload,
-                headers=_headers(),
-                cookies=session.cookies,
-            )
             if resp.status_code == 419 and attempt == 0:
                 print("[Chat] 419 -> refreshing session...")
@@ -573,34 +576,39 @@ async def _stream_one_response(resp):
     Yields (text, finish_reason) tuples. finish_reason is None for text chunks."""
     finish_reason = None
-    async for raw_line in resp.aiter_lines():
-        line = raw_line.strip()
-        if not line or line.startswith(":"):
-            continue
-        if not line.startswith("data: "):
-            continue
-        payload_str = line[6:]
-        if payload_str.strip() == "[DONE]":
-            break
-        try:
-            chunk = json.loads(payload_str)
-        except json.JSONDecodeError:
-            continue
-        for choice in chunk.get("choices", []):
-            delta = choice.get("delta", {})
-            c = delta.get("content", "")
-            if c:
-                yield c, None
-            fr = choice.get("finish_reason")
-            if fr:
-                if fr in ("stop", "end_turn"):
-                    finish_reason = "stop"
-                elif fr in ("length", "max_tokens"):
-                    finish_reason = "length"
     yield "", finish_reason
@@ -619,12 +627,17 @@ async def _raw_call_streaming(messages: list[dict], model: str):
         for rate_attempt in range(3):  # 429 retry
             yield ": thinking...\n\n"
-            resp = await _proxy_post(
-                "https://chatgpt.org/api/chat",
-                json=payload,
-                headers=_headers(),
-                cookies=session.cookies,
-            )
             if resp.status_code == 419 and attempt == 0:
                 print("[Chat] 419 -> refreshing session...")
@@ -684,7 +697,7 @@ def _emit_tool_call_chunks(chunk_id: str, created: int, model: str, tool_calls:
         })
         chunks.append(f"data: {sse_start}\n\n")
-        # Argument chunks — split into small pieces for streaming feel
         args = tc["function"]["arguments"]
         chunk_size = max(1, len(args) // 3)
         for offset in range(0, len(args), chunk_size):
@@ -709,7 +722,7 @@ def _emit_tool_call_chunks(chunk_id: str, created: int, model: str, tool_calls:
             })
             chunks.append(f"data: {sse_arg}\n\n")
-    # If there's remaining text alongside tool calls, emit it too
     if remaining_text.strip():
         sse_text = json.dumps({
             "id": chunk_id,
@@ -755,23 +768,43 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
     total_content = ""
     for cont_num in range(MAX_CONTINUATIONS):
-        # Send keep-alive while we buffer
         yield ": thinking...\n\n"
         resp = None
-        async for result in _raw_call_streaming(conversation, model):
-            if isinstance(result, str):
-                yield result
-            else:
-                resp = result
         if resp is None:
-            raise HTTPException(500, "No response from upstream")
         finish_reason = "stop"
         chunk_content = ""
-        # Buffer the full response (don't stream in real-time so we can detect tool calls)
         async for text, fr in _stream_one_response(resp):
             if fr is not None:
                 finish_reason = fr
@@ -780,25 +813,21 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
             if text:
                 chunk_content += text
                 total_content += text
-                # Send keep-alive pings while buffering
                 yield ": streaming...\n\n"
         print(f"[Chat] Chunk #{cont_num+1}: {len(chunk_content)} chars, finish={finish_reason}")
-        # ALWAYS check for tool calls in the accumulated text
         tool_calls, remaining_text = _parse_tool_calls(total_content)
         if tool_calls:
             print(f"[Chat] Detected {len(tool_calls)} tool call(s)")
-            # Emit tool calls as proper OpenAI streaming chunks
             for sse_chunk in _emit_tool_call_chunks(chunk_id, created, model, tool_calls, remaining_text):
                 yield sse_chunk
             return
-        # No tool calls found
         if finish_reason == "stop":
-            # Stream the buffered text content as regular content chunks
             chunk_sz = 50
             for offset in range(0, len(total_content), chunk_sz):
                 piece = total_content[offset:offset + chunk_sz]
@@ -815,7 +844,6 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
                 })
                 yield f"data: {sse_data}\n\n"
-            # Final stop chunk
             sse_data = json.dumps({
                 "id": chunk_id,
                 "object": "chat.completion.chunk",
@@ -831,7 +859,7 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
             yield "data: [DONE]\n\n"
             return
-        # Auto-continue for length-limited responses
         yield ": continuing...\n\n"
         if _has_incomplete_tool_call(chunk_content):
@@ -862,8 +890,7 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
 # ── Non-streaming with auto-continue ────────────────────────────
 async def _collect_with_auto_continue(messages: list[dict], model: str) -> dict:
-    """Collect the full response, auto-continuing if cut off.
-    Always checks for tool calls. Returns dict with 'content' and/or 'tool_calls'."""
     conversation = list(messages)
     full_content = ""
@@ -886,9 +913,8 @@ async def _collect_with_auto_continue(messages: list[dict], model: str) -> dict:
         tool_calls, remaining_text = _parse_tool_calls(full_content)
         if tool_calls:
-            # If there are incomplete tool calls and we got cut off, continue
             if _has_incomplete_tool_call(full_content) and finish_reason == "length":
-                pass  # fall through to auto-continue
             else:
                 return {
                     "tool_calls": tool_calls,
@@ -926,16 +952,14 @@ async def chat_completions(request: Request):
     messages_raw = body.get("messages", [])
     stream = body.get("stream", False)
-    # Extract tools — support both new 'tools' and old 'functions' formats
     tools = body.get("tools") or body.get("functions") or None
     tool_choice = body.get("tool_choice", "auto")
-    # Convert old 'functions' format to new 'tools' format if needed
     if tools and "function" not in tools[0] and "name" in tools[0]:
-        # Old format: [{"name": "X", "parameters": {...}}]
         tools = [{"type": "function", "function": f} for f in tools]
-    # Log request for debugging
     print(f"[Request] model={model} stream={stream} tools={bool(tools)} tool_choice={tool_choice} msgs={len(messages_raw)}")
     if not messages_raw or not isinstance(messages_raw, list):
@@ -946,52 +970,59 @@ async def chat_completions(request: Request):
     if not messages:
         raise HTTPException(400, "No valid messages after normalization")
-    if stream:
-        return StreamingResponse(
-            _stream_with_auto_continue(messages, model),
-            media_type="text/event-stream",
-            headers={
-                "Cache-Control": "no-cache",
-                "Connection": "keep-alive",
-                "X-Accel-Buffering": "no",
-            },
-        )
-    else:
-        result = await _collect_with_auto_continue(messages, model)
-        tool_calls = result.get("tool_calls")
-        content = result.get("content")
-        if tool_calls:
-            return JSONResponse({
-                "id": f"chatcmpl-{int(time.time())}",
-                "object": "chat.completion",
-                "created": int(time.time()),
-                "model": model,
-                "choices": [{
-                    "index": 0,
-                    "message": {
-                        "role": "assistant",
-                        "content": content,
-                        "tool_calls": tool_calls,
-                    },
-                    "finish_reason": "tool_calls",
-                }],
-                "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
-            })
         else:
-            return JSONResponse({
-                "id": f"chatcmpl-{int(time.time())}",
-                "object": "chat.completion",
-                "created": int(time.time()),
-                "model": model,
-                "choices": [{
-                    "index": 0,
-                    "message": {"role": "assistant", "content": content or ""},
-                    "finish_reason": "stop",
-                }],
-                "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
-            })
 # ── Models / Health ─────────────────────────────────────────────
@@ -1011,7 +1042,7 @@ async def list_models():
 async def root():
     return {
         "status": "ok",
-        "version": "7.0.0",
         "proxy": bool(PROXY_URL),
         "tool_calling": True,
         "endpoints": ["/v1/chat/completions", "/v1/models"],
@@ -1030,7 +1061,10 @@ async def health():
 @app.get("/debug/refresh")
 async def force_refresh():
     session.last_refresh = 0
-    await session.refresh(http_client)
     return {
         "refreshed": True,
         "has_cookies": bool(session.cookies),

 Features:
 - Tool/function calling support (always detects tool call tags in output)
 - Auto-continues when upstream hits the ~1K token output limit
+- Rotating proxy with direct-connection fallback
 - SSE keep-alive comments during continuation gaps
 - Message normalization for Orchids.app compatibility
+- Robust error handling with proper JSON error responses
 """
 import asyncio
 import re
 import time
 import uuid
+import traceback
 from typing import Optional
 from urllib.parse import unquote
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse, JSONResponse
+app = FastAPI(title="Haiku API", version="8.0.0")
 # ── CORS ─────────────────────────────────────────────────────────
 app.add_middleware(
 # ── Proxy Config ─────────────────────────────────────────────────
 PROXY_URL = os.environ.get("PROXY_URL", "")
+PROXY_MAX_RETRIES = 4  # rotating proxy: try a few IPs
 PROXY_RETRY_DELAY = 1  # seconds between proxy retries
+CONNECT_TIMEOUT = 10.0  # short connect timeout
+READ_TIMEOUT = 120.0   # long read timeout (for streaming responses)
+def _make_client(use_proxy: bool = True) -> httpx.AsyncClient:
     """Create an httpx client, with or without proxy."""
     kwargs = dict(
         verify=False,
+        timeout=httpx.Timeout(READ_TIMEOUT, connect=CONNECT_TIMEOUT),
     )
+    if use_proxy and PROXY_URL:
         kwargs["proxy"] = PROXY_URL
     return httpx.AsyncClient(**kwargs)
             if self.cookies and (now - self.last_refresh) < self.refresh_interval:
                 return
+            # Try with proxy first, then fallback to direct
+            for use_proxy in [True, False]:
+                if use_proxy and not PROXY_URL:
+                    continue
+                working_client = client
+                for attempt in range(PROXY_MAX_RETRIES if use_proxy else 2):
+                    try:
+                        if attempt > 0:
+                            try:
+                                await working_client.aclose()
+                            except:
+                                pass
+                            working_client = _make_client(use_proxy=use_proxy)
+                        resp = await working_client.get(
+                            "https://chatgpt.org/claude/chat",
+                            follow_redirects=True,
+                            headers={
+                                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36",
+                                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                            },
+                            timeout=20.0,
+                        )
+                        if resp.status_code != 200:
+                            print(f"[Session] GET returned {resp.status_code} (proxy={use_proxy}, attempt {attempt+1})")
+                            await asyncio.sleep(PROXY_RETRY_DELAY)
+                            continue
+                        new_cookies = httpx.Cookies()
+                        for name, value in resp.cookies.items():
+                            new_cookies.set(name, value, domain="chatgpt.org")
+                        for header in resp.headers.get_list("set-cookie"):
+                            parts = header.split(";")[0]
+                            if "=" in parts:
+                                k, v = parts.split("=", 1)
+                                new_cookies.set(k.strip(), v.strip(), domain="chatgpt.org")
+                        xsrf = new_cookies.get("XSRF-TOKEN", domain="chatgpt.org")
+                        if xsrf:
+                            xsrf = unquote(xsrf)
+                        csrf = None
+                        m = re.search(r'<meta\s+name="csrf-token"\s+content="([^"]+)"', resp.text)
+                        if m:
+                            csrf = m.group(1)
+                        self.cookies = new_cookies
+                        self.xsrf_token = xsrf
+                        self.csrf_token = csrf
+                        self.last_refresh = now
+                        mode = "proxy" if use_proxy else "direct"
+                        print(f"[Session] OK ({mode}) — CSRF:{bool(csrf)} XSRF:{bool(xsrf)} Cookies:{list(new_cookies.keys())}")
+                        return working_client
+                    except (httpx.ConnectError, httpx.ProxyError, httpx.TimeoutException) as e:
+                        print(f"[Session] Connection error (proxy={use_proxy}, attempt {attempt+1}): {type(e).__name__}")
+                        await asyncio.sleep(PROXY_RETRY_DELAY)
+                        continue
+                    except Exception as e:
+                        print(f"[Session] Error (proxy={use_proxy}, attempt {attempt+1}): {type(e).__name__}: {e}")
                         await asyncio.sleep(PROXY_RETRY_DELAY)
                         continue
+            print("[Session] WARNING: All refresh attempts failed (both proxy and direct)")
 session = SessionState()
 @app.on_event("startup")
 async def startup():
     global http_client
+    http_client = _make_client(use_proxy=bool(PROXY_URL))
+    result = await session.refresh(http_client)
+    if result is not None:
+        http_client = result
 @app.on_event("shutdown")
 async def shutdown():
 )
 # Regex for Format 2: Anthropic XML function_calls blocks
 _ANTHROPIC_FC_BLOCK_RE = re.compile(
     r'<function_calls>\s*(.*?)\s*</function_calls>',
     re.DOTALL
     If no tool calls found, returns ([], original_text).
     """
     tool_calls = []
+    consumed_spans = []
     # --- Format 1: Inline JSON tool calls ---
     for match in _TOOL_CALL_INLINE_RE.finditer(text):
             func_name = invoke_match.group(1)
             invoke_body = invoke_match.group(2)
             params = {}
             for param_match in _ANTHROPIC_PARAM_RE.finditer(invoke_body):
                 param_name = param_match.group(1)
                 param_value = param_match.group(2)
                 try:
                     params[param_name] = json.loads(param_value)
                 except (json.JSONDecodeError, ValueError):
 def _has_incomplete_tool_call(text: str) -> bool:
+    """Check if text has an opening tool call tag without a matching close."""
     # Inline format
     inline_opens = len(re.findall(r'<(?:function_call|tool_call)\s+name="[^"]+">', text))
     inline_closes = len(re.findall(r'</(?:function_call|tool_call)_?>', text))
         return True
     # Anthropic XML format
+    if text.count('<function_calls>') > text.count('</function_calls>'):
         return True
     invoke_opens = len(re.findall(r'<invoke\s+name="[^"]+">', text))
+    if invoke_opens > text.count('</invoke>'):
         return True
     return False
     tool_names = []
     for tool in tools:
         if "function" in tool:
             func = tool["function"]
         else:
+            func = tool
         name = func.get("name", "unknown")
         desc = func.get("description", "No description")
     tools_xml = '\n\n'.join(invoke_blocks)
     choice_instruction = ""
     if tool_choice == "required":
         choice_instruction = "\nIMPORTANT: You MUST call at least one tool."
     elif tool_choice == "none":
+        return ""
     elif isinstance(tool_choice, dict) and tool_choice.get("type") == "function":
         fname = tool_choice.get("function", {}).get("name", "")
         choice_instruction = f"\nIMPORTANT: You MUST call the {fname} function."
     and inject tool definitions into system prompt if tools are provided."""
     result = []
     tool_system = None
     if tools and tool_choice != "none":
         tool_system = _build_tool_system_prompt(tools, tool_choice)
         role = msg.get("role", "user")
         content = msg.get("content", "")
         if isinstance(content, list):
             content = _flatten_content_array(content)
             content = ""
         content = str(content)
+        # Handle tool role messages
         if role == "tool":
             tool_name = msg.get("name", "unknown_tool")
             tool_call_id = msg.get("tool_call_id", "")
             })
             continue
+        # Handle assistant messages with tool_calls
         if role == "assistant" and msg.get("tool_calls"):
             parts = []
             regular_content = content if content and content.strip() else ""
                     args_json = json.loads(args)
                 except (json.JSONDecodeError, TypeError):
                     args_json = {}
                 invoke_lines = [f'<invoke name="{name}">']
                 for k, v in args_json.items():
                     invoke_lines.append(f'<parameter name="{k}">{v}</parameter>')
                 invoke_lines.append('</invoke>')
                 invoke_parts.append('\n'.join(invoke_lines))
             fc_content = '<function_calls>\n' + '\n'.join(invoke_parts) + '\n</function_calls>'
             combined = regular_content + '\n\n' + fc_content if regular_content else fc_content
             result.append({"role": "assistant", "content": combined})
             system_injected = True
             continue
         if role == "system" and not content.strip():
             continue
         result.append({"role": role, "content": content})
     if tool_system and not system_injected:
         result.insert(0, {"role": "system", "content": tool_system})
     return h
+# ── Proxy-aware request with retry + direct fallback ──────────────
 async def _proxy_post(url: str, **kwargs) -> httpx.Response:
+    """POST with proxy retry logic, falling back to direct connection."""
     global http_client
+    # Try with proxy first
+    if PROXY_URL:
+        for attempt in range(PROXY_MAX_RETRIES):
+            try:
+                resp = await http_client.post(url, **kwargs)
+                return resp
+            except (httpx.ConnectError, httpx.ProxyError, httpx.TimeoutException) as e:
+                print(f"[Proxy] Connection error #{attempt+1}: {type(e).__name__}")
                 try:
                     await http_client.aclose()
                 except:
                     pass
+                http_client = _make_client(use_proxy=True)
                 await asyncio.sleep(PROXY_RETRY_DELAY)
+                continue
+    # Fallback: try direct connection
+    print("[Proxy] Falling back to direct connection")
+    direct_client = _make_client(use_proxy=False)
+    try:
+        resp = await direct_client.post(url, **kwargs)
+        return resp
+    finally:
+        await direct_client.aclose()
 # ── Raw call with retries ───────────────────────────────────────
     for attempt in range(2):  # CSRF retry
         for rate_attempt in range(3):  # 429 retry
+            try:
+                resp = await _proxy_post(
+                    "https://chatgpt.org/api/chat",
+                    json=payload,
+                    headers=_headers(),
+                    cookies=session.cookies,
+                )
+            except (httpx.ConnectError, httpx.ProxyError, httpx.TimeoutException) as e:
+                print(f"[Chat] Connection failed: {type(e).__name__}")
+                session.last_refresh = 0
+                raise HTTPException(502, f"Cannot reach upstream: {type(e).__name__}")
             if resp.status_code == 419 and attempt == 0:
                 print("[Chat] 419 -> refreshing session...")
     Yields (text, finish_reason) tuples. finish_reason is None for text chunks."""
     finish_reason = None
+    try:
+        async for raw_line in resp.aiter_lines():
+            line = raw_line.strip()
+            if not line or line.startswith(":"):
+                continue
+            if not line.startswith("data: "):
+                continue
+            payload_str = line[6:]
+            if payload_str.strip() == "[DONE]":
+                break
+            try:
+                chunk = json.loads(payload_str)
+            except json.JSONDecodeError:
+                continue
+            for choice in chunk.get("choices", []):
+                delta = choice.get("delta", {})
+                c = delta.get("content", "")
+                if c:
+                    yield c, None
+                fr = choice.get("finish_reason")
+                if fr:
+                    if fr in ("stop", "end_turn"):
+                        finish_reason = "stop"
+                    elif fr in ("length", "max_tokens"):
+                        finish_reason = "length"
+    except (httpx.ReadError, httpx.RemoteProtocolError) as e:
+        print(f"[Stream] Connection lost during streaming: {type(e).__name__}")
+    except Exception as e:
+        print(f"[Stream] Error during streaming: {type(e).__name__}: {e}")
     yield "", finish_reason
         for rate_attempt in range(3):  # 429 retry
             yield ": thinking...\n\n"
+            try:
+                resp = await _proxy_post(
+                    "https://chatgpt.org/api/chat",
+                    json=payload,
+                    headers=_headers(),
+                    cookies=session.cookies,
+                )
+            except (httpx.ConnectError, httpx.ProxyError, httpx.TimeoutException) as e:
+                print(f"[Chat] Connection failed: {type(e).__name__}")
+                session.last_refresh = 0
+                raise HTTPException(502, f"Cannot reach upstream: {type(e).__name__}")
             if resp.status_code == 419 and attempt == 0:
                 print("[Chat] 419 -> refreshing session...")
         })
         chunks.append(f"data: {sse_start}\n\n")
+        # Argument chunks
         args = tc["function"]["arguments"]
         chunk_size = max(1, len(args) // 3)
         for offset in range(0, len(args), chunk_size):
             })
             chunks.append(f"data: {sse_arg}\n\n")
+    # Remaining text alongside tool calls
     if remaining_text.strip():
         sse_text = json.dumps({
             "id": chunk_id,
     total_content = ""
     for cont_num in range(MAX_CONTINUATIONS):
         yield ": thinking...\n\n"
         resp = None
+        try:
+            async for result in _raw_call_streaming(conversation, model):
+                if isinstance(result, str):
+                    yield result
+                else:
+                    resp = result
+        except HTTPException as e:
+            # Send error as SSE then stop
+            error_data = json.dumps({
+                "id": chunk_id,
+                "object": "chat.completion.chunk",
+                "created": created,
+                "model": model,
+                "choices": [{
+                    "index": 0,
+                    "delta": {"content": f"\n\n[Error: {e.detail}]"},
+                    "finish_reason": None,
+                }],
+            })
+            yield f"data: {error_data}\n\n"
+            yield f"data: {json.dumps({'id': chunk_id, 'object': 'chat.completion.chunk', 'created': created, 'model': model, 'choices': [{'index': 0, 'delta': {}, 'finish_reason': 'stop'}]})}\n\n"
+            yield "data: [DONE]\n\n"
+            return
         if resp is None:
+            yield f"data: {json.dumps({'id': chunk_id, 'object': 'chat.completion.chunk', 'created': created, 'model': model, 'choices': [{'index': 0, 'delta': {'content': '[Error: No response from upstream]'}, 'finish_reason': None}]})}\n\n"
+            yield f"data: {json.dumps({'id': chunk_id, 'object': 'chat.completion.chunk', 'created': created, 'model': model, 'choices': [{'index': 0, 'delta': {}, 'finish_reason': 'stop'}]})}\n\n"
+            yield "data: [DONE]\n\n"
+            return
         finish_reason = "stop"
         chunk_content = ""
+        # Buffer the full response
         async for text, fr in _stream_one_response(resp):
             if fr is not None:
                 finish_reason = fr
             if text:
                 chunk_content += text
                 total_content += text
                 yield ": streaming...\n\n"
         print(f"[Chat] Chunk #{cont_num+1}: {len(chunk_content)} chars, finish={finish_reason}")
+        # ALWAYS check for tool calls
         tool_calls, remaining_text = _parse_tool_calls(total_content)
         if tool_calls:
             print(f"[Chat] Detected {len(tool_calls)} tool call(s)")
             for sse_chunk in _emit_tool_call_chunks(chunk_id, created, model, tool_calls, remaining_text):
                 yield sse_chunk
             return
+        # No tool calls
         if finish_reason == "stop":
             chunk_sz = 50
             for offset in range(0, len(total_content), chunk_sz):
                 piece = total_content[offset:offset + chunk_sz]
                 })
                 yield f"data: {sse_data}\n\n"
             sse_data = json.dumps({
                 "id": chunk_id,
                 "object": "chat.completion.chunk",
             yield "data: [DONE]\n\n"
             return
+        # Auto-continue
         yield ": continuing...\n\n"
         if _has_incomplete_tool_call(chunk_content):
 # ── Non-streaming with auto-continue ────────────────────────────
 async def _collect_with_auto_continue(messages: list[dict], model: str) -> dict:
+    """Collect the full response, auto-continuing if cut off."""
     conversation = list(messages)
     full_content = ""
         tool_calls, remaining_text = _parse_tool_calls(full_content)
         if tool_calls:
             if _has_incomplete_tool_call(full_content) and finish_reason == "length":
+                pass
             else:
                 return {
                     "tool_calls": tool_calls,
     messages_raw = body.get("messages", [])
     stream = body.get("stream", False)
+    # Extract tools
     tools = body.get("tools") or body.get("functions") or None
     tool_choice = body.get("tool_choice", "auto")
+    # Convert old 'functions' format
     if tools and "function" not in tools[0] and "name" in tools[0]:
         tools = [{"type": "function", "function": f} for f in tools]
     print(f"[Request] model={model} stream={stream} tools={bool(tools)} tool_choice={tool_choice} msgs={len(messages_raw)}")
     if not messages_raw or not isinstance(messages_raw, list):
     if not messages:
         raise HTTPException(400, "No valid messages after normalization")
+    try:
+        if stream:
+            return StreamingResponse(
+                _stream_with_auto_continue(messages, model),
+                media_type="text/event-stream",
+                headers={
+                    "Cache-Control": "no-cache",
+                    "Connection": "keep-alive",
+                    "X-Accel-Buffering": "no",
+                },
+            )
         else:
+            result = await _collect_with_auto_continue(messages, model)
+            tool_calls = result.get("tool_calls")
+            content = result.get("content")
+            if tool_calls:
+                return JSONResponse({
+                    "id": f"chatcmpl-{int(time.time())}",
+                    "object": "chat.completion",
+                    "created": int(time.time()),
+                    "model": model,
+                    "choices": [{
+                        "index": 0,
+                        "message": {
+                            "role": "assistant",
+                            "content": content,
+                            "tool_calls": tool_calls,
+                        },
+                        "finish_reason": "tool_calls",
+                    }],
+                    "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
+                })
+            else:
+                return JSONResponse({
+                    "id": f"chatcmpl-{int(time.time())}",
+                    "object": "chat.completion",
+                    "created": int(time.time()),
+                    "model": model,
+                    "choices": [{
+                        "index": 0,
+                        "message": {"role": "assistant", "content": content or ""},
+                        "finish_reason": "stop",
+                    }],
+                    "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
+                })
+    except HTTPException:
+        raise
+    except Exception as e:
+        print(f"[Request] Unhandled error: {type(e).__name__}: {e}")
+        print(traceback.format_exc())
+        raise HTTPException(500, f"Internal error: {type(e).__name__}")
 # ── Models / Health ─────────────────────────────────────────────
 async def root():
     return {
         "status": "ok",
+        "version": "8.0.0",
         "proxy": bool(PROXY_URL),
         "tool_calling": True,
         "endpoints": ["/v1/chat/completions", "/v1/models"],
 @app.get("/debug/refresh")
 async def force_refresh():
     session.last_refresh = 0
+    result = await session.refresh(http_client)
+    if result is not None:
+        global http_client
+        http_client = result
     return {
         "refreshed": True,
         "has_cookies": bool(session.cookies),