Spaces:

overwrite69
/

haiku-api

Sleeping

App Files Files Community

overwrite69 commited on 6 days ago

Commit

49412a5

verified ·

1 Parent(s): 9bc9aad

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +106 -31

app.py CHANGED Viewed

@@ -150,33 +150,45 @@ def _headers() -> dict:
     return h
-async def _raw_call(messages: list[dict], model: str) -> httpx.Response:
-    """Make a single POST to chatgpt.org/api/chat, returns raw streaming Response."""
     await session.refresh(http_client)
     payload = {"model": model, "messages": messages}
     for attempt in range(2):
-        resp = await http_client.post(
-            "https://chatgpt.org/api/chat",
-            json=payload,
-            headers=_headers(),
-            cookies=session.cookies,
-        )
-        if resp.status_code == 419 and attempt == 0:
-            print("[Chat] 419 → refreshing session...")
-            session.last_refresh = 0
-            await session.refresh(http_client)
-            continue
-        if resp.status_code == 429:
-            raise HTTPException(429, "Rate limited by upstream")
-        if resp.status_code != 200:
-            session.last_refresh = 0
-            raise HTTPException(resp.status_code, f"Upstream {resp.status_code}: {resp.text[:300]}")
-        return resp
     raise HTTPException(500, "Failed after retry")
@@ -228,6 +240,50 @@ async def _stream_one_response(resp):
 # ── Main streaming endpoint with auto-continue ──────────────────
 MAX_CONTINUATIONS = 20
 async def _stream_with_auto_continue(messages: list[dict], model: str):
     """
     Stream the response in real-time. If it gets cut off (length limit),
@@ -240,8 +296,20 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
     total_content = ""
     for cont_num in range(MAX_CONTINUATIONS):
-        # Make the upstream call
-        resp = await _raw_call(conversation, model)
         # Stream it in real-time
         finish_reason = "stop"
@@ -249,14 +317,12 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
         async for text, fr in _stream_one_response(resp):
             if fr is not None:
-                # This is the final yield with finish_reason
                 finish_reason = fr
                 continue
             if text:
                 chunk_content += text
                 total_content += text
-                # Send the content chunk to the client immediately
                 sse_data = json.dumps({
                     "id": chunk_id,
                     "object": "chat.completion.chunk",
@@ -273,7 +339,6 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
         print(f"[Chat] Chunk #{cont_num+1}: {len(chunk_content)} chars, finish={finish_reason}")
         if finish_reason == "stop":
-            # All done — send final chunk with finish_reason=stop
             sse_data = json.dumps({
                 "id": chunk_id,
                 "object": "chat.completion.chunk",
@@ -289,17 +354,15 @@ async def _stream_with_auto_continue(messages: list[dict], model: str):
             yield "data: [DONE]\n\n"
             return
-        # Response was cut off — need to auto-continue
-        # Send a keep-alive comment so the client doesn't timeout
         yield ": continuing...\n\n"
-        # Append to conversation for next round
         conversation.append({"role": "assistant", "content": chunk_content})
         conversation.append({"role": "user", "content": "Continue exactly from where you left off. Do not repeat any text you already wrote."})
         print(f"[Chat] Auto-continue #{cont_num+1}, total so far: {len(total_content)} chars")
-    # Safety: hit max continuations
     sse_data = json.dumps({
         "id": chunk_id,
         "object": "chat.completion.chunk",
@@ -418,6 +481,18 @@ async def health():
     return {"status": "ok", "session_active": bool(session.cookies)}
 @app.get("/debug/session")
 async def debug_session():
     return {

     return h
+async def _raw_call(messages: list[dict], model: str, retry_on_429: bool = True) -> httpx.Response:
+    """Make a single POST to chatgpt.org/api/chat, returns raw streaming Response.
+    Retries with backoff on 429 rate limits."""
     await session.refresh(http_client)
     payload = {"model": model, "messages": messages}
+    # CSRF retry
     for attempt in range(2):
+        # 429 retry with backoff
+        for rate_attempt in range(3):  # up to 3 attempts on 429
+            resp = await http_client.post(
+                "https://chatgpt.org/api/chat",
+                json=payload,
+                headers=_headers(),
+                cookies=session.cookies,
+            )
+            if resp.status_code == 419 and attempt == 0:
+                print("[Chat] 419 → refreshing session...")
+                session.last_refresh = 0
+                await session.refresh(http_client)
+                break  # break inner loop, retry CSRF
+            if resp.status_code == 429:
+                wait_time = (rate_attempt + 1) * 10  # 10s, 20s, 30s
+                print(f"[Chat] 429 rate limited, waiting {wait_time}s (attempt {rate_attempt+1}/3)...")
+                session.last_refresh = 0
+                await session.refresh(http_client)
+                if retry_on_429 and rate_attempt < 2:
+                    await asyncio.sleep(wait_time)
+                    continue
+                raise HTTPException(429, f"Rate limited by upstream after {rate_attempt+1} retries")
+            if resp.status_code != 200:
+                session.last_refresh = 0
+                raise HTTPException(resp.status_code, f"Upstream {resp.status_code}: {resp.text[:300]}")
+            return resp
     raise HTTPException(500, "Failed after retry")
 # ── Main streaming endpoint with auto-continue ──────────────────
 MAX_CONTINUATIONS = 20
+async def _raw_call_streaming(messages: list[dict], model: str):
+    """Like _raw_call but yields SSE keep-alive comments during 429 retries.
+    For use in streaming mode so the client connection stays alive."""
+    await session.refresh(http_client)
+    payload = {"model": model, "messages": messages}
+    for attempt in range(2):  # CSRF retry
+        for rate_attempt in range(3):  # 429 retry
+            resp = await http_client.post(
+                "https://chatgpt.org/api/chat",
+                json=payload,
+                headers=_headers(),
+                cookies=session.cookies,
+            )
+            if resp.status_code == 419 and attempt == 0:
+                print("[Chat] 419 → refreshing session...")
+                session.last_refresh = 0
+                await session.refresh(http_client)
+                break
+            if resp.status_code == 429:
+                wait_time = (rate_attempt + 1) * 10
+                print(f"[Chat] 429 rate limited, waiting {wait_time}s (attempt {rate_attempt+1}/3)...")
+                session.last_refresh = 0
+                await session.refresh(http_client)
+                if rate_attempt < 2:
+                    # Send keep-alive pings while waiting
+                    for _ in range(wait_time):
+                        yield ": retrying...\n\n"
+                        await asyncio.sleep(1)
+                    continue
+                raise HTTPException(429, f"Rate limited after {rate_attempt+1} retries")
+            if resp.status_code != 200:
+                session.last_refresh = 0
+                raise HTTPException(resp.status_code, f"Upstream {resp.status_code}: {resp.text[:300]}")
+            yield resp
+            return
+    raise HTTPException(500, "Failed after retry")
 async def _stream_with_auto_continue(messages: list[dict], model: str):
     """
     Stream the response in real-time. If it gets cut off (length limit),
     total_content = ""
     for cont_num in range(MAX_CONTINUATIONS):
+        # Send keep-alive before making the call
+        yield ": thinking...\n\n"
+        # Make the upstream call (with keep-alive during 429 retries)
+        resp = None
+        async for result in _raw_call_streaming(conversation, model):
+            if isinstance(result, str):
+                # This is a keep-alive comment
+                yield result
+            else:
+                resp = result
+        if resp is None:
+            raise HTTPException(500, "No response from upstream")
         # Stream it in real-time
         finish_reason = "stop"
         async for text, fr in _stream_one_response(resp):
             if fr is not None:
                 finish_reason = fr
                 continue
             if text:
                 chunk_content += text
                 total_content += text
                 sse_data = json.dumps({
                     "id": chunk_id,
                     "object": "chat.completion.chunk",
         print(f"[Chat] Chunk #{cont_num+1}: {len(chunk_content)} chars, finish={finish_reason}")
         if finish_reason == "stop":
             sse_data = json.dumps({
                 "id": chunk_id,
                 "object": "chat.completion.chunk",
             yield "data: [DONE]\n\n"
             return
+        # Auto-continue — send keep-alive
         yield ": continuing...\n\n"
         conversation.append({"role": "assistant", "content": chunk_content})
         conversation.append({"role": "user", "content": "Continue exactly from where you left off. Do not repeat any text you already wrote."})
         print(f"[Chat] Auto-continue #{cont_num+1}, total so far: {len(total_content)} chars")
+    # Safety
     sse_data = json.dumps({
         "id": chunk_id,
         "object": "chat.completion.chunk",
     return {"status": "ok", "session_active": bool(session.cookies)}
+@app.get("/debug/refresh")
+async def force_refresh():
+    """Force refresh the session cookies."""
+    session.last_refresh = 0
+    await session.refresh(http_client)
+    return {
+        "refreshed": True,
+        "has_cookies": bool(session.cookies),
+        "has_csrf": bool(session.csrf_token),
+    }
 @app.get("/debug/session")
 async def debug_session():
     return {