""" OpenClaw SSE Proxy - Strips reasoning_content from TRT-LLM SSE chunks so OpenClaw renders text instead of "..." - Strips ... blocks from content (stateful per-request) - Injects repetition_penalty: 1.15 to prevent loops - Injects "detailed thinking off" into system prompt (Nemotron suppresses thinking via system prompt) Run on :5002 — point OpenClaw at http://localhost:5002 instead of :5001 """ import json import asyncio import httpx from fastapi import FastAPI, Request, Response from fastapi.responses import StreamingResponse import uvicorn UPSTREAM = "http://localhost:5001" app = FastAPI() def strip_chunk(data_str: str, think_state: list) -> str | None: """ Clean a single SSE data chunk: - Remove reasoning_content field - Strip ... block from content (stateful — think_state[0] tracks position) think_state[0]: "before" | "inside" | "after" Returns cleaned JSON string, or None to suppress the chunk entirely. """ try: obj = json.loads(data_str) for choice in obj.get("choices", []): delta = choice.get("delta", {}) delta.pop("reasoning_content", None) content = delta.get("content", "") if not content: continue state = think_state[0] if state == "before": if "" in content: think_state[0] = "inside" after_open = content.split("", 1)[1] if "" in after_open: think_state[0] = "after" delta["content"] = after_open.split("", 1)[1] else: delta.pop("content", None) # null not "" — avoids OpenClaw treating as finished # else: no think tag yet, pass through as-is elif state == "inside": if "" in content: think_state[0] = "after" delta["content"] = content.split("", 1)[1] else: delta.pop("content", None) # null not "" — avoids OpenClaw treating as finished # state == "after": pass through as-is return json.dumps(obj) except (json.JSONDecodeError, AttributeError): return data_str async def stream_response(upstream_url: str, method: str, headers: dict, body: bytes): think_state = ["before"] # mutable so nested function can update it chunk_n = 0 log_f = open(r"D:\AI\models\proxy_debug.log", "a", encoding="utf-8") log_f.write("=== REQUEST ===\n") log_f.flush() try: async with httpx.AsyncClient(timeout=300.0) as client: async with client.stream(method, upstream_url, headers=headers, content=body) as resp: log_f.write(f"HTTP {resp.status_code}\n") log_f.flush() raw_n = 0 async for line in resp.aiter_lines(): raw_n += 1 if raw_n <= 20: log_f.write(f"RAW[{raw_n}]: {repr(line[:300])}\n") log_f.flush() if line.startswith("data: "): payload = line[6:] if payload.strip() == "[DONE]": log_f.write("[DONE]\n") log_f.flush() yield "data: [DONE]\n\n" else: chunk_n += 1 cleaned = strip_chunk(payload, think_state) if cleaned is not None: yield f"data: {cleaned}\n\n" elif line == "": pass else: yield f"{line}\n\n" finally: log_f.write(f"total_chunks={chunk_n}\n\n") log_f.close() @app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"]) async def proxy(path: str, request: Request): upstream_url = f"{UPSTREAM}/{path}" body = await request.body() # Forward headers, drop hop-by-hop skip = {"host", "content-length", "transfer-encoding", "connection"} headers = {k: v for k, v in request.headers.items() if k.lower() not in skip} # Check if client wants streaming; inject repetition_penalty to prevent loops is_stream = False if body: try: payload = json.loads(body) is_stream = payload.get("stream", False) payload.setdefault("repetition_penalty", 1.15) # Suppress Nemotron thinking via system prompt. messages = payload.get("messages", []) if messages and messages[0].get("role") == "system": sys_content = messages[0].get("content", "") if "detailed thinking off" not in sys_content: messages[0]["content"] = "detailed thinking off\n\n" + sys_content elif messages: messages.insert(0, {"role": "system", "content": "detailed thinking off\n\nYou are a helpful assistant."}) body = json.dumps(payload).encode() except json.JSONDecodeError: pass if is_stream: return StreamingResponse( stream_response(upstream_url, request.method, headers, body), media_type="text/event-stream", headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"}, ) else: async with httpx.AsyncClient(timeout=300.0) as client: resp = await client.request( request.method, upstream_url, headers=headers, content=body ) # Strip reasoning_content and ... from non-streaming response try: obj = resp.json() for choice in obj.get("choices", []): msg = choice.get("message", {}) msg.pop("reasoning_content", None) content = msg.get("content", "") if content and "" in content and "" in content: msg["content"] = content.split("", 1)[1].lstrip("\n") clean = json.dumps(obj).encode() return Response(content=clean, status_code=resp.status_code, headers={"Content-Type": "application/json"}) except Exception: return Response(content=resp.content, status_code=resp.status_code, headers=dict(resp.headers)) if __name__ == "__main__": print("OpenClaw SSE Proxy :5002 -> :5001") print("Point OpenClaw at http://localhost:5002") uvicorn.run(app, host="localhost", port=5002, log_level="warning")