Spaces:
Sleeping
Sleeping
| """Tiny pass-through HTTP proxy that clamps max_tokens to <=128000. | |
| Why: aibuildai's bundled Claude SDK (claude-cli 2.1.44) hardcodes | |
| max_tokens=128001 in its first /v1/messages request — one over Anthropic's | |
| current cap for claude-sonnet-4-6. The SDK ignores | |
| CLAUDE_CODE_MAX_OUTPUT_TOKENS in this version and exposes no CLI flag, so | |
| we intercept on the network and clamp before forwarding to CLIProxyAPI. | |
| Usage: | |
| .venv/bin/python -m agents.cliproxyapi.clamp_proxy | |
| # then in the agent's env: | |
| # ANTHROPIC_BASE_URL=http://127.0.0.1:8318 | |
| # ANTHROPIC_API_KEY=<the proxy's api-key> | |
| Env vars: | |
| CLIPROXYAPI_HOST/PORT upstream CLIProxyAPI (default 127.0.0.1:8317) | |
| CLIPROXYAPI_CLAMP_PORT this server's port (default 8318) | |
| CLIPROXYAPI_MAX_TOKENS_CAP cap value (default 128000) | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import requests | |
| from flask import Flask, Response, request, stream_with_context | |
| UPSTREAM_HOST = os.environ.get("CLIPROXYAPI_HOST", "127.0.0.1") | |
| UPSTREAM_PORT = int(os.environ.get("CLIPROXYAPI_PORT", "8317")) | |
| UPSTREAM = f"http://{UPSTREAM_HOST}:{UPSTREAM_PORT}" | |
| LISTEN_PORT = int(os.environ.get("CLIPROXYAPI_CLAMP_PORT", "8318")) | |
| MAX_TOKENS_CAP = int(os.environ.get("CLIPROXYAPI_MAX_TOKENS_CAP", "128000")) | |
| app = Flask(__name__) | |
| _HOP_BY_HOP = { | |
| "connection", "keep-alive", "proxy-authenticate", "proxy-authorization", | |
| "te", "trailers", "transfer-encoding", "upgrade", | |
| "content-encoding", "content-length", "host", | |
| } | |
| def _clamp_max_tokens(body: bytes) -> tuple[bytes, bool]: | |
| """If JSON body has max_tokens > cap, clamp it. Returns (body, clamped?).""" | |
| if not body: | |
| return body, False | |
| try: | |
| obj = json.loads(body) | |
| except (ValueError, TypeError): | |
| return body, False | |
| if not isinstance(obj, dict): | |
| return body, False | |
| mt = obj.get("max_tokens") | |
| if isinstance(mt, int) and mt > MAX_TOKENS_CAP: | |
| obj["max_tokens"] = MAX_TOKENS_CAP | |
| return json.dumps(obj).encode(), True | |
| return body, False | |
| def forward(path: str): | |
| url = f"{UPSTREAM}/{path}" | |
| if request.query_string: | |
| url += "?" + request.query_string.decode() | |
| body = request.get_data() if request.method in ("POST", "PUT", "PATCH") else None | |
| clamped = False | |
| if body is not None: | |
| body, clamped = _clamp_max_tokens(body) | |
| headers = {k: v for k, v in request.headers | |
| if k.lower() not in _HOP_BY_HOP} | |
| if clamped: | |
| # Recompute Content-Length implicitly by letting requests handle it. | |
| headers.pop("Content-Length", None) | |
| headers.pop("content-length", None) | |
| upstream = requests.request( | |
| method=request.method, | |
| url=url, | |
| headers=headers, | |
| data=body, | |
| stream=True, | |
| timeout=900, | |
| allow_redirects=False, | |
| ) | |
| resp_headers = [(k, v) for k, v in upstream.headers.items() | |
| if k.lower() not in _HOP_BY_HOP] | |
| def gen(): | |
| for chunk in upstream.iter_content(chunk_size=8192): | |
| if chunk: | |
| yield chunk | |
| return Response(stream_with_context(gen()), | |
| status=upstream.status_code, | |
| headers=resp_headers) | |
| if __name__ == "__main__": | |
| print(f"clamp-proxy listening on :{LISTEN_PORT} → {UPSTREAM} " | |
| f"(max_tokens cap = {MAX_TOKENS_CAP})", flush=True) | |
| app.run(host="0.0.0.0", port=LISTEN_PORT, threaded=True, use_reloader=False) | |