Spaces:

lanczos
/

graphtestbed

Sleeping

graphtestbed / agents /cliproxyapi /clamp_proxy.py

Zhu Jiajun (jz28583)

Add clamp_proxy + route AI-Build-AI through it (max_tokens fix)

61a7817 19 days ago

3.68 kB

	"""Tiny pass-through HTTP proxy that clamps max_tokens to <=128000.

	Why: aibuildai's bundled Claude SDK (claude-cli 2.1.44) hardcodes
	max_tokens=128001 in its first /v1/messages request — one over Anthropic's
	current cap for claude-sonnet-4-6. The SDK ignores
	CLAUDE_CODE_MAX_OUTPUT_TOKENS in this version and exposes no CLI flag, so
	we intercept on the network and clamp before forwarding to CLIProxyAPI.

	Usage:
	.venv/bin/python -m agents.cliproxyapi.clamp_proxy
	# then in the agent's env:
	# ANTHROPIC_BASE_URL=http://127.0.0.1:8318
	# ANTHROPIC_API_KEY=<the proxy's api-key>

	Env vars:
	CLIPROXYAPI_HOST/PORT upstream CLIProxyAPI (default 127.0.0.1:8317)
	CLIPROXYAPI_CLAMP_PORT this server's port (default 8318)
	CLIPROXYAPI_MAX_TOKENS_CAP cap value (default 128000)
	"""

	from __future__ import annotations

	import json
	import os

	import requests
	from flask import Flask, Response, request, stream_with_context

	UPSTREAM_HOST = os.environ.get("CLIPROXYAPI_HOST", "127.0.0.1")
	UPSTREAM_PORT = int(os.environ.get("CLIPROXYAPI_PORT", "8317"))
	UPSTREAM = f"http://{UPSTREAM_HOST}:{UPSTREAM_PORT}"
	LISTEN_PORT = int(os.environ.get("CLIPROXYAPI_CLAMP_PORT", "8318"))
	MAX_TOKENS_CAP = int(os.environ.get("CLIPROXYAPI_MAX_TOKENS_CAP", "128000"))

	app = Flask(__name__)

	_HOP_BY_HOP = {
	"connection", "keep-alive", "proxy-authenticate", "proxy-authorization",
	"te", "trailers", "transfer-encoding", "upgrade",
	"content-encoding", "content-length", "host",
	}


	def _clamp_max_tokens(body: bytes) -> tuple[bytes, bool]:
	"""If JSON body has max_tokens > cap, clamp it. Returns (body, clamped?)."""
	if not body:
	return body, False
	try:
	obj = json.loads(body)
	except (ValueError, TypeError):
	return body, False
	if not isinstance(obj, dict):
	return body, False
	mt = obj.get("max_tokens")
	if isinstance(mt, int) and mt > MAX_TOKENS_CAP:
	obj["max_tokens"] = MAX_TOKENS_CAP
	return json.dumps(obj).encode(), True
	return body, False


	@app.route("/", defaults={"path": ""}, methods=[
	"GET", "POST", "PUT", "PATCH", "DELETE", "OPTIONS"])
	@app.route("/<path:path>", methods=[
	"GET", "POST", "PUT", "PATCH", "DELETE", "OPTIONS"])
	def forward(path: str):
	url = f"{UPSTREAM}/{path}"
	if request.query_string:
	url += "?" + request.query_string.decode()

	body = request.get_data() if request.method in ("POST", "PUT", "PATCH") else None
	clamped = False
	if body is not None:
	body, clamped = _clamp_max_tokens(body)

	headers = {k: v for k, v in request.headers
	if k.lower() not in _HOP_BY_HOP}
	if clamped:
	# Recompute Content-Length implicitly by letting requests handle it.
	headers.pop("Content-Length", None)
	headers.pop("content-length", None)

	upstream = requests.request(
	method=request.method,
	url=url,
	headers=headers,
	data=body,
	stream=True,
	timeout=900,
	allow_redirects=False,
	)

	resp_headers = [(k, v) for k, v in upstream.headers.items()
	if k.lower() not in _HOP_BY_HOP]

	def gen():
	for chunk in upstream.iter_content(chunk_size=8192):
	if chunk:
	yield chunk

	return Response(stream_with_context(gen()),
	status=upstream.status_code,
	headers=resp_headers)


	if __name__ == "__main__":
	print(f"clamp-proxy listening on :{LISTEN_PORT} → {UPSTREAM} "
	f"(max_tokens cap = {MAX_TOKENS_CAP})", flush=True)
	app.run(host="0.0.0.0", port=LISTEN_PORT, threaded=True, use_reloader=False)