NanoBotAIAgent commited on
Commit
f348ff6
·
verified ·
1 Parent(s): f1242e0

Update proxy to match 27B space (chat UI, hop-by-hop headers, api-info)

Browse files
Files changed (1) hide show
  1. proxy.py +70 -27
proxy.py CHANGED
@@ -6,16 +6,35 @@ from contextlib import asynccontextmanager
6
 
7
  import httpx
8
  from fastapi import FastAPI, Request, Response
9
- from fastapi.responses import JSONResponse, StreamingResponse
10
- from sse_starlette.sse import EventSourceResponse
11
 
12
  LLAMA_HOST = os.getenv("LLAMA_HOST", "127.0.0.1")
13
  LLAMA_PORT = int(os.getenv("LLAMA_PORT", "8080"))
14
  LLAMA_URL = f"http://{LLAMA_HOST}:{LLAMA_PORT}"
15
 
16
-
17
- async def wait_for_llama(timeout: float = 300.0):
18
- """Wait for llama-server health endpoint to respond."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  start = time.time()
20
  async with httpx.AsyncClient() as client:
21
  while time.time() - start < timeout:
@@ -31,55 +50,79 @@ async def wait_for_llama(timeout: float = 300.0):
31
 
32
  @asynccontextmanager
33
  async def lifespan(app: FastAPI):
34
- ready = await wait_for_llama()
35
- if not ready:
36
- raise RuntimeError("llama-server did not become ready in time")
37
  yield
38
 
39
 
40
  app = FastAPI(lifespan=lifespan)
41
- client = httpx.AsyncClient(base_url=LLAMA_URL, timeout=600)
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
 
44
- @app.get("/")
45
  async def root():
46
- return {"status": "ok", "llama_server": LLAMA_URL}
 
 
 
 
 
47
 
48
 
49
  @app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "OPTIONS", "HEAD", "PATCH"])
50
  async def proxy(request: Request, path: str):
51
- url = httpx.URL(path=path, query=request.url.query.encode("utf-8"))
52
- headers = dict(request.headers)
53
  headers.pop("host", None)
54
 
55
  body = await request.body()
56
 
57
- # Rewrite /v1/chat/completions payload
58
- if path == "v1/chat/completions" and request.method == "POST":
 
59
  try:
60
  payload = json.loads(body)
61
- # Accept whatever model string the client sends; llama.cpp ignores it anyway
62
  payload.pop("model", None)
 
63
  body = json.dumps(payload).encode()
64
  except Exception:
65
  pass
66
 
67
- rp_resp = await client.request(
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  method=request.method,
69
  url=url,
70
  headers=headers,
71
  content=body,
72
  )
73
 
74
- # Streaming responses from llama.cpp
75
- if "text/event-stream" in rp_resp.headers.get("content-type", ""):
76
- async def event_generator():
77
- async for chunk in rp_resp.aiter_text():
78
- yield chunk
79
- return StreamingResponse(event_generator(), status_code=rp_resp.status_code, headers=dict(rp_resp.headers))
80
-
81
  return Response(
82
- content=rp_resp.content,
83
- status_code=rp_resp.status_code,
84
- headers=dict(rp_resp.headers),
 
85
  )
 
6
 
7
  import httpx
8
  from fastapi import FastAPI, Request, Response
9
+ from fastapi.responses import StreamingResponse, HTMLResponse, JSONResponse
 
10
 
11
  LLAMA_HOST = os.getenv("LLAMA_HOST", "127.0.0.1")
12
  LLAMA_PORT = int(os.getenv("LLAMA_PORT", "8080"))
13
  LLAMA_URL = f"http://{LLAMA_HOST}:{LLAMA_PORT}"
14
 
15
+ # Headers that must NOT be copied verbatim. Stripping framing headers from
16
+ # both request and response avoids
17
+ # "Too little data for declared Content-Length" errors (we mutate the JSON
18
+ # body, which changes its length).
19
+ HOP_BY_HOP = {
20
+ "content-length",
21
+ "transfer-encoding",
22
+ "content-encoding",
23
+ "connection",
24
+ "keep-alive",
25
+ "proxy-authenticate",
26
+ "proxy-authorization",
27
+ "te",
28
+ "trailers",
29
+ "upgrade",
30
+ }
31
+
32
+
33
+ def clean_headers(headers):
34
+ return {k: v for k, v in headers.items() if k.lower() not in HOP_BY_HOP}
35
+
36
+
37
+ async def wait_for_llama(timeout: float = 600.0):
38
  start = time.time()
39
  async with httpx.AsyncClient() as client:
40
  while time.time() - start < timeout:
 
50
 
51
  @asynccontextmanager
52
  async def lifespan(app: FastAPI):
53
+ await wait_for_llama()
 
 
54
  yield
55
 
56
 
57
  app = FastAPI(lifespan=lifespan)
58
+ http_client = httpx.AsyncClient(base_url=LLAMA_URL, timeout=None)
59
+
60
+
61
+ CHAT_HTML_PATH = os.path.join(os.path.dirname(__file__), "chat.html")
62
+ try:
63
+ with open(CHAT_HTML_PATH, "r", encoding="utf-8") as _f:
64
+ CHAT_HTML = _f.read()
65
+ except Exception:
66
+ CHAT_HTML = "<h1>Chat UI not found</h1>"
67
+
68
+
69
+ @app.get("/health")
70
+ async def health():
71
+ return {"status": "ok"}
72
 
73
 
74
+ @app.get("/", response_class=HTMLResponse)
75
  async def root():
76
+ return HTMLResponse(CHAT_HTML)
77
+
78
+
79
+ @app.get("/api-info")
80
+ async def api_info():
81
+ return JSONResponse({"status": "ok", "llama_server": LLAMA_URL})
82
 
83
 
84
  @app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "OPTIONS", "HEAD", "PATCH"])
85
  async def proxy(request: Request, path: str):
86
+ url = httpx.URL(path="/" + path, query=request.url.query.encode("utf-8"))
87
+ headers = clean_headers(dict(request.headers))
88
  headers.pop("host", None)
89
 
90
  body = await request.body()
91
 
92
+ # Detect streaming requests and strip the (ignored) model field
93
+ is_stream = False
94
+ if request.method == "POST" and path.startswith("v1/"):
95
  try:
96
  payload = json.loads(body)
 
97
  payload.pop("model", None)
98
+ is_stream = bool(payload.get("stream", False))
99
  body = json.dumps(payload).encode()
100
  except Exception:
101
  pass
102
 
103
+ if is_stream:
104
+ async def event_stream():
105
+ async with http_client.stream(
106
+ request.method, url, headers=headers, content=body
107
+ ) as upstream:
108
+ async for chunk in upstream.aiter_raw():
109
+ yield chunk
110
+
111
+ return StreamingResponse(
112
+ event_stream(),
113
+ media_type="text/event-stream",
114
+ )
115
+
116
+ upstream = await http_client.request(
117
  method=request.method,
118
  url=url,
119
  headers=headers,
120
  content=body,
121
  )
122
 
 
 
 
 
 
 
 
123
  return Response(
124
+ content=upstream.content,
125
+ status_code=upstream.status_code,
126
+ headers=clean_headers(dict(upstream.headers)),
127
+ media_type=upstream.headers.get("content-type"),
128
  )