Update proxy to match 27B space (chat UI, hop-by-hop headers, api-info)
Browse files
proxy.py
CHANGED
|
@@ -6,16 +6,35 @@ from contextlib import asynccontextmanager
|
|
| 6 |
|
| 7 |
import httpx
|
| 8 |
from fastapi import FastAPI, Request, Response
|
| 9 |
-
from fastapi.responses import
|
| 10 |
-
from sse_starlette.sse import EventSourceResponse
|
| 11 |
|
| 12 |
LLAMA_HOST = os.getenv("LLAMA_HOST", "127.0.0.1")
|
| 13 |
LLAMA_PORT = int(os.getenv("LLAMA_PORT", "8080"))
|
| 14 |
LLAMA_URL = f"http://{LLAMA_HOST}:{LLAMA_PORT}"
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
start = time.time()
|
| 20 |
async with httpx.AsyncClient() as client:
|
| 21 |
while time.time() - start < timeout:
|
|
@@ -31,55 +50,79 @@ async def wait_for_llama(timeout: float = 300.0):
|
|
| 31 |
|
| 32 |
@asynccontextmanager
|
| 33 |
async def lifespan(app: FastAPI):
|
| 34 |
-
|
| 35 |
-
if not ready:
|
| 36 |
-
raise RuntimeError("llama-server did not become ready in time")
|
| 37 |
yield
|
| 38 |
|
| 39 |
|
| 40 |
app = FastAPI(lifespan=lifespan)
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
-
@app.get("/")
|
| 45 |
async def root():
|
| 46 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
|
| 49 |
@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "OPTIONS", "HEAD", "PATCH"])
|
| 50 |
async def proxy(request: Request, path: str):
|
| 51 |
-
url = httpx.URL(path=path, query=request.url.query.encode("utf-8"))
|
| 52 |
-
headers = dict(request.headers)
|
| 53 |
headers.pop("host", None)
|
| 54 |
|
| 55 |
body = await request.body()
|
| 56 |
|
| 57 |
-
#
|
| 58 |
-
|
|
|
|
| 59 |
try:
|
| 60 |
payload = json.loads(body)
|
| 61 |
-
# Accept whatever model string the client sends; llama.cpp ignores it anyway
|
| 62 |
payload.pop("model", None)
|
|
|
|
| 63 |
body = json.dumps(payload).encode()
|
| 64 |
except Exception:
|
| 65 |
pass
|
| 66 |
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
method=request.method,
|
| 69 |
url=url,
|
| 70 |
headers=headers,
|
| 71 |
content=body,
|
| 72 |
)
|
| 73 |
|
| 74 |
-
# Streaming responses from llama.cpp
|
| 75 |
-
if "text/event-stream" in rp_resp.headers.get("content-type", ""):
|
| 76 |
-
async def event_generator():
|
| 77 |
-
async for chunk in rp_resp.aiter_text():
|
| 78 |
-
yield chunk
|
| 79 |
-
return StreamingResponse(event_generator(), status_code=rp_resp.status_code, headers=dict(rp_resp.headers))
|
| 80 |
-
|
| 81 |
return Response(
|
| 82 |
-
content=
|
| 83 |
-
status_code=
|
| 84 |
-
headers=dict(
|
|
|
|
| 85 |
)
|
|
|
|
| 6 |
|
| 7 |
import httpx
|
| 8 |
from fastapi import FastAPI, Request, Response
|
| 9 |
+
from fastapi.responses import StreamingResponse, HTMLResponse, JSONResponse
|
|
|
|
| 10 |
|
| 11 |
LLAMA_HOST = os.getenv("LLAMA_HOST", "127.0.0.1")
|
| 12 |
LLAMA_PORT = int(os.getenv("LLAMA_PORT", "8080"))
|
| 13 |
LLAMA_URL = f"http://{LLAMA_HOST}:{LLAMA_PORT}"
|
| 14 |
|
| 15 |
+
# Headers that must NOT be copied verbatim. Stripping framing headers from
|
| 16 |
+
# both request and response avoids
|
| 17 |
+
# "Too little data for declared Content-Length" errors (we mutate the JSON
|
| 18 |
+
# body, which changes its length).
|
| 19 |
+
HOP_BY_HOP = {
|
| 20 |
+
"content-length",
|
| 21 |
+
"transfer-encoding",
|
| 22 |
+
"content-encoding",
|
| 23 |
+
"connection",
|
| 24 |
+
"keep-alive",
|
| 25 |
+
"proxy-authenticate",
|
| 26 |
+
"proxy-authorization",
|
| 27 |
+
"te",
|
| 28 |
+
"trailers",
|
| 29 |
+
"upgrade",
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def clean_headers(headers):
|
| 34 |
+
return {k: v for k, v in headers.items() if k.lower() not in HOP_BY_HOP}
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
async def wait_for_llama(timeout: float = 600.0):
|
| 38 |
start = time.time()
|
| 39 |
async with httpx.AsyncClient() as client:
|
| 40 |
while time.time() - start < timeout:
|
|
|
|
| 50 |
|
| 51 |
@asynccontextmanager
|
| 52 |
async def lifespan(app: FastAPI):
|
| 53 |
+
await wait_for_llama()
|
|
|
|
|
|
|
| 54 |
yield
|
| 55 |
|
| 56 |
|
| 57 |
app = FastAPI(lifespan=lifespan)
|
| 58 |
+
http_client = httpx.AsyncClient(base_url=LLAMA_URL, timeout=None)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
CHAT_HTML_PATH = os.path.join(os.path.dirname(__file__), "chat.html")
|
| 62 |
+
try:
|
| 63 |
+
with open(CHAT_HTML_PATH, "r", encoding="utf-8") as _f:
|
| 64 |
+
CHAT_HTML = _f.read()
|
| 65 |
+
except Exception:
|
| 66 |
+
CHAT_HTML = "<h1>Chat UI not found</h1>"
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@app.get("/health")
|
| 70 |
+
async def health():
|
| 71 |
+
return {"status": "ok"}
|
| 72 |
|
| 73 |
|
| 74 |
+
@app.get("/", response_class=HTMLResponse)
|
| 75 |
async def root():
|
| 76 |
+
return HTMLResponse(CHAT_HTML)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
@app.get("/api-info")
|
| 80 |
+
async def api_info():
|
| 81 |
+
return JSONResponse({"status": "ok", "llama_server": LLAMA_URL})
|
| 82 |
|
| 83 |
|
| 84 |
@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "OPTIONS", "HEAD", "PATCH"])
|
| 85 |
async def proxy(request: Request, path: str):
|
| 86 |
+
url = httpx.URL(path="/" + path, query=request.url.query.encode("utf-8"))
|
| 87 |
+
headers = clean_headers(dict(request.headers))
|
| 88 |
headers.pop("host", None)
|
| 89 |
|
| 90 |
body = await request.body()
|
| 91 |
|
| 92 |
+
# Detect streaming requests and strip the (ignored) model field
|
| 93 |
+
is_stream = False
|
| 94 |
+
if request.method == "POST" and path.startswith("v1/"):
|
| 95 |
try:
|
| 96 |
payload = json.loads(body)
|
|
|
|
| 97 |
payload.pop("model", None)
|
| 98 |
+
is_stream = bool(payload.get("stream", False))
|
| 99 |
body = json.dumps(payload).encode()
|
| 100 |
except Exception:
|
| 101 |
pass
|
| 102 |
|
| 103 |
+
if is_stream:
|
| 104 |
+
async def event_stream():
|
| 105 |
+
async with http_client.stream(
|
| 106 |
+
request.method, url, headers=headers, content=body
|
| 107 |
+
) as upstream:
|
| 108 |
+
async for chunk in upstream.aiter_raw():
|
| 109 |
+
yield chunk
|
| 110 |
+
|
| 111 |
+
return StreamingResponse(
|
| 112 |
+
event_stream(),
|
| 113 |
+
media_type="text/event-stream",
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
upstream = await http_client.request(
|
| 117 |
method=request.method,
|
| 118 |
url=url,
|
| 119 |
headers=headers,
|
| 120 |
content=body,
|
| 121 |
)
|
| 122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
return Response(
|
| 124 |
+
content=upstream.content,
|
| 125 |
+
status_code=upstream.status_code,
|
| 126 |
+
headers=clean_headers(dict(upstream.headers)),
|
| 127 |
+
media_type=upstream.headers.get("content-type"),
|
| 128 |
)
|