import asyncio import json import os import time from contextlib import asynccontextmanager import httpx from fastapi import FastAPI, Request, Response from fastapi.responses import StreamingResponse, HTMLResponse, JSONResponse LLAMA_HOST = os.getenv("LLAMA_HOST", "127.0.0.1") LLAMA_PORT = int(os.getenv("LLAMA_PORT", "8080")) LLAMA_URL = f"http://{LLAMA_HOST}:{LLAMA_PORT}" # Headers that must NOT be copied verbatim. Stripping framing headers from # both request and response avoids # "Too little data for declared Content-Length" errors (we mutate the JSON # body, which changes its length). HOP_BY_HOP = { "content-length", "transfer-encoding", "content-encoding", "connection", "keep-alive", "proxy-authenticate", "proxy-authorization", "te", "trailers", "upgrade", } def clean_headers(headers): return {k: v for k, v in headers.items() if k.lower() not in HOP_BY_HOP} async def wait_for_llama(timeout: float = 600.0): start = time.time() async with httpx.AsyncClient() as client: while time.time() - start < timeout: try: r = await client.get(f"{LLAMA_URL}/health", timeout=2) if r.status_code == 200: return True except Exception: pass await asyncio.sleep(1) return False @asynccontextmanager async def lifespan(app: FastAPI): await wait_for_llama() yield app = FastAPI(lifespan=lifespan) http_client = httpx.AsyncClient(base_url=LLAMA_URL, timeout=None) CHAT_HTML_PATH = os.path.join(os.path.dirname(__file__), "chat.html") try: with open(CHAT_HTML_PATH, "r", encoding="utf-8") as _f: CHAT_HTML = _f.read() except Exception: CHAT_HTML = "