import uvicorn from fastapi import FastAPI, Request from fastapi.responses import StreamingResponse from fastapi.middleware.cors import CORSMiddleware import httpx import os import subprocess import time import threading app = FastAPI() # Enable CORS for the local developer dashboard app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) BACKEND_URL = "http://127.0.0.1:8001" client = httpx.AsyncClient(base_url=BACKEND_URL, timeout=None) @app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"]) async def proxy(request: Request, path: str): url = f"{BACKEND_URL}/{path}" # Forward the request content = await request.body() headers = dict(request.headers) # Remove host header to avoid proxy detection issues headers.pop("host", None) try: # We use a streaming request to handle potentially large responses # although llama-cpp-python server has its own streaming logic rp_req = client.build_request( request.method, url, params=request.query_params, headers=headers, content=content ) rp_resp = await client.send(rp_req, stream=True) return StreamingResponse( rp_resp.aiter_raw(), status_code=rp_resp.status_code, headers=dict(rp_resp.headers) ) except Exception as e: return {"error": f"Proxy failed to connect to backend: {str(e)}"} def run_llama_server(): model_file = os.environ.get("MODEL_FILE") n_ctx = os.environ.get("N_CTX", "4096") print(f"--- Launching Pure Llama Engine on port 8001 ---") # Using the standard CLI which we KNOW works for the API cmd = [ "python3", "-m", "llama_cpp.server", "--model", f"./{model_file}", "--port", "8001", "--host", "127.0.0.1", "--n_ctx", n_ctx ] subprocess.run(cmd) if __name__ == "__main__": # Start the llama server in a background thread threading.Thread(target=run_llama_server, daemon=True).start() # Wait a few seconds for the engine to warm up print("Waiting for engine to stabilize...") time.sleep(10) # Run the proxy on the port Hugging Face expects (7860) port = int(os.environ.get("PORT", 7860)) print(f"--- Launching CORS Proxy on port {port} ---") uvicorn.run(app, host="0.0.0.0", port=port)