Spaces:
Sleeping
Sleeping
| import uvicorn | |
| from fastapi import FastAPI, Request | |
| from fastapi.responses import StreamingResponse | |
| from fastapi.middleware.cors import CORSMiddleware | |
| import httpx | |
| import os | |
| import subprocess | |
| import time | |
| import threading | |
| app = FastAPI() | |
| # Enable CORS for the local developer dashboard | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| BACKEND_URL = "http://127.0.0.1:8001" | |
| client = httpx.AsyncClient(base_url=BACKEND_URL, timeout=None) | |
| async def proxy(request: Request, path: str): | |
| url = f"{BACKEND_URL}/{path}" | |
| # Forward the request | |
| content = await request.body() | |
| headers = dict(request.headers) | |
| # Remove host header to avoid proxy detection issues | |
| headers.pop("host", None) | |
| try: | |
| # We use a streaming request to handle potentially large responses | |
| # although llama-cpp-python server has its own streaming logic | |
| rp_req = client.build_request( | |
| request.method, | |
| url, | |
| params=request.query_params, | |
| headers=headers, | |
| content=content | |
| ) | |
| rp_resp = await client.send(rp_req, stream=True) | |
| return StreamingResponse( | |
| rp_resp.aiter_raw(), | |
| status_code=rp_resp.status_code, | |
| headers=dict(rp_resp.headers) | |
| ) | |
| except Exception as e: | |
| return {"error": f"Proxy failed to connect to backend: {str(e)}"} | |
| def run_llama_server(): | |
| model_file = os.environ.get("MODEL_FILE") | |
| n_ctx = os.environ.get("N_CTX", "4096") | |
| print(f"--- Launching Pure Llama Engine on port 8001 ---") | |
| # Using the standard CLI which we KNOW works for the API | |
| cmd = [ | |
| "python3", "-m", "llama_cpp.server", | |
| "--model", f"./{model_file}", | |
| "--port", "8001", | |
| "--host", "127.0.0.1", | |
| "--n_ctx", n_ctx | |
| ] | |
| subprocess.run(cmd) | |
| if __name__ == "__main__": | |
| # Start the llama server in a background thread | |
| threading.Thread(target=run_llama_server, daemon=True).start() | |
| # Wait a few seconds for the engine to warm up | |
| print("Waiting for engine to stabilize...") | |
| time.sleep(10) | |
| # Run the proxy on the port Hugging Face expects (7860) | |
| port = int(os.environ.get("PORT", 7860)) | |
| print(f"--- Launching CORS Proxy on port {port} ---") | |
| uvicorn.run(app, host="0.0.0.0", port=port) | |