llama-3.2-api-backend / proxy_server.py
zenotaiz's picture
Fixed: Transitioned to native python model downloader for stability
a89ffcb verified
import uvicorn
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
from fastapi.middleware.cors import CORSMiddleware
import httpx
import os
import subprocess
import time
import threading
app = FastAPI()
# Enable CORS for the local developer dashboard
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
BACKEND_URL = "http://127.0.0.1:8001"
client = httpx.AsyncClient(base_url=BACKEND_URL, timeout=None)
@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"])
async def proxy(request: Request, path: str):
url = f"{BACKEND_URL}/{path}"
# Forward the request
content = await request.body()
headers = dict(request.headers)
# Remove host header to avoid proxy detection issues
headers.pop("host", None)
try:
# We use a streaming request to handle potentially large responses
# although llama-cpp-python server has its own streaming logic
rp_req = client.build_request(
request.method,
url,
params=request.query_params,
headers=headers,
content=content
)
rp_resp = await client.send(rp_req, stream=True)
return StreamingResponse(
rp_resp.aiter_raw(),
status_code=rp_resp.status_code,
headers=dict(rp_resp.headers)
)
except Exception as e:
return {"error": f"Proxy failed to connect to backend: {str(e)}"}
def run_llama_server():
model_file = os.environ.get("MODEL_FILE")
n_ctx = os.environ.get("N_CTX", "4096")
print(f"--- Launching Pure Llama Engine on port 8001 ---")
# Using the standard CLI which we KNOW works for the API
cmd = [
"python3", "-m", "llama_cpp.server",
"--model", f"./{model_file}",
"--port", "8001",
"--host", "127.0.0.1",
"--n_ctx", n_ctx
]
subprocess.run(cmd)
if __name__ == "__main__":
# Start the llama server in a background thread
threading.Thread(target=run_llama_server, daemon=True).start()
# Wait a few seconds for the engine to warm up
print("Waiting for engine to stabilize...")
time.sleep(10)
# Run the proxy on the port Hugging Face expects (7860)
port = int(os.environ.get("PORT", 7860))
print(f"--- Launching CORS Proxy on port {port} ---")
uvicorn.run(app, host="0.0.0.0", port=port)