Spaces:

zenotaiz
/

llama-3.2-api-backend

Sleeping

App Files Files Community

llama-3.2-api-backend / proxy_server.py

zenotaiz

Fixed: Transitioned to native python model downloader for stability

a89ffcb verified about 1 month ago

raw

history blame contribute delete

2.48 kB

	import uvicorn
	from fastapi import FastAPI, Request
	from fastapi.responses import StreamingResponse
	from fastapi.middleware.cors import CORSMiddleware
	import httpx
	import os
	import subprocess
	import time
	import threading

	app = FastAPI()

	# Enable CORS for the local developer dashboard
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	BACKEND_URL = "http://127.0.0.1:8001"
	client = httpx.AsyncClient(base_url=BACKEND_URL, timeout=None)

	@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"])
	async def proxy(request: Request, path: str):
	url = f"{BACKEND_URL}/{path}"

	# Forward the request
	content = await request.body()
	headers = dict(request.headers)
	# Remove host header to avoid proxy detection issues
	headers.pop("host", None)

	try:
	# We use a streaming request to handle potentially large responses
	# although llama-cpp-python server has its own streaming logic
	rp_req = client.build_request(
	request.method,
	url,
	params=request.query_params,
	headers=headers,
	content=content
	)
	rp_resp = await client.send(rp_req, stream=True)

	return StreamingResponse(
	rp_resp.aiter_raw(),
	status_code=rp_resp.status_code,
	headers=dict(rp_resp.headers)
	)
	except Exception as e:
	return {"error": f"Proxy failed to connect to backend: {str(e)}"}

	def run_llama_server():
	model_file = os.environ.get("MODEL_FILE")
	n_ctx = os.environ.get("N_CTX", "4096")
	print(f"--- Launching Pure Llama Engine on port 8001 ---")
	# Using the standard CLI which we KNOW works for the API
	cmd = [
	"python3", "-m", "llama_cpp.server",
	"--model", f"./{model_file}",
	"--port", "8001",
	"--host", "127.0.0.1",
	"--n_ctx", n_ctx
	]
	subprocess.run(cmd)

	if __name__ == "__main__":
	# Start the llama server in a background thread
	threading.Thread(target=run_llama_server, daemon=True).start()

	# Wait a few seconds for the engine to warm up
	print("Waiting for engine to stabilize...")
	time.sleep(10)

	# Run the proxy on the port Hugging Face expects (7860)
	port = int(os.environ.get("PORT", 7860))
	print(f"--- Launching CORS Proxy on port {port} ---")
	uvicorn.run(app, host="0.0.0.0", port=port)