Spaces:
Sleeping
Sleeping
File size: 2,483 Bytes
a89ffcb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | import uvicorn
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
from fastapi.middleware.cors import CORSMiddleware
import httpx
import os
import subprocess
import time
import threading
app = FastAPI()
# Enable CORS for the local developer dashboard
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
BACKEND_URL = "http://127.0.0.1:8001"
client = httpx.AsyncClient(base_url=BACKEND_URL, timeout=None)
@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"])
async def proxy(request: Request, path: str):
url = f"{BACKEND_URL}/{path}"
# Forward the request
content = await request.body()
headers = dict(request.headers)
# Remove host header to avoid proxy detection issues
headers.pop("host", None)
try:
# We use a streaming request to handle potentially large responses
# although llama-cpp-python server has its own streaming logic
rp_req = client.build_request(
request.method,
url,
params=request.query_params,
headers=headers,
content=content
)
rp_resp = await client.send(rp_req, stream=True)
return StreamingResponse(
rp_resp.aiter_raw(),
status_code=rp_resp.status_code,
headers=dict(rp_resp.headers)
)
except Exception as e:
return {"error": f"Proxy failed to connect to backend: {str(e)}"}
def run_llama_server():
model_file = os.environ.get("MODEL_FILE")
n_ctx = os.environ.get("N_CTX", "4096")
print(f"--- Launching Pure Llama Engine on port 8001 ---")
# Using the standard CLI which we KNOW works for the API
cmd = [
"python3", "-m", "llama_cpp.server",
"--model", f"./{model_file}",
"--port", "8001",
"--host", "127.0.0.1",
"--n_ctx", n_ctx
]
subprocess.run(cmd)
if __name__ == "__main__":
# Start the llama server in a background thread
threading.Thread(target=run_llama_server, daemon=True).start()
# Wait a few seconds for the engine to warm up
print("Waiting for engine to stabilize...")
time.sleep(10)
# Run the proxy on the port Hugging Face expects (7860)
port = int(os.environ.get("PORT", 7860))
print(f"--- Launching CORS Proxy on port {port} ---")
uvicorn.run(app, host="0.0.0.0", port=port)
|