Spaces:

zenotaiz
/

llama-3.2-api-backend

Sleeping

File size: 2,483 Bytes

a89ffcb

import uvicorn
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
from fastapi.middleware.cors import CORSMiddleware
import httpx
import os
import subprocess
import time
import threading

app = FastAPI()

# Enable CORS for the local developer dashboard
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)

BACKEND_URL = "http://127.0.0.1:8001"
client = httpx.AsyncClient(base_url=BACKEND_URL, timeout=None)

@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"])
async def proxy(request: Request, path: str):
    url = f"{BACKEND_URL}/{path}"
    
    # Forward the request
    content = await request.body()
    headers = dict(request.headers)
    # Remove host header to avoid proxy detection issues
    headers.pop("host", None)
    
    try:
        # We use a streaming request to handle potentially large responses
        # although llama-cpp-python server has its own streaming logic
        rp_req = client.build_request(
            request.method,
            url,
            params=request.query_params,
            headers=headers,
            content=content
        )
        rp_resp = await client.send(rp_req, stream=True)
        
        return StreamingResponse(
            rp_resp.aiter_raw(),
            status_code=rp_resp.status_code,
            headers=dict(rp_resp.headers)
        )
    except Exception as e:
        return {"error": f"Proxy failed to connect to backend: {str(e)}"}

def run_llama_server():
    model_file = os.environ.get("MODEL_FILE")
    n_ctx = os.environ.get("N_CTX", "4096")
    print(f"--- Launching Pure Llama Engine on port 8001 ---")
    # Using the standard CLI which we KNOW works for the API
    cmd = [
        "python3", "-m", "llama_cpp.server",
        "--model", f"./{model_file}",
        "--port", "8001",
        "--host", "127.0.0.1",
        "--n_ctx", n_ctx
    ]
    subprocess.run(cmd)

if __name__ == "__main__":
    # Start the llama server in a background thread
    threading.Thread(target=run_llama_server, daemon=True).start()
    
    # Wait a few seconds for the engine to warm up
    print("Waiting for engine to stabilize...")
    time.sleep(10)
    
    # Run the proxy on the port Hugging Face expects (7860)
    port = int(os.environ.get("PORT", 7860))
    print(f"--- Launching CORS Proxy on port {port} ---")
    uvicorn.run(app, host="0.0.0.0", port=port)