ollamaapi-lfm / proxy.py
Manus Agent
Optymalizacja dla CPU: usuni臋cie parallel, OpenAI v1 oraz dodanie limit贸w kontekstu (num_ctx: 2048)
5fde9c6
import os
from fastapi import FastAPI, Request, HTTPException, Depends
from fastapi.responses import StreamingResponse
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
import httpx
app = FastAPI()
security = HTTPBearer()
API_KEY = os.environ.get("API_KEY", "!TU MUSISZ EDYTOWAC!")
MODEL = os.environ.get("MODEL", "!TU MUSISZ EDYTOWAC!")
OLLAMA_BASE = "http://127.0.0.1:11434"
# --- Optymalizacja dla CPU ---
# Ograniczamy kontekst (num_ctx), aby model nie musia艂 przelicza膰 ogromnej historii.
# 1024-2048 to bezpieczny zakres dla darmowych serwer贸w HF.
CONTEXT_LIMIT = 2048
if "!TU MUSISZ EDYTOWAC!" in (API_KEY, MODEL):
raise RuntimeError("Ustaw zmienne API_KEY i MODEL w HF Space Settings -> Variables")
def verify_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
if credentials.credentials != API_KEY:
raise HTTPException(status_code=401, detail="Invalid API key")
return credentials.credentials
@app.post("/api/chat")
async def ollama_chat(request: Request, key: str = Depends(verify_key)):
body = await request.json()
# Wymuszamy parametry optymalizacyjne
body["stream"] = True
body["model"] = MODEL
# Dodajemy opcje optymalizacji kontekstu, je艣li nie zosta艂y podane
if "options" not in body:
body["options"] = {}
# num_ctx: 2048 ogranicza "pami臋膰" modelu, co drastycznie przyspiesza kolejne odpowiedzi na CPU
if "num_ctx" not in body["options"]:
body["options"]["num_ctx"] = CONTEXT_LIMIT
async def generate():
async with httpx.AsyncClient(timeout=600.0) as client:
async with client.stream("POST", f"{OLLAMA_BASE}/api/chat", json=body) as resp:
async for line in resp.aiter_lines():
if not line: continue
yield line + "\n"
return StreamingResponse(generate(), media_type="application/x-ndjson")
@app.post("/api/generate")
async def ollama_generate(request: Request, key: str = Depends(verify_key)):
body = await request.json()
body["stream"] = True
body["model"] = MODEL
if "options" not in body:
body["options"] = {}
if "num_ctx" not in body["options"]:
body["options"]["num_ctx"] = CONTEXT_LIMIT
async def generate():
async with httpx.AsyncClient(timeout=600.0) as client:
async with client.stream("POST", f"{OLLAMA_BASE}/api/generate", json=body) as resp:
async for line in resp.aiter_lines():
if not line: continue
yield line + "\n"
return StreamingResponse(generate(), media_type="application/x-ndjson")
@app.get("/api/tags")
async def ollama_tags(key: str = Depends(verify_key)):
return {
"models": [{
"name": MODEL,
"model": MODEL,
"modified_at": "2024-01-01T00:00:00Z",
"size": 0,
"details": {"format": "gguf", "family": "llama"}
}]
}
@app.get("/health")
async def health():
async with httpx.AsyncClient(timeout=5.0) as client:
try:
r = await client.get(f"{OLLAMA_BASE}/api/version")
ollama_ok = r.status_code == 200
except Exception:
ollama_ok = False
return {"status": "ok" if ollama_ok else "starting", "model": MODEL}
@app.get("/")
async def root():
return {"message": "Ollama Proxy is running", "model": MODEL}