ollamaapi-lfm

Sleeping

Manus Agent

Optymalizacja dla CPU: usunięcie parallel, OpenAI v1 oraz dodanie limitów kontekstu (num_ctx: 2048)

5fde9c6 17 days ago

3.44 kB

	import os
	from fastapi import FastAPI, Request, HTTPException, Depends
	from fastapi.responses import StreamingResponse
	from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
	import httpx

	app = FastAPI()
	security = HTTPBearer()

	API_KEY = os.environ.get("API_KEY", "!TU MUSISZ EDYTOWAC!")
	MODEL = os.environ.get("MODEL", "!TU MUSISZ EDYTOWAC!")
	OLLAMA_BASE = "http://127.0.0.1:11434"

	# --- Optymalizacja dla CPU ---
	# Ograniczamy kontekst (num_ctx), aby model nie musiał przeliczać ogromnej historii.
	# 1024-2048 to bezpieczny zakres dla darmowych serwerów HF.
	CONTEXT_LIMIT = 2048

	if "!TU MUSISZ EDYTOWAC!" in (API_KEY, MODEL):
	raise RuntimeError("Ustaw zmienne API_KEY i MODEL w HF Space Settings -> Variables")

	def verify_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
	if credentials.credentials != API_KEY:
	raise HTTPException(status_code=401, detail="Invalid API key")
	return credentials.credentials

	@app.post("/api/chat")
	async def ollama_chat(request: Request, key: str = Depends(verify_key)):
	body = await request.json()

	# Wymuszamy parametry optymalizacyjne
	body["stream"] = True
	body["model"] = MODEL

	# Dodajemy opcje optymalizacji kontekstu, jeśli nie zostały podane
	if "options" not in body:
	body["options"] = {}

	# num_ctx: 2048 ogranicza "pamięć" modelu, co drastycznie przyspiesza kolejne odpowiedzi na CPU
	if "num_ctx" not in body["options"]:
	body["options"]["num_ctx"] = CONTEXT_LIMIT

	async def generate():
	async with httpx.AsyncClient(timeout=600.0) as client:
	async with client.stream("POST", f"{OLLAMA_BASE}/api/chat", json=body) as resp:
	async for line in resp.aiter_lines():
	if not line: continue
	yield line + "\n"

	return StreamingResponse(generate(), media_type="application/x-ndjson")

	@app.post("/api/generate")
	async def ollama_generate(request: Request, key: str = Depends(verify_key)):
	body = await request.json()
	body["stream"] = True
	body["model"] = MODEL

	if "options" not in body:
	body["options"] = {}
	if "num_ctx" not in body["options"]:
	body["options"]["num_ctx"] = CONTEXT_LIMIT

	async def generate():
	async with httpx.AsyncClient(timeout=600.0) as client:
	async with client.stream("POST", f"{OLLAMA_BASE}/api/generate", json=body) as resp:
	async for line in resp.aiter_lines():
	if not line: continue
	yield line + "\n"

	return StreamingResponse(generate(), media_type="application/x-ndjson")

	@app.get("/api/tags")
	async def ollama_tags(key: str = Depends(verify_key)):
	return {
	"models": [{
	"name": MODEL,
	"model": MODEL,
	"modified_at": "2024-01-01T00:00:00Z",
	"size": 0,
	"details": {"format": "gguf", "family": "llama"}
	}]
	}

	@app.get("/health")
	async def health():
	async with httpx.AsyncClient(timeout=5.0) as client:
	try:
	r = await client.get(f"{OLLAMA_BASE}/api/version")
	ollama_ok = r.status_code == 200
	except Exception:
	ollama_ok = False
	return {"status": "ok" if ollama_ok else "starting", "model": MODEL}

	@app.get("/")
	async def root():
	return {"message": "Ollama Proxy is running", "model": MODEL}