import os from fastapi import FastAPI, Request, HTTPException, Depends from fastapi.responses import StreamingResponse from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials import httpx app = FastAPI() security = HTTPBearer() API_KEY = os.environ.get("API_KEY", "!TU MUSISZ EDYTOWAC!") MODEL = os.environ.get("MODEL", "!TU MUSISZ EDYTOWAC!") OLLAMA_BASE = "http://127.0.0.1:11434" # --- Optymalizacja dla CPU --- # Ograniczamy kontekst (num_ctx), aby model nie musiał przeliczać ogromnej historii. # 1024-2048 to bezpieczny zakres dla darmowych serwerów HF. CONTEXT_LIMIT = 2048 if "!TU MUSISZ EDYTOWAC!" in (API_KEY, MODEL): raise RuntimeError("Ustaw zmienne API_KEY i MODEL w HF Space Settings -> Variables") def verify_key(credentials: HTTPAuthorizationCredentials = Depends(security)): if credentials.credentials != API_KEY: raise HTTPException(status_code=401, detail="Invalid API key") return credentials.credentials @app.post("/api/chat") async def ollama_chat(request: Request, key: str = Depends(verify_key)): body = await request.json() # Wymuszamy parametry optymalizacyjne body["stream"] = True body["model"] = MODEL # Dodajemy opcje optymalizacji kontekstu, jeśli nie zostały podane if "options" not in body: body["options"] = {} # num_ctx: 2048 ogranicza "pamięć" modelu, co drastycznie przyspiesza kolejne odpowiedzi na CPU if "num_ctx" not in body["options"]: body["options"]["num_ctx"] = CONTEXT_LIMIT async def generate(): async with httpx.AsyncClient(timeout=600.0) as client: async with client.stream("POST", f"{OLLAMA_BASE}/api/chat", json=body) as resp: async for line in resp.aiter_lines(): if not line: continue yield line + "\n" return StreamingResponse(generate(), media_type="application/x-ndjson") @app.post("/api/generate") async def ollama_generate(request: Request, key: str = Depends(verify_key)): body = await request.json() body["stream"] = True body["model"] = MODEL if "options" not in body: body["options"] = {} if "num_ctx" not in body["options"]: body["options"]["num_ctx"] = CONTEXT_LIMIT async def generate(): async with httpx.AsyncClient(timeout=600.0) as client: async with client.stream("POST", f"{OLLAMA_BASE}/api/generate", json=body) as resp: async for line in resp.aiter_lines(): if not line: continue yield line + "\n" return StreamingResponse(generate(), media_type="application/x-ndjson") @app.get("/api/tags") async def ollama_tags(key: str = Depends(verify_key)): return { "models": [{ "name": MODEL, "model": MODEL, "modified_at": "2024-01-01T00:00:00Z", "size": 0, "details": {"format": "gguf", "family": "llama"} }] } @app.get("/health") async def health(): async with httpx.AsyncClient(timeout=5.0) as client: try: r = await client.get(f"{OLLAMA_BASE}/api/version") ollama_ok = r.status_code == 200 except Exception: ollama_ok = False return {"status": "ok" if ollama_ok else "starting", "model": MODEL} @app.get("/") async def root(): return {"message": "Ollama Proxy is running", "model": MODEL}