Spaces:
Sleeping
Sleeping
Manus Agent
Optymalizacja dla CPU: usuni臋cie parallel, OpenAI v1 oraz dodanie limit贸w kontekstu (num_ctx: 2048)
5fde9c6 | import os | |
| from fastapi import FastAPI, Request, HTTPException, Depends | |
| from fastapi.responses import StreamingResponse | |
| from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials | |
| import httpx | |
| app = FastAPI() | |
| security = HTTPBearer() | |
| API_KEY = os.environ.get("API_KEY", "!TU MUSISZ EDYTOWAC!") | |
| MODEL = os.environ.get("MODEL", "!TU MUSISZ EDYTOWAC!") | |
| OLLAMA_BASE = "http://127.0.0.1:11434" | |
| # --- Optymalizacja dla CPU --- | |
| # Ograniczamy kontekst (num_ctx), aby model nie musia艂 przelicza膰 ogromnej historii. | |
| # 1024-2048 to bezpieczny zakres dla darmowych serwer贸w HF. | |
| CONTEXT_LIMIT = 2048 | |
| if "!TU MUSISZ EDYTOWAC!" in (API_KEY, MODEL): | |
| raise RuntimeError("Ustaw zmienne API_KEY i MODEL w HF Space Settings -> Variables") | |
| def verify_key(credentials: HTTPAuthorizationCredentials = Depends(security)): | |
| if credentials.credentials != API_KEY: | |
| raise HTTPException(status_code=401, detail="Invalid API key") | |
| return credentials.credentials | |
| async def ollama_chat(request: Request, key: str = Depends(verify_key)): | |
| body = await request.json() | |
| # Wymuszamy parametry optymalizacyjne | |
| body["stream"] = True | |
| body["model"] = MODEL | |
| # Dodajemy opcje optymalizacji kontekstu, je艣li nie zosta艂y podane | |
| if "options" not in body: | |
| body["options"] = {} | |
| # num_ctx: 2048 ogranicza "pami臋膰" modelu, co drastycznie przyspiesza kolejne odpowiedzi na CPU | |
| if "num_ctx" not in body["options"]: | |
| body["options"]["num_ctx"] = CONTEXT_LIMIT | |
| async def generate(): | |
| async with httpx.AsyncClient(timeout=600.0) as client: | |
| async with client.stream("POST", f"{OLLAMA_BASE}/api/chat", json=body) as resp: | |
| async for line in resp.aiter_lines(): | |
| if not line: continue | |
| yield line + "\n" | |
| return StreamingResponse(generate(), media_type="application/x-ndjson") | |
| async def ollama_generate(request: Request, key: str = Depends(verify_key)): | |
| body = await request.json() | |
| body["stream"] = True | |
| body["model"] = MODEL | |
| if "options" not in body: | |
| body["options"] = {} | |
| if "num_ctx" not in body["options"]: | |
| body["options"]["num_ctx"] = CONTEXT_LIMIT | |
| async def generate(): | |
| async with httpx.AsyncClient(timeout=600.0) as client: | |
| async with client.stream("POST", f"{OLLAMA_BASE}/api/generate", json=body) as resp: | |
| async for line in resp.aiter_lines(): | |
| if not line: continue | |
| yield line + "\n" | |
| return StreamingResponse(generate(), media_type="application/x-ndjson") | |
| async def ollama_tags(key: str = Depends(verify_key)): | |
| return { | |
| "models": [{ | |
| "name": MODEL, | |
| "model": MODEL, | |
| "modified_at": "2024-01-01T00:00:00Z", | |
| "size": 0, | |
| "details": {"format": "gguf", "family": "llama"} | |
| }] | |
| } | |
| async def health(): | |
| async with httpx.AsyncClient(timeout=5.0) as client: | |
| try: | |
| r = await client.get(f"{OLLAMA_BASE}/api/version") | |
| ollama_ok = r.status_code == 200 | |
| except Exception: | |
| ollama_ok = False | |
| return {"status": "ok" if ollama_ok else "starting", "model": MODEL} | |
| async def root(): | |
| return {"message": "Ollama Proxy is running", "model": MODEL} | |