File size: 3,441 Bytes
f2f6faa
 
 
 
 
 
 
 
 
 
 
 
 
5fde9c6
 
 
 
 
f2f6faa
 
 
 
 
 
 
 
 
 
 
5fde9c6
 
f2f6faa
 
 
5fde9c6
 
 
 
 
 
 
 
f2f6faa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fde9c6
 
 
 
 
f2f6faa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fde9c6
f2f6faa
 
 
 
 
 
 
 
 
 
 
 
5fde9c6
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
from fastapi import FastAPI, Request, HTTPException, Depends
from fastapi.responses import StreamingResponse
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
import httpx

app = FastAPI()
security = HTTPBearer()

API_KEY = os.environ.get("API_KEY", "!TU MUSISZ EDYTOWAC!")
MODEL   = os.environ.get("MODEL",   "!TU MUSISZ EDYTOWAC!")
OLLAMA_BASE = "http://127.0.0.1:11434"

# --- Optymalizacja dla CPU ---
# Ograniczamy kontekst (num_ctx), aby model nie musiał przeliczać ogromnej historii.
# 1024-2048 to bezpieczny zakres dla darmowych serwerów HF.
CONTEXT_LIMIT = 2048 

if "!TU MUSISZ EDYTOWAC!" in (API_KEY, MODEL):
    raise RuntimeError("Ustaw zmienne API_KEY i MODEL w HF Space Settings -> Variables")

def verify_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
    if credentials.credentials != API_KEY:
        raise HTTPException(status_code=401, detail="Invalid API key")
    return credentials.credentials

@app.post("/api/chat")
async def ollama_chat(request: Request, key: str = Depends(verify_key)):
    body = await request.json()
    
    # Wymuszamy parametry optymalizacyjne
    body["stream"] = True
    body["model"] = MODEL
    
    # Dodajemy opcje optymalizacji kontekstu, jeśli nie zostały podane
    if "options" not in body:
        body["options"] = {}
    
    # num_ctx: 2048 ogranicza "pamięć" modelu, co drastycznie przyspiesza kolejne odpowiedzi na CPU
    if "num_ctx" not in body["options"]:
        body["options"]["num_ctx"] = CONTEXT_LIMIT

    async def generate():
        async with httpx.AsyncClient(timeout=600.0) as client:
            async with client.stream("POST", f"{OLLAMA_BASE}/api/chat", json=body) as resp:
                async for line in resp.aiter_lines():
                    if not line: continue
                    yield line + "\n"

    return StreamingResponse(generate(), media_type="application/x-ndjson")

@app.post("/api/generate")
async def ollama_generate(request: Request, key: str = Depends(verify_key)):
    body = await request.json()
    body["stream"] = True
    body["model"] = MODEL
    
    if "options" not in body:
        body["options"] = {}
    if "num_ctx" not in body["options"]:
        body["options"]["num_ctx"] = CONTEXT_LIMIT
    
    async def generate():
        async with httpx.AsyncClient(timeout=600.0) as client:
            async with client.stream("POST", f"{OLLAMA_BASE}/api/generate", json=body) as resp:
                async for line in resp.aiter_lines():
                    if not line: continue
                    yield line + "\n"

    return StreamingResponse(generate(), media_type="application/x-ndjson")

@app.get("/api/tags")
async def ollama_tags(key: str = Depends(verify_key)):
    return {
        "models": [{
            "name": MODEL,
            "model": MODEL,
            "modified_at": "2024-01-01T00:00:00Z",
            "size": 0,
            "details": {"format": "gguf", "family": "llama"}
        }]
    }

@app.get("/health")
async def health():
    async with httpx.AsyncClient(timeout=5.0) as client:
        try:
            r = await client.get(f"{OLLAMA_BASE}/api/version")
            ollama_ok = r.status_code == 200
        except Exception:
            ollama_ok = False
    return {"status": "ok" if ollama_ok else "starting", "model": MODEL}

@app.get("/")
async def root():
    return {"message": "Ollama Proxy is running", "model": MODEL}