File size: 3,948 Bytes
3154e52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
from fastapi import FastAPI, Request, HTTPException, Depends
from fastapi.responses import StreamingResponse
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
import httpx
import json
import time
import uuid

app = FastAPI()
security = HTTPBearer()

API_KEY = os.environ.get("API_KEY", "!TU MUSISZ EDYTOWAC!")  # np. "moj-tajny-klucz"
MODEL   = os.environ.get("MODEL",   "!TU MUSISZ EDYTOWAC!")  # np. "deepseek-r1:14b" albo "hf.co/unsloth/GLM-4.7-Flash-GGUF:UD-TQ1_0"
OLLAMA_BASE = "http://127.0.0.1:11434"

if "!TU MUSISZ EDYTOWAC!" in (API_KEY, MODEL):
    raise RuntimeError("Ustaw zmienne API_KEY i MODEL w HF Space Settings -> Variables")


def verify_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
    if credentials.credentials != API_KEY:
        raise HTTPException(status_code=401, detail="Invalid API key")
    return credentials.credentials


@app.get("/v1/models")
async def list_models(key: str = Depends(verify_key)):
    return {
        "object": "list",
        "data": [{
            "id": MODEL,
            "object": "model",
            "created": int(time.time()),
            "owned_by": "ollama",
        }]
    }


@app.post("/v1/chat/completions")
async def chat_completions(request: Request, key: str = Depends(verify_key)):
    body = await request.json()

    messages    = body.get("messages", [])
    temperature = body.get("temperature", 0.6)  # !TU MUSISZ EDYTOWAC! domyślna temperatura (0.0-2.0)
    top_p       = body.get("top_p", 0.95)       # !TU MUSISZ EDYTOWAC! domyślne top_p (0.0-1.0)

    options = {"temperature": temperature, "top_p": top_p}
    if "max_tokens" in body:
        options["num_predict"] = body["max_tokens"]

    ollama_payload = {
        "model": MODEL,
        "messages": messages,
        "stream": True,
        "options": options,
    }

    completion_id = f"chatcmpl-{uuid.uuid4().hex}"
    created       = int(time.time())

    async def generate():
        async with httpx.AsyncClient(timeout=300.0) as client:
            async with client.stream("POST", f"{OLLAMA_BASE}/api/chat", json=ollama_payload) as resp:
                async for line in resp.aiter_lines():
                    if not line:
                        continue
                    try:
                        chunk = json.loads(line)
                    except Exception:
                        continue

                    msg  = chunk.get("message", {})
                    done = chunk.get("done", False)

                    if done:
                        delta = {}
                    else:
                        delta = {}
                        if msg.get("thinking") is not None:
                            delta["reasoning_content"] = msg["thinking"]
                        if msg.get("content") is not None:
                            delta["content"] = msg["content"]

                    data = {
                        "id":      completion_id,
                        "object":  "chat.completion.chunk",
                        "created": created,
                        "model":   MODEL,
                        "choices": [{
                            "index":         0,
                            "delta":         delta,
                            "finish_reason": "stop" if done else None,
                        }]
                    }
                    yield f"data: {json.dumps(data)}\n\n"

                    if done:
                        break

        yield "data: [DONE]\n\n"

    return StreamingResponse(generate(), media_type="text/event-stream")


@app.get("/health")
async def health():
    async with httpx.AsyncClient(timeout=5.0) as client:
        try:
            r = await client.get(f"{OLLAMA_BASE}/api/version")
            ollama_ok = r.status_code == 200
        except Exception:
            ollama_ok = False
    return {"status": "ok" if ollama_ok else "starting", "model": MODEL}