cometapii commited on
Commit
2fb3ee9
·
verified ·
1 Parent(s): 3f2ba14

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +34 -0
  2. app.py +171 -0
  3. entrypoint.sh +5 -0
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM ubuntu:22.04
2
+
3
+ RUN apt-get update && apt-get install -y \
4
+ python3 \
5
+ python3-pip \
6
+ build-essential \
7
+ cmake \
8
+ curl \
9
+ ca-certificates \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # llama-cpp-python kompiluje llama.cpp z C++ — długi build ~8 min
13
+ RUN pip3 install --no-cache-dir \
14
+ llama-cpp-python \
15
+ huggingface-hub \
16
+ fastapi \
17
+ uvicorn[standard]
18
+
19
+ RUN useradd -m -u 1000 user
20
+ USER user
21
+
22
+ ENV HOME=/home/user \
23
+ PATH="/home/user/.local/bin:$PATH" \
24
+ HF_HOME=/home/user/.cache/huggingface
25
+
26
+ WORKDIR /home/user/app
27
+
28
+ COPY --chown=user app.py .
29
+ COPY --chown=user entrypoint.sh .
30
+ RUN chmod +x entrypoint.sh
31
+
32
+ EXPOSE 7860
33
+
34
+ CMD ["./entrypoint.sh"]
app.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Request, HTTPException, Depends
2
+ from fastapi.responses import StreamingResponse
3
+ from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
4
+ from llama_cpp import Llama
5
+ import json
6
+ import time
7
+ import uuid
8
+
9
+ app = FastAPI()
10
+ security = HTTPBearer()
11
+
12
+ API_KEY = "connectkey"
13
+ MODEL_ID = "glm-4.7-flash"
14
+
15
+ # IQ1_S = 9.25 GB — single file, pasuje na CPU Upgrade (16GB RAM)
16
+ print("==> Loading GLM-4.7-Flash IQ1_S (9.25 GB) from HF...")
17
+ llm = Llama.from_pretrained(
18
+ repo_id="unsloth/GLM-4.7-Flash-GGUF",
19
+ filename="GLM-4.7-Flash-IQ1_S.gguf",
20
+ n_ctx=8192,
21
+ n_threads=4,
22
+ n_batch=512,
23
+ verbose=False,
24
+ )
25
+ print("==> Model loaded!")
26
+
27
+
28
+ def verify_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
29
+ if credentials.credentials != API_KEY:
30
+ raise HTTPException(status_code=401, detail="Invalid API key")
31
+ return credentials.credentials
32
+
33
+
34
+ @app.get("/v1/models")
35
+ async def list_models(key: str = Depends(verify_key)):
36
+ return {
37
+ "object": "list",
38
+ "data": [{
39
+ "id": MODEL_ID,
40
+ "object": "model",
41
+ "created": int(time.time()),
42
+ "owned_by": "unsloth",
43
+ }]
44
+ }
45
+
46
+
47
+ @app.post("/v1/chat/completions")
48
+ async def chat_completions(request: Request, key: str = Depends(verify_key)):
49
+ body = await request.json()
50
+
51
+ messages = body.get("messages", [])
52
+ stream = body.get("stream", False)
53
+ max_tokens = body.get("max_tokens", 1024)
54
+ temperature = body.get("temperature", 1.0)
55
+ top_p = body.get("top_p", 0.95)
56
+ min_p = body.get("min_p", 0.01)
57
+ stop = body.get("stop", None)
58
+
59
+ completion_id = f"chatcmpl-{uuid.uuid4().hex}"
60
+ created = int(time.time())
61
+
62
+ if stream:
63
+ def generate():
64
+ for chunk in llm.create_chat_completion(
65
+ messages=messages,
66
+ max_tokens=max_tokens,
67
+ temperature=temperature,
68
+ top_p=top_p,
69
+ min_p=min_p,
70
+ stop=stop,
71
+ stream=True,
72
+ ):
73
+ delta = chunk["choices"][0].get("delta", {})
74
+ finish_reason = chunk["choices"][0].get("finish_reason")
75
+ data = {
76
+ "id": completion_id,
77
+ "object": "chat.completion.chunk",
78
+ "created": created,
79
+ "model": MODEL_ID,
80
+ "choices": [{
81
+ "index": 0,
82
+ "delta": delta,
83
+ "finish_reason": finish_reason,
84
+ }]
85
+ }
86
+ yield f"data: {json.dumps(data)}\n\n"
87
+ yield "data: [DONE]\n\n"
88
+
89
+ return StreamingResponse(generate(), media_type="text/event-stream")
90
+
91
+ else:
92
+ result = llm.create_chat_completion(
93
+ messages=messages,
94
+ max_tokens=max_tokens,
95
+ temperature=temperature,
96
+ top_p=top_p,
97
+ min_p=min_p,
98
+ stop=stop,
99
+ stream=False,
100
+ )
101
+ return {
102
+ "id": completion_id,
103
+ "object": "chat.completion",
104
+ "created": created,
105
+ "model": MODEL_ID,
106
+ "choices": result["choices"],
107
+ "usage": result.get("usage", {}),
108
+ }
109
+
110
+
111
+ @app.post("/v1/completions")
112
+ async def completions(request: Request, key: str = Depends(verify_key)):
113
+ body = await request.json()
114
+
115
+ prompt = body.get("prompt", "")
116
+ stream = body.get("stream", False)
117
+ max_tokens = body.get("max_tokens", 512)
118
+ temperature = body.get("temperature", 1.0)
119
+ top_p = body.get("top_p", 0.95)
120
+ min_p = body.get("min_p", 0.01)
121
+ stop = body.get("stop", None)
122
+
123
+ completion_id = f"cmpl-{uuid.uuid4().hex}"
124
+ created = int(time.time())
125
+
126
+ if stream:
127
+ def generate():
128
+ for chunk in llm.create_completion(
129
+ prompt=prompt,
130
+ max_tokens=max_tokens,
131
+ temperature=temperature,
132
+ top_p=top_p,
133
+ min_p=min_p,
134
+ stop=stop,
135
+ stream=True,
136
+ ):
137
+ data = {
138
+ "id": completion_id,
139
+ "object": "text_completion",
140
+ "created": created,
141
+ "model": MODEL_ID,
142
+ "choices": chunk["choices"],
143
+ }
144
+ yield f"data: {json.dumps(data)}\n\n"
145
+ yield "data: [DONE]\n\n"
146
+
147
+ return StreamingResponse(generate(), media_type="text/event-stream")
148
+
149
+ else:
150
+ result = llm.create_completion(
151
+ prompt=prompt,
152
+ max_tokens=max_tokens,
153
+ temperature=temperature,
154
+ top_p=top_p,
155
+ min_p=min_p,
156
+ stop=stop,
157
+ stream=False,
158
+ )
159
+ return {
160
+ "id": completion_id,
161
+ "object": "text_completion",
162
+ "created": created,
163
+ "model": MODEL_ID,
164
+ "choices": result["choices"],
165
+ "usage": result.get("usage", {}),
166
+ }
167
+
168
+
169
+ @app.get("/health")
170
+ async def health():
171
+ return {"status": "ok", "model": MODEL_ID}
entrypoint.sh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ echo "==> Starting GLM-4.7-Flash GGUF API..."
5
+ exec uvicorn app:app --host 0.0.0.0 --port 7860 --timeout-keep-alive 300