mrmadblack commited on
Commit
e76849b
·
verified ·
1 Parent(s): 069a96e

Update server.py

Browse files
Files changed (1) hide show
  1. server.py +83 -110
server.py CHANGED
@@ -9,6 +9,7 @@ import os
9
  import json
10
  import time
11
  import hashlib
 
12
 
13
  app = FastAPI()
14
 
@@ -26,12 +27,13 @@ LLAMA_SERVER = "./llama.cpp/build/bin/llama-server"
26
 
27
 
28
  # ---------------------------
29
- # REQUEST SCHEMAS
30
  # ---------------------------
31
 
32
  class ChatRequest(BaseModel):
33
  model: str
34
  messages: list
 
35
 
36
 
37
  class GenerateRequest(BaseModel):
@@ -75,7 +77,7 @@ os.makedirs("models", exist_ok=True)
75
 
76
  if not os.path.exists(MODEL_PATH):
77
 
78
- print("Downloading model from HuggingFace")
79
 
80
  downloaded = hf_hub_download(
81
  repo_id=MODEL_REPO,
@@ -91,15 +93,29 @@ if not os.path.exists(MODEL_PATH):
91
  # START LLAMA SERVER
92
  # ---------------------------
93
 
94
- print("Starting llama-server...")
95
 
96
- subprocess.Popen([
97
- LLAMA_SERVER,
98
- "-m", MODEL_PATH,
99
- "--host", "0.0.0.0",
100
- "--port", "8080",
101
- "-c", "2048"
102
- ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
 
105
  # ---------------------------
@@ -112,120 +128,65 @@ def root():
112
 
113
 
114
  # ---------------------------
115
- # OLLAMA MODEL LIST
116
  # ---------------------------
117
 
118
- @app.post("/api/chat")
119
- def chat(req: ChatRequest):
120
-
121
- prompt = build_prompt(req.messages)
122
-
123
- stream = getattr(req, "stream", False)
124
-
125
- r = requests.post(
126
- "http://localhost:8080/completion",
127
- json={
128
- "prompt": prompt,
129
- "stream": stream,
130
- "n_predict": 512
131
- },
132
- stream=stream
133
- )
134
 
135
- # NON STREAM MODE (normal JSON)
136
- if not stream:
137
 
138
- data = r.json()
139
- text = data.get("content", "")
140
 
141
- return JSONResponse({
142
- "model": req.model,
143
- "message": {
144
- "role": "assistant",
145
- "content": text
146
- },
147
- "done": True
148
- })
149
-
150
- # STREAM MODE (NDJSON like Ollama)
151
- def stream_generator():
152
-
153
- for line in r.iter_lines():
154
-
155
- if not line:
156
- continue
157
-
158
- line = line.decode("utf-8").strip()
159
-
160
- if line.startswith("data:"):
161
- line = line[5:].strip()
162
-
163
- try:
164
- data = json.loads(line)
165
- except:
166
- continue
167
-
168
- token = data.get("content", "")
169
-
170
- yield json.dumps({
171
- "model": req.model,
172
- "message": {
173
- "role": "assistant",
174
- "content": token
175
- },
176
- "done": False
177
- }) + "\n"
178
-
179
- yield json.dumps({
180
- "model": req.model,
181
- "done": True
182
- }) + "\n"
183
 
184
- return StreamingResponse(
185
- stream_generator(),
186
- media_type="application/x-ndjson"
187
- )
188
 
189
  # ---------------------------
190
- # GENERATE (NON STREAM)
191
  # ---------------------------
192
 
193
  @app.post("/api/generate")
194
  def generate(req: GenerateRequest):
195
 
196
- start = time.time()
197
-
198
- response = requests.post(
199
  "http://localhost:8080/completion",
200
  json={
201
  "prompt": req.prompt,
202
- "n_predict": 200
203
  }
204
  )
205
 
206
- data = response.json()
207
 
208
  text = data.get("content", "").strip()
209
 
210
- duration = int((time.time() - start) * 1e9)
211
-
212
  return {
213
  "model": req.model,
214
- "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
215
  "response": text,
216
- "done": True,
217
- "done_reason": "stop",
218
- "total_duration": duration,
219
- "load_duration": 0,
220
- "prompt_eval_count": len(req.prompt.split()),
221
- "prompt_eval_duration": 0,
222
- "eval_count": len(text.split()),
223
- "eval_duration": duration
224
  }
225
 
226
 
227
  # ---------------------------
228
- # CHAT STREAM (OLLAMA STYLE)
229
  # ---------------------------
230
 
231
  @app.post("/api/chat")
@@ -237,16 +198,30 @@ def chat(req: ChatRequest):
237
  "http://localhost:8080/completion",
238
  json={
239
  "prompt": prompt,
240
- "stream": True,
241
- "n_predict": 1024,
242
  "temperature": 0.7,
243
  "top_p": 0.9,
244
  "stop": ["User:", "</s>"]
245
  },
246
- stream=True
247
  )
248
 
249
- def stream():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
  for line in r.iter_lines():
252
 
@@ -255,30 +230,24 @@ def chat(req: ChatRequest):
255
 
256
  line = line.decode("utf-8").strip()
257
 
258
- if not line:
259
- continue
260
-
261
  if line.startswith("data:"):
262
  line = line[5:].strip()
263
-
264
  try:
265
  data = json.loads(line)
266
- except Exception:
267
  continue
268
 
269
  token = data.get("content", "")
270
 
271
- chunk = {
272
  "model": req.model,
273
- "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
274
  "message": {
275
  "role": "assistant",
276
  "content": token
277
  },
278
  "done": False
279
- }
280
-
281
- yield json.dumps(chunk) + "\n"
282
 
283
  yield json.dumps({
284
  "model": req.model,
@@ -286,7 +255,11 @@ def chat(req: ChatRequest):
286
  "done_reason": "stop"
287
  }) + "\n"
288
 
289
- return StreamingResponse(stream(), media_type="application/x-ndjson")
 
 
 
 
290
 
291
 
292
  # ---------------------------
 
9
  import json
10
  import time
11
  import hashlib
12
+ import threading
13
 
14
  app = FastAPI()
15
 
 
27
 
28
 
29
  # ---------------------------
30
+ # REQUEST MODELS
31
  # ---------------------------
32
 
33
  class ChatRequest(BaseModel):
34
  model: str
35
  messages: list
36
+ stream: bool = True
37
 
38
 
39
  class GenerateRequest(BaseModel):
 
77
 
78
  if not os.path.exists(MODEL_PATH):
79
 
80
+ print("Downloading model from HuggingFace...")
81
 
82
  downloaded = hf_hub_download(
83
  repo_id=MODEL_REPO,
 
93
  # START LLAMA SERVER
94
  # ---------------------------
95
 
96
+ def start_llama():
97
 
98
+ print("Starting llama-server...")
99
+
100
+ subprocess.Popen([
101
+ LLAMA_SERVER,
102
+ "-m", MODEL_PATH,
103
+ "--host", "0.0.0.0",
104
+ "--port", "8080",
105
+ "-c", "2048"
106
+ ])
107
+
108
+ # wait for server to start
109
+ for _ in range(30):
110
+ try:
111
+ requests.get("http://localhost:8080/health")
112
+ print("llama-server ready")
113
+ return
114
+ except:
115
+ time.sleep(1)
116
+
117
+
118
+ threading.Thread(target=start_llama, daemon=True).start()
119
 
120
 
121
  # ---------------------------
 
128
 
129
 
130
  # ---------------------------
131
+ # MODEL LIST (Ollama style)
132
  # ---------------------------
133
 
134
+ @app.get("/api/tags")
135
+ def tags():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
+ size = os.path.getsize(MODEL_PATH)
 
138
 
139
+ with open(MODEL_PATH, "rb") as f:
140
+ digest = hashlib.sha256(f.read()).hexdigest()
141
 
142
+ return {
143
+ "models": [
144
+ {
145
+ "name": MODEL_NAME,
146
+ "model": MODEL_NAME,
147
+ "modified_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
148
+ "size": size,
149
+ "digest": digest,
150
+ "details": {
151
+ "format": "gguf",
152
+ "family": "llama",
153
+ "families": ["llama"],
154
+ "parameter_size": "1.1B",
155
+ "quantization_level": "Q4_K_M"
156
+ }
157
+ }
158
+ ]
159
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
 
 
 
 
161
 
162
  # ---------------------------
163
+ # GENERATE (non-stream)
164
  # ---------------------------
165
 
166
  @app.post("/api/generate")
167
  def generate(req: GenerateRequest):
168
 
169
+ r = requests.post(
 
 
170
  "http://localhost:8080/completion",
171
  json={
172
  "prompt": req.prompt,
173
+ "n_predict": 256
174
  }
175
  )
176
 
177
+ data = r.json()
178
 
179
  text = data.get("content", "").strip()
180
 
 
 
181
  return {
182
  "model": req.model,
 
183
  "response": text,
184
+ "done": True
 
 
 
 
 
 
 
185
  }
186
 
187
 
188
  # ---------------------------
189
+ # CHAT (Ollama streaming)
190
  # ---------------------------
191
 
192
  @app.post("/api/chat")
 
198
  "http://localhost:8080/completion",
199
  json={
200
  "prompt": prompt,
201
+ "stream": req.stream,
202
+ "n_predict": 256,
203
  "temperature": 0.7,
204
  "top_p": 0.9,
205
  "stop": ["User:", "</s>"]
206
  },
207
+ stream=req.stream
208
  )
209
 
210
+ if not req.stream:
211
+
212
+ data = r.json()
213
+ text = data.get("content", "")
214
+
215
+ return JSONResponse({
216
+ "model": req.model,
217
+ "message": {
218
+ "role": "assistant",
219
+ "content": text
220
+ },
221
+ "done": True
222
+ })
223
+
224
+ def stream_generator():
225
 
226
  for line in r.iter_lines():
227
 
 
230
 
231
  line = line.decode("utf-8").strip()
232
 
 
 
 
233
  if line.startswith("data:"):
234
  line = line[5:].strip()
235
+
236
  try:
237
  data = json.loads(line)
238
+ except:
239
  continue
240
 
241
  token = data.get("content", "")
242
 
243
+ yield json.dumps({
244
  "model": req.model,
 
245
  "message": {
246
  "role": "assistant",
247
  "content": token
248
  },
249
  "done": False
250
+ }) + "\n"
 
 
251
 
252
  yield json.dumps({
253
  "model": req.model,
 
255
  "done_reason": "stop"
256
  }) + "\n"
257
 
258
+ return StreamingResponse(
259
+ stream_generator(),
260
+ media_type="application/x-ndjson",
261
+ headers={"Cache-Control": "no-cache"}
262
+ )
263
 
264
 
265
  # ---------------------------