mrmadblack commited on
Commit
583a3d1
·
verified ·
1 Parent(s): 57efecf

Update server.py

Browse files
Files changed (1) hide show
  1. server.py +74 -98
server.py CHANGED
@@ -1,6 +1,7 @@
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from huggingface_hub import hf_hub_download
 
4
  import subprocess
5
  import uvicorn
6
  import os
@@ -9,8 +10,7 @@ import json
9
  app = FastAPI()
10
 
11
  MODELS = {
12
- "tinyllama": "models/tinyllama.gguf",
13
- "qwen": "models/qwen1.5b.gguf"
14
  }
15
 
16
  class ChatRequest(BaseModel):
@@ -23,7 +23,7 @@ class GenerateRequest(BaseModel):
23
 
24
 
25
  # ---------------------------
26
- # Utility Logging
27
  # ---------------------------
28
 
29
  def log(title, data):
@@ -34,177 +34,153 @@ def log(title, data):
34
 
35
 
36
  # ---------------------------
37
- # Prompt builder
38
  # ---------------------------
39
 
40
  def build_prompt(messages):
41
 
42
- log("CHAT HISTORY", json.dumps(messages, indent=2))
43
-
44
  prompt = ""
45
 
46
  for m in messages:
47
  role = m.get("role", "user")
48
  content = m.get("content", "")
49
 
 
 
 
50
  prompt += f"{role}: {content}\n"
51
 
52
  prompt += "assistant:"
53
 
54
- log("FINAL PROMPT", prompt)
55
 
56
  return prompt
57
 
58
 
59
  # ---------------------------
60
- # Run llama.cpp
61
  # ---------------------------
62
 
63
- def run_model(model_path, prompt):
 
 
 
 
 
 
 
64
 
65
- log("MODEL PATH", model_path)
66
 
67
- if not os.path.exists(model_path):
68
- log("ERROR", f"Model file missing: {model_path}")
69
- return "Model file not found"
70
 
71
- command = [
72
- "./llama.cpp/build/bin/llama-cli",
73
- "-m", model_path,
74
- "-p", prompt,
75
- "-n", "200",
76
- "--no-display-prompt"
77
- ]
78
 
79
- log("EXEC COMMAND", command)
80
 
81
- result = subprocess.run(
82
- command,
83
- capture_output=True,
84
- text=True
85
- )
86
 
87
- log("LLAMA STDOUT", result.stdout)
88
- log("LLAMA STDERR", result.stderr)
 
89
 
90
- output = result.stdout.strip()
91
 
92
- if "assistant:" in output:
93
- output = output.split("assistant:")[-1].strip()
 
94
 
95
- log("FINAL OUTPUT", output)
96
 
97
- return output
 
 
 
 
 
 
98
 
99
 
100
  # ---------------------------
101
- # Root endpoint
102
  # ---------------------------
103
 
104
  @app.get("/")
105
  def root():
106
- log("SERVER STATUS", "Server running")
107
  return {"status": "running"}
108
 
109
 
110
  # ---------------------------
111
- # Model list (Ollama compatible)
112
  # ---------------------------
113
 
114
  @app.get("/api/tags")
115
  def list_models():
116
 
117
- models = []
118
-
119
- for name in MODELS.keys():
120
- models.append({
121
- "name": name,
122
- "model": name
123
- })
124
-
125
- log("MODEL LIST REQUEST", models)
126
-
127
- return {"models": models}
128
-
129
-
130
- # ---------------------------
131
- # Generate endpoint
132
- # ---------------------------
133
-
134
- @app.post("/api/generate")
135
- def generate(req: GenerateRequest):
136
-
137
- log("GENERATE REQUEST", req.dict())
138
-
139
- if req.model not in MODELS:
140
- return {"error": "model not found"}
141
-
142
- model_path = MODELS[req.model]
143
-
144
- response = run_model(model_path, req.prompt)
145
-
146
  return {
147
- "model": req.model,
148
- "response": response,
149
- "done": True
150
  }
151
 
152
 
153
  # ---------------------------
154
- # Chat endpoint
155
  # ---------------------------
156
 
157
  @app.post("/api/chat")
158
  def chat(req: ChatRequest):
159
 
160
- log("CHAT REQUEST", req.dict())
161
-
162
- if req.model not in MODELS:
163
- return {"error": "model not found"}
164
-
165
- model_path = MODELS[req.model]
166
-
167
  prompt = build_prompt(req.messages)
168
 
169
- response = run_model(model_path, prompt)
 
 
 
 
 
 
 
 
170
 
171
  return {
172
  "model": req.model,
173
  "message": {
174
  "role": "assistant",
175
- "content": response
176
  },
177
  "done": True
178
  }
179
 
180
- os.makedirs("models", exist_ok=True)
181
-
182
- MODEL_FILES = {
183
- "tinyllama": (
184
- "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
185
- "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
186
- )
187
- }
188
-
189
- for name, (repo, file) in MODEL_FILES.items():
190
 
191
- path = f"models/{name}.gguf"
 
 
192
 
193
- if not os.path.exists(path):
 
194
 
195
- print(f"Downloading model {name} from {repo}")
 
 
 
 
 
 
196
 
197
- downloaded = hf_hub_download(
198
- repo_id=repo,
199
- filename=file
200
- )
201
 
202
- os.system(f"cp {downloaded} {path}")
 
 
 
 
203
 
204
- print(f"Model ready: {path}")
205
 
206
  # ---------------------------
207
- # Start server
208
  # ---------------------------
209
 
210
  if __name__ == "__main__":
 
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from huggingface_hub import hf_hub_download
4
+ import requests
5
  import subprocess
6
  import uvicorn
7
  import os
 
10
  app = FastAPI()
11
 
12
  MODELS = {
13
+ "tinyllama": "models/tinyllama.gguf"
 
14
  }
15
 
16
  class ChatRequest(BaseModel):
 
23
 
24
 
25
  # ---------------------------
26
+ # logging
27
  # ---------------------------
28
 
29
  def log(title, data):
 
34
 
35
 
36
  # ---------------------------
37
+ # prompt builder
38
  # ---------------------------
39
 
40
  def build_prompt(messages):
41
 
 
 
42
  prompt = ""
43
 
44
  for m in messages:
45
  role = m.get("role", "user")
46
  content = m.get("content", "")
47
 
48
+ if content.strip() == "":
49
+ continue
50
+
51
  prompt += f"{role}: {content}\n"
52
 
53
  prompt += "assistant:"
54
 
55
+ log("PROMPT", prompt)
56
 
57
  return prompt
58
 
59
 
60
  # ---------------------------
61
+ # download model
62
  # ---------------------------
63
 
64
+ os.makedirs("models", exist_ok=True)
65
+
66
+ MODEL_FILES = {
67
+ "tinyllama": (
68
+ "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
69
+ "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
70
+ )
71
+ }
72
 
73
+ for name, (repo, file) in MODEL_FILES.items():
74
 
75
+ path = f"models/{name}.gguf"
 
 
76
 
77
+ if not os.path.exists(path):
 
 
 
 
 
 
78
 
79
+ print(f"Downloading model {name}")
80
 
81
+ downloaded = hf_hub_download(
82
+ repo_id=repo,
83
+ filename=file
84
+ )
 
85
 
86
+ os.system(f"cp {downloaded} {path}")
87
+
88
+ print(f"Model ready: {path}")
89
 
 
90
 
91
+ # ---------------------------
92
+ # start llama-server
93
+ # ---------------------------
94
 
95
+ print("Starting llama-server...")
96
 
97
+ subprocess.Popen([
98
+ "./llama.cpp/build/bin/llama-server",
99
+ "-m", "models/tinyllama.gguf",
100
+ "--host", "0.0.0.0",
101
+ "--port", "8080",
102
+ "-c", "2048"
103
+ ])
104
 
105
 
106
  # ---------------------------
107
+ # root
108
  # ---------------------------
109
 
110
  @app.get("/")
111
  def root():
 
112
  return {"status": "running"}
113
 
114
 
115
  # ---------------------------
116
+ # model list
117
  # ---------------------------
118
 
119
  @app.get("/api/tags")
120
  def list_models():
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  return {
123
+ "models": [
124
+ {"name": "tinyllama"}
125
+ ]
126
  }
127
 
128
 
129
  # ---------------------------
130
+ # chat endpoint
131
  # ---------------------------
132
 
133
  @app.post("/api/chat")
134
  def chat(req: ChatRequest):
135
 
 
 
 
 
 
 
 
136
  prompt = build_prompt(req.messages)
137
 
138
+ response = requests.post(
139
+ "http://localhost:8080/completion",
140
+ json={
141
+ "prompt": prompt,
142
+ "n_predict": 200
143
+ }
144
+ )
145
+
146
+ data = response.json()
147
 
148
  return {
149
  "model": req.model,
150
  "message": {
151
  "role": "assistant",
152
+ "content": data["content"]
153
  },
154
  "done": True
155
  }
156
 
 
 
 
 
 
 
 
 
 
 
157
 
158
+ # ---------------------------
159
+ # generate endpoint
160
+ # ---------------------------
161
 
162
+ @app.post("/api/generate")
163
+ def generate(req: GenerateRequest):
164
 
165
+ response = requests.post(
166
+ "http://localhost:8080/completion",
167
+ json={
168
+ "prompt": req.prompt,
169
+ "n_predict": 200
170
+ }
171
+ )
172
 
173
+ data = response.json()
 
 
 
174
 
175
+ return {
176
+ "model": req.model,
177
+ "response": data["content"],
178
+ "done": True
179
+ }
180
 
 
181
 
182
  # ---------------------------
183
+ # start API
184
  # ---------------------------
185
 
186
  if __name__ == "__main__":