Spaces:

mrmadblack
/

llm-api

Building

App Files Files Community

mrmadblack commited on Mar 13

Commit

e76849b

verified ·

1 Parent(s): 069a96e

Update server.py

Browse files

Files changed (1) hide show

server.py +83 -110

server.py CHANGED Viewed

@@ -9,6 +9,7 @@ import os
 import json
 import time
 import hashlib
 app = FastAPI()
@@ -26,12 +27,13 @@ LLAMA_SERVER = "./llama.cpp/build/bin/llama-server"
 # ---------------------------
-# REQUEST SCHEMAS
 # ---------------------------
 class ChatRequest(BaseModel):
     model: str
     messages: list
 class GenerateRequest(BaseModel):
@@ -75,7 +77,7 @@ os.makedirs("models", exist_ok=True)
 if not os.path.exists(MODEL_PATH):
-    print("Downloading model from HuggingFace")
     downloaded = hf_hub_download(
         repo_id=MODEL_REPO,
@@ -91,15 +93,29 @@ if not os.path.exists(MODEL_PATH):
 # START LLAMA SERVER
 # ---------------------------
-print("Starting llama-server...")
-subprocess.Popen([
-    LLAMA_SERVER,
-    "-m", MODEL_PATH,
-    "--host", "0.0.0.0",
-    "--port", "8080",
-    "-c", "2048"
-])
 # ---------------------------
@@ -112,120 +128,65 @@ def root():
 # ---------------------------
-# OLLAMA MODEL LIST
 # ---------------------------
-@app.post("/api/chat")
-def chat(req: ChatRequest):
-    prompt = build_prompt(req.messages)
-    stream = getattr(req, "stream", False)
-    r = requests.post(
-        "http://localhost:8080/completion",
-        json={
-            "prompt": prompt,
-            "stream": stream,
-            "n_predict": 512
-        },
-        stream=stream
-    )
-    # NON STREAM MODE (normal JSON)
-    if not stream:
-        data = r.json()
-        text = data.get("content", "")
-        return JSONResponse({
-            "model": req.model,
-            "message": {
-                "role": "assistant",
-                "content": text
-            },
-            "done": True
-        })
-    # STREAM MODE (NDJSON like Ollama)
-    def stream_generator():
-        for line in r.iter_lines():
-            if not line:
-                continue
-            line = line.decode("utf-8").strip()
-            if line.startswith("data:"):
-                line = line[5:].strip()
-            try:
-                data = json.loads(line)
-            except:
-                continue
-            token = data.get("content", "")
-            yield json.dumps({
-                "model": req.model,
-                "message": {
-                    "role": "assistant",
-                    "content": token
-                },
-                "done": False
-            }) + "\n"
-        yield json.dumps({
-            "model": req.model,
-            "done": True
-        }) + "\n"
-    return StreamingResponse(
-        stream_generator(),
-        media_type="application/x-ndjson"
-    )
 # ---------------------------
-# GENERATE (NON STREAM)
 # ---------------------------
 @app.post("/api/generate")
 def generate(req: GenerateRequest):
-    start = time.time()
-    response = requests.post(
         "http://localhost:8080/completion",
         json={
             "prompt": req.prompt,
-            "n_predict": 200
         }
     )
-    data = response.json()
     text = data.get("content", "").strip()
-    duration = int((time.time() - start) * 1e9)
     return {
         "model": req.model,
-        "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
         "response": text,
-        "done": True,
-        "done_reason": "stop",
-        "total_duration": duration,
-        "load_duration": 0,
-        "prompt_eval_count": len(req.prompt.split()),
-        "prompt_eval_duration": 0,
-        "eval_count": len(text.split()),
-        "eval_duration": duration
     }
 # ---------------------------
-# CHAT STREAM (OLLAMA STYLE)
 # ---------------------------
 @app.post("/api/chat")
@@ -237,16 +198,30 @@ def chat(req: ChatRequest):
         "http://localhost:8080/completion",
         json={
             "prompt": prompt,
-            "stream": True,
-            "n_predict": 1024,
             "temperature": 0.7,
             "top_p": 0.9,
             "stop": ["User:", "</s>"]
         },
-        stream=True
     )
-    def stream():
         for line in r.iter_lines():
@@ -255,30 +230,24 @@ def chat(req: ChatRequest):
             line = line.decode("utf-8").strip()
-            if not line:
-                continue
             if line.startswith("data:"):
                 line = line[5:].strip()
             try:
                 data = json.loads(line)
-            except Exception:
                 continue
             token = data.get("content", "")
-            chunk = {
                 "model": req.model,
-                "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
                 "message": {
                     "role": "assistant",
                     "content": token
                 },
                 "done": False
-            }
-            yield json.dumps(chunk) + "\n"
         yield json.dumps({
             "model": req.model,
@@ -286,7 +255,11 @@ def chat(req: ChatRequest):
             "done_reason": "stop"
         }) + "\n"
-    return StreamingResponse(stream(), media_type="application/x-ndjson")
 # ---------------------------

 import json
 import time
 import hashlib
+import threading
 app = FastAPI()
 # ---------------------------
+# REQUEST MODELS
 # ---------------------------
 class ChatRequest(BaseModel):
     model: str
     messages: list
+    stream: bool = True
 class GenerateRequest(BaseModel):
 if not os.path.exists(MODEL_PATH):
+    print("Downloading model from HuggingFace...")
     downloaded = hf_hub_download(
         repo_id=MODEL_REPO,
 # START LLAMA SERVER
 # ---------------------------
+def start_llama():
+    print("Starting llama-server...")
+    subprocess.Popen([
+        LLAMA_SERVER,
+        "-m", MODEL_PATH,
+        "--host", "0.0.0.0",
+        "--port", "8080",
+        "-c", "2048"
+    ])
+    # wait for server to start
+    for _ in range(30):
+        try:
+            requests.get("http://localhost:8080/health")
+            print("llama-server ready")
+            return
+        except:
+            time.sleep(1)
+threading.Thread(target=start_llama, daemon=True).start()
 # ---------------------------
 # ---------------------------
+# MODEL LIST (Ollama style)
 # ---------------------------
+@app.get("/api/tags")
+def tags():
+    size = os.path.getsize(MODEL_PATH)
+    with open(MODEL_PATH, "rb") as f:
+        digest = hashlib.sha256(f.read()).hexdigest()
+    return {
+        "models": [
+            {
+                "name": MODEL_NAME,
+                "model": MODEL_NAME,
+                "modified_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
+                "size": size,
+                "digest": digest,
+                "details": {
+                    "format": "gguf",
+                    "family": "llama",
+                    "families": ["llama"],
+                    "parameter_size": "1.1B",
+                    "quantization_level": "Q4_K_M"
+                }
+            }
+        ]
+    }
 # ---------------------------
+# GENERATE (non-stream)
 # ---------------------------
 @app.post("/api/generate")
 def generate(req: GenerateRequest):
+    r = requests.post(
         "http://localhost:8080/completion",
         json={
             "prompt": req.prompt,
+            "n_predict": 256
         }
     )
+    data = r.json()
     text = data.get("content", "").strip()
     return {
         "model": req.model,
         "response": text,
+        "done": True
     }
 # ---------------------------
+# CHAT (Ollama streaming)
 # ---------------------------
 @app.post("/api/chat")
         "http://localhost:8080/completion",
         json={
             "prompt": prompt,
+            "stream": req.stream,
+            "n_predict": 256,
             "temperature": 0.7,
             "top_p": 0.9,
             "stop": ["User:", "</s>"]
         },
+        stream=req.stream
     )
+    if not req.stream:
+        data = r.json()
+        text = data.get("content", "")
+        return JSONResponse({
+            "model": req.model,
+            "message": {
+                "role": "assistant",
+                "content": text
+            },
+            "done": True
+        })
+    def stream_generator():
         for line in r.iter_lines():
             line = line.decode("utf-8").strip()
             if line.startswith("data:"):
                 line = line[5:].strip()
             try:
                 data = json.loads(line)
+            except:
                 continue
             token = data.get("content", "")
+            yield json.dumps({
                 "model": req.model,
                 "message": {
                     "role": "assistant",
                     "content": token
                 },
                 "done": False
+            }) + "\n"
         yield json.dumps({
             "model": req.model,
             "done_reason": "stop"
         }) + "\n"
+    return StreamingResponse(
+        stream_generator(),
+        media_type="application/x-ndjson",
+        headers={"Cache-Control": "no-cache"}
+    )
 # ---------------------------