Spaces:

mrmadblack
/

llm-api

Running

App Files Files Community

mrmadblack commited on Mar 13

Commit

3c98a54

verified ·

1 Parent(s): 583a3d1

Update server.py

Browse files

Files changed (1) hide show

server.py +118 -83

server.py CHANGED Viewed

@@ -1,102 +1,69 @@
 from fastapi import FastAPI
 from pydantic import BaseModel
 from huggingface_hub import hf_hub_download
-import requests
 import subprocess
 import uvicorn
 import os
-import json
 app = FastAPI()
-MODELS = {
-    "tinyllama": "models/tinyllama.gguf"
-}
-class ChatRequest(BaseModel):
-    model: str
-    messages: list
-class GenerateRequest(BaseModel):
-    model: str
-    prompt: str
 # ---------------------------
-# logging
 # ---------------------------
-def log(title, data):
-    print("\n==============================")
-    print(title)
-    print(data)
-    print("==============================\n")
 # ---------------------------
-# prompt builder
 # ---------------------------
-def build_prompt(messages):
-    prompt = ""
-    for m in messages:
-        role = m.get("role", "user")
-        content = m.get("content", "")
-        if content.strip() == "":
-            continue
-        prompt += f"{role}: {content}\n"
-    prompt += "assistant:"
-    log("PROMPT", prompt)
-    return prompt
 # ---------------------------
-# download model
 # ---------------------------
 os.makedirs("models", exist_ok=True)
-MODEL_FILES = {
-    "tinyllama": (
-        "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
-        "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
-    )
-}
-for name, (repo, file) in MODEL_FILES.items():
-    path = f"models/{name}.gguf"
-    if not os.path.exists(path):
-        print(f"Downloading model {name}")
-        downloaded = hf_hub_download(
-            repo_id=repo,
-            filename=file
-        )
-        os.system(f"cp {downloaded} {path}")
-        print(f"Model ready: {path}")
 # ---------------------------
-# start llama-server
 # ---------------------------
 print("Starting llama-server...")
 subprocess.Popen([
     "./llama.cpp/build/bin/llama-server",
-    "-m", "models/tinyllama.gguf",
     "--host", "0.0.0.0",
     "--port", "8080",
     "-c", "2048"
@@ -104,7 +71,30 @@ subprocess.Popen([
 # ---------------------------
-# root
 # ---------------------------
 @app.get("/")
@@ -113,74 +103,119 @@ def root():
 # ---------------------------
-# model list
 # ---------------------------
 @app.get("/api/tags")
-def list_models():
     return {
         "models": [
-            {"name": "tinyllama"}
         ]
     }
 # ---------------------------
-# chat endpoint
 # ---------------------------
-@app.post("/api/chat")
-def chat(req: ChatRequest):
-    prompt = build_prompt(req.messages)
     response = requests.post(
         "http://localhost:8080/completion",
         json={
-            "prompt": prompt,
             "n_predict": 200
         }
     )
-    data = response.json()
     return {
         "model": req.model,
-        "message": {
-            "role": "assistant",
-            "content": data["content"]
-        },
-        "done": True
     }
 # ---------------------------
-# generate endpoint
 # ---------------------------
-@app.post("/api/generate")
-def generate(req: GenerateRequest):
     response = requests.post(
         "http://localhost:8080/completion",
         json={
-            "prompt": req.prompt,
             "n_predict": 200
         }
     )
-    data = response.json()
     return {
         "model": req.model,
-        "response": data["content"],
-        "done": True
     }
 # ---------------------------
-# start API
 # ---------------------------
 if __name__ == "__main__":

 from fastapi import FastAPI
 from pydantic import BaseModel
 from huggingface_hub import hf_hub_download
 import subprocess
+import requests
 import uvicorn
 import os
+import time
+import hashlib
 app = FastAPI()
 # ---------------------------
+# MODEL CONFIG
 # ---------------------------
+MODEL_NAME = "tinyllama"
+MODEL_PATH = "models/tinyllama.gguf"
+MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
+MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
 # ---------------------------
+# REQUEST MODELS
 # ---------------------------
+class ChatRequest(BaseModel):
+    model: str
+    messages: list
+class GenerateRequest(BaseModel):
+    model: str
+    prompt: str
 # ---------------------------
+# DOWNLOAD MODEL
 # ---------------------------
 os.makedirs("models", exist_ok=True)
+if not os.path.exists(MODEL_PATH):
+    print("Downloading model from HuggingFace")
+    downloaded = hf_hub_download(
+        repo_id=MODEL_REPO,
+        filename=MODEL_FILE
+    )
+    os.system(f"cp {downloaded} {MODEL_PATH}")
+    print("Model downloaded")
 # ---------------------------
+# START LLAMA SERVER
 # ---------------------------
 print("Starting llama-server...")
 subprocess.Popen([
     "./llama.cpp/build/bin/llama-server",
+    "-m", MODEL_PATH,
     "--host", "0.0.0.0",
     "--port", "8080",
     "-c", "2048"
 # ---------------------------
+# PROMPT BUILDER
+# ---------------------------
+def build_prompt(messages):
+    prompt = ""
+    for m in messages:
+        role = m.get("role")
+        content = m.get("content", "").strip()
+        if content == "":
+            continue
+        prompt += f"{role}: {content}\n"
+    prompt += "assistant:"
+    return prompt
+# ---------------------------
+# ROOT
 # ---------------------------
 @app.get("/")
 # ---------------------------
+# MODEL LIST (OLLAMA FORMAT)
 # ---------------------------
 @app.get("/api/tags")
+def tags():
+    size = os.path.getsize(MODEL_PATH)
+    digest = hashlib.sha256(open(MODEL_PATH, "rb").read()).hexdigest()
     return {
         "models": [
+            {
+                "name": MODEL_NAME,
+                "model": MODEL_NAME,
+                "modified_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
+                "size": size,
+                "digest": digest,
+                "details": {
+                    "format": "gguf",
+                    "family": "llama",
+                    "families": ["llama"],
+                    "parameter_size": "1.1B",
+                    "quantization_level": "Q4_K_M"
+                }
+            }
         ]
     }
 # ---------------------------
+# GENERATE ENDPOINT
 # ---------------------------
+@app.post("/api/generate")
+def generate(req: GenerateRequest):
+    start = time.time()
     response = requests.post(
         "http://localhost:8080/completion",
         json={
+            "prompt": req.prompt,
             "n_predict": 200
         }
     )
+    text = response.json()["content"].strip()
+    duration = int((time.time() - start) * 1e9)
     return {
         "model": req.model,
+        "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
+        "response": text,
+        "done": True,
+        "done_reason": "stop",
+        "total_duration": duration,
+        "load_duration": 0,
+        "prompt_eval_count": 0,
+        "prompt_eval_duration": 0,
+        "eval_count": len(text.split()),
+        "eval_duration": duration
     }
 # ---------------------------
+# CHAT ENDPOINT
 # ---------------------------
+@app.post("/api/chat")
+def chat(req: ChatRequest):
+    start = time.time()
+    prompt = build_prompt(req.messages)
     response = requests.post(
         "http://localhost:8080/completion",
         json={
+            "prompt": prompt,
             "n_predict": 200
         }
     )
+    text = response.json()["content"].strip()
+    duration = int((time.time() - start) * 1e9)
     return {
         "model": req.model,
+        "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
+        "message": {
+            "role": "assistant",
+            "content": text,
+            "thinking": "",
+            "tool_calls": [],
+            "images": []
+        },
+        "done": True,
+        "done_reason": "stop",
+        "total_duration": duration,
+        "load_duration": 0,
+        "prompt_eval_count": 0,
+        "prompt_eval_duration": 0,
+        "eval_count": len(text.split()),
+        "eval_duration": duration,
+        "logprobs": []
     }
 # ---------------------------
+# START API
 # ---------------------------
 if __name__ == "__main__":