Spaces:

mrmadblack
/

llm-api

Running

App Files Files Community

mrmadblack commited on Mar 13

Commit

583a3d1

verified ·

1 Parent(s): 57efecf

Update server.py

Browse files

Files changed (1) hide show

server.py +74 -98

server.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from fastapi import FastAPI
 from pydantic import BaseModel
 from huggingface_hub import hf_hub_download
 import subprocess
 import uvicorn
 import os
@@ -9,8 +10,7 @@ import json
 app = FastAPI()
 MODELS = {
-    "tinyllama": "models/tinyllama.gguf",
-    "qwen": "models/qwen1.5b.gguf"
 }
 class ChatRequest(BaseModel):
@@ -23,7 +23,7 @@ class GenerateRequest(BaseModel):
 # ---------------------------
-# Utility Logging
 # ---------------------------
 def log(title, data):
@@ -34,177 +34,153 @@ def log(title, data):
 # ---------------------------
-# Prompt builder
 # ---------------------------
 def build_prompt(messages):
-    log("CHAT HISTORY", json.dumps(messages, indent=2))
     prompt = ""
     for m in messages:
         role = m.get("role", "user")
         content = m.get("content", "")
         prompt += f"{role}: {content}\n"
     prompt += "assistant:"
-    log("FINAL PROMPT", prompt)
     return prompt
 # ---------------------------
-# Run llama.cpp
 # ---------------------------
-def run_model(model_path, prompt):
-    log("MODEL PATH", model_path)
-    if not os.path.exists(model_path):
-        log("ERROR", f"Model file missing: {model_path}")
-        return "Model file not found"
-    command = [
-        "./llama.cpp/build/bin/llama-cli",
-        "-m", model_path,
-        "-p", prompt,
-        "-n", "200",
-        "--no-display-prompt"
-    ]
-    log("EXEC COMMAND", command)
-    result = subprocess.run(
-        command,
-        capture_output=True,
-        text=True
-    )
-    log("LLAMA STDOUT", result.stdout)
-    log("LLAMA STDERR", result.stderr)
-    output = result.stdout.strip()
-    if "assistant:" in output:
-        output = output.split("assistant:")[-1].strip()
-    log("FINAL OUTPUT", output)
-    return output
 # ---------------------------
-# Root endpoint
 # ---------------------------
 @app.get("/")
 def root():
-    log("SERVER STATUS", "Server running")
     return {"status": "running"}
 # ---------------------------
-# Model list (Ollama compatible)
 # ---------------------------
 @app.get("/api/tags")
 def list_models():
-    models = []
-    for name in MODELS.keys():
-        models.append({
-            "name": name,
-            "model": name
-        })
-    log("MODEL LIST REQUEST", models)
-    return {"models": models}
-# ---------------------------
-# Generate endpoint
-# ---------------------------
-@app.post("/api/generate")
-def generate(req: GenerateRequest):
-    log("GENERATE REQUEST", req.dict())
-    if req.model not in MODELS:
-        return {"error": "model not found"}
-    model_path = MODELS[req.model]
-    response = run_model(model_path, req.prompt)
     return {
-        "model": req.model,
-        "response": response,
-        "done": True
     }
 # ---------------------------
-# Chat endpoint
 # ---------------------------
 @app.post("/api/chat")
 def chat(req: ChatRequest):
-    log("CHAT REQUEST", req.dict())
-    if req.model not in MODELS:
-        return {"error": "model not found"}
-    model_path = MODELS[req.model]
     prompt = build_prompt(req.messages)
-    response = run_model(model_path, prompt)
     return {
         "model": req.model,
         "message": {
             "role": "assistant",
-            "content": response
         },
         "done": True
     }
-os.makedirs("models", exist_ok=True)
-MODEL_FILES = {
-    "tinyllama": (
-        "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
-        "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
-    )
-}
-for name, (repo, file) in MODEL_FILES.items():
-    path = f"models/{name}.gguf"
-    if not os.path.exists(path):
-        print(f"Downloading model {name} from {repo}")
-        downloaded = hf_hub_download(
-            repo_id=repo,
-            filename=file
-        )
-        os.system(f"cp {downloaded} {path}")
-        print(f"Model ready: {path}")
 # ---------------------------
-# Start server
 # ---------------------------
 if __name__ == "__main__":

 from fastapi import FastAPI
 from pydantic import BaseModel
 from huggingface_hub import hf_hub_download
+import requests
 import subprocess
 import uvicorn
 import os
 app = FastAPI()
 MODELS = {
+    "tinyllama": "models/tinyllama.gguf"
 }
 class ChatRequest(BaseModel):
 # ---------------------------
+# logging
 # ---------------------------
 def log(title, data):
 # ---------------------------
+# prompt builder
 # ---------------------------
 def build_prompt(messages):
     prompt = ""
     for m in messages:
         role = m.get("role", "user")
         content = m.get("content", "")
+        if content.strip() == "":
+            continue
         prompt += f"{role}: {content}\n"
     prompt += "assistant:"
+    log("PROMPT", prompt)
     return prompt
 # ---------------------------
+# download model
 # ---------------------------
+os.makedirs("models", exist_ok=True)
+MODEL_FILES = {
+    "tinyllama": (
+        "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
+        "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
+    )
+}
+for name, (repo, file) in MODEL_FILES.items():
+    path = f"models/{name}.gguf"
+    if not os.path.exists(path):
+        print(f"Downloading model {name}")
+        downloaded = hf_hub_download(
+            repo_id=repo,
+            filename=file
+        )
+        os.system(f"cp {downloaded} {path}")
+        print(f"Model ready: {path}")
+# ---------------------------
+# start llama-server
+# ---------------------------
+print("Starting llama-server...")
+subprocess.Popen([
+    "./llama.cpp/build/bin/llama-server",
+    "-m", "models/tinyllama.gguf",
+    "--host", "0.0.0.0",
+    "--port", "8080",
+    "-c", "2048"
+])
 # ---------------------------
+# root
 # ---------------------------
 @app.get("/")
 def root():
     return {"status": "running"}
 # ---------------------------
+# model list
 # ---------------------------
 @app.get("/api/tags")
 def list_models():
     return {
+        "models": [
+            {"name": "tinyllama"}
+        ]
     }
 # ---------------------------
+# chat endpoint
 # ---------------------------
 @app.post("/api/chat")
 def chat(req: ChatRequest):
     prompt = build_prompt(req.messages)
+    response = requests.post(
+        "http://localhost:8080/completion",
+        json={
+            "prompt": prompt,
+            "n_predict": 200
+        }
+    )
+    data = response.json()
     return {
         "model": req.model,
         "message": {
             "role": "assistant",
+            "content": data["content"]
         },
         "done": True
     }
+# ---------------------------
+# generate endpoint
+# ---------------------------
+@app.post("/api/generate")
+def generate(req: GenerateRequest):
+    response = requests.post(
+        "http://localhost:8080/completion",
+        json={
+            "prompt": req.prompt,
+            "n_predict": 200
+        }
+    )
+    data = response.json()
+    return {
+        "model": req.model,
+        "response": data["content"],
+        "done": True
+    }
 # ---------------------------
+# start API
 # ---------------------------
 if __name__ == "__main__":