Spaces:

zerovic
/

dolphin-2.6-phi-2-q4

Sleeping

zerovic commited on Mar 31

Commit

38d28a3

verified ·

1 Parent(s): 955a0cc

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,46 +3,28 @@ from llama_cpp import Llama
 app = FastAPI()
-# Load model ONCE
 llm = Llama(
     model_path="/app/models/model.gguf",
     n_ctx=2048,
     n_threads=6
 )
-# Health check
 @app.get("/")
 def root():
     return {"status": "ok"}
-# Main endpoint (FIXED)
 @app.post("/run/predict")
 async def predict(request: Request):
     body = await request.json()
     messages = body.get("messages", [])
-    # Convert messages to a prompt
-    prompt = ""
-    for msg in messages:
-        role = msg.get("role")
-        content = msg.get("content")
-        prompt += f"{role}: {content}\n"
-    prompt += "assistant:"
-    output = llm(
-        prompt,
         max_tokens=body.get("max_tokens", 50),
         temperature=body.get("temperature", 0.7)
     )
-    return {
-        "choices": [
-            {
-                "message": {
-                    "content": output["choices"][0]["text"]
-                }
-            }
-        ]
-    }

 app = FastAPI()
+# Load model once
 llm = Llama(
     model_path="/app/models/model.gguf",
     n_ctx=2048,
     n_threads=6
 )
 @app.get("/")
 def root():
     return {"status": "ok"}
 @app.post("/run/predict")
 async def predict(request: Request):
     body = await request.json()
     messages = body.get("messages", [])
+    # ✅ Use native chat API (IMPORTANT FIX)
+    output = llm.create_chat_completion(
+        messages=messages,
         max_tokens=body.get("max_tokens", 50),
         temperature=body.get("temperature", 0.7)
     )
+    return output