zerovic commited on
Commit
38d28a3
·
verified ·
1 Parent(s): 955a0cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -23
app.py CHANGED
@@ -3,46 +3,28 @@ from llama_cpp import Llama
3
 
4
  app = FastAPI()
5
 
6
- # Load model ONCE
7
  llm = Llama(
8
  model_path="/app/models/model.gguf",
9
  n_ctx=2048,
10
  n_threads=6
11
  )
12
 
13
- # Health check
14
  @app.get("/")
15
  def root():
16
  return {"status": "ok"}
17
 
18
- # Main endpoint (FIXED)
19
  @app.post("/run/predict")
20
  async def predict(request: Request):
21
  body = await request.json()
22
 
23
  messages = body.get("messages", [])
24
 
25
- # Convert messages to a prompt
26
- prompt = ""
27
- for msg in messages:
28
- role = msg.get("role")
29
- content = msg.get("content")
30
- prompt += f"{role}: {content}\n"
31
-
32
- prompt += "assistant:"
33
-
34
- output = llm(
35
- prompt,
36
  max_tokens=body.get("max_tokens", 50),
37
  temperature=body.get("temperature", 0.7)
38
  )
39
 
40
- return {
41
- "choices": [
42
- {
43
- "message": {
44
- "content": output["choices"][0]["text"]
45
- }
46
- }
47
- ]
48
- }
 
3
 
4
  app = FastAPI()
5
 
6
+ # Load model once
7
  llm = Llama(
8
  model_path="/app/models/model.gguf",
9
  n_ctx=2048,
10
  n_threads=6
11
  )
12
 
 
13
  @app.get("/")
14
  def root():
15
  return {"status": "ok"}
16
 
 
17
  @app.post("/run/predict")
18
  async def predict(request: Request):
19
  body = await request.json()
20
 
21
  messages = body.get("messages", [])
22
 
23
+ # Use native chat API (IMPORTANT FIX)
24
+ output = llm.create_chat_completion(
25
+ messages=messages,
 
 
 
 
 
 
 
 
26
  max_tokens=body.get("max_tokens", 50),
27
  temperature=body.get("temperature", 0.7)
28
  )
29
 
30
+ return output