Adedoyinjames commited on
Commit
dcc1a4f
·
verified ·
1 Parent(s): 12c2fc0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -35
app.py CHANGED
@@ -3,12 +3,12 @@ from fastapi import FastAPI, HTTPException
3
  from pydantic import BaseModel
4
  from typing import List, Optional
5
  import uvicorn
6
- from ctransformers import AutoModelForCausalLM
7
- import torch # For CPU checks
8
 
9
- # Model config (Q5_K_M quantization: fast on CPU, ~300MB, high quality)
10
  MODEL_REPO = "Qwen/Qwen1.5-0.5B-Chat-GGUF"
11
- MODEL_FILE = "qwen1_5-0_5b-chat-q5_k_m.gguf" # Download via HF CLI if needed
12
  CONTEXT_LENGTH = 32768
13
  MAX_TOKENS = 512
14
  TEMPERATURE = 0.7
@@ -34,14 +34,14 @@ class ChatResponse(BaseModel):
34
 
35
  def load_model():
36
  global model
37
- print("Loading quantized Qwen1.5-0.5B-Chat model on CPU... (10–20s)")
38
- model = AutoModelForCausalLM.from_pretrained(
39
- MODEL_REPO,
40
  model_file=MODEL_FILE,
41
- model_type="qwen", # Qwen architecture
42
- context_length=CONTEXT_LENGTH,
43
- gpu_layers=0, # Force CPU (set to >0 if GPU available)
44
- threads=0 # Auto-detect CPU threads for max speed
45
  )
46
  print("Model loaded! Ready for fast CPU inference.")
47
 
@@ -49,34 +49,22 @@ def load_model():
49
  load_model()
50
 
51
  def generate_response(messages: List[ChatMessage], max_tokens: int, temperature: float, top_p: float) -> str:
52
- # Build prompt in Qwen chat format (multi-turn support)
53
- prompt = ""
54
- for msg in messages:
55
- if msg.role == "user":
56
- prompt += f"<|im_start|>user\n{msg.content}<|im_end|>\n"
57
- elif msg.role == "assistant":
58
- prompt += f"<|im_start|>assistant\n{msg.content}<|im_end|>\n"
59
- prompt += f"<|im_start|>assistant\n" # Start assistant response
60
-
61
- # Generate (streams internally but we collect full output)
62
- response = model(
63
- prompt,
64
- max_new_tokens=max_tokens,
65
  temperature=temperature,
66
  top_p=top_p,
67
- do_sample=True,
68
- stream=False # Full response for API simplicity
69
  )
70
 
71
- # Extract assistant response (strip prompt)
72
- full_output = response
73
- assistant_start = prompt.rfind("<|im_start|>assistant\n")
74
- if assistant_start != -1:
75
- response = full_output[assistant_start + len("<|im_start|>assistant\n"):].strip()
76
- # Clean up any trailing <|im_end|>
77
- response = response.split("<|im_end|>")[0].strip()
78
-
79
- return response
80
 
81
  @app.post("/chat/", response_model=ChatResponse)
82
  async def chat_endpoint(request: ChatRequest):
 
3
  from pydantic import BaseModel
4
  from typing import List, Optional
5
  import uvicorn
6
+ from llama_cpp import Llama
7
+ import os
8
 
9
+ # Model config (Official Qwen GGUF repo; Q5_K_M: fast on CPU, ~300MB, high quality)
10
  MODEL_REPO = "Qwen/Qwen1.5-0.5B-Chat-GGUF"
11
+ MODEL_FILE = "Qwen1.5-0.5B-Chat-Q5_K_M.gguf" # Correct file name with dots & uppercase
12
  CONTEXT_LENGTH = 32768
13
  MAX_TOKENS = 512
14
  TEMPERATURE = 0.7
 
34
 
35
  def load_model():
36
  global model
37
+ print("Loading quantized Qwen1.5-0.5B-Chat model on CPU... (10–15s)")
38
+ model = Llama.from_pretrained(
39
+ repo_id=MODEL_REPO,
40
  model_file=MODEL_FILE,
41
+ n_ctx=CONTEXT_LENGTH,
42
+ n_threads=0, # Auto-detect all CPU threads for max speed
43
+ verbose=False, # Reduce logs
44
+ chat_format="chatml" # Qwen uses ChatML template; auto-applies to messages
45
  )
46
  print("Model loaded! Ready for fast CPU inference.")
47
 
 
49
  load_model()
50
 
51
  def generate_response(messages: List[ChatMessage], max_tokens: int, temperature: float, top_p: float) -> str:
52
+ # Prepare messages list (llama-cpp auto-applies Qwen chat template)
53
+ chat_messages = [{"role": msg.role, "content": msg.content} for msg in messages]
54
+
55
+ # Generate using built-in chat completion (handles template, sampling, etc.)
56
+ response = model.create_chat_completion(
57
+ messages=chat_messages,
58
+ max_tokens=max_tokens,
 
 
 
 
 
 
59
  temperature=temperature,
60
  top_p=top_p,
61
+ stream=False,
62
+ echo=False # Don't repeat input
63
  )
64
 
65
+ # Extract assistant response
66
+ bot_reply = response["choices"][0]["message"]["content"]
67
+ return bot_reply
 
 
 
 
 
 
68
 
69
  @app.post("/chat/", response_model=ChatResponse)
70
  async def chat_endpoint(request: ChatRequest):