Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,12 +3,12 @@ from fastapi import FastAPI, HTTPException
|
|
| 3 |
from pydantic import BaseModel
|
| 4 |
from typing import List, Optional
|
| 5 |
import uvicorn
|
| 6 |
-
from
|
| 7 |
-
import
|
| 8 |
|
| 9 |
-
# Model config (Q5_K_M
|
| 10 |
MODEL_REPO = "Qwen/Qwen1.5-0.5B-Chat-GGUF"
|
| 11 |
-
MODEL_FILE = "
|
| 12 |
CONTEXT_LENGTH = 32768
|
| 13 |
MAX_TOKENS = 512
|
| 14 |
TEMPERATURE = 0.7
|
|
@@ -34,14 +34,14 @@ class ChatResponse(BaseModel):
|
|
| 34 |
|
| 35 |
def load_model():
|
| 36 |
global model
|
| 37 |
-
print("Loading quantized Qwen1.5-0.5B-Chat model on CPU... (10–
|
| 38 |
-
model =
|
| 39 |
-
MODEL_REPO,
|
| 40 |
model_file=MODEL_FILE,
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
)
|
| 46 |
print("Model loaded! Ready for fast CPU inference.")
|
| 47 |
|
|
@@ -49,34 +49,22 @@ def load_model():
|
|
| 49 |
load_model()
|
| 50 |
|
| 51 |
def generate_response(messages: List[ChatMessage], max_tokens: int, temperature: float, top_p: float) -> str:
|
| 52 |
-
#
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
prompt += f"<|im_start|>assistant\n" # Start assistant response
|
| 60 |
-
|
| 61 |
-
# Generate (streams internally but we collect full output)
|
| 62 |
-
response = model(
|
| 63 |
-
prompt,
|
| 64 |
-
max_new_tokens=max_tokens,
|
| 65 |
temperature=temperature,
|
| 66 |
top_p=top_p,
|
| 67 |
-
|
| 68 |
-
|
| 69 |
)
|
| 70 |
|
| 71 |
-
# Extract assistant response
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
if assistant_start != -1:
|
| 75 |
-
response = full_output[assistant_start + len("<|im_start|>assistant\n"):].strip()
|
| 76 |
-
# Clean up any trailing <|im_end|>
|
| 77 |
-
response = response.split("<|im_end|>")[0].strip()
|
| 78 |
-
|
| 79 |
-
return response
|
| 80 |
|
| 81 |
@app.post("/chat/", response_model=ChatResponse)
|
| 82 |
async def chat_endpoint(request: ChatRequest):
|
|
|
|
| 3 |
from pydantic import BaseModel
|
| 4 |
from typing import List, Optional
|
| 5 |
import uvicorn
|
| 6 |
+
from llama_cpp import Llama
|
| 7 |
+
import os
|
| 8 |
|
| 9 |
+
# Model config (Official Qwen GGUF repo; Q5_K_M: fast on CPU, ~300MB, high quality)
|
| 10 |
MODEL_REPO = "Qwen/Qwen1.5-0.5B-Chat-GGUF"
|
| 11 |
+
MODEL_FILE = "Qwen1.5-0.5B-Chat-Q5_K_M.gguf" # Correct file name with dots & uppercase
|
| 12 |
CONTEXT_LENGTH = 32768
|
| 13 |
MAX_TOKENS = 512
|
| 14 |
TEMPERATURE = 0.7
|
|
|
|
| 34 |
|
| 35 |
def load_model():
|
| 36 |
global model
|
| 37 |
+
print("Loading quantized Qwen1.5-0.5B-Chat model on CPU... (10–15s)")
|
| 38 |
+
model = Llama.from_pretrained(
|
| 39 |
+
repo_id=MODEL_REPO,
|
| 40 |
model_file=MODEL_FILE,
|
| 41 |
+
n_ctx=CONTEXT_LENGTH,
|
| 42 |
+
n_threads=0, # Auto-detect all CPU threads for max speed
|
| 43 |
+
verbose=False, # Reduce logs
|
| 44 |
+
chat_format="chatml" # Qwen uses ChatML template; auto-applies to messages
|
| 45 |
)
|
| 46 |
print("Model loaded! Ready for fast CPU inference.")
|
| 47 |
|
|
|
|
| 49 |
load_model()
|
| 50 |
|
| 51 |
def generate_response(messages: List[ChatMessage], max_tokens: int, temperature: float, top_p: float) -> str:
|
| 52 |
+
# Prepare messages list (llama-cpp auto-applies Qwen chat template)
|
| 53 |
+
chat_messages = [{"role": msg.role, "content": msg.content} for msg in messages]
|
| 54 |
+
|
| 55 |
+
# Generate using built-in chat completion (handles template, sampling, etc.)
|
| 56 |
+
response = model.create_chat_completion(
|
| 57 |
+
messages=chat_messages,
|
| 58 |
+
max_tokens=max_tokens,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
temperature=temperature,
|
| 60 |
top_p=top_p,
|
| 61 |
+
stream=False,
|
| 62 |
+
echo=False # Don't repeat input
|
| 63 |
)
|
| 64 |
|
| 65 |
+
# Extract assistant response
|
| 66 |
+
bot_reply = response["choices"][0]["message"]["content"]
|
| 67 |
+
return bot_reply
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
@app.post("/chat/", response_model=ChatResponse)
|
| 70 |
async def chat_endpoint(request: ChatRequest):
|