Spaces:

Toilatop1sever
/

AI_Coder

Sleeping

App Files Files Community

Toilatop1sever commited on May 27

Commit

31decae

verified ·

1 Parent(s): 71067ad

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -13

app.py CHANGED Viewed

@@ -18,10 +18,10 @@ app.add_middleware(
 MODEL_REPO = "unsloth/Qwen3-4B-GGUF"
 MODEL_FILE = "Qwen3-4B-Q4_K_M.gguf"
-MAX_HISTORY = 6
 MAX_CTX     = 4096
-MAX_TOKENS  = 4096
-THREADS     = 2
 llm: Optional[Llama] = None
@@ -39,12 +39,15 @@ async def startup_event():
         model_path   = MODEL_FILE,
         n_ctx        = MAX_CTX,
         n_threads    = THREADS,
-        n_batch      = 8192,
-        n_ubatch     = 512,
         n_gpu_layers = 0,
         verbose      = False,
         use_mmap     = True,
         use_mlock    = False,
     )
     print("Model ready!")
@@ -61,12 +64,16 @@ class ChatRequest(BaseModel):
     top_p: float = 0.9
 def build_messages(req: ChatRequest) -> list:
-    system = req.system_prompt or "/no_think bạn là trợ lý AI, trả lời bằng tiếng Việt."
     msgs = [{"role": "system", "content": system}]
-    for msg in req.history[-(MAX_HISTORY * 2):]:
         if msg.role in ("user", "assistant") and msg.content.strip():
             if msgs[-1]["role"] != msg.role:
                 msgs.append({"role": msg.role, "content": msg.content.strip()})
     if msgs[-1]["role"] == "user":
         msgs.pop()
     msgs.append({"role": "user", "content": req.prompt.strip()})
@@ -75,7 +82,7 @@ def build_messages(req: ChatRequest) -> list:
 @app.post("/chat")
 async def chat(req: ChatRequest):
     if llm is None:
-        raise HTTPException(503, "Model chưa sẵn sàng, thử lại sau!")
     if not req.prompt.strip():
         raise HTTPException(400, "Prompt trống")
     if len(req.prompt) > 4000:
@@ -111,11 +118,7 @@ async def chat(req: ChatRequest):
 @app.get("/")
 async def root():
-    return {
-        "status"  : "ok" if llm else "loading",
-        "model"   : MODEL_FILE,
-        "message" : "Model ready!" if llm else "Model đang tải...",
-    }
 @app.get("/health")
 async def health():

 MODEL_REPO = "unsloth/Qwen3-4B-GGUF"
 MODEL_FILE = "Qwen3-4B-Q4_K_M.gguf"
+MAX_HISTORY = 4          # Giảm từ 6 xuống 4 để bớt prefill
 MAX_CTX     = 4096
+MAX_TOKENS  = 2048       # Giảm nếu không cần sinh dài
+THREADS     = 2          # Giữ nguyên vì 2 vCPU
 llm: Optional[Llama] = None
         model_path   = MODEL_FILE,
         n_ctx        = MAX_CTX,
         n_threads    = THREADS,
+        n_threads_batch = THREADS,   # Thêm nếu llama_cpp hỗ trợ
+        n_batch      = 256,           # Giảm mạnh từ 8192 -> 256
+        n_ubatch     = 128,           # Giảm từ 512 -> 128
         n_gpu_layers = 0,
         verbose      = False,
         use_mmap     = True,
         use_mlock    = False,
+        flash_attn   = True,          # Bật flash attention nếu phiên bản hỗ trợ
+        logits_all   = False,         # Không cần logits, tiết kiệm
     )
     print("Model ready!")
     top_p: float = 0.9
 def build_messages(req: ChatRequest) -> list:
+    # Prompt system ngắn gọn hơn một chút
+    system = req.system_prompt or "Bạn là trợ lý AI, trả lời bằng tiếng Việt ."
     msgs = [{"role": "system", "content": system}]
+    # Giữ lại tối đa 4 tin nhắn gần nhất (2 lượt)
+    recent_history = req.history[-(MAX_HISTORY * 2):]
+    for msg in recent_history:
         if msg.role in ("user", "assistant") and msg.content.strip():
             if msgs[-1]["role"] != msg.role:
                 msgs.append({"role": msg.role, "content": msg.content.strip()})
+    # Loại bỏ user cuối nếu trùng, rồi thêm prompt mới
     if msgs[-1]["role"] == "user":
         msgs.pop()
     msgs.append({"role": "user", "content": req.prompt.strip()})
 @app.post("/chat")
 async def chat(req: ChatRequest):
     if llm is None:
+        raise HTTPException(503, "Model chưa sẵn sàng")
     if not req.prompt.strip():
         raise HTTPException(400, "Prompt trống")
     if len(req.prompt) > 4000:
 @app.get("/")
 async def root():
+    return {"status": "ok" if llm else "loading", "model": MODEL_FILE}
 @app.get("/health")
 async def health():