Spaces:

Toilatop1sever
/

AI_Coder

Sleeping

App Files Files Community

Toilatop1sever commited on May 27

Commit

0a73fa9

verified ·

1 Parent(s): 31decae

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -25

app.py CHANGED Viewed

@@ -5,7 +5,9 @@ from pydantic import BaseModel
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 from typing import List, Optional
-import os, json, uvicorn
 app = FastAPI()
 app.add_middleware(
@@ -18,38 +20,42 @@ app.add_middleware(
 MODEL_REPO = "unsloth/Qwen3-4B-GGUF"
 MODEL_FILE = "Qwen3-4B-Q4_K_M.gguf"
-MAX_HISTORY = 4          # Giảm từ 6 xuống 4 để bớt prefill
-MAX_CTX     = 4096
-MAX_TOKENS  = 2048       # Giảm nếu không cần sinh dài
-THREADS     = 2          # Giữ nguyên vì 2 vCPU
 llm: Optional[Llama] = None
 @app.on_event("startup")
 async def startup_event():
     global llm
     if os.path.exists(MODEL_FILE) and os.path.getsize(MODEL_FILE) < 1_000_000:
         os.remove(MODEL_FILE)
     if not os.path.exists(MODEL_FILE):
         print(f"Downloading {MODEL_FILE}...")
         hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=".")
         print("Download done!")
-    print("Loading model...")
     llm = Llama(
-        model_path   = MODEL_FILE,
-        n_ctx        = MAX_CTX,
-        n_threads    = THREADS,
-        n_threads_batch = THREADS,   # Thêm nếu llama_cpp hỗ trợ
-        n_batch      = 256,           # Giảm mạnh từ 8192 -> 256
-        n_ubatch     = 128,           # Giảm từ 512 -> 128
-        n_gpu_layers = 0,
-        verbose      = False,
-        use_mmap     = True,
-        use_mlock    = False,
-        flash_attn   = True,          # Bật flash attention nếu phiên bản hỗ trợ
-        logits_all   = False,         # Không cần logits, tiết kiệm
     )
-    print("Model ready!")
 class Message(BaseModel):
     role: str
@@ -64,16 +70,18 @@ class ChatRequest(BaseModel):
     top_p: float = 0.9
 def build_messages(req: ChatRequest) -> list:
-    # Prompt system ngắn gọn hơn một chút
-    system = req.system_prompt or "Bạn là trợ lý AI, trả lời bằng tiếng Việt ."
     msgs = [{"role": "system", "content": system}]
-    # Giữ lại tối đa 4 tin nhắn gần nhất (2 lượt)
     recent_history = req.history[-(MAX_HISTORY * 2):]
     for msg in recent_history:
         if msg.role in ("user", "assistant") and msg.content.strip():
             if msgs[-1]["role"] != msg.role:
                 msgs.append({"role": msg.role, "content": msg.content.strip()})
-    # Loại bỏ user cuối nếu trùng, rồi thêm prompt mới
     if msgs[-1]["role"] == "user":
         msgs.pop()
     msgs.append({"role": "user", "content": req.prompt.strip()})
@@ -82,7 +90,7 @@ def build_messages(req: ChatRequest) -> list:
 @app.post("/chat")
 async def chat(req: ChatRequest):
     if llm is None:
-        raise HTTPException(503, "Model chưa sẵn sàng")
     if not req.prompt.strip():
         raise HTTPException(400, "Prompt trống")
     if len(req.prompt) > 4000:
@@ -118,7 +126,11 @@ async def chat(req: ChatRequest):
 @app.get("/")
 async def root():
-    return {"status": "ok" if llm else "loading", "model": MODEL_FILE}
 @app.get("/health")
 async def health():

 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 from typing import List, Optional
+import os
+import json
+import uvicorn
 app = FastAPI()
 app.add_middleware(
 MODEL_REPO = "unsloth/Qwen3-4B-GGUF"
 MODEL_FILE = "Qwen3-4B-Q4_K_M.gguf"
+MAX_HISTORY = 4          # Giảm để tiết kiệm token xử lý
+MAX_CTX     = 4096       # Khung ngữ cảnh tối đa
+MAX_TOKENS  = 2048       # Giới hạn token sinh ra để giải phóng tài nguyên sớm
+THREADS     = 2          # Số luồng CPU, khớp với vCPU bạn có
 llm: Optional[Llama] = None
 @app.on_event("startup")
 async def startup_event():
     global llm
+    # 1. Tải model (xóa file cũ nếu lỗi)
     if os.path.exists(MODEL_FILE) and os.path.getsize(MODEL_FILE) < 1_000_000:
         os.remove(MODEL_FILE)
     if not os.path.exists(MODEL_FILE):
         print(f"Downloading {MODEL_FILE}...")
         hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=".")
         print("Download done!")
+    # 2. Khởi tạo model với các tham số tối ưu cho CPU và RAM
+    print("Loading model with RAM-optimized settings...")
     llm = Llama(
+        model_path       = MODEL_FILE,
+        n_ctx            = MAX_CTX,
+        n_threads        = THREADS,
+        n_threads_batch  = THREADS,
+        n_batch          = MAX_CTX,      # Quan trọng: bằng với n_ctx để tối ưu cache
+        n_ubatch         = 512,          # Kích thước micro-batch, cân bằng với n_batch
+        n_gpu_layers     = 0,
+        verbose          = False,
+        use_mmap         = False,        # Tắt mmap, load toàn bộ model vào RAM
+        use_mlock        = True,         # Khóa model trong RAM, tránh swap ra ổ cứng
+        flash_attn       = True,         # Bật Flash Attention, giảm tải bộ nhớ
+        cache_type_k     = "q8_0",       # Lượng tử hóa KV cache (Key) xuống 8-bit
+        cache_type_v     = "q8_0",       # Lượng tử hóa KV cache (Value) xuống 8-bit
     )
+    print("Model ready! All weights and caches are locked in RAM.")
 class Message(BaseModel):
     role: str
     top_p: float = 0.9
 def build_messages(req: ChatRequest) -> list:
+    # System prompt ngắn gọn, tiết kiệm token
+    system = req.system_prompt or "Bạn là trợ lý AI, trả lời bằng tiếng Việt ngắn gọn."
     msgs = [{"role": "system", "content": system}]
+    # Chỉ giữ lại lịch sử tối đa MAX_HISTORY tin nhắn (mỗi lượt gồm user + assistant)
     recent_history = req.history[-(MAX_HISTORY * 2):]
     for msg in recent_history:
         if msg.role in ("user", "assistant") and msg.content.strip():
             if msgs[-1]["role"] != msg.role:
                 msgs.append({"role": msg.role, "content": msg.content.strip()})
+    # Tránh trùng lặp role user cuối
     if msgs[-1]["role"] == "user":
         msgs.pop()
     msgs.append({"role": "user", "content": req.prompt.strip()})
 @app.post("/chat")
 async def chat(req: ChatRequest):
     if llm is None:
+        raise HTTPException(503, "Model chưa sẵn sàng, thử lại sau!")
     if not req.prompt.strip():
         raise HTTPException(400, "Prompt trống")
     if len(req.prompt) > 4000:
 @app.get("/")
 async def root():
+    return {
+        "status"   : "ok" if llm else "loading",
+        "model"    : MODEL_FILE,
+        "message"  : "Model ready (RAM-optimized)!" if llm else "Model đang tải...",
+    }
 @app.get("/health")
 async def health():