Spaces:

Toilatop1sever
/

AI_Coder

Sleeping

App Files Files Community

Toilatop1sever commited on May 28

Commit

86e793d

verified ·

1 Parent(s): 58615c9

Update app.py

Browse files

Files changed (1) hide show

app.py +302 -104

app.py CHANGED Viewed

@@ -5,11 +5,18 @@ from pydantic import BaseModel
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 from typing import List, Optional
 import os
 import json
 import uvicorn
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -17,166 +24,357 @@ app.add_middleware(
     allow_headers=["*"],
 )
 MODEL_REPO = "unsloth/Qwen3-4B-GGUF"
 MODEL_FILE = "Qwen3-4B-Q4_K_M.gguf"
-# ── Triết lý tối ưu ───────────────────────────────────────────────────────
-# RAM 18GB dư dả → nhét hết vào RAM, dùng prefix cache để CPU
-# không phải recompute system prompt mỗi request
-# n_batch = 4096 (sweet spot) — đủ để prefill nhanh mà không gây RAM spike
-# ─────────────────────────────────────────────────────────────────────────
 MAX_HISTORY = 6
-MAX_CTX     = 8192
-MAX_TOKENS  = 2048
-THREADS     = 2
-# System prompt cố định — sẽ được cache sẵn vào KV cache lúc startup
-# CPU chỉ tính 1 lần duy nhất, mọi request sau dùng lại cache này
-DEFAULT_SYSTEM = "Bạn là trợ lý AI, trả lời bằng tiếng Việt ngắn gọn."
 llm: Optional[Llama] = None
 @app.on_event("startup")
 async def startup_event():
     global llm
-    if os.path.exists(MODEL_FILE) and os.path.getsize(MODEL_FILE) < 1_000_000:
         os.remove(MODEL_FILE)
     if not os.path.exists(MODEL_FILE):
         print(f"Downloading {MODEL_FILE}...")
-        hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=".")
-        print("Download done!")
-    print("Loading model — RAM-heavy, CPU-light mode...")
     llm = Llama(
-        model_path      = MODEL_FILE,
-        # ── Context & batch ───────────────────────────────────────────────
-        n_ctx           = MAX_CTX,
-        n_batch         = 512 ,   # Nhỏ vừa tay CPU: 2 vCPU không bị nghẹt khi prefill
-        n_ubatch        = 512   ,    # Giữ nhỏ: ổn định hơn khi decode
-        # ── CPU ───────────────────────────────────────────────────────────
-        n_threads       = THREADS,
-        n_threads_batch = THREADS,
-        n_gpu_layers    = 0,
-        # ── RAM: load toàn bộ, khóa lại, không swap ──────────────────────
-        use_mmap        = False,
-        use_mlock       = True,
-        # ── KV Cache quantize — ăn RAM ít hơn, CPU vẫn nhẹ ───────────────
-        cache_type_k    = "q4_0",
-        cache_type_v    = "q4_0",
-        # ── Prefix cache: CPU tính system prompt 1 lần rồi thôi ──────────
-        last_n_tokens_size = 64,  # Cửa sổ detect prefix trùng
-        flash_attn      = True,
-        verbose         = False,
-    )
-    # ── Warm up prefix cache với system prompt ────────────────────────────
-    # Gọi 1 lần lúc startup để KV cache của system prompt được lưu sẵn
-    # Mọi request sau có cùng system prompt → CPU bỏ qua phần này hoàn toàn
-    print("Warming up prefix cache...")
-    warmup_msgs = [
-        {"role": "system", "content": DEFAULT_SYSTEM},
-        {"role": "user",   "content": "hi"},
-    ]
-    _ = llm.create_chat_completion(
-        messages   = warmup_msgs,
-        max_tokens = 1,
-        stream     = False,
-    )
-    print("Prefix cache warmed up! Model ready.")
-class Message(BaseModel):
-    role: str
-    content: str
-class ChatRequest(BaseModel):
-    prompt: str
-    history: List[Message] = []
-    system_prompt: Optional[str] = None  # Để None → tận dụng prefix cache
-    max_tokens: int = MAX_TOKENS
-    temperature: float = 0.7
-    top_p: float = 0.9
-def build_messages(req: ChatRequest) -> list:
-    # Dùng DEFAULT_SYSTEM nếu không truyền system_prompt
-    # → prefix cache luôn hit, CPU không recompute
-    system = req.system_prompt or DEFAULT_SYSTEM
-    msgs = [{"role": "system", "content": system}]
-    recent = req.history[-(MAX_HISTORY * 2):]
-    for msg in recent:
-        if msg.role in ("user", "assistant") and msg.content.strip():
-            if msgs[-1]["role"] != msg.role:
-                msgs.append({"role": msg.role, "content": msg.content.strip()})
-    if msgs[-1]["role"] == "user":
-        msgs.pop()
-    msgs.append({"role": "user", "content": req.prompt.strip()})
-    return msgs
 @app.post("/chat")
 async def chat(req: ChatRequest):
     if llm is None:
-        raise HTTPException(503, "Model chưa sẵn sàng, thử lại sau!")
-    if not req.prompt.strip():
-        raise HTTPException(400, "Prompt trống")
-    if len(req.prompt) > 8000:
-        raise HTTPException(400, "Prompt quá dài")
     messages = build_messages(req)
-    def generate():
         full = ""
-        try:
-            for chunk in llm.create_chat_completion(
-                messages    = messages,
-                max_tokens  = req.max_tokens,
-                temperature = req.temperature,
-                top_p       = req.top_p,
-                stream      = True,
-            ):
-                delta = chunk["choices"][0]["delta"].get("content", "")
-                if delta:
-                    full += delta
-                    yield f"data: {json.dumps({'delta': delta}, ensure_ascii=False)}\n\n"
-        except Exception as e:
-            yield f"data: {json.dumps({'delta': f'[Lỗi: {str(e)}]'})}\n\n"
-        finally:
-            print(f">> Done ({len(full)} chars): {full[:80]}")
-            yield "data: [DONE]\n\n"
     return StreamingResponse(
-        generate(),
-        media_type = "text/event-stream",
-        headers    = {"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
     )
 @app.get("/")
 async def root():
     return {
-        "status"  : "ok" if llm else "loading",
-        "model"   : MODEL_FILE,
-        "message" : "Model ready (prefix cache active)!" if llm else "Model đang tải...",
     }
 @app.get("/health")
 async def health():
-    return {"status": "healthy", "model_loaded": llm is not None}
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 from typing import List, Optional
+import asyncio
 import os
 import json
 import uvicorn
+import gc
+# =============================================================================
+# FASTAPI
+# =============================================================================
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# =============================================================================
+# MODEL CONFIG
+# =============================================================================
 MODEL_REPO = "unsloth/Qwen3-4B-GGUF"
 MODEL_FILE = "Qwen3-4B-Q4_K_M.gguf"
 MAX_HISTORY = 6
+MAX_CTX = 8192
+MAX_TOKENS = 4096
+# Giữ nguyên tham số theo yêu cầu
+THREADS = 2
+N_BATCH = 512
+N_UBATCH = 512
+DEFAULT_SYSTEM = (
+    "Bạn là trợ lý AI, trả lời bằng tiếng Việt ngắn gọn."
+)
+STOP_TOKENS = [
+    "<|im_end|>",
+    "<|endoftext|>",
+]
+# =============================================================================
+# GLOBALS
+# =============================================================================
 llm: Optional[Llama] = None
+# CPU inference -> serialize request để tránh lag/token collapse
+inference_lock = asyncio.Semaphore(1)
+# =============================================================================
+# REQUEST MODELS
+# =============================================================================
+class Message(BaseModel):
+    role: str
+    content: str
+class ChatRequest(BaseModel):
+    prompt: str
+    history: List[Message] = []
+    system_prompt: Optional[str] = None
+    max_tokens: int = MAX_TOKENS
+    temperature: float = 0.7
+    top_p: float = 0.9
+# =============================================================================
+# HELPERS
+# =============================================================================
+def cleanup_text(text: str) -> str:
+    return text.strip().replace("\x00", "")
+def build_messages(req: ChatRequest) -> list:
+    system_prompt = cleanup_text(
+        req.system_prompt or DEFAULT_SYSTEM
+    )
+    messages = [
+        {
+            "role": "system",
+            "content": system_prompt,
+        }
+    ]
+    recent = req.history[-(MAX_HISTORY * 2):]
+    last_role = "system"
+    for msg in recent:
+        role = msg.role.strip().lower()
+        content = cleanup_text(msg.content)
+        if (
+            role not in ("user", "assistant")
+            or not content
+        ):
+            continue
+        # tránh duplicate role liên tục
+        if role == last_role:
+            continue
+        messages.append(
+            {
+                "role": role,
+                "content": content,
+            }
+        )
+        last_role = role
+    prompt = cleanup_text(req.prompt)
+    if not prompt:
+        raise HTTPException(400, "Prompt trống")
+    if len(prompt) > 8000:
+        raise HTTPException(400, "Prompt quá dài")
+    if messages[-1]["role"] == "user":
+        messages.pop()
+    messages.append(
+        {
+            "role": "user",
+            "content": prompt,
+        }
+    )
+    return messages
+def sse(data):
+    return f"data: {json.dumps(data, ensure_ascii=False)}\n\n"
+# =============================================================================
+# STARTUP
+# =============================================================================
 @app.on_event("startup")
 async def startup_event():
     global llm
+    # Xóa file corrupt
+    if (
+        os.path.exists(MODEL_FILE)
+        and os.path.getsize(MODEL_FILE) < 1_000_000
+    ):
         os.remove(MODEL_FILE)
+    # Download nếu chưa có
     if not os.path.exists(MODEL_FILE):
         print(f"Downloading {MODEL_FILE}...")
+        hf_hub_download(
+            repo_id=MODEL_REPO,
+            filename=MODEL_FILE,
+            local_dir=".",
+        )
+        print("Download complete!")
+    print("Loading model...")
     llm = Llama(
+        model_path=MODEL_FILE,
+        # Context
+        n_ctx=MAX_CTX,
+        # Giữ nguyên batch
+        n_batch=N_BATCH,
+        n_ubatch=N_UBATCH,
+        # CPU
+        n_threads=THREADS,
+        n_threads_batch=THREADS,
+        n_gpu_layers=0,
+        # RAM
+        use_mmap=False,
+        use_mlock=True,
+        # KV cache
+        cache_type_k="q4_0",
+        cache_type_v="q4_0",
+        # Prefix detection
+        last_n_tokens_size=64,
+        # Performance
+        flash_attn=True,
+        # Cleaner logs
+        verbose=False,
+    )
+    print("Warmup model...")
+    try:
+        _ = llm.create_chat_completion(
+            messages=[
+                {
+                    "role": "system",
+                    "content": DEFAULT_SYSTEM,
+                },
+                {
+                    "role": "user",
+                    "content": "hi",
+                },
+            ],
+            max_tokens=1,
+            stream=False,
+        )
+    except Exception as e:
+        print(f"Warmup failed: {e}")
+    gc.collect()
+    print("Model ready!")
+# =============================================================================
+# CHAT
+# =============================================================================
 @app.post("/chat")
 async def chat(req: ChatRequest):
+    global llm
     if llm is None:
+        raise HTTPException(
+            503,
+            "Model chưa sẵn sàng",
+        )
     messages = build_messages(req)
+    # Clamp để user không spam 999999
+    max_tokens = min(
+        max(1, req.max_tokens),
+        MAX_TOKENS,
+    )
+    temperature = min(
+        max(0.0, req.temperature),
+        2.0,
+    )
+    top_p = min(
+        max(0.1, req.top_p),
+        1.0,
+    )
+    async def event_stream():
         full = ""
+        async with inference_lock:
+            try:
+                stream = llm.create_chat_completion(
+                    messages=messages,
+                    max_tokens=max_tokens,
+                    temperature=temperature,
+                    top_p=top_p,
+                    stop=STOP_TOKENS,
+                    stream=True,
+                )
+                for chunk in stream:
+                    try:
+                        delta = (
+                            chunk["choices"][0]
+                            .get("delta", {})
+                            .get("content", "")
+                        )
+                        if not delta:
+                            continue
+                        full += delta
+                        yield sse(
+                            {
+                                "delta": delta,
+                            }
+                        )
+                    except Exception:
+                        continue
+            except Exception as e:
+                yield sse(
+                    {
+                        "error": str(e),
+                    }
+                )
+            finally:
+                print(
+                    f"[DONE] "
+                    f"{len(full)} chars"
+                )
+                yield "data: [DONE]\n\n"
+                gc.collect()
     return StreamingResponse(
+        event_stream(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no",
+        },
     )
+# =============================================================================
+# HEALTH
+# =============================================================================
 @app.get("/")
 async def root():
     return {
+        "status": "ok" if llm else "loading",
+        "model": MODEL_FILE,
+        "ctx": MAX_CTX,
+        "batch": N_BATCH,
+        "threads": THREADS,
     }
 @app.get("/health")
 async def health():
+    return {
+        "healthy": llm is not None,
+    }
+# =============================================================================
+# MAIN
+# =============================================================================
 if __name__ == "__main__":
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=7860,
+        # production-ish
+        access_log=False,
+        server_header=False,
+    )