Small_llm

Running

App Files Files Community

everydaytok commited on 18 days ago

Commit

c44a1f7

verified ·

1 Parent(s): b88168f

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -58

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
 from llama_cpp import Llama
 from threading import Thread
-from queue import Queue, Empty
 import time
 import psutil
 import os
@@ -17,47 +16,39 @@ from huggingface_hub import hf_hub_download
 MODEL_REPO = "bartowski/Qwen2.5-0.5B-Instruct-GGUF"
 GGUF_FILE  = "Qwen2.5-0.5B-Instruct-Q4_K_M.gguf"
-# This is the actual RAM budget — Q4_K_M 0.5B should be ~350MB
-EXPECTED_MB = 350
 model       = None
 load_status = "🔄 Initializing..."
 load_start  = time.time()
 # ─────────────────────────────────────────────────────────────
-# ACCURATE RAM READING — process RSS only, not whole container
 # ─────────────────────────────────────────────────────────────
 def get_process_ram_mb() -> float:
-    """Returns only THIS process's RAM in MB."""
-    proc = psutil.Process(os.getpid())
-    return proc.memory_info().rss / 1024**2
 def get_stats_md() -> str:
-    used_mb   = get_process_ram_mb()
-    used_gb   = used_mb / 1024
-    pct       = min((used_mb / (EXPECTED_MB * 4)) * 100, 100)  # scale bar to 4x expected
-    filled    = int(pct / 10)
-    bar       = "█" * filled + "░" * (10 - filled)
     return (
         f"**Status:** {load_status}  \n"
-        f"**Process RAM:** `[{bar}]` "
-        f"**{used_mb:.0f} MB** ({used_gb:.2f} GB)"
     )
 # ─────────────────────────────────────────────────────────────
-# MODEL LOADING — llama-cpp-python runs GGUF natively
-# stays at ~350MB instead of dequantizing to float32
 # ─────────────────────────────────────────────────────────────
 def load_model():
     global model, load_status
     try:
-        load_status = "🔄 Downloading GGUF (~350MB)..."
         print(load_status)
         model_path = hf_hub_download(
             repo_id=MODEL_REPO,
-            filename=GGUF_FILE,
         )
         load_status = "🔄 Loading into llama.cpp..."
@@ -65,15 +56,14 @@ def load_model():
         model = Llama(
             model_path=model_path,
-            n_ctx=2048,        # context window
-            n_threads=4,       # CPU threads
-            n_gpu_layers=0,    # CPU only
             verbose=False
         )
         elapsed = time.time() - load_start
-        ram_mb  = get_process_ram_mb()
-        load_status = f"✅ Ready in {elapsed:.0f}s · {ram_mb:.0f} MB used"
         print(load_status)
     except Exception as e:
@@ -84,7 +74,7 @@ Thread(target=load_model, daemon=True).start()
 # ─────────────────────────────────────────────────────────────
-# PROMPT FORMAT — Qwen2.5 ChatML
 # ─────────────────────────────────────────────────────────────
 def build_prompt(system: str, history: list, user: str) -> str:
     parts = []
@@ -105,15 +95,14 @@ def build_prompt(system: str, history: list, user: str) -> str:
 # ─────────────────────────────────────────────────────────────
 def chat(message: str, history: list, system_prompt: str):
     if model is None:
-        yield "⏳ Model still loading...", get_stats_md()
         return
-    prompt = build_prompt(system_prompt, history, message)
-    t0     = time.time()
-    output = ""
-    count  = 0
-    # llama-cpp-python native streaming
     stream = model(
         prompt,
         max_tokens=512,
@@ -125,7 +114,7 @@ def chat(message: str, history: list, system_prompt: str):
     )
     for chunk in stream:
-        token  = chunk["choices"][0]["text"]
         output += token
         count  += 1
         elapsed = time.time() - t0
@@ -141,46 +130,47 @@ def chat(message: str, history: list, system_prompt: str):
 # ─────────────────────────────────────────────────────────────
-# GRADIO UI — mobile-first, minimal, no broken SVG
 # ─────────────────────────────────────────────────────────────
 CSS = """
-/* ── reset Gradio's giant empty-chatbot SVG placeholder ── */
 .empty.svelte-byatnx { display: none !important; }
 .wrap.svelte-byatnx  { min-height: 20px !important; }
-/* ── stats panel ── */
 #stats {
     background: #0f172a;
     color: #94a3b8;
     border-radius: 8px;
     padding: 10px 14px;
     font-size: 0.82rem;
-    line-height: 1.6;
-    margin-bottom: 6px;
 }
-/* ── make textbox taller on mobile ── */
-#msg textarea { font-size: 1rem; }
-/* ── send button full width on small screens ── */
 @media (max-width: 600px) {
-    #send-btn { width: 100% !important; }
 }
 footer { display: none !important; }
 """
-with gr.Blocks(theme=gr.themes.Default(), css=CSS) as demo:
-    gr.Markdown("## 🧠 Qwen2.5-0.5B Q4_K_M")
-    # Live stats — always at top
     stats_md = gr.Markdown(
         value=get_stats_md(),
         elem_id="stats"
     )
-    # System prompt — hidden by default
     with gr.Accordion("⚙️ System Prompt", open=False):
         system_box = gr.Textbox(
             value="You are a helpful assistant.",
@@ -188,37 +178,36 @@ with gr.Blocks(theme=gr.themes.Default(), css=CSS) as demo:
             show_label=False
         )
-    # Chat
     chatbot = gr.Chatbot(
         value=[],
-        label="",
         show_label=False,
-        height=380,
-        # No placeholder icon
-        placeholder=None
     )
-    # Input row
     with gr.Row(equal_height=True):
         msg = gr.Textbox(
-            placeholder="Message…",
             show_label=False,
-            scale=8,
             lines=1,
-            max_lines=4,
             elem_id="msg"
         )
         send_btn = gr.Button(
             "➤",
             variant="primary",
             scale=1,
-            elem_id="send-btn",
-            min_width=48
         )
     clear = gr.Button("🗑️ Clear", size="sm")
-    # ── event wiring ────────────────────────────────────────
     def user_turn(message, history):
         return "", history + [[message, ""]]
@@ -250,7 +239,7 @@ with gr.Blocks(theme=gr.themes.Default(), css=CSS) as demo:
 # ─────────────────────────────────────────────────────────────
 # FASTAPI
 # ─────────────────────────────────────────────────────────────
-app = FastAPI(title="Qwen2.5-0.5B")
 class ChatRequest(BaseModel):
     message: str
@@ -283,13 +272,22 @@ def api_chat(req: ChatRequest):
     )
     text = result["choices"][0]["text"].strip()
     return {
         "response": text,
         "tokens": result["usage"]["completion_tokens"],
         "process_ram_mb": round(get_process_ram_mb(), 1)
     }
 app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)

 import gradio as gr
 from llama_cpp import Llama
 from threading import Thread
 import time
 import psutil
 import os
 MODEL_REPO = "bartowski/Qwen2.5-0.5B-Instruct-GGUF"
 GGUF_FILE  = "Qwen2.5-0.5B-Instruct-Q4_K_M.gguf"
 model       = None
 load_status = "🔄 Initializing..."
 load_start  = time.time()
 # ─────────────────────────────────────────────────────────────
+# RAM — process-only, not container/host
 # ─────────────────────────────────────────────────────────────
 def get_process_ram_mb() -> float:
+    return psutil.Process(os.getpid()).memory_info().rss / 1024**2
 def get_stats_md() -> str:
+    mb     = get_process_ram_mb()
+    filled = min(int(mb / 100), 10)   # 1 block per 100MB, max 10
+    bar    = "█" * filled + "░" * (10 - filled)
     return (
         f"**Status:** {load_status}  \n"
+        f"**Process RAM:** `[{bar}]` **{mb:.0f} MB**"
     )
 # ─────────────────────────────────────────────────────────────
+# MODEL LOADING
 # ─────────────────────────────────────────────────────────────
 def load_model():
     global model, load_status
     try:
+        load_status = "🔄 Downloading GGUF (~350 MB)..."
         print(load_status)
         model_path = hf_hub_download(
             repo_id=MODEL_REPO,
+            filename=GGUF_FILE
         )
         load_status = "🔄 Loading into llama.cpp..."
         model = Llama(
             model_path=model_path,
+            n_ctx=2048,
+            n_threads=os.cpu_count() or 4,
+            n_gpu_layers=0,
             verbose=False
         )
         elapsed = time.time() - load_start
+        load_status = f"✅ Ready — {get_process_ram_mb():.0f} MB · {elapsed:.0f}s load time"
         print(load_status)
     except Exception as e:
 # ─────────────────────────────────────────────────────────────
+# PROMPT — Qwen2.5 ChatML format
 # ─────────────────────────────────────────────────────────────
 def build_prompt(system: str, history: list, user: str) -> str:
     parts = []
 # ─────────────────────────────────────────────────────────────
 def chat(message: str, history: list, system_prompt: str):
     if model is None:
+        yield "⏳ Model still loading — please wait.", get_stats_md()
         return
+    prompt  = build_prompt(system_prompt, history, message)
+    t0      = time.time()
+    output  = ""
+    count   = 0
     stream = model(
         prompt,
         max_tokens=512,
     )
     for chunk in stream:
+        token   = chunk["choices"][0]["text"]
         output += token
         count  += 1
         elapsed = time.time() - t0
 # ─────────────────────────────────────────────────────────────
+# GRADIO UI
 # ─────────────────────────────────────────────────────────────
 CSS = """
+/* hide empty chatbot SVG placeholder */
 .empty.svelte-byatnx { display: none !important; }
 .wrap.svelte-byatnx  { min-height: 20px !important; }
 #stats {
     background: #0f172a;
     color: #94a3b8;
     border-radius: 8px;
     padding: 10px 14px;
     font-size: 0.82rem;
+    line-height: 1.7;
+    margin-bottom: 8px;
 }
+#chatbot .message {
+    font-size: 0.95rem;
+    line-height: 1.5;
+}
+/* full-width send on mobile */
 @media (max-width: 600px) {
+    #send-btn { width: 100% !important; margin-top: 6px; }
 }
 footer { display: none !important; }
 """
+with gr.Blocks(theme=gr.themes.Default(), css=CSS, title="Qwen 0.5B") as demo:
+    gr.Markdown("## 🧠 Qwen2.5-0.5B · Q4_K_M · CPU")
+    # ── always-visible status bar ────────────────────────────
     stats_md = gr.Markdown(
         value=get_stats_md(),
         elem_id="stats"
     )
+    # ── optional system prompt ───────────────────────────────
     with gr.Accordion("⚙️ System Prompt", open=False):
         system_box = gr.Textbox(
             value="You are a helpful assistant.",
             show_label=False
         )
+    # ── conversation ─────────────────────────────────────────
     chatbot = gr.Chatbot(
         value=[],
         show_label=False,
+        height=400,
+        placeholder=None,
+        bubble_full_width=False
     )
+    # ── input row ────────────────────────────────────────────
     with gr.Row(equal_height=True):
         msg = gr.Textbox(
+            placeholder="Type a message…",
             show_label=False,
+            scale=9,
             lines=1,
+            max_lines=5,
             elem_id="msg"
         )
         send_btn = gr.Button(
             "➤",
             variant="primary",
             scale=1,
+            min_width=48,
+            elem_id="send-btn"
         )
     clear = gr.Button("🗑️ Clear", size="sm")
+    # ── wiring ───────────────────────────────────────────────
     def user_turn(message, history):
         return "", history + [[message, ""]]
 # ─────────────────────────────────────────────────────────────
 # FASTAPI
 # ─────────────────────────────────────────────────────────────
+app = FastAPI(title="Qwen2.5-0.5B API")
 class ChatRequest(BaseModel):
     message: str
     )
     text = result["choices"][0]["text"].strip()
     return {
         "response": text,
         "tokens": result["usage"]["completion_tokens"],
         "process_ram_mb": round(get_process_ram_mb(), 1)
     }
+# ────────────────────────────────���────────────────────────────
+# MOUNT + RUN
+# ─────────────────────────────────────────────────────────────
 app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
+    print("\n🌐 Starting on http://0.0.0.0:7860")
+    print("   UI     → http://0.0.0.0:7860/")
+    print("   API    → POST http://0.0.0.0:7860/chat")
+    print("   Health → GET  http://0.0.0.0:7860/health\n")
     uvicorn.run(app, host="0.0.0.0", port=7860)