Small_llm

Running

App Files Files Community

everydaytok commited on 17 days ago

Commit

ff4ac49

verified ·

1 Parent(s): 8fbb150

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -92

app.py CHANGED Viewed

@@ -1,69 +1,89 @@
 import gradio as gr
-from llama_cpp import Llama
 from threading import Thread
 import time
 import psutil
 import os
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 import uvicorn
-from huggingface_hub import hf_hub_download
 # ─────────────────────────────────────────────────────────────
 # CONFIGURATION
 # ─────────────────────────────────────────────────────────────
-MODEL_REPO = "bartowski/Qwen2.5-0.5B-Instruct-GGUF"
-GGUF_FILE  = "Qwen2.5-0.5B-Instruct-Q4_K_M.gguf"
 model       = None
 load_status = "🔄 Initializing..."
 load_start  = time.time()
 # ─────────────────────────────────────────────────────────────
-# RAM — process-only, not container/host
 # ─────────────────────────────────────────────────────────────
 def get_process_ram_mb() -> float:
     return psutil.Process(os.getpid()).memory_info().rss / 1024**2
-def get_stats_md() -> str:
     mb     = get_process_ram_mb()
-    filled = min(int(mb / 100), 10)   # 1 block per 100MB, max 10
     bar    = "█" * filled + "░" * (10 - filled)
-    return (
-        f"**Status:** {load_status}  \n"
-        f"**Process RAM:** `[{bar}]` **{mb:.0f} MB**"
-    )
 # ─────────────────────────────────────────────────────────────
-# MODEL LOADING
 # ─────────────────────────────────────────────────────────────
 def load_model():
-    global model, load_status
     try:
-        load_status = "🔄 Downloading GGUF (~350 MB)..."
         print(load_status)
-        model_path = hf_hub_download(
-            repo_id=MODEL_REPO,
-            filename=GGUF_FILE
         )
-        load_status = "🔄 Loading into llama.cpp..."
         print(load_status)
-        model = Llama(
-            model_path=model_path,
-            n_ctx=2048,
-            n_threads=os.cpu_count() or 4,
-            n_gpu_layers=0,
-            verbose=False
         )
         elapsed = time.time() - load_start
-        load_status = f"✅ Ready — {get_process_ram_mb():.0f} MB · {elapsed:.0f}s load time"
         print(load_status)
     except Exception as e:
@@ -74,7 +94,7 @@ Thread(target=load_model, daemon=True).start()
 # ─────────────────────────────────────────────────────────────
-# PROMPT — Qwen2.5 ChatML format
 # ─────────────────────────────────────────────────────────────
 def build_prompt(system: str, history: list, user: str) -> str:
     parts = []
@@ -94,49 +114,51 @@ def build_prompt(system: str, history: list, user: str) -> str:
 # STREAMING GENERATOR
 # ─────────────────────────────────────────────────────────────
 def chat(message: str, history: list, system_prompt: str):
-    if model is None:
         yield "⏳ Model still loading — please wait.", get_stats_md()
         return
-    prompt  = build_prompt(system_prompt, history, message)
-    t0      = time.time()
-    output  = ""
-    count   = 0
-    stream = model(
-        prompt,
-        max_tokens=512,
         temperature=0.7,
         top_p=0.9,
-        repeat_penalty=1.1,
-        stop=["<|im_end|>", "<|im_start|>"],
-        stream=True
     )
-    for chunk in stream:
-        token   = chunk["choices"][0]["text"]
-        output += token
         count  += 1
         elapsed = time.time() - t0
         tps     = count / elapsed if elapsed > 0 else 0
-        stats   = (
-            f"**Status:** {load_status}  \n"
-            f"**Process RAM:** {get_process_ram_mb():.0f} MB  \n"
-            f"**Speed:** {tps:.1f} t/s · "
-            f"**Tokens:** {count} · "
-            f"**Elapsed:** {elapsed:.1f}s"
-        )
-        yield output, stats
 # ─────────────────────────────────────────────────────────────
 # GRADIO UI
 # ─────────────────────────────────────────────────────────────
 CSS = """
-/* hide empty chatbot SVG placeholder */
-.empty.svelte-byatnx { display: none !important; }
-.wrap.svelte-byatnx  { min-height: 20px !important; }
 #stats {
     background: #0f172a;
     color: #94a3b8;
@@ -146,31 +168,18 @@ CSS = """
     line-height: 1.7;
     margin-bottom: 8px;
 }
-#chatbot .message {
-    font-size: 0.95rem;
-    line-height: 1.5;
-}
-/* full-width send on mobile */
-@media (max-width: 600px) {
-    #send-btn { width: 100% !important; margin-top: 6px; }
-}
 footer { display: none !important; }
 """
 with gr.Blocks(theme=gr.themes.Default(), css=CSS, title="Qwen 0.5B") as demo:
-    gr.Markdown("## 🧠 Qwen2.5-0.5B · Q4_K_M · CPU")
-    # ── always-visible status bar ────────────────────────────
     stats_md = gr.Markdown(
         value=get_stats_md(),
         elem_id="stats"
     )
-    # ── optional system prompt ───────────────────────────────
     with gr.Accordion("⚙️ System Prompt", open=False):
         system_box = gr.Textbox(
             value="You are a helpful assistant.",
@@ -178,7 +187,6 @@ with gr.Blocks(theme=gr.themes.Default(), css=CSS, title="Qwen 0.5B") as demo:
             show_label=False
         )
-    # ── conversation ─────────────────────────────────────────
     chatbot = gr.Chatbot(
         value=[],
         show_label=False,
@@ -187,27 +195,23 @@ with gr.Blocks(theme=gr.themes.Default(), css=CSS, title="Qwen 0.5B") as demo:
         bubble_full_width=False
     )
-    # ── input row ────────────────────────────────────────────
     with gr.Row(equal_height=True):
         msg = gr.Textbox(
             placeholder="Type a message…",
             show_label=False,
             scale=9,
             lines=1,
-            max_lines=5,
-            elem_id="msg"
         )
         send_btn = gr.Button(
             "➤",
             variant="primary",
             scale=1,
-            min_width=48,
-            elem_id="send-btn"
         )
     clear = gr.Button("🗑️ Clear", size="sm")
-    # ── wiring ───────────────────────────────────────────────
     def user_turn(message, history):
         return "", history + [[message, ""]]
@@ -261,33 +265,33 @@ def api_chat(req: ChatRequest):
         raise HTTPException(status_code=503, detail=load_status)
     prompt = build_prompt(req.system, [], req.message)
-    result = model(
-        prompt,
-        max_tokens=req.max_tokens,
-        temperature=req.temperature,
-        top_p=0.9,
-        repeat_penalty=1.1,
-        stop=["<|im_end|>", "<|im_start|>"]
     )
-    text = result["choices"][0]["text"].strip()
     return {
-        "response": text,
-        "tokens": result["usage"]["completion_tokens"],
         "process_ram_mb": round(get_process_ram_mb(), 1)
     }
-# ─────────────────────────────────────────────────────────────
-# MOUNT + RUN
-# ─────────────────────────────────────────────────────────────
 app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
-    print("\n🌐 Starting on http://0.0.0.0:7860")
-    print("   UI     → http://0.0.0.0:7860/")
-    print("   API    → POST http://0.0.0.0:7860/chat")
-    print("   Health → GET  http://0.0.0.0:7860/health\n")
     uvicorn.run(app, host="0.0.0.0", port=7860)

 import gradio as gr
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    TextIteratorStreamer,
+    BitsAndBytesConfig
+)
 from threading import Thread
 import time
 import psutil
 import os
+import torch
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 import uvicorn
 # ─────────────────────────────────────────────────────────────
 # CONFIGURATION
+# Use a model that is ALREADY quantized on HF — no GGUF needed
+# Qwen2.5-0.5B in int8 via bitsandbytes = ~500MB, no compilation
 # ─────────────────────────────────────────────────────────────
+MODEL_ID    = "Qwen/Qwen2.5-0.5B-Instruct"
 model       = None
+tokenizer   = None
 load_status = "🔄 Initializing..."
 load_start  = time.time()
 # ─────────────────────────────────────────────────────────────
+# RAM — process only
 # ─────────────────────────────────────────────────────────────
 def get_process_ram_mb() -> float:
     return psutil.Process(os.getpid()).memory_info().rss / 1024**2
+def get_stats_md(tps=None, tokens=None, elapsed=None) -> str:
     mb     = get_process_ram_mb()
+    filled = min(int(mb / 150), 10)   # 1 block per 150MB
     bar    = "█" * filled + "░" * (10 - filled)
+    line1  = f"**Status:** {load_status}  \n"
+    line2  = f"**RAM:** `[{bar}]` **{mb:.0f} MB**"
+    if tps is not None:
+        line2 += (
+            f"  \n**Speed:** {tps:.1f} t/s · "
+            f"**Tokens:** {tokens} · "
+            f"**Elapsed:** {elapsed:.1f}s"
+        )
+    return line1 + line2
 # ─────────────────────────────────────────────────────────────
+# MODEL LOADING — int8 quantization via bitsandbytes
+# No compilation, installs in seconds, stays ~450-500MB RAM
 # ─────────────────────────────────────────────────────────────
 def load_model():
+    global model, tokenizer, load_status
     try:
+        load_status = "🔄 Loading tokenizer..."
         print(load_status)
+        tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_ID,
+            trust_remote_code=True
         )
+        load_status = "🔄 Loading model (int8 quantized)..."
         print(load_status)
+        quant_config = BitsAndBytesConfig(load_in_8bit=True)
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            quantization_config=quant_config,
+            device_map="cpu",
+            trust_remote_code=True,
+            low_cpu_mem_usage=True
         )
+        model.eval()
         elapsed = time.time() - load_start
+        load_status = (
+            f"✅ Ready — "
+            f"{get_process_ram_mb():.0f} MB · "
+            f"{elapsed:.0f}s"
+        )
         print(load_status)
     except Exception as e:
 # ─────────────────────────────────────────────────────────────
+# PROMPT — Qwen2.5 ChatML
 # ─────────────────────────────────────────────────────────────
 def build_prompt(system: str, history: list, user: str) -> str:
     parts = []
 # STREAMING GENERATOR
 # ─────────────────────────────────────────────────────────────
 def chat(message: str, history: list, system_prompt: str):
+    if model is None or tokenizer is None:
         yield "⏳ Model still loading — please wait.", get_stats_md()
         return
+    prompt = build_prompt(system_prompt, history, message)
+    inputs = tokenizer(prompt, return_tensors="pt")
+    streamer = TextIteratorStreamer(
+        tokenizer,
+        skip_prompt=True,
+        skip_special_tokens=True
+    )
+    gen_kwargs = dict(
+        **inputs,
+        streamer=streamer,
+        max_new_tokens=512,
+        do_sample=True,
         temperature=0.7,
         top_p=0.9,
+        repetition_penalty=1.1,
+        pad_token_id=tokenizer.eos_token_id
     )
+    thread = Thread(target=model.generate, kwargs=gen_kwargs)
+    thread.start()
+    t0     = time.time()
+    output = ""
+    count  = 0
+    for chunk in streamer:
+        output += chunk
         count  += 1
         elapsed = time.time() - t0
         tps     = count / elapsed if elapsed > 0 else 0
+        yield output, get_stats_md(tps=tps, tokens=count, elapsed=elapsed)
+    thread.join()
 # ─────────────────────────────────────────────────────────────
 # GRADIO UI
 # ─────────────────────────────────────────────────────────────
 CSS = """
 #stats {
     background: #0f172a;
     color: #94a3b8;
     line-height: 1.7;
     margin-bottom: 8px;
 }
 footer { display: none !important; }
 """
 with gr.Blocks(theme=gr.themes.Default(), css=CSS, title="Qwen 0.5B") as demo:
+    gr.Markdown("## 🧠 Qwen2.5-0.5B · int8 · CPU")
     stats_md = gr.Markdown(
         value=get_stats_md(),
         elem_id="stats"
     )
     with gr.Accordion("⚙️ System Prompt", open=False):
         system_box = gr.Textbox(
             value="You are a helpful assistant.",
             show_label=False
         )
     chatbot = gr.Chatbot(
         value=[],
         show_label=False,
         bubble_full_width=False
     )
     with gr.Row(equal_height=True):
         msg = gr.Textbox(
             placeholder="Type a message…",
             show_label=False,
             scale=9,
             lines=1,
+            max_lines=5
         )
         send_btn = gr.Button(
             "➤",
             variant="primary",
             scale=1,
+            min_width=48
         )
     clear = gr.Button("🗑️ Clear", size="sm")
     def user_turn(message, history):
         return "", history + [[message, ""]]
         raise HTTPException(status_code=503, detail=load_status)
     prompt = build_prompt(req.system, [], req.message)
+    inputs = tokenizer(prompt, return_tensors="pt")
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=req.max_tokens,
+            do_sample=req.temperature > 0,
+            temperature=max(req.temperature, 1e-4),
+            top_p=0.9,
+            repetition_penalty=1.1,
+            pad_token_id=tokenizer.eos_token_id
+        )
+    input_length  = inputs.input_ids.shape[1]
+    response_text = tokenizer.decode(
+        outputs[0][input_length:],
+        skip_special_tokens=True
     )
     return {
+        "response": response_text,
+        "tokens": len(outputs[0]) - input_length,
         "process_ram_mb": round(get_process_ram_mb(), 1)
     }
 app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
+    print("\n🌐 http://0.0.0.0:7860")
     uvicorn.run(app, host="0.0.0.0", port=7860)