Small_llm / app(LLM).py
everydaytok's picture
Rename app.py to app(LLM).py
a5c7311 verified
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from threading import Thread
import time
import psutil
import os
import torch
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
model = None
tokenizer = None
load_status = "🔄 Initializing..."
load_start = time.time()
def get_ram_mb() -> float:
return psutil.Process(os.getpid()).memory_info().rss / 1024**2
def get_stats_md(tps=None, tokens=None, elapsed=None) -> str:
mb = get_ram_mb()
filled = min(int(mb / 150), 10)
bar = "█" * filled + "░" * (10 - filled)
s = f"**Status:** {load_status} \n**RAM:** `[{bar}]` **{mb:.0f} MB**"
if tps is not None:
s += f" \n**Speed:** {tps:.1f} t/s · **Tokens:** {tokens} · **Elapsed:** {elapsed:.1f}s"
return s
def load_model():
global model, tokenizer, load_status
try:
load_status = "🔄 Loading tokenizer..."
print(load_status)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
load_status = "🔄 Loading model weights..."
print(load_status)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.float32,
low_cpu_mem_usage=True
)
model.eval()
elapsed = time.time() - load_start
load_status = f"✅ Ready — {get_ram_mb():.0f} MB · {elapsed:.0f}s"
print(load_status)
except Exception as e:
load_status = f"❌ {e}"
print(load_status)
Thread(target=load_model, daemon=True).start()
def chat(message: str, prior_messages: list, system_prompt: str):
if model is None or tokenizer is None:
yield "⏳ Still loading...", get_stats_md()
return
# history is now already in OpenAI dict format. Just prepend system, append user.
messages = []
if system_prompt.strip():
messages.append({"role": "system", "content": system_prompt.strip()})
messages.extend(prior_messages)
messages.append({"role": "user", "content": message})
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = tokenizer(prompt, return_tensors="pt")
streamer = TextIteratorStreamer(
tokenizer,
skip_prompt=True,
skip_special_tokens=True
)
Thread(target=model.generate, kwargs=dict(
**inputs,
streamer=streamer,
max_new_tokens=512,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1,
pad_token_id=tokenizer.eos_token_id
)).start()
t0 = time.time()
output = ""
count = 0
for chunk in streamer:
output += chunk
count += 1
elapsed = time.time() - t0
yield output, get_stats_md(
tps=count / elapsed if elapsed > 0 else 0,
tokens=count,
elapsed=elapsed
)
def user_turn(message, history):
# Append native dictionary format
history.append({"role": "user", "content": message})
return "", history
def bot_turn(history, system):
user_msg = history[-1]["content"]
prior_history = history[:-1] # Everything except the just-added user message
# Pre-allocate assistant dict so the UI knows where to stream text
history.append({"role": "assistant", "content": ""})
for text, stats in chat(user_msg, prior_history, system):
history[-1]["content"] = text
yield history, stats
with gr.Blocks(title="Qwen 0.5B") as demo:
gr.Markdown("## 🧠 Qwen2.5-0.5B · CPU")
stats_md = gr.Markdown(value=get_stats_md())
with gr.Accordion("⚙️ System Prompt", open=False):
system_box = gr.Textbox(
value="You are a helpful assistant.",
lines=3,
show_label=False
)
# Added type="messages" to silence warning and structure data properly
chatbot = gr.Chatbot(value=[], type="messages", show_label=False, height=400)
with gr.Row():
msg = gr.Textbox(
placeholder="Type a message…",
show_label=False,
scale=9,
lines=1
)
send_btn = gr.Button("➤", variant="primary", scale=1)
clear = gr.Button("🗑️ Clear")
for trigger in [msg.submit, send_btn.click]:
trigger(
user_turn, [msg, chatbot], [msg, chatbot], queue=False
).then(
bot_turn, [chatbot, system_box], [chatbot, stats_md]
)
clear.click(lambda: ([], ""), outputs=[chatbot, msg], queue=False)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)