import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer from threading import Thread import time import psutil import os import torch MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct" model = None tokenizer = None load_status = "πŸ”„ Initializing..." load_start = time.time() def get_ram_mb() -> float: return psutil.Process(os.getpid()).memory_info().rss / 1024**2 def get_stats_md(tps=None, tokens=None, elapsed=None) -> str: mb = get_ram_mb() filled = min(int(mb / 150), 10) bar = "β–ˆ" * filled + "β–‘" * (10 - filled) s = f"**Status:** {load_status} \n**RAM:** `[{bar}]` **{mb:.0f} MB**" if tps is not None: s += f" \n**Speed:** {tps:.1f} t/s Β· **Tokens:** {tokens} Β· **Elapsed:** {elapsed:.1f}s" return s def load_model(): global model, tokenizer, load_status try: load_status = "πŸ”„ Loading tokenizer..." print(load_status) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) load_status = "πŸ”„ Loading model weights..." print(load_status) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float32, low_cpu_mem_usage=True ) model.eval() elapsed = time.time() - load_start load_status = f"βœ… Ready β€” {get_ram_mb():.0f} MB Β· {elapsed:.0f}s" print(load_status) except Exception as e: load_status = f"❌ {e}" print(load_status) Thread(target=load_model, daemon=True).start() def chat(message: str, prior_messages: list, system_prompt: str): if model is None or tokenizer is None: yield "⏳ Still loading...", get_stats_md() return # history is now already in OpenAI dict format. Just prepend system, append user. messages = [] if system_prompt.strip(): messages.append({"role": "system", "content": system_prompt.strip()}) messages.extend(prior_messages) messages.append({"role": "user", "content": message}) prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = tokenizer(prompt, return_tensors="pt") streamer = TextIteratorStreamer( tokenizer, skip_prompt=True, skip_special_tokens=True ) Thread(target=model.generate, kwargs=dict( **inputs, streamer=streamer, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.9, repetition_penalty=1.1, pad_token_id=tokenizer.eos_token_id )).start() t0 = time.time() output = "" count = 0 for chunk in streamer: output += chunk count += 1 elapsed = time.time() - t0 yield output, get_stats_md( tps=count / elapsed if elapsed > 0 else 0, tokens=count, elapsed=elapsed ) def user_turn(message, history): # Append native dictionary format history.append({"role": "user", "content": message}) return "", history def bot_turn(history, system): user_msg = history[-1]["content"] prior_history = history[:-1] # Everything except the just-added user message # Pre-allocate assistant dict so the UI knows where to stream text history.append({"role": "assistant", "content": ""}) for text, stats in chat(user_msg, prior_history, system): history[-1]["content"] = text yield history, stats with gr.Blocks(title="Qwen 0.5B") as demo: gr.Markdown("## 🧠 Qwen2.5-0.5B Β· CPU") stats_md = gr.Markdown(value=get_stats_md()) with gr.Accordion("βš™οΈ System Prompt", open=False): system_box = gr.Textbox( value="You are a helpful assistant.", lines=3, show_label=False ) # Added type="messages" to silence warning and structure data properly chatbot = gr.Chatbot(value=[], type="messages", show_label=False, height=400) with gr.Row(): msg = gr.Textbox( placeholder="Type a message…", show_label=False, scale=9, lines=1 ) send_btn = gr.Button("➀", variant="primary", scale=1) clear = gr.Button("πŸ—‘οΈ Clear") for trigger in [msg.submit, send_btn.click]: trigger( user_turn, [msg, chatbot], [msg, chatbot], queue=False ).then( bot_turn, [chatbot, system_box], [chatbot, stats_md] ) clear.click(lambda: ([], ""), outputs=[chatbot, msg], queue=False) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)