Spaces:
Running
Running
| import gradio as gr | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer | |
| from threading import Thread | |
| import time | |
| import psutil | |
| import os | |
| import torch | |
| MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct" | |
| model = None | |
| tokenizer = None | |
| load_status = "🔄 Initializing..." | |
| load_start = time.time() | |
| def get_ram_mb() -> float: | |
| return psutil.Process(os.getpid()).memory_info().rss / 1024**2 | |
| def get_stats_md(tps=None, tokens=None, elapsed=None) -> str: | |
| mb = get_ram_mb() | |
| filled = min(int(mb / 150), 10) | |
| bar = "█" * filled + "░" * (10 - filled) | |
| s = f"**Status:** {load_status} \n**RAM:** `[{bar}]` **{mb:.0f} MB**" | |
| if tps is not None: | |
| s += f" \n**Speed:** {tps:.1f} t/s · **Tokens:** {tokens} · **Elapsed:** {elapsed:.1f}s" | |
| return s | |
| def load_model(): | |
| global model, tokenizer, load_status | |
| try: | |
| load_status = "🔄 Loading tokenizer..." | |
| print(load_status) | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| load_status = "🔄 Loading model weights..." | |
| print(load_status) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.float32, | |
| low_cpu_mem_usage=True | |
| ) | |
| model.eval() | |
| elapsed = time.time() - load_start | |
| load_status = f"✅ Ready — {get_ram_mb():.0f} MB · {elapsed:.0f}s" | |
| print(load_status) | |
| except Exception as e: | |
| load_status = f"❌ {e}" | |
| print(load_status) | |
| Thread(target=load_model, daemon=True).start() | |
| def chat(message: str, prior_messages: list, system_prompt: str): | |
| if model is None or tokenizer is None: | |
| yield "⏳ Still loading...", get_stats_md() | |
| return | |
| # history is now already in OpenAI dict format. Just prepend system, append user. | |
| messages = [] | |
| if system_prompt.strip(): | |
| messages.append({"role": "system", "content": system_prompt.strip()}) | |
| messages.extend(prior_messages) | |
| messages.append({"role": "user", "content": message}) | |
| prompt = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| streamer = TextIteratorStreamer( | |
| tokenizer, | |
| skip_prompt=True, | |
| skip_special_tokens=True | |
| ) | |
| Thread(target=model.generate, kwargs=dict( | |
| **inputs, | |
| streamer=streamer, | |
| max_new_tokens=512, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.9, | |
| repetition_penalty=1.1, | |
| pad_token_id=tokenizer.eos_token_id | |
| )).start() | |
| t0 = time.time() | |
| output = "" | |
| count = 0 | |
| for chunk in streamer: | |
| output += chunk | |
| count += 1 | |
| elapsed = time.time() - t0 | |
| yield output, get_stats_md( | |
| tps=count / elapsed if elapsed > 0 else 0, | |
| tokens=count, | |
| elapsed=elapsed | |
| ) | |
| def user_turn(message, history): | |
| # Append native dictionary format | |
| history.append({"role": "user", "content": message}) | |
| return "", history | |
| def bot_turn(history, system): | |
| user_msg = history[-1]["content"] | |
| prior_history = history[:-1] # Everything except the just-added user message | |
| # Pre-allocate assistant dict so the UI knows where to stream text | |
| history.append({"role": "assistant", "content": ""}) | |
| for text, stats in chat(user_msg, prior_history, system): | |
| history[-1]["content"] = text | |
| yield history, stats | |
| with gr.Blocks(title="Qwen 0.5B") as demo: | |
| gr.Markdown("## 🧠 Qwen2.5-0.5B · CPU") | |
| stats_md = gr.Markdown(value=get_stats_md()) | |
| with gr.Accordion("⚙️ System Prompt", open=False): | |
| system_box = gr.Textbox( | |
| value="You are a helpful assistant.", | |
| lines=3, | |
| show_label=False | |
| ) | |
| # Added type="messages" to silence warning and structure data properly | |
| chatbot = gr.Chatbot(value=[], type="messages", show_label=False, height=400) | |
| with gr.Row(): | |
| msg = gr.Textbox( | |
| placeholder="Type a message…", | |
| show_label=False, | |
| scale=9, | |
| lines=1 | |
| ) | |
| send_btn = gr.Button("➤", variant="primary", scale=1) | |
| clear = gr.Button("🗑️ Clear") | |
| for trigger in [msg.submit, send_btn.click]: | |
| trigger( | |
| user_turn, [msg, chatbot], [msg, chatbot], queue=False | |
| ).then( | |
| bot_turn, [chatbot, system_box], [chatbot, stats_md] | |
| ) | |
| clear.click(lambda: ([], ""), outputs=[chatbot, msg], queue=False) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) |