import os import torch import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct" HF_TOKEN = os.getenv("HF_TOKEN", None) os.environ["TOKENIZERS_PARALLELISM"] = "false" torch.set_num_threads(max(1, (os.cpu_count() or 4) - 1)) SYSTEM_PROMPT = () print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained( MODEL_ID, token=HF_TOKEN, use_fast=True, ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token print("Loading model...") model = AutoModelForCausalLM.from_pretrained( MODEL_ID, token=HF_TOKEN, device_map="cpu", torch_dtype=torch.float32, low_cpu_mem_usage=True, ) model.eval() print("Model loaded.") def respond(message, history): messages = [{"role": "system", "content": SYSTEM_PROMPT}] for user_msg, assistant_msg in history: if user_msg: messages.append({"role": "user", "content": user_msg}) if assistant_msg: messages.append({"role": "assistant", "content": assistant_msg}) messages.append({"role": "user", "content": message}) prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) inputs = tokenizer(prompt, return_tensors="pt") with torch.inference_mode(): outputs = model.generate( **inputs, max_new_tokens=256, do_sample=True, temperature=0.7, top_p=0.9, repetition_penalty=1.05, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id, ) new_tokens = outputs[0][inputs["input_ids"].shape[1]:] reply = tokenizer.decode(new_tokens, skip_special_tokens=True).strip() return reply demo = gr.ChatInterface( fn=respond, title="Qwen2.5-1.5B CPU Chat", description="Directly loads the model from Hugging Face Hub. No custom model upload needed.", examples=[ "Explain black holes in simple words.", "Write a cinematic image prompt for a medieval knight in a storm.", "Set a timer for 10 minutes because pizza is baking.", ], ) if __name__ == "__main__": demo.queue().launch(server_name="0.0.0.0", server_port=7860)