| import os |
| import torch |
| import gradio as gr |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
| MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct" |
| HF_TOKEN = os.getenv("HF_TOKEN", None) |
|
|
| os.environ["TOKENIZERS_PARALLELISM"] = "false" |
| torch.set_num_threads(max(1, (os.cpu_count() or 4) - 1)) |
|
|
| SYSTEM_PROMPT = () |
|
|
| print("Loading tokenizer...") |
| tokenizer = AutoTokenizer.from_pretrained( |
| MODEL_ID, |
| token=HF_TOKEN, |
| use_fast=True, |
| ) |
|
|
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| print("Loading model...") |
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_ID, |
| token=HF_TOKEN, |
| device_map="cpu", |
| torch_dtype=torch.float32, |
| low_cpu_mem_usage=True, |
| ) |
|
|
| model.eval() |
| print("Model loaded.") |
|
|
| def respond(message, history): |
| messages = [{"role": "system", "content": SYSTEM_PROMPT}] |
|
|
| for user_msg, assistant_msg in history: |
| if user_msg: |
| messages.append({"role": "user", "content": user_msg}) |
| if assistant_msg: |
| messages.append({"role": "assistant", "content": assistant_msg}) |
|
|
| messages.append({"role": "user", "content": message}) |
|
|
| prompt = tokenizer.apply_chat_template( |
| messages, |
| tokenize=False, |
| add_generation_prompt=True, |
| ) |
|
|
| inputs = tokenizer(prompt, return_tensors="pt") |
|
|
| with torch.inference_mode(): |
| outputs = model.generate( |
| **inputs, |
| max_new_tokens=256, |
| do_sample=True, |
| temperature=0.7, |
| top_p=0.9, |
| repetition_penalty=1.05, |
| pad_token_id=tokenizer.eos_token_id, |
| eos_token_id=tokenizer.eos_token_id, |
| ) |
|
|
| new_tokens = outputs[0][inputs["input_ids"].shape[1]:] |
| reply = tokenizer.decode(new_tokens, skip_special_tokens=True).strip() |
| return reply |
|
|
| demo = gr.ChatInterface( |
| fn=respond, |
| title="Qwen2.5-1.5B CPU Chat", |
| description="Directly loads the model from Hugging Face Hub. No custom model upload needed.", |
| examples=[ |
| "Explain black holes in simple words.", |
| "Write a cinematic image prompt for a medieval knight in a storm.", |
| "Set a timer for 10 minutes because pizza is baking.", |
| ], |
| ) |
|
|
| if __name__ == "__main__": |
| demo.queue().launch(server_name="0.0.0.0", server_port=7860) |