File size: 2,292 Bytes
3ca5d3b d0d995c 82ab978 3ca5d3b 82ab978 d0d995c 82ab978 d0d995c c70d6d0 f6b4903 c70d6d0 3ca5d3b d0d995c 82ab978 3ca5d3b d0d995c 54700cc d0d995c 3ca5d3b 82ab978 d0d995c 82ab978 d0d995c 82ab978 d0d995c 82ab978 d0d995c 3ca5d3b d0d995c 3ca5d3b d0d995c 3ca5d3b d0d995c 3ca5d3b d0d995c 3ca5d3b d0d995c 82ab978 d0d995c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | import os
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
HF_TOKEN = os.getenv("HF_TOKEN", None)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.set_num_threads(max(1, (os.cpu_count() or 4) - 1))
SYSTEM_PROMPT = ()
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
MODEL_ID,
token=HF_TOKEN,
use_fast=True,
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
token=HF_TOKEN,
device_map="cpu",
torch_dtype=torch.float32,
low_cpu_mem_usage=True,
)
model.eval()
print("Model loaded.")
def respond(message, history):
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
for user_msg, assistant_msg in history:
if user_msg:
messages.append({"role": "user", "content": user_msg})
if assistant_msg:
messages.append({"role": "assistant", "content": assistant_msg})
messages.append({"role": "user", "content": message})
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
inputs = tokenizer(prompt, return_tensors="pt")
with torch.inference_mode():
outputs = model.generate(
**inputs,
max_new_tokens=256,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.05,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
)
new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
reply = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
return reply
demo = gr.ChatInterface(
fn=respond,
title="Qwen2.5-1.5B CPU Chat",
description="Directly loads the model from Hugging Face Hub. No custom model upload needed.",
examples=[
"Explain black holes in simple words.",
"Write a cinematic image prompt for a medieval knight in a storm.",
"Set a timer for 10 minutes because pizza is baking.",
],
)
if __name__ == "__main__":
demo.queue().launch(server_name="0.0.0.0", server_port=7860) |