File size: 2,292 Bytes
3ca5d3b
d0d995c
82ab978
3ca5d3b
82ab978
d0d995c
 
82ab978
d0d995c
 
c70d6d0
f6b4903
c70d6d0
3ca5d3b
d0d995c
 
 
 
 
 
 
 
82ab978
3ca5d3b
 
 
d0d995c
54700cc
d0d995c
 
3ca5d3b
82ab978
d0d995c
 
82ab978
d0d995c
 
82ab978
d0d995c
 
 
 
 
82ab978
d0d995c
3ca5d3b
d0d995c
 
 
 
3ca5d3b
 
d0d995c
 
 
3ca5d3b
 
d0d995c
3ca5d3b
d0d995c
 
 
 
 
3ca5d3b
 
d0d995c
 
 
 
 
 
 
 
 
 
 
 
82ab978
 
 
d0d995c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import os
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
HF_TOKEN = os.getenv("HF_TOKEN", None)

os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.set_num_threads(max(1, (os.cpu_count() or 4) - 1))

SYSTEM_PROMPT = ()

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    token=HF_TOKEN,
    use_fast=True,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    token=HF_TOKEN,
    device_map="cpu",
    torch_dtype=torch.float32,
    low_cpu_mem_usage=True,
)

model.eval()
print("Model loaded.")

def respond(message, history):
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]

    for user_msg, assistant_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})

    messages.append({"role": "user", "content": message})

    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    inputs = tokenizer(prompt, return_tensors="pt")

    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.05,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
    reply = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
    return reply

demo = gr.ChatInterface(
    fn=respond,
    title="Qwen2.5-1.5B CPU Chat",
    description="Directly loads the model from Hugging Face Hub. No custom model upload needed.",
    examples=[
        "Explain black holes in simple words.",
        "Write a cinematic image prompt for a medieval knight in a storm.",
        "Set a timer for 10 minutes because pizza is baking.",
    ],
)

if __name__ == "__main__":
    demo.queue().launch(server_name="0.0.0.0", server_port=7860)