vikk / app.py
ulduldp's picture
Update app.py
f6b4903 verified
import os
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
HF_TOKEN = os.getenv("HF_TOKEN", None)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.set_num_threads(max(1, (os.cpu_count() or 4) - 1))
SYSTEM_PROMPT = ()
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
MODEL_ID,
token=HF_TOKEN,
use_fast=True,
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
token=HF_TOKEN,
device_map="cpu",
torch_dtype=torch.float32,
low_cpu_mem_usage=True,
)
model.eval()
print("Model loaded.")
def respond(message, history):
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
for user_msg, assistant_msg in history:
if user_msg:
messages.append({"role": "user", "content": user_msg})
if assistant_msg:
messages.append({"role": "assistant", "content": assistant_msg})
messages.append({"role": "user", "content": message})
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
inputs = tokenizer(prompt, return_tensors="pt")
with torch.inference_mode():
outputs = model.generate(
**inputs,
max_new_tokens=256,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.05,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
)
new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
reply = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
return reply
demo = gr.ChatInterface(
fn=respond,
title="Qwen2.5-1.5B CPU Chat",
description="Directly loads the model from Hugging Face Hub. No custom model upload needed.",
examples=[
"Explain black holes in simple words.",
"Write a cinematic image prompt for a medieval knight in a storm.",
"Set a timer for 10 minutes because pizza is baking.",
],
)
if __name__ == "__main__":
demo.queue().launch(server_name="0.0.0.0", server_port=7860)