qwen / app.py
sonuprasad23's picture
Update app.py
27c7d14 verified
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import gc
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.set_num_threads(2)
torch.set_grad_enabled(False)
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
MAX_INPUT_TOKENS = 2000 # hard cap on input context
print("Loading tokenizer…")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
print("Loading model…")
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
dtype=torch.float32,
device_map="cpu",
low_cpu_mem_usage=True,
)
model.eval()
gc.collect()
print("Model ready ✅")
def parse_history(history: list) -> list:
messages = []
for item in history:
if isinstance(item, dict):
role = item.get("role", "")
content = item.get("content", "")
if role and content:
messages.append({"role": role, "content": str(content)})
elif isinstance(item, (list, tuple)) and len(item) == 2:
human, assistant = item
if human:
messages.append({"role": "user", "content": str(human)})
if assistant:
messages.append({"role": "assistant", "content": str(assistant)})
return messages
def truncate_to_limit(messages: list, user_msg: str, max_tokens: int) -> list:
"""
Trim oldest history turns until the full prompt fits within max_tokens.
Always keeps the latest user message.
"""
while True:
full_messages = messages + [{"role": "user", "content": user_msg}]
prompt = tokenizer.apply_chat_template(
full_messages, tokenize=False, add_generation_prompt=True
)
token_count = len(tokenizer(prompt, return_tensors="pt")["input_ids"][0])
if token_count <= max_tokens or len(messages) == 0:
return full_messages # fits, or nothing left to trim
# Drop the oldest turn (user + assistant = 2 items if dicts, 1 if tuple)
messages = messages[2:] if len(messages) >= 2 else []
def generate(
user_msg: str,
history: list,
max_new_tokens: int,
temperature: float,
):
if not user_msg.strip():
return ""
messages = parse_history(history)
full_messages = truncate_to_limit(messages, user_msg.strip(), MAX_INPUT_TOKENS)
prompt = tokenizer.apply_chat_template(
full_messages, tokenize=False, add_generation_prompt=True
)
inputs = tokenizer(prompt, return_tensors="pt")
inp_len = inputs["input_ids"].shape[-1]
print(f"[gen] input_tokens={inp_len} max_new_tokens={max_new_tokens} temp={temperature}")
with torch.inference_mode():
output_ids = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=temperature > 0,
temperature=temperature if temperature > 0 else 1.0,
top_p=0.9,
repetition_penalty=1.15,
no_repeat_ngram_size=3,
pad_token_id=tokenizer.eos_token_id,
use_cache=True,
)
reply = tokenizer.decode(
output_ids[0][inp_len:], skip_special_tokens=True
).strip()
return reply
demo = gr.ChatInterface(
fn=generate,
title="🤖 Qwen2.5-0.5B · CPU Chat",
description=(
"Optimised for HF Spaces free tier — 16 GB RAM / 2 vCPU\n"
f"📥 Max input: **{MAX_INPUT_TOKENS} tokens** (oldest history auto-trimmed)"
),
additional_inputs=[
gr.Slider(
minimum=64,
maximum=2048, # ✅ dynamic — user picks how long the reply can be
value=512,
step=64,
label="Max output tokens",
info="Higher = longer replies but slower on CPU"
),
gr.Slider(
minimum=0.0,
maximum=1.5,
value=0.7,
step=0.05,
label="Temperature",
info="0 = focused/deterministic | 1.5 = creative/random"
),
],
additional_inputs_accordion=gr.Accordion("⚙️ Generation settings", open=False),
)
demo.launch()