Qwen3-0.6B / app.py
aal-hawa
edit
9a439fc
import gradio as gr
import torch
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
# ============================================================
# Qwen3-0.6B – Fast Chat
# ============================================================
MODEL_ID = "Qwen/Qwen3-0.6B"
model = None
tokenizer = None
def load_model():
global model, tokenizer
if model is not None:
return
import os
token = os.getenv("HF_TOKEN", None)
print("Loading Qwen3-0.6B ...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=token)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
dtype=torch.float32,
low_cpu_mem_usage=True,
token=token
).to("cpu")
model.eval()
print("Model loaded.")
def strip_thinking(text):
"""Remove <think...</think*> blocks from Qwen3 output."""
return re.sub(r'<think[^>]*>.*?</think[^>]*>', '', text, flags=re.DOTALL).strip()
def normalize_content(msg):
"""Convert list content to string (Gradio may pass content as a list)."""
if isinstance(msg.get("content"), list):
parts = []
for item in msg["content"]:
if isinstance(item, dict) and "text" in item:
parts.append(item["text"])
elif isinstance(item, str):
parts.append(item)
return {"role": msg["role"], "content": " ".join(parts)}
return msg
def chat_response(message, history):
load_model()
# Normalize history: convert any list content to plain strings
clean_history = [normalize_content(m) for m in history]
messages = clean_history + [{"role": "user", "content": message}]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
enable_thinking=False # disable thinking mode
)
inputs = tokenizer(text, return_tensors="pt").to("cpu")
with torch.no_grad():
generated_ids = model.generate(
**inputs,
max_new_tokens=512,
do_sample=True,
temperature=0.7,
top_p=0.9,
)
output_ids = generated_ids[0][len(inputs.input_ids[0]):]
response = tokenizer.decode(output_ids, skip_special_tokens=True)
# Fallback: strip any <think...> blocks if they still appear
response = strip_thinking(response)
return messages + [{"role": "assistant", "content": response}]
# ============================================================
# Gradio Interface
# ============================================================
with gr.Blocks(title="Qwen3-0.6B Fast Chat") as demo:
gr.Markdown("""
# ⚡ Qwen3-0.6B – Fast Chat
Small and fast model. Great for quick answers on CPU.
""")
chatbot = gr.Chatbot(label="Conversation")
msg = gr.Textbox(label="Your Message", placeholder="Type your message and press Enter...")
clear = gr.Button("Clear Conversation")
msg.submit(
chat_response, [msg, chatbot], chatbot,
concurrency_limit=3
).then(
lambda: "", None, msg
)
clear.click(lambda: [], None, chatbot)
if __name__ == "__main__":
demo.queue(default_concurrency_limit=3)
demo.launch(server_name="0.0.0.0")