Spaces:

recentechstudio
/

Qwen3-0.6B

Sleeping

aal-hawa

edit

9a439fc about 1 month ago

3.23 kB

	import gradio as gr
	import torch
	import re
	from transformers import AutoModelForCausalLM, AutoTokenizer

	# ============================================================
	# Qwen3-0.6B – Fast Chat
	# ============================================================
	MODEL_ID = "Qwen/Qwen3-0.6B"
	model = None
	tokenizer = None

	def load_model():
	global model, tokenizer
	if model is not None:
	return
	import os
	token = os.getenv("HF_TOKEN", None)
	print("Loading Qwen3-0.6B ...")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=token)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	dtype=torch.float32,
	low_cpu_mem_usage=True,
	token=token
	).to("cpu")
	model.eval()
	print("Model loaded.")

	def strip_thinking(text):
	"""Remove <think...</think*> blocks from Qwen3 output."""
	return re.sub(r'<think[^>]>.?</think[^>]*>', '', text, flags=re.DOTALL).strip()

	def normalize_content(msg):
	"""Convert list content to string (Gradio may pass content as a list)."""
	if isinstance(msg.get("content"), list):
	parts = []
	for item in msg["content"]:
	if isinstance(item, dict) and "text" in item:
	parts.append(item["text"])
	elif isinstance(item, str):
	parts.append(item)
	return {"role": msg["role"], "content": " ".join(parts)}
	return msg

	def chat_response(message, history):
	load_model()

	# Normalize history: convert any list content to plain strings
	clean_history = [normalize_content(m) for m in history]
	messages = clean_history + [{"role": "user", "content": message}]

	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True,
	enable_thinking=False # disable thinking mode
	)
	inputs = tokenizer(text, return_tensors="pt").to("cpu")

	with torch.no_grad():
	generated_ids = model.generate(
	**inputs,
	max_new_tokens=512,
	do_sample=True,
	temperature=0.7,
	top_p=0.9,
	)
	output_ids = generated_ids[0][len(inputs.input_ids[0]):]
	response = tokenizer.decode(output_ids, skip_special_tokens=True)

	# Fallback: strip any <think...> blocks if they still appear
	response = strip_thinking(response)

	return messages + [{"role": "assistant", "content": response}]

	# ============================================================
	# Gradio Interface
	# ============================================================
	with gr.Blocks(title="Qwen3-0.6B Fast Chat") as demo:
	gr.Markdown("""
	# ⚡ Qwen3-0.6B – Fast Chat
	Small and fast model. Great for quick answers on CPU.
	""")

	chatbot = gr.Chatbot(label="Conversation")
	msg = gr.Textbox(label="Your Message", placeholder="Type your message and press Enter...")
	clear = gr.Button("Clear Conversation")

	msg.submit(
	chat_response, [msg, chatbot], chatbot,
	concurrency_limit=3
	).then(
	lambda: "", None, msg
	)
	clear.click(lambda: [], None, chatbot)

	if __name__ == "__main__":
	demo.queue(default_concurrency_limit=3)
	demo.launch(server_name="0.0.0.0")