Spaces:

rahul7star
/

Qwen3-Coder-Next

Running on Zero

App Files Files Community

Qwen3-Coder-Next / app.py

rahul7star

Update app.py

dba7c16 verified about 2 months ago

raw

history blame contribute delete

5.15 kB

	import spaces
	import gradio as gr
	import torch
	import time
	import re
	from transformers import AutoModelForCausalLM, AutoTokenizer

	MODEL_NAME = "rahul7star/Qwen3-4B-Thinking-2509-Genius-Coder-AI-Full"

	MAX_INPUT_TOKENS = 4096
	MAX_NEW_TOKENS = 4096

	# ---------------- Model ----------------

	print("Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(
	MODEL_NAME,
	trust_remote_code=True
	)

	print("Loading model...")
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	device_map="auto",
	trust_remote_code=True,
	low_cpu_mem_usage=True
	)

	# Important for Qwen
	if tokenizer.pad_token_id is None:
	tokenizer.pad_token = tokenizer.eos_token

	print("✅ Model loaded successfully")


	SYSTEM_PROMPT = """You are a professional AI Coding Assistant.
	Your responses must be:
	- Clear and concise
	- Well-structured with headings and bullet points
	- Technically accurate
	- Written in a formal, professional tone
	- Focused on best practices and production-quality code
	"""


	# ---------------- Helper ----------------

	def strip_thinking(text: str):
	"""Remove <think> reasoning blocks"""
	return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()


	# ---------------- Inference ----------------

	@spaces.GPU()
	def generate_answer(question, max_tokens):

	print("\n================ GENERATE ANSWER START ================")

	if not question or not question.strip():
	return "Please enter a valid question."

	try:
	start_time = time.time()

	messages = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": question.strip()},
	]

	# ✅ New Qwen chat template
	prompt = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True,
	)

	inputs = tokenizer(
	prompt,
	return_tensors="pt",
	truncation=True,
	max_length=MAX_INPUT_TOKENS,
	).to(model.device)

	input_token_count = inputs.input_ids.shape[-1]
	print(f"Input tokens: {input_token_count}")

	max_tokens = min(int(max_tokens), MAX_NEW_TOKENS)
	print(f"Final max_new_tokens: {max_tokens}")

	print("🚀 Starting generation...")

	with torch.no_grad():
	output = model.generate(
	**inputs,
	max_new_tokens=max_tokens,
	do_sample=False, # Deterministic (recommended)
	temperature=0.0, # Important when do_sample=False
	repetition_penalty=1.05,
	use_cache=True,
	pad_token_id=tokenizer.eos_token_id,
	eos_token_id=tokenizer.eos_token_id,
	)

	print("✅ Generation finished")

	generated_tokens = output[0][input_token_count:]

	response = tokenizer.decode(
	generated_tokens,
	skip_special_tokens=True,
	)

	# 🔥 Remove thinking block
	response = strip_thinking(response)

	print(f"Generated tokens: {generated_tokens.shape[-1]}")
	print(f"⏱ Total time: {time.time() - start_time:.2f} sec")
	print("================ GENERATE ANSWER END ==================\n")

	return response if response else "No output generated."

	except Exception as e:
	import traceback
	traceback.print_exc()

	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	return f"Error occurred: {str(e)}"


	# ---------------- UI ----------------

	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	# 🤖 Professional Coding Assistant
	Powered by Qwen3-4B Thinking

	Optimized for:
	- ⚡ Stable GPU inference
	- 🧠 Deterministic responses
	- 💻 Production-quality code
	"""
	)

	question = gr.Textbox(
	label="Your Question",
	placeholder="Explain Quick Sort with complexity and a Python example",
	value="write a python code using pytorch for a simple neural network demo",
	lines=4,
	)

	answer = gr.Markdown(label="AI Response", elem_id="answer_box")

	max_tokens = gr.Slider(
	64, 4096, value=2048, step=32, label="Max New Tokens"
	)

	with gr.Row():
	submit = gr.Button("Generate Answer", variant="primary")
	copy_btn = gr.Button("📋 Copy Response")
	clear = gr.Button("Clear")

	submit.click(
	fn=generate_answer,
	inputs=[question, max_tokens],
	outputs=answer,
	)

	clear.click(
	fn=lambda: ("", ""),
	outputs=[question, answer],
	)

	copy_btn.click(
	fn=None,
	js="""
	() => {
	const el = document.querySelector('#answer_box');
	navigator.clipboard.writeText(el.innerText);
	}
	""",
	)

	demo.launch(
	theme=gr.themes.Soft(),
	css="""
	.gradio-container { max-width: 900px !important; margin: auto; }
	textarea { font-size: 14px !important; }
	""",
	)