import spaces import gradio as gr import torch import time import re from transformers import AutoModelForCausalLM, AutoTokenizer MODEL_NAME = "rahul7star/Qwen3-4B-Thinking-2509-Genius-Coder-AI-Full" MAX_INPUT_TOKENS = 4096 MAX_NEW_TOKENS = 4096 # ---------------- Model ---------------- print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained( MODEL_NAME, trust_remote_code=True ) print("Loading model...") model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto", trust_remote_code=True, low_cpu_mem_usage=True ) # Important for Qwen if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token print("✅ Model loaded successfully") SYSTEM_PROMPT = """You are a professional AI Coding Assistant. Your responses must be: - Clear and concise - Well-structured with headings and bullet points - Technically accurate - Written in a formal, professional tone - Focused on best practices and production-quality code """ # ---------------- Helper ---------------- def strip_thinking(text: str): """Remove reasoning blocks""" return re.sub(r".*?", "", text, flags=re.DOTALL).strip() # ---------------- Inference ---------------- @spaces.GPU() def generate_answer(question, max_tokens): print("\n================ GENERATE ANSWER START ================") if not question or not question.strip(): return "Please enter a valid question." try: start_time = time.time() messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": question.strip()}, ] # ✅ New Qwen chat template prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) inputs = tokenizer( prompt, return_tensors="pt", truncation=True, max_length=MAX_INPUT_TOKENS, ).to(model.device) input_token_count = inputs.input_ids.shape[-1] print(f"Input tokens: {input_token_count}") max_tokens = min(int(max_tokens), MAX_NEW_TOKENS) print(f"Final max_new_tokens: {max_tokens}") print("🚀 Starting generation...") with torch.no_grad(): output = model.generate( **inputs, max_new_tokens=max_tokens, do_sample=False, # Deterministic (recommended) temperature=0.0, # Important when do_sample=False repetition_penalty=1.05, use_cache=True, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id, ) print("✅ Generation finished") generated_tokens = output[0][input_token_count:] response = tokenizer.decode( generated_tokens, skip_special_tokens=True, ) # 🔥 Remove thinking block response = strip_thinking(response) print(f"Generated tokens: {generated_tokens.shape[-1]}") print(f"⏱ Total time: {time.time() - start_time:.2f} sec") print("================ GENERATE ANSWER END ==================\n") return response if response else "No output generated." except Exception as e: import traceback traceback.print_exc() if torch.cuda.is_available(): torch.cuda.empty_cache() return f"Error occurred: {str(e)}" # ---------------- UI ---------------- with gr.Blocks() as demo: gr.Markdown( """ # 🤖 Professional Coding Assistant **Powered by Qwen3-4B Thinking** Optimized for: - ⚡ Stable GPU inference - 🧠 Deterministic responses - 💻 Production-quality code """ ) question = gr.Textbox( label="Your Question", placeholder="Explain Quick Sort with complexity and a Python example", value="write a python code using pytorch for a simple neural network demo", lines=4, ) answer = gr.Markdown(label="AI Response", elem_id="answer_box") max_tokens = gr.Slider( 64, 4096, value=2048, step=32, label="Max New Tokens" ) with gr.Row(): submit = gr.Button("Generate Answer", variant="primary") copy_btn = gr.Button("📋 Copy Response") clear = gr.Button("Clear") submit.click( fn=generate_answer, inputs=[question, max_tokens], outputs=answer, ) clear.click( fn=lambda: ("", ""), outputs=[question, answer], ) copy_btn.click( fn=None, js=""" () => { const el = document.querySelector('#answer_box'); navigator.clipboard.writeText(el.innerText); } """, ) demo.launch( theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 900px !important; margin: auto; } textarea { font-size: 14px !important; } """, )